├── .github
    ├── ISSUE_TEMPLATE.md
    └── PULL_REQUEST_TEMPLATE.md
├── .travis.yml
├── .travis
    └── test-coverage.sh
├── README.md
├── asm
    ├── bench_gen.sh
    ├── c128
    │   ├── axpyinc_amd64.s
    │   ├── axpyincto_amd64.s
    │   ├── axpyunitary_amd64.s
    │   ├── axpyunitaryto_amd64.s
    │   ├── bench_test.go
    │   ├── doc.go
    │   ├── dotc.go
    │   ├── dotu.go
    │   ├── scal.go
    │   ├── stubs_amd64.go
    │   ├── stubs_noasm.go
    │   └── stubs_test.go
    ├── c64
    │   ├── axpyinc_amd64.s
    │   ├── axpyincto_amd64.s
    │   ├── axpyunitary_amd64.s
    │   ├── axpyunitaryto_amd64.s
    │   ├── bench_test.go
    │   ├── conj.go
    │   ├── doc.go
    │   ├── dotc.go
    │   ├── dotu.go
    │   ├── scal.go
    │   ├── stubs_amd64.go
    │   ├── stubs_noasm.go
    │   └── stubs_test.go
    ├── f32
    │   ├── axpyinc_amd64.s
    │   ├── axpyincto_amd64.s
    │   ├── axpyunitary_amd64.s
    │   ├── axpyunitaryto_amd64.s
    │   ├── bench_test.go
    │   ├── ddot.go
    │   ├── doc.go
    │   ├── dot.go
    │   ├── scal.go
    │   ├── stubs_amd64.go
    │   ├── stubs_noasm.go
    │   └── stubs_test.go
    └── f64
    │   ├── abssum_amd64.s
    │   ├── abssuminc_amd64.s
    │   ├── add_amd64.s
    │   ├── addconst_amd64.s
    │   ├── asm_test.go
    │   ├── axpy.go
    │   ├── axpy_test.go
    │   ├── axpyinc_amd64.s
    │   ├── axpyincto_amd64.s
    │   ├── axpyunitary_amd64.s
    │   ├── axpyunitaryto_amd64.s
    │   ├── benchAxpy_test.go
    │   ├── benchScal_test.go
    │   ├── bench_other_test.go
    │   ├── bench_test.go
    │   ├── cumprod_amd64.s
    │   ├── cumsum_amd64.s
    │   ├── div_amd64.s
    │   ├── divto_amd64.s
    │   ├── doc.go
    │   ├── dot.go
    │   ├── dot_amd64.s
    │   ├── dot_test.go
    │   ├── l1norm_amd64.s
    │   ├── linfnorm_amd64.s
    │   ├── scal.go
    │   ├── scal_test.go
    │   ├── scalinc_amd64.s
    │   ├── scalincto_amd64.s
    │   ├── scalunitary_amd64.s
    │   ├── scalunitaryto_amd64.s
    │   ├── stubs_amd64.go
    │   ├── stubs_noasm.go
    │   └── stubs_test.go
└── binding
    └── binding.go


/.github/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ### This repository is no longer actively maintained.
2 | 
3 | Development of the packages in this repository has moved to https://github.com/gonum/gonum.
4 | Please file issues [there](https://github.com/gonum/gonum/issues) after having checked that your issue has not been fixed.
5 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | ### This repository is no longer actively maintained.
2 | 
3 | Development of the packages in this repository has moved to https://github.com/gonum/gonum.
4 | Please send pull requests [there](https://github.com/gonum/gonum/pulls) after having checked that your addition has not already been made.
5 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | sudo: false
 2 | 
 3 | language: go
 4 | 
 5 | # Versions of go that are explicitly supported by gonum.
 6 | go:
 7 |  - 1.5.4
 8 |  - 1.6.3
 9 |  - 1.7.3
10 | 
11 | # Required for coverage, testing and generate.
12 | before_install:
13 |  - go get golang.org/x/tools/cmd/cover
14 |  - go get golang.org/x/tools/cmd/goimports
15 |  - go get github.com/mattn/goveralls
16 |  - go get github.com/klauspost/asmfmt/cmd/asmfmt
17 | 
18 | # Get deps, build, test, and ensure the code is gofmt'ed.
19 | # If we are building as gonum, then we have access to the coveralls api key, so we can run coverage as well.
20 | script:
21 |  - go get -d -t -v ./...
22 |  - go build -v -x ./...
23 |  - go test -v -x -a ./...
24 |  - go test -v -x -a -tags noasm ./...
25 |  - go test -v -x -a -tags appengine ./...
26 |  - test -z "$(gofmt -d .)"
27 |  - diff <(asmfmt -d .) <("")
28 |  - if [[ $TRAVIS_SECURE_ENV_VARS = "true" ]]; then bash ./.travis/test-coverage.sh; fi
29 | 


--------------------------------------------------------------------------------
/.travis/test-coverage.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROFILE_OUT=$PWD/profile.out
 4 | ACC_OUT=$PWD/acc.out
 5 | 
 6 | testCover() {
 7 | 	# set the return value to 0 (succesful)
 8 | 	retval=0
 9 | 	# get the directory to check from the parameter. Default to '.'
10 | 	d=${1:-.}
11 | 	# skip if there are no Go files here
12 | 	ls $d/*.go &> /dev/null || return $retval
13 | 	# switch to the directory to check
14 | 	pushd $d > /dev/null
15 | 	# create the coverage profile
16 | 	coverageresult=`go test -v -coverprofile=$PROFILE_OUT -tags noasm`
17 | 	# output the result so we can check the shell output
18 | 	echo ${coverageresult}
19 | 	# append the results to acc.out if coverage didn't fail, else set the retval to 1 (failed)
20 | 	( [[ ${coverageresult} == *FAIL* ]] && retval=1 ) || ( [ -f $PROFILE_OUT ] && grep -v "mode: set" $PROFILE_OUT >> $ACC_OUT )
21 | 	# return to our working dir
22 | 	popd > /dev/null
23 | 	# return our return value
24 | 	return $retval
25 | }
26 | 
27 | # Init acc.out
28 | echo "mode: set" > $ACC_OUT
29 | 
30 | # Run test coverage on all directories containing go files
31 | find . -maxdepth 10 -type d | while read d; do testCover $d || exit; done
32 | 
33 | # Upload the coverage profile to coveralls.io
34 | [ -n "$COVERALLS_TOKEN" ] && goveralls -coverprofile=$ACC_OUT -service=travis-ci -repotoken $COVERALLS_TOKEN
35 | 
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Gonum Internal [![Build Status](https://travis-ci.org/gonum/internal.svg?branch=master)](https://travis-ci.org/gonum/internal) [![Coverage Status](https://coveralls.io/repos/gonum/internal/badge.svg?branch=master&service=github)](https://coveralls.io/github/gonum/internal?branch=master) [![GoDoc](https://godoc.org/github.com/gonum/internal?status.svg)](https://godoc.org/github.com/gonum/internal)
 2 | 
 3 | # This repository is no longer maintained. Development has moved to https://github.com/gonum/gonum.
 4 | 
 5 | This is the set of internal packages for the Gonum project.
 6 | 
 7 | ## Issues
 8 | 
 9 | If you find any bugs, feel free to file an issue on the github [issue tracker for gonum/gonum](https://github.com/gonum/gonum/issues) if the bug exists in that reposity; no code changes will be made to this repository. Other dicussions should be taken to the gonum-dev Google Group.
10 | 
11 | https://groups.google.com/forum/#!forum/gonum-dev
12 | 
13 | ## License
14 | 
15 | Please see [github.com/gonum/license](https://github.com/gonum/license) for general license information, contributors, authors, etc on the Gonum suite of packages.
16 | 


--------------------------------------------------------------------------------
/asm/bench_gen.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Copyright ©2016 The gonum Authors. All rights reserved.
 4 | # Use of this source code is governed by a BSD-style
 5 | # license that can be found in the LICENSE file.
 6 | 
 7 | cat c64/bench_test.go \
 8 |     | gofmt -r 'complex(float32(n), float32(n)) -> float32(n)' \
 9 |     | gofmt -r 'complex64 -> float32' \
10 |     | gofmt -r '1 + 1i -> 1' \
11 |     | gofmt -r '2 + 2i -> 2' \
12 |     | sed 's/C64/F32/g' \
13 |     | sed 's/c64/f32/g' \
14 |     > f32/bench_test.go
15 | 
16 | cat c64/bench_test.go \
17 |     | gofmt -r 'complex(float32(n), float32(n)) -> float64(n)' \
18 |     | gofmt -r 'complex64 -> float64' \
19 |     | gofmt -r '1 + 1i -> 1' \
20 |     | gofmt -r '2 + 2i -> 2' \
21 |     | sed 's/C64/F64/g' \
22 |     | sed 's/c64/f64/g' \
23 |     > f64/bench_test.go
24 | 
25 | cat c64/bench_test.go \
26 |     | gofmt -r 'float32 -> float64' \
27 |     | gofmt -r 'complex64 -> complex128' \
28 |     | sed 's/C64/C128/g' \
29 |     | sed 's/c64/c128/g' \
30 |     > c128/bench_test.go
31 | 


--------------------------------------------------------------------------------
/asm/c128/axpyinc_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //+build !noasm,!appengine
  6 | 
  7 | #include "textflag.h"
  8 | 
  9 | // MOVDDUP X2, X3
 10 | #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
 11 | // MOVDDUP X4, X5
 12 | #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
 13 | // MOVDDUP X6, X7
 14 | #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
 15 | // MOVDDUP X8, X9
 16 | #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
 17 | 
 18 | // ADDSUBPD X2, X3
 19 | #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
 20 | // ADDSUBPD X4, X5
 21 | #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
 22 | // ADDSUBPD X6, X7
 23 | #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
 24 | // ADDSUBPD X8, X9
 25 | #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
 26 | 
 27 | // func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
 28 | TEXT ·AxpyInc(SB), NOSPLIT, $0
 29 | 	MOVQ   x_base+16(FP), SI // SI = &x
 30 | 	MOVQ   y_base+40(FP), DI // DI = &y
 31 | 	MOVQ   n+64(FP), CX      // CX = n
 32 | 	CMPQ   CX, $0            // if n==0 { return }
 33 | 	JE     axpyi_end
 34 | 	MOVQ   ix+88(FP), R8     // R8 = ix  // Load the first index
 35 | 	SHLQ   $4, R8            // R8 *= sizeof(complex128)
 36 | 	MOVQ   iy+96(FP), R9     // R9 = iy
 37 | 	SHLQ   $4, R9            // R9 *= sizeof(complex128)
 38 | 	LEAQ   (SI)(R8*1), SI    // SI = &(x[ix])
 39 | 	LEAQ   (DI)(R9*1), DI    // DI = &(y[iy])
 40 | 	MOVQ   DI, DX            // DX = DI      // Separate Read/Write pointers
 41 | 	MOVQ   incX+72(FP), R8   // R8 = incX
 42 | 	SHLQ   $4, R8            // R8 *= sizeof(complex128)
 43 | 	MOVQ   incY+80(FP), R9   // R9 = iy
 44 | 	SHLQ   $4, R9            // R9 *= sizeof(complex128)
 45 | 	MOVUPS alpha+0(FP), X0   // X0 = { imag(a), real(a) }
 46 | 	MOVAPS X0, X1
 47 | 	SHUFPD $0x1, X1, X1      // X1 = { real(a), imag(a) }
 48 | 	MOVAPS X0, X10           // Copy X0 and X1 for pipelining
 49 | 	MOVAPS X1, X11
 50 | 	MOVQ   CX, BX
 51 | 	ANDQ   $3, CX            // CX = n % 4
 52 | 	SHRQ   $2, BX            // BX = floor( n / 4 )
 53 | 	JZ     axpyi_tail        // if BX == 0 { goto axpyi_tail }
 54 | 
 55 | axpyi_loop: // do {
 56 | 	MOVUPS (SI), X2       // X_i = { imag(x[i]), real(x[i]) }
 57 | 	MOVUPS (SI)(R8*1), X4
 58 | 	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
 59 | 	MOVUPS (SI), X6
 60 | 	MOVUPS (SI)(R8*1), X8
 61 | 
 62 | 	// X_(i+1) = { real(x[i], real(x[i]) }
 63 | 	MOVDDUP_X2_X3
 64 | 	MOVDDUP_X4_X5
 65 | 	MOVDDUP_X6_X7
 66 | 	MOVDDUP_X8_X9
 67 | 
 68 | 	// X_i = { imag(x[i]), imag(x[i]) }
 69 | 	SHUFPD $0x3, X2, X2
 70 | 	SHUFPD $0x3, X4, X4
 71 | 	SHUFPD $0x3, X6, X6
 72 | 	SHUFPD $0x3, X8, X8
 73 | 
 74 | 	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
 75 | 	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
 76 | 	MULPD X1, X2
 77 | 	MULPD X0, X3
 78 | 	MULPD X11, X4
 79 | 	MULPD X10, X5
 80 | 	MULPD X1, X6
 81 | 	MULPD X0, X7
 82 | 	MULPD X11, X8
 83 | 	MULPD X10, X9
 84 | 
 85 | 	// X_(i+1) = {
 86 | 	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
 87 | 	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
 88 | 	//  }
 89 | 	ADDSUBPD_X2_X3
 90 | 	ADDSUBPD_X4_X5
 91 | 	ADDSUBPD_X6_X7
 92 | 	ADDSUBPD_X8_X9
 93 | 
 94 | 	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
 95 | 	ADDPD  (DX), X3
 96 | 	ADDPD  (DX)(R9*1), X5
 97 | 	LEAQ   (DX)(R9*2), DX // DX = &(DX[incY*2])
 98 | 	ADDPD  (DX), X7
 99 | 	ADDPD  (DX)(R9*1), X9
100 | 	MOVUPS X3, (DI)       // dst[i] = X_(i+1)
101 | 	MOVUPS X5, (DI)(R9*1)
102 | 	LEAQ   (DI)(R9*2), DI
103 | 	MOVUPS X7, (DI)
104 | 	MOVUPS X9, (DI)(R9*1)
105 | 	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
106 | 	LEAQ   (DX)(R9*2), DX // DX = &(DX[incY*2])
107 | 	LEAQ   (DI)(R9*2), DI // DI = &(DI[incY*2])
108 | 	DECQ   BX
109 | 	JNZ    axpyi_loop     // } while --BX > 0
110 | 	CMPQ   CX, $0         // if CX == 0 { return }
111 | 	JE     axpyi_end
112 | 
113 | axpyi_tail: // do {
114 | 	MOVUPS (SI), X2     // X_i = { imag(x[i]), real(x[i]) }
115 | 	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
116 | 	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
117 | 	MULPD  X1, X2       // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
118 | 	MULPD  X0, X3       // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
119 | 
120 | 	// X_(i+1) = {
121 | 	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
122 | 	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
123 | 	//  }
124 | 	ADDSUBPD_X2_X3
125 | 
126 | 	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
127 | 	ADDPD  (DI), X3
128 | 	MOVUPS X3, (DI)   // y[i] = X_i
129 | 	ADDQ   R8, SI     // SI = &(SI[incX])
130 | 	ADDQ   R9, DI     // DI = &(DI[incY])
131 | 	LOOP   axpyi_tail // } while --CX > 0
132 | 
133 | axpyi_end:
134 | 	RET
135 | 


--------------------------------------------------------------------------------
/asm/c128/axpyincto_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //+build !noasm,!appengine
  6 | 
  7 | #include "textflag.h"
  8 | 
  9 | // MOVDDUP X2, X3
 10 | #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
 11 | // MOVDDUP X4, X5
 12 | #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
 13 | // MOVDDUP X6, X7
 14 | #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
 15 | // MOVDDUP X8, X9
 16 | #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
 17 | 
 18 | // ADDSUBPD X2, X3
 19 | #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
 20 | // ADDSUBPD X4, X5
 21 | #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
 22 | // ADDSUBPD X6, X7
 23 | #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
 24 | // ADDSUBPD X8, X9
 25 | #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
 26 | 
 27 | // func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
 28 | TEXT ·AxpyIncTo(SB), NOSPLIT, $0
 29 | 	MOVQ   dst_base+0(FP), DI // DI = &dst
 30 | 	MOVQ   x_base+56(FP), SI  // SI = &x
 31 | 	MOVQ   y_base+80(FP), DX  // DX = &y
 32 | 	MOVQ   n+104(FP), CX      // CX = n
 33 | 	CMPQ   CX, $0             // if n==0 { return }
 34 | 	JE     axpyi_end
 35 | 	MOVQ   ix+128(FP), R8     // R8 = ix  // Load the first index
 36 | 	SHLQ   $4, R8             // R8 *= sizeof(complex128)
 37 | 	MOVQ   iy+136(FP), R9     // R9 = iy
 38 | 	SHLQ   $4, R9             // R9 *= sizeof(complex128)
 39 | 	MOVQ   idst+32(FP), R10   // R10 = idst
 40 | 	SHLQ   $4, R10            // R10 *= sizeof(complex128)
 41 | 	LEAQ   (SI)(R8*1), SI     // SI = &(x[ix])
 42 | 	LEAQ   (DX)(R9*1), DX     // DX = &(y[iy])
 43 | 	LEAQ   (DI)(R10*1), DI    // DI = &(dst[idst])
 44 | 	MOVQ   incX+112(FP), R8   // R8 = incX
 45 | 	SHLQ   $4, R8             // R8 *= sizeof(complex128)
 46 | 	MOVQ   incY+120(FP), R9   // R9 = incY
 47 | 	SHLQ   $4, R9             // R9 *= sizeof(complex128)
 48 | 	MOVQ   incDst+24(FP), R10 // R10 = incDst
 49 | 	SHLQ   $4, R10            // R10 *= sizeof(complex128)
 50 | 	MOVUPS alpha+40(FP), X0   // X0 = { imag(a), real(a) }
 51 | 	MOVAPS X0, X1
 52 | 	SHUFPD $0x1, X1, X1       // X1 = { real(a), imag(a) }
 53 | 	MOVAPS X0, X10            // Copy X0 and X1 for pipelining
 54 | 	MOVAPS X1, X11
 55 | 	MOVQ   CX, BX
 56 | 	ANDQ   $3, CX             // CX = n % 4
 57 | 	SHRQ   $2, BX             // BX = floor( n / 4 )
 58 | 	JZ     axpyi_tail         // if BX == 0 { goto axpyi_tail }
 59 | 
 60 | axpyi_loop: // do {
 61 | 	MOVUPS (SI), X2       // X_i = { imag(x[i]), real(x[i]) }
 62 | 	MOVUPS (SI)(R8*1), X4
 63 | 	LEAQ   (SI)(R8*2), SI // SI = &(SI[incX*2])
 64 | 
 65 | 	MOVUPS (SI), X6
 66 | 	MOVUPS (SI)(R8*1), X8
 67 | 
 68 | 	// X_(i+1) = { real(x[i], real(x[i]) }
 69 | 	MOVDDUP_X2_X3
 70 | 	MOVDDUP_X4_X5
 71 | 	MOVDDUP_X6_X7
 72 | 	MOVDDUP_X8_X9
 73 | 
 74 | 	// X_i = { imag(x[i]), imag(x[i]) }
 75 | 	SHUFPD $0x3, X2, X2
 76 | 	SHUFPD $0x3, X4, X4
 77 | 	SHUFPD $0x3, X6, X6
 78 | 	SHUFPD $0x3, X8, X8
 79 | 
 80 | 	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
 81 | 	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
 82 | 	MULPD X1, X2
 83 | 	MULPD X0, X3
 84 | 	MULPD X11, X4
 85 | 	MULPD X10, X5
 86 | 	MULPD X1, X6
 87 | 	MULPD X0, X7
 88 | 	MULPD X11, X8
 89 | 	MULPD X10, X9
 90 | 
 91 | 	// X_(i+1) = {
 92 | 	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
 93 | 	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
 94 | 	//  }
 95 | 	ADDSUBPD_X2_X3
 96 | 	ADDSUBPD_X4_X5
 97 | 	ADDSUBPD_X6_X7
 98 | 	ADDSUBPD_X8_X9
 99 | 
100 | 	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
101 | 	ADDPD  (DX), X3
102 | 	ADDPD  (DX)(R9*1), X5
103 | 	LEAQ   (DX)(R9*2), DX  // DX = &(DX[incY*2])
104 | 	ADDPD  (DX), X7
105 | 	ADDPD  (DX)(R9*1), X9
106 | 	MOVUPS X3, (DI)        // dst[i] = X_(i+1)
107 | 	MOVUPS X5, (DI)(R10*1)
108 | 	LEAQ   (DI)(R10*2), DI
109 | 	MOVUPS X7, (DI)
110 | 	MOVUPS X9, (DI)(R10*1)
111 | 	LEAQ   (SI)(R8*2), SI  // SI = &(SI[incX*2])
112 | 	LEAQ   (DX)(R9*2), DX  // DX = &(DX[incY*2])
113 | 	LEAQ   (DI)(R10*2), DI // DI = &(DI[incDst*2])
114 | 	DECQ   BX
115 | 	JNZ    axpyi_loop      // } while --BX > 0
116 | 	CMPQ   CX, $0          // if CX == 0 { return }
117 | 	JE     axpyi_end
118 | 
119 | axpyi_tail: // do {
120 | 	MOVUPS (SI), X2     // X_i = { imag(x[i]), real(x[i]) }
121 | 	MOVDDUP_X2_X3       // X_(i+1) = { real(x[i], real(x[i]) }
122 | 	SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) }
123 | 	MULPD  X1, X2       // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
124 | 	MULPD  X0, X3       // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
125 | 
126 | 	// X_(i+1) = {
127 | 	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
128 | 	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
129 | 	//  }
130 | 	ADDSUBPD_X2_X3
131 | 
132 | 	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
133 | 	ADDPD  (DX), X3
134 | 	MOVUPS X3, (DI)   // y[i] X_(i+1)
135 | 	ADDQ   R8, SI     // SI += incX
136 | 	ADDQ   R9, DX     // DX += incY
137 | 	ADDQ   R10, DI    // DI += incDst
138 | 	LOOP   axpyi_tail // } while --CX > 0
139 | 
140 | axpyi_end:
141 | 	RET
142 | 


--------------------------------------------------------------------------------
/asm/c128/axpyunitary_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //+build !noasm,!appengine
  6 | 
  7 | #include "textflag.h"
  8 | 
  9 | // MOVDDUP X2, X3
 10 | #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
 11 | // MOVDDUP X4, X5
 12 | #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
 13 | // MOVDDUP X6, X7
 14 | #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
 15 | // MOVDDUP X8, X9
 16 | #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
 17 | 
 18 | // ADDSUBPD X2, X3
 19 | #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
 20 | // ADDSUBPD X4, X5
 21 | #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
 22 | // ADDSUBPD X6, X7
 23 | #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
 24 | // ADDSUBPD X8, X9
 25 | #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
 26 | 
 27 | // func AxpyUnitary(alpha complex128, x, y []complex128)
 28 | TEXT ·AxpyUnitary(SB), NOSPLIT, $0
 29 | 	MOVQ    x_base+16(FP), SI // SI = &x
 30 | 	MOVQ    y_base+40(FP), DI // DI = &y
 31 | 	MOVQ    x_len+24(FP), CX  // CX = min( len(x), len(y) )
 32 | 	CMPQ    y_len+48(FP), CX
 33 | 	CMOVQLE y_len+48(FP), CX
 34 | 	CMPQ    CX, $0            // if CX == 0 { return }
 35 | 	JE      caxy_end
 36 | 	PXOR    X0, X0            // Clear work registers and cache-align loop
 37 | 	PXOR    X1, X1
 38 | 	MOVUPS  alpha+0(FP), X0   // X0 = { imag(a), real(a) }
 39 | 	MOVAPS  X0, X1
 40 | 	SHUFPD  $0x1, X1, X1      // X1 = { real(a), imag(a) }
 41 | 	XORQ    AX, AX            // i = 0
 42 | 	MOVAPS  X0, X10           // Copy X0 and X1 for pipelining
 43 | 	MOVAPS  X1, X11
 44 | 	MOVQ    CX, BX
 45 | 	ANDQ    $3, CX            // CX = n % 4
 46 | 	SHRQ    $2, BX            // BX = floor( n / 4 )
 47 | 	JZ      caxy_tail         // if BX == 0 { goto caxy_tail }
 48 | 
 49 | caxy_loop: // do {
 50 | 	MOVUPS (SI)(AX*8), X2   // X_i = { imag(x[i]), real(x[i]) }
 51 | 	MOVUPS 16(SI)(AX*8), X4
 52 | 	MOVUPS 32(SI)(AX*8), X6
 53 | 	MOVUPS 48(SI)(AX*8), X8
 54 | 
 55 | 	// X_(i+1) = { real(x[i], real(x[i]) }
 56 | 	MOVDDUP_X2_X3
 57 | 	MOVDDUP_X4_X5
 58 | 	MOVDDUP_X6_X7
 59 | 	MOVDDUP_X8_X9
 60 | 
 61 | 	// X_i = { imag(x[i]), imag(x[i]) }
 62 | 	SHUFPD $0x3, X2, X2
 63 | 	SHUFPD $0x3, X4, X4
 64 | 	SHUFPD $0x3, X6, X6
 65 | 	SHUFPD $0x3, X8, X8
 66 | 
 67 | 	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
 68 | 	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
 69 | 	MULPD X1, X2
 70 | 	MULPD X0, X3
 71 | 	MULPD X11, X4
 72 | 	MULPD X10, X5
 73 | 	MULPD X1, X6
 74 | 	MULPD X0, X7
 75 | 	MULPD X11, X8
 76 | 	MULPD X10, X9
 77 | 
 78 | 	// X_(i+1) = {
 79 | 	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
 80 | 	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
 81 | 	//  }
 82 | 	ADDSUBPD_X2_X3
 83 | 	ADDSUBPD_X4_X5
 84 | 	ADDSUBPD_X6_X7
 85 | 	ADDSUBPD_X8_X9
 86 | 
 87 | 	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
 88 | 	ADDPD  (DI)(AX*8), X3
 89 | 	ADDPD  16(DI)(AX*8), X5
 90 | 	ADDPD  32(DI)(AX*8), X7
 91 | 	ADDPD  48(DI)(AX*8), X9
 92 | 	MOVUPS X3, (DI)(AX*8)   // y[i] = X_(i+1)
 93 | 	MOVUPS X5, 16(DI)(AX*8)
 94 | 	MOVUPS X7, 32(DI)(AX*8)
 95 | 	MOVUPS X9, 48(DI)(AX*8)
 96 | 	ADDQ   $8, AX           // i += 8
 97 | 	DECQ   BX
 98 | 	JNZ    caxy_loop        // } while --BX > 0
 99 | 	CMPQ   CX, $0           // if CX == 0 { return }
100 | 	JE     caxy_end
101 | 
102 | caxy_tail: // do {
103 | 	MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
104 | 	MOVDDUP_X2_X3         // X_(i+1) = { real(x[i], real(x[i]) }
105 | 	SHUFPD $0x3, X2, X2   // X_i = { imag(x[i]), imag(x[i]) }
106 | 	MULPD  X1, X2         // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
107 | 	MULPD  X0, X3         // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
108 | 
109 | 	// X_(i+1) = {
110 | 	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
111 | 	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
112 | 	//  }
113 | 	ADDSUBPD_X2_X3
114 | 
115 | 	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
116 | 	ADDPD  (DI)(AX*8), X3
117 | 	MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
118 | 	ADDQ   $2, AX         // i += 2
119 | 	LOOP   caxy_tail      // }  while --CX > 0
120 | 
121 | caxy_end:
122 | 	RET
123 | 


--------------------------------------------------------------------------------
/asm/c128/axpyunitaryto_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //+build !noasm,!appengine
  6 | 
  7 | #include "textflag.h"
  8 | 
  9 | // MOVDDUP X2, X3
 10 | #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA
 11 | // MOVDDUP X4, X5
 12 | #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC
 13 | // MOVDDUP X6, X7
 14 | #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE
 15 | // MOVDDUP X8, X9
 16 | #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8
 17 | 
 18 | // ADDSUBPD X2, X3
 19 | #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
 20 | // ADDSUBPD X4, X5
 21 | #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
 22 | // ADDSUBPD X6, X7
 23 | #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
 24 | // ADDSUBPD X8, X9
 25 | #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
 26 | 
 27 | // func AxpyUnitaryTo(dst []complex128, alpha complex64, x, y []complex128)
 28 | TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
 29 | 	MOVQ    dst_base+0(FP), DI // DI = &dst
 30 | 	MOVQ    x_base+40(FP), SI  // SI = &x
 31 | 	MOVQ    y_base+64(FP), DX  // DX = &y
 32 | 	MOVQ    x_len+48(FP), CX   // CX = min( len(x), len(y), len(dst) )
 33 | 	CMPQ    y_len+72(FP), CX
 34 | 	CMOVQLE y_len+72(FP), CX
 35 | 	CMPQ    dst_len+8(FP), CX
 36 | 	CMOVQLE dst_len+8(FP), CX
 37 | 	CMPQ    CX, $0             // if CX == 0 { return }
 38 | 	JE      caxy_end
 39 | 	MOVUPS  alpha+24(FP), X0   // X0 = { imag(a), real(a) }
 40 | 	MOVAPS  X0, X1
 41 | 	SHUFPD  $0x1, X1, X1       // X1 = { real(a), imag(a) }
 42 | 	XORQ    AX, AX             // i = 0
 43 | 	MOVAPS  X0, X10            // Copy X0 and X1 for pipelining
 44 | 	MOVAPS  X1, X11
 45 | 	MOVQ    CX, BX
 46 | 	ANDQ    $3, CX             // CX = n % 4
 47 | 	SHRQ    $2, BX             // BX = floor( n / 4 )
 48 | 	JZ      caxy_tail          // if BX == 0 { goto caxy_tail }
 49 | 
 50 | caxy_loop: // do {
 51 | 	MOVUPS (SI)(AX*8), X2   // X_i = { imag(x[i]), real(x[i]) }
 52 | 	MOVUPS 16(SI)(AX*8), X4
 53 | 	MOVUPS 32(SI)(AX*8), X6
 54 | 	MOVUPS 48(SI)(AX*8), X8
 55 | 
 56 | 	// X_(i+1) = { real(x[i], real(x[i]) }
 57 | 	MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi)
 58 | 	MOVDDUP_X4_X5
 59 | 	MOVDDUP_X6_X7
 60 | 	MOVDDUP_X8_X9
 61 | 
 62 | 	// X_i = { imag(x[i]), imag(x[i]) }
 63 | 	SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr)
 64 | 	SHUFPD $0x3, X4, X4
 65 | 	SHUFPD $0x3, X6, X6
 66 | 	SHUFPD $0x3, X8, X8
 67 | 
 68 | 	// X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
 69 | 	// X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
 70 | 	MULPD X1, X2
 71 | 	MULPD X0, X3
 72 | 	MULPD X11, X4
 73 | 	MULPD X10, X5
 74 | 	MULPD X1, X6
 75 | 	MULPD X0, X7
 76 | 	MULPD X11, X8
 77 | 	MULPD X10, X9
 78 | 
 79 | 	// X_(i+1) = {
 80 | 	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
 81 | 	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
 82 | 	//  }
 83 | 	ADDSUBPD_X2_X3
 84 | 	ADDSUBPD_X4_X5
 85 | 	ADDSUBPD_X6_X7
 86 | 	ADDSUBPD_X8_X9
 87 | 
 88 | 	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
 89 | 	ADDPD  (DX)(AX*8), X3
 90 | 	ADDPD  16(DX)(AX*8), X5
 91 | 	ADDPD  32(DX)(AX*8), X7
 92 | 	ADDPD  48(DX)(AX*8), X9
 93 | 	MOVUPS X3, (DI)(AX*8)   // y[i] = X_(i+1)
 94 | 	MOVUPS X5, 16(DI)(AX*8)
 95 | 	MOVUPS X7, 32(DI)(AX*8)
 96 | 	MOVUPS X9, 48(DI)(AX*8)
 97 | 	ADDQ   $8, AX           // i += 8
 98 | 	DECQ   BX
 99 | 	JNZ    caxy_loop        // } while --BX > 0
100 | 	CMPQ   CX, $0           // if CX == 0 { return }
101 | 	JE     caxy_end
102 | 
103 | caxy_tail: // Same calculation, but read in values to avoid trampling memory
104 | 	MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) }
105 | 	MOVDDUP_X2_X3         // X_(i+1) = { real(x[i], real(x[i]) }
106 | 	SHUFPD $0x3, X2, X2   // X_i = { imag(x[i]), imag(x[i]) }
107 | 	MULPD  X1, X2         // X_i     = { real(a) * imag(x[i]), imag(a) * imag(x[i])  }
108 | 	MULPD  X0, X3         // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i])  }
109 | 
110 | 	// X_(i+1) = {
111 | 	//	imag(result[i]):  imag(a)*real(x[i]) + real(a)*imag(x[i]),
112 | 	//	real(result[i]):  real(a)*real(x[i]) - imag(a)*imag(x[i])
113 | 	//  }
114 | 	ADDSUBPD_X2_X3
115 | 
116 | 	// X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
117 | 	ADDPD  (DX)(AX*8), X3
118 | 	MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1)
119 | 	ADDQ   $2, AX         // i += 2
120 | 	LOOP   caxy_tail      // }  while --CX > 0
121 | 
122 | caxy_end:
123 | 	RET
124 | 


--------------------------------------------------------------------------------
/asm/c128/doc.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2017 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // This repository is no longer maintained.
 6 | // Development has moved to https://github.com/gonum/gonum.
 7 | //
 8 | // Package c128 provides complex128 vector primitives.
 9 | package c128
10 | 


--------------------------------------------------------------------------------
/asm/c128/dotc.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2015 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package c128
 6 | 
 7 | import "math/cmplx"
 8 | 
 9 | // DotcUnitary is
10 | //  for i, v := range x {
11 | //  	sum += y[i] * cmplx.Conj(v)
12 | //  }
13 | //  return sum
14 | func DotcUnitary(x, y []complex128) (sum complex128) {
15 | 	for i, v := range x {
16 | 		sum += y[i] * cmplx.Conj(v)
17 | 	}
18 | 	return sum
19 | }
20 | 
21 | // DotcInc is
22 | //  for i := 0; i < int(n); i++ {
23 | //  	sum += y[iy] * cmplx.Conj(x[ix])
24 | //  	ix += incX
25 | //  	iy += incY
26 | //  }
27 | //  return sum
28 | func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
29 | 	for i := 0; i < int(n); i++ {
30 | 		sum += y[iy] * cmplx.Conj(x[ix])
31 | 		ix += incX
32 | 		iy += incY
33 | 	}
34 | 	return sum
35 | }
36 | 


--------------------------------------------------------------------------------
/asm/c128/dotu.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2015 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package c128
 6 | 
 7 | // DotuUnitary is
 8 | //  for i, v := range x {
 9 | //  	sum += y[i] * v
10 | //  }
11 | //  return sum
12 | func DotuUnitary(x, y []complex128) (sum complex128) {
13 | 	for i, v := range x {
14 | 		sum += y[i] * v
15 | 	}
16 | 	return sum
17 | }
18 | 
19 | // DotuInc is
20 | //  for i := 0; i < int(n); i++ {
21 | //  	sum += y[iy] * x[ix]
22 | //  	ix += incX
23 | //  	iy += incY
24 | //  }
25 | //  return sum
26 | func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) {
27 | 	for i := 0; i < int(n); i++ {
28 | 		sum += y[iy] * x[ix]
29 | 		ix += incX
30 | 		iy += incY
31 | 	}
32 | 	return sum
33 | }
34 | 


--------------------------------------------------------------------------------
/asm/c128/scal.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package c128
 6 | 
 7 | // ScalUnitary is
 8 | //  for i := range x {
 9 | //  	x[i] *= alpha
10 | //  }
11 | func ScalUnitary(alpha complex128, x []complex128) {
12 | 	for i := range x {
13 | 		x[i] *= alpha
14 | 	}
15 | }
16 | 
17 | // ScalUnitaryTo is
18 | //  for i, v := range x {
19 | //  	dst[i] = alpha * v
20 | //  }
21 | func ScalUnitaryTo(dst []complex128, alpha complex128, x []complex128) {
22 | 	for i, v := range x {
23 | 		dst[i] = alpha * v
24 | 	}
25 | }
26 | 
27 | // ScalInc is
28 | //  var ix uintptr
29 | //  for i := 0; i < int(n); i++ {
30 | //  	x[ix] *= alpha
31 | //  	ix += incX
32 | //  }
33 | func ScalInc(alpha complex128, x []complex128, n, incX uintptr) {
34 | 	var ix uintptr
35 | 	for i := 0; i < int(n); i++ {
36 | 		x[ix] *= alpha
37 | 		ix += incX
38 | 	}
39 | }
40 | 
41 | // ScalIncTo is
42 | //  var idst, ix uintptr
43 | //  for i := 0; i < int(n); i++ {
44 | //  	dst[idst] = alpha * x[ix]
45 | //  	ix += incX
46 | //  	idst += incDst
47 | //  }
48 | func ScalIncTo(dst []complex128, incDst uintptr, alpha complex128, x []complex128, n, incX uintptr) {
49 | 	var idst, ix uintptr
50 | 	for i := 0; i < int(n); i++ {
51 | 		dst[idst] = alpha * x[ix]
52 | 		ix += incX
53 | 		idst += incDst
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/asm/c128/stubs_amd64.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !noasm,!appengine
 6 | 
 7 | package c128
 8 | 
 9 | // AxpyUnitary is
10 | //  for i, v := range x {
11 | //  	y[i] += alpha * v
12 | //  }
13 | func AxpyUnitary(alpha complex128, x, y []complex128)
14 | 
15 | // AxpyUnitaryTo is
16 | //  for i, v := range x {
17 | //  	dst[i] = alpha*v + y[i]
18 | //  }
19 | func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128)
20 | 
21 | // AxpyInc is
22 | //  for i := 0; i < int(n); i++ {
23 | //  	y[iy] += alpha * x[ix]
24 | //  	ix += incX
25 | //  	iy += incY
26 | //  }
27 | func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
28 | 
29 | // AxpyIncTo is
30 | //  for i := 0; i < int(n); i++ {
31 | //  	dst[idst] = alpha*x[ix] + y[iy]
32 | //  	ix += incX
33 | //  	iy += incY
34 | //  	idst += incDst
35 | //  }
36 | func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr)
37 | 


--------------------------------------------------------------------------------
/asm/c128/stubs_noasm.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !amd64 noasm appengine
 6 | 
 7 | package c128
 8 | 
 9 | // AxpyUnitary is
10 | //  for i, v := range x {
11 | //  	y[i] += alpha * v
12 | //  }
13 | func AxpyUnitary(alpha complex128, x, y []complex128) {
14 | 	for i, v := range x {
15 | 		y[i] += alpha * v
16 | 	}
17 | }
18 | 
19 | // AxpyUnitaryTo is
20 | //  for i, v := range x {
21 | //  	dst[i] = alpha*v + y[i]
22 | //  }
23 | func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128) {
24 | 	for i, v := range x {
25 | 		dst[i] = alpha*v + y[i]
26 | 	}
27 | }
28 | 
29 | // AxpyInc is
30 | //  for i := 0; i < int(n); i++ {
31 | //  	y[iy] += alpha * x[ix]
32 | //  	ix += incX
33 | //  	iy += incY
34 | //  }
35 | func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
36 | 	for i := 0; i < int(n); i++ {
37 | 		y[iy] += alpha * x[ix]
38 | 		ix += incX
39 | 		iy += incY
40 | 	}
41 | }
42 | 
43 | // AxpyIncTo is
44 | //  for i := 0; i < int(n); i++ {
45 | //  	dst[idst] = alpha*x[ix] + y[iy]
46 | //  	ix += incX
47 | //  	iy += incY
48 | //  	idst += incDst
49 | //  }
50 | func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) {
51 | 	for i := 0; i < int(n); i++ {
52 | 		dst[idst] = alpha*x[ix] + y[iy]
53 | 		ix += incX
54 | 		iy += incY
55 | 		idst += incDst
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/asm/c128/stubs_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package c128
  6 | 
  7 | import "testing"
  8 | 
  9 | var tests = []struct {
 10 | 	incX, incY, incDst int
 11 | 	ix, iy, idst       uintptr
 12 | 	a                  complex128
 13 | 	dst, x, y          []complex128
 14 | 	ex                 []complex128
 15 | }{
 16 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 17 | 		a:   1 + 1i,
 18 | 		dst: []complex128{5},
 19 | 		x:   []complex128{1},
 20 | 		y:   []complex128{1i},
 21 | 		ex:  []complex128{1 + 2i}},
 22 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 23 | 		a:   1 + 2i,
 24 | 		dst: []complex128{0, 0, 0},
 25 | 		x:   []complex128{0, 0, 0},
 26 | 		y:   []complex128{1, 1, 1},
 27 | 		ex:  []complex128{1, 1, 1}},
 28 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 29 | 		a:   1 + 2i,
 30 | 		dst: []complex128{0, 0, 0},
 31 | 		x:   []complex128{0, 0},
 32 | 		y:   []complex128{1, 1, 1},
 33 | 		ex:  []complex128{1, 1}},
 34 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 35 | 		a:   1 + 2i,
 36 | 		dst: []complex128{1i, 1i, 1i},
 37 | 		x:   []complex128{1i, 1i, 1i},
 38 | 		y:   []complex128{1, 2, 1},
 39 | 		ex:  []complex128{-1 + 1i, 1i, -1 + 1i}},
 40 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 41 | 		a:   -1i,
 42 | 		dst: []complex128{1i, 1i, 1i},
 43 | 		x:   []complex128{1i, 1i, 1i},
 44 | 		y:   []complex128{1, 2, 1},
 45 | 		ex:  []complex128{2, 3, 2}},
 46 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 47 | 		a:   -1i,
 48 | 		dst: []complex128{1i, 1i, 1i},
 49 | 		x:   []complex128{1i, 1i, 1i, 1i, 1i}[1:4],
 50 | 		y:   []complex128{1, 1, 2, 1, 1}[1:4],
 51 | 		ex:  []complex128{2, 3, 2}},
 52 | 	{incX: 2, incY: 4, incDst: 3, ix: 0, iy: 0, idst: 0,
 53 | 		a:   -2,
 54 | 		dst: []complex128{1i, 1i, 1i, 1i, 1i},
 55 | 		x:   []complex128{2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i},
 56 | 		y:   []complex128{1, 1, 2, 1, 1},
 57 | 		ex:  []complex128{-3 - 2i, -3 - 2i, -2 - 2i, -3 - 2i, -3 - 2i}},
 58 | 	// Run big test twice, once aligned once unaligned.
 59 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 60 | 		a:   1 - 1i,
 61 | 		dst: make([]complex128, 10),
 62 | 		x:   []complex128{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i},
 63 | 		y:   []complex128{1, 1, 2, 1, 1, 1, 1, 2, 1, 1},
 64 | 		ex:  []complex128{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}},
 65 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 66 | 		a:   1 - 1i,
 67 | 		dst: make([]complex128, 10),
 68 | 		x:   []complex128{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i},
 69 | 		y:   []complex128{1, 1, 2, 1, 1, 1, 1, 2, 1, 1},
 70 | 		ex:  []complex128{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}},
 71 | 	{incX: -2, incY: -2, incDst: -3, ix: 18, iy: 18, idst: 27,
 72 | 		a:   1 - 1i,
 73 | 		dst: make([]complex128, 10),
 74 | 		x:   []complex128{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i},
 75 | 		y:   []complex128{1, 1, 2, 1, 1, 1, 1, 2, 1, 1},
 76 | 		ex:  []complex128{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}},
 77 | 	{incX: -2, incY: 2, incDst: -3, ix: 18, iy: 0, idst: 27,
 78 | 		a:   1 - 1i,
 79 | 		dst: make([]complex128, 10),
 80 | 		x:   []complex128{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i},
 81 | 		y:   []complex128{1, 1, 2, 1, 1, 1, 1, 2, 1, 1},
 82 | 		ex:  []complex128{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}},
 83 | }
 84 | 
 85 | func guardVector(vec []complex128, guard_val complex128, guard_len int) (guarded []complex128) {
 86 | 	guarded = make([]complex128, len(vec)+guard_len*2)
 87 | 	copy(guarded[guard_len:], vec)
 88 | 	for i := 0; i < guard_len; i++ {
 89 | 		guarded[i] = guard_val
 90 | 		guarded[len(guarded)-1-i] = guard_val
 91 | 	}
 92 | 	return guarded
 93 | }
 94 | 
 95 | func isValidGuard(vec []complex128, guard_val complex128, guard_len int) bool {
 96 | 	for i := 0; i < guard_len; i++ {
 97 | 		if vec[i] != guard_val || vec[len(vec)-1-i] != guard_val {
 98 | 			return false
 99 | 		}
100 | 	}
101 | 	return true
102 | }
103 | 
104 | func TestAxpyUnitary(t *testing.T) {
105 | 	var x_gd, y_gd complex128 = 1, 1
106 | 	for cas, test := range tests {
107 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
108 | 		test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln)
109 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
110 | 		AxpyUnitary(test.a, x, y)
111 | 		for i := range test.ex {
112 | 			if y[i] != test.ex[i] {
113 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i], test.ex[i])
114 | 			}
115 | 		}
116 | 		if !isValidGuard(test.x, x_gd, xg_ln) {
117 | 			t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:])
118 | 		}
119 | 		if !isValidGuard(test.y, y_gd, yg_ln) {
120 | 			t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:])
121 | 		}
122 | 	}
123 | }
124 | 
125 | func TestAxpyUnitaryTo(t *testing.T) {
126 | 	var x_gd, y_gd, dst_gd complex128 = 1, 1, 0
127 | 	for cas, test := range tests {
128 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
129 | 		test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln)
130 | 		test.dst = guardVector(test.dst, dst_gd, xg_ln)
131 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
132 | 		dst := test.dst[xg_ln : len(test.dst)-xg_ln]
133 | 		AxpyUnitaryTo(dst, test.a, x, y)
134 | 		for i := range test.ex {
135 | 			if dst[i] != test.ex[i] {
136 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i], test.ex[i])
137 | 			}
138 | 		}
139 | 		if !isValidGuard(test.x, x_gd, xg_ln) {
140 | 			t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:])
141 | 		}
142 | 		if !isValidGuard(test.y, y_gd, yg_ln) {
143 | 			t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:])
144 | 		}
145 | 		if !isValidGuard(test.dst, dst_gd, xg_ln) {
146 | 			t.Errorf("Test %d Guard violated in dst vector %v %v", cas, test.dst[:xg_ln], test.dst[len(test.dst)-xg_ln:])
147 | 		}
148 | 
149 | 	}
150 | }
151 | 
152 | func guardIncVector(vec []complex128, guard_val complex128, incV uintptr, guard_len int) (guarded []complex128) {
153 | 	inc := int(incV)
154 | 	s_ln := len(vec) * inc
155 | 	if inc < 0 {
156 | 		s_ln = len(vec) * -inc
157 | 	}
158 | 	guarded = make([]complex128, s_ln+guard_len*2)
159 | 	for i, cas := 0, 0; i < len(guarded); i++ {
160 | 		switch {
161 | 		case i < guard_len, i > guard_len+s_ln:
162 | 			guarded[i] = guard_val
163 | 		case (i-guard_len)%(inc) == 0 && cas < len(vec):
164 | 			guarded[i] = vec[cas]
165 | 			cas++
166 | 		default:
167 | 			guarded[i] = guard_val
168 | 		}
169 | 	}
170 | 	return guarded
171 | }
172 | 
173 | func checkValidIncGuard(t *testing.T, vec []complex128, guard_val complex128, incV uintptr, guard_len int) {
174 | 	inc := int(incV)
175 | 	s_ln := len(vec) - 2*guard_len
176 | 	if inc < 0 {
177 | 		s_ln = len(vec) * -inc
178 | 	}
179 | 
180 | 	for i := range vec {
181 | 		switch {
182 | 		case vec[i] == guard_val:
183 | 			// Correct value
184 | 		case i < guard_len:
185 | 			t.Errorf("Front guard violated at %d %v", i, vec[:guard_len])
186 | 		case i > guard_len+s_ln:
187 | 			t.Errorf("Back guard violated at %d %v", i-guard_len-s_ln, vec[guard_len+s_ln:])
188 | 		case (i-guard_len)%inc == 0 && (i-guard_len)/inc < len(vec):
189 | 			// Ignore input values
190 | 		default:
191 | 			t.Errorf("Internal guard violated at %d %v", i-guard_len, vec[guard_len:guard_len+s_ln])
192 | 		}
193 | 	}
194 | }
195 | 
196 | func TestAxpyInc(t *testing.T) {
197 | 	var x_gd, y_gd complex128 = 1, 1
198 | 	for cas, test := range tests {
199 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
200 | 		test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln)
201 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
202 | 		AxpyInc(test.a, x, y, uintptr(len(test.ex)), uintptr(test.incX), uintptr(test.incY), test.ix, test.iy)
203 | 		for i := range test.ex {
204 | 			if y[int(test.iy)+i*int(test.incY)] != test.ex[i] {
205 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i*int(test.incY)], test.ex[i])
206 | 			}
207 | 		}
208 | 		checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln)
209 | 		checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln)
210 | 	}
211 | }
212 | 
213 | func TestAxpyIncTo(t *testing.T) {
214 | 	var x_gd, y_gd, dst_gd complex128 = 1, 1, 0
215 | 	for cas, test := range tests {
216 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
217 | 		test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln)
218 | 		test.dst = guardIncVector(test.dst, dst_gd, uintptr(test.incDst), xg_ln)
219 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
220 | 		dst := test.dst[xg_ln : len(test.dst)-xg_ln]
221 | 		AxpyIncTo(dst, uintptr(test.incDst), test.idst, test.a, x, y, uintptr(len(test.ex)), uintptr(test.incX), uintptr(test.incY), test.ix, test.iy)
222 | 		for i := range test.ex {
223 | 			if dst[int(test.idst)+i*int(test.incDst)] != test.ex[i] {
224 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i*int(test.incDst)], test.ex[i])
225 | 			}
226 | 		}
227 | 		checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln)
228 | 		checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln)
229 | 		checkValidIncGuard(t, test.dst, dst_gd, uintptr(test.incDst), xg_ln)
230 | 	}
231 | }
232 | 


--------------------------------------------------------------------------------
/asm/c64/axpyinc_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //+build !noasm,!appengine
  6 | 
  7 | #include "textflag.h"
  8 | 
  9 | // MOVSHDUP X3, X2
 10 | #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
 11 | // MOVSLDUP X3, X3
 12 | #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
 13 | // ADDSUBPS X2, X3
 14 | #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
 15 | 
 16 | // MOVSHDUP X5, X4
 17 | #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
 18 | // MOVSLDUP X5, X5
 19 | #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
 20 | // ADDSUBPS X4, X5
 21 | #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
 22 | 
 23 | // MOVSHDUP X7, X6
 24 | #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
 25 | // MOVSLDUP X7, X7
 26 | #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
 27 | // ADDSUBPS X6, X7
 28 | #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
 29 | 
 30 | // MOVSHDUP X9, X8
 31 | #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
 32 | // MOVSLDUP X9, X9
 33 | #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
 34 | // ADDSUBPS X8, X9
 35 | #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
 36 | 
 37 | // func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
 38 | TEXT ·AxpyInc(SB), NOSPLIT, $0
 39 | 	MOVQ   x_base+8(FP), SI  // SI = &x
 40 | 	MOVQ   y_base+32(FP), DI // DI = &y
 41 | 	MOVQ   n+56(FP), CX      // CX = n
 42 | 	CMPQ   CX, $0            // if n==0 { return }
 43 | 	JE     axpyi_end
 44 | 	MOVQ   ix+80(FP), R8     // R8 = ix
 45 | 	MOVQ   iy+88(FP), R9     // R9 = iy
 46 | 	LEAQ   (SI)(R8*8), SI    // SI = &(x[ix])
 47 | 	LEAQ   (DI)(R9*8), DI    // DI = &(y[iy])
 48 | 	MOVQ   DI, DX            // DX = DI    // Read/Write pointers
 49 | 	MOVQ   incX+64(FP), R8   // R8 = incX
 50 | 	SHLQ   $3, R8            // R8 *= sizeof(complex64)
 51 | 	MOVQ   incY+72(FP), R9   // R9 = incY
 52 | 	SHLQ   $3, R9            // R9 *= sizeof(complex64)
 53 | 	MOVSD  alpha+0(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
 54 | 	MOVAPS X0, X1
 55 | 	SHUFPS $0x11, X1, X1     // X1 = { 0, 0, real(a), imag(a) }
 56 | 	MOVAPS X0, X10           // Copy X0 and X1 for pipelining
 57 | 	MOVAPS X1, X11
 58 | 	MOVQ   CX, BX
 59 | 	ANDQ   $3, CX            // CX = n % 4
 60 | 	SHRQ   $2, BX            // BX = floor( n / 4 )
 61 | 	JZ     axpyi_tail        // if BX == 0 { goto axpyi_tail }
 62 | 
 63 | axpyi_loop: // do {
 64 | 	MOVSD (SI), X3       // X_i = { imag(x[i+1]), real(x[i+1]) }
 65 | 	MOVSD (SI)(R8*1), X5
 66 | 	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
 67 | 	MOVSD (SI), X7
 68 | 	MOVSD (SI)(R8*1), X9
 69 | 
 70 | 	// X_(i-1) = { imag(x[i]), imag(x[i]) }
 71 | 	MOVSHDUP_X3_X2
 72 | 	MOVSHDUP_X5_X4
 73 | 	MOVSHDUP_X7_X6
 74 | 	MOVSHDUP_X9_X8
 75 | 
 76 | 	// X_i = { real(x[i]), real(x[i]) }
 77 | 	MOVSLDUP_X3_X3
 78 | 	MOVSLDUP_X5_X5
 79 | 	MOVSLDUP_X7_X7
 80 | 	MOVSLDUP_X9_X9
 81 | 
 82 | 	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]) }
 83 | 	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i])  }
 84 | 	MULPS X1, X2
 85 | 	MULPS X0, X3
 86 | 	MULPS X11, X4
 87 | 	MULPS X10, X5
 88 | 	MULPS X1, X6
 89 | 	MULPS X0, X7
 90 | 	MULPS X11, X8
 91 | 	MULPS X10, X9
 92 | 
 93 | 	// X_i = {
 94 | 	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
 95 | 	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
 96 | 	//  }
 97 | 	ADDSUBPS_X2_X3
 98 | 	ADDSUBPS_X4_X5
 99 | 	ADDSUBPS_X6_X7
100 | 	ADDSUBPS_X8_X9
101 | 
102 | 	// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
103 | 	MOVSD (DX), X2
104 | 	MOVSD (DX)(R9*1), X4
105 | 	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
106 | 	MOVSD (DX), X6
107 | 	MOVSD (DX)(R9*1), X8
108 | 	ADDPS X2, X3
109 | 	ADDPS X4, X5
110 | 	ADDPS X6, X7
111 | 	ADDPS X8, X9
112 | 
113 | 	MOVSD X3, (DI)       // y[i] = X_i
114 | 	MOVSD X5, (DI)(R9*1)
115 | 	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
116 | 	MOVSD X7, (DI)
117 | 	MOVSD X9, (DI)(R9*1)
118 | 	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
119 | 	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
120 | 	LEAQ  (DI)(R9*2), DI // DI = &(DI[incDst])
121 | 	DECQ  BX
122 | 	JNZ   axpyi_loop     // }  while --BX > 0
123 | 	CMPQ  CX, $0         // if CX == 0 { return }
124 | 	JE    axpyi_end
125 | 
126 | axpyi_tail: // do {
127 | 	MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) }
128 | 	MOVSHDUP_X3_X2 // X_(i-1) = { real(x[i]), real(x[i]) }
129 | 	MOVSLDUP_X3_X3 // X_i = { imag(x[i]), imag(x[i]) }
130 | 
131 | 	// X_i     = { imag(a) * real(x[i]),  real(a) * real(x[i]) }
132 | 	// X_(i-1) = { real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
133 | 	MULPS X1, X2
134 | 	MULPS X0, X3
135 | 
136 | 	// X_i = {
137 | 	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
138 | 	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
139 | 	//  }
140 | 	ADDSUBPS_X2_X3 // (ai*x1r+ar*x1i, ar*x1r-ai*x1i)
141 | 
142 | 	// X_i = { imag(result[i]) + imag(y[i]),  real(result[i]) + real(y[i])  }
143 | 	MOVSD (DI), X4
144 | 	ADDPS X4, X3
145 | 	MOVSD X3, (DI)   // y[i] = X_i
146 | 	ADDQ  R8, SI     // SI += incX
147 | 	ADDQ  R9, DI     // DI += incY
148 | 	LOOP  axpyi_tail // } while --CX > 0
149 | 
150 | axpyi_end:
151 | 	RET
152 | 


--------------------------------------------------------------------------------
/asm/c64/axpyincto_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //+build !noasm,!appengine
  6 | 
  7 | #include "textflag.h"
  8 | 
  9 | // MOVSHDUP X3, X2
 10 | #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
 11 | // MOVSLDUP X3, X3
 12 | #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
 13 | // ADDSUBPS X2, X3
 14 | #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
 15 | 
 16 | // MOVSHDUP X5, X4
 17 | #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
 18 | // MOVSLDUP X5, X5
 19 | #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
 20 | // ADDSUBPS X4, X5
 21 | #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
 22 | 
 23 | // MOVSHDUP X7, X6
 24 | #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
 25 | // MOVSLDUP X7, X7
 26 | #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
 27 | // ADDSUBPS X6, X7
 28 | #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
 29 | 
 30 | // MOVSHDUP X9, X8
 31 | #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
 32 | // MOVSLDUP X9, X9
 33 | #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
 34 | // ADDSUBPS X8, X9
 35 | #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
 36 | 
 37 | // func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
 38 | TEXT ·AxpyIncTo(SB), NOSPLIT, $0
 39 | 	MOVQ   dst_base+0(FP), DI // DI = &dst
 40 | 	MOVQ   x_base+48(FP), SI  // SI = &x
 41 | 	MOVQ   y_base+72(FP), DX  // DX = &y
 42 | 	MOVQ   n+96(FP), CX       // CX = n
 43 | 	CMPQ   CX, $0             // if n==0 { return }
 44 | 	JE     axpyi_end
 45 | 	MOVQ   ix+120(FP), R8     // Load the first index
 46 | 	MOVQ   iy+128(FP), R9
 47 | 	MOVQ   idst+32(FP), R10
 48 | 	LEAQ   (SI)(R8*8), SI     // SI = &(x[ix])
 49 | 	LEAQ   (DX)(R9*8), DX     // DX = &(y[iy])
 50 | 	LEAQ   (DI)(R10*8), DI    // DI = &(dst[idst])
 51 | 	MOVQ   incX+104(FP), R8   // Incrementors*8 for easy iteration (ADDQ)
 52 | 	SHLQ   $3, R8
 53 | 	MOVQ   incY+112(FP), R9
 54 | 	SHLQ   $3, R9
 55 | 	MOVQ   incDst+24(FP), R10
 56 | 	SHLQ   $3, R10
 57 | 	MOVSD  alpha+40(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
 58 | 	MOVAPS X0, X1
 59 | 	SHUFPS $0x11, X1, X1      // X1 = { 0, 0, real(a), imag(a) }
 60 | 	MOVAPS X0, X10            // Copy X0 and X1 for pipelining
 61 | 	MOVAPS X1, X11
 62 | 	MOVQ   CX, BX
 63 | 	ANDQ   $3, CX             // CX = n % 4
 64 | 	SHRQ   $2, BX             // BX = floor( n / 4 )
 65 | 	JZ     axpyi_tail         // if BX == 0 { goto axpyi_tail }
 66 | 
 67 | axpyi_loop: // do {
 68 | 	MOVSD (SI), X3       // X_i = { imag(x[i]), real(x[i]) }
 69 | 	MOVSD (SI)(R8*1), X5
 70 | 	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
 71 | 	MOVSD (SI), X7
 72 | 	MOVSD (SI)(R8*1), X9
 73 | 
 74 | 	// X_(i-1) = { imag(x[i]), imag(x[i]) }
 75 | 	MOVSHDUP_X3_X2
 76 | 	MOVSHDUP_X5_X4
 77 | 	MOVSHDUP_X7_X6
 78 | 	MOVSHDUP_X9_X8
 79 | 
 80 | 	// X_i = { real(x[i]), real(x[i]) }
 81 | 	MOVSLDUP_X3_X3
 82 | 	MOVSLDUP_X5_X5
 83 | 	MOVSLDUP_X7_X7
 84 | 	MOVSLDUP_X9_X9
 85 | 
 86 | 	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]) }
 87 | 	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i])  }
 88 | 	MULPS X1, X2
 89 | 	MULPS X0, X3
 90 | 	MULPS X11, X4
 91 | 	MULPS X10, X5
 92 | 	MULPS X1, X6
 93 | 	MULPS X0, X7
 94 | 	MULPS X11, X8
 95 | 	MULPS X10, X9
 96 | 
 97 | 	// X_i = {
 98 | 	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
 99 | 	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
100 | 	//  }
101 | 	ADDSUBPS_X2_X3
102 | 	ADDSUBPS_X4_X5
103 | 	ADDSUBPS_X6_X7
104 | 	ADDSUBPS_X8_X9
105 | 
106 | 	// X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) }
107 | 	MOVSD (DX), X2
108 | 	MOVSD (DX)(R9*1), X4
109 | 	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
110 | 	MOVSD (DX), X6
111 | 	MOVSD (DX)(R9*1), X8
112 | 	ADDPS X2, X3
113 | 	ADDPS X4, X5
114 | 	ADDPS X6, X7
115 | 	ADDPS X8, X9
116 | 
117 | 	MOVSD X3, (DI)        // y[i] = X_i
118 | 	MOVSD X5, (DI)(R10*1)
119 | 	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst])
120 | 	MOVSD X7, (DI)
121 | 	MOVSD X9, (DI)(R10*1)
122 | 	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])
123 | 	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
124 | 	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst])
125 | 	DECQ  BX
126 | 	JNZ   axpyi_loop      // } while --BX > 0
127 | 	CMPQ  CX, $0          // if CX == 0 { return }
128 | 	JE    axpyi_end
129 | 
130 | axpyi_tail:
131 | 	MOVSD (SI), X3 // X_i     = { imag(x[i]), real(x[i]) }
132 | 	MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) }
133 | 	MOVSLDUP_X3_X3 // X_i     = { real(x[i]), real(x[i]) }
134 | 
135 | 	// X_i     = { imag(a) * real(x[i]),  real(a) * real(x[i]) }
136 | 	// X_(i-1) = { real(a) * imag(x[i]),  imag(a) * imag(x[i]) }
137 | 	MULPS X1, X2
138 | 	MULPS X0, X3
139 | 
140 | 	// X_i = {
141 | 	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
142 | 	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
143 | 	//  }
144 | 	ADDSUBPS_X2_X3
145 | 
146 | 	// X_i = { imag(result[i]) + imag(y[i]),  real(result[i]) + real(y[i])  }
147 | 	MOVSD (DX), X4
148 | 	ADDPS X4, X3
149 | 	MOVSD X3, (DI)   // y[i] = X_i
150 | 	ADDQ  R8, SI     // SI += incX
151 | 	ADDQ  R9, DX     // DX += incY
152 | 	ADDQ  R10, DI    // DI += incDst
153 | 	LOOP  axpyi_tail // } while --CX > 0
154 | 
155 | axpyi_end:
156 | 	RET
157 | 


--------------------------------------------------------------------------------
/asm/c64/axpyunitary_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //+build !noasm,!appengine
  6 | 
  7 | #include "textflag.h"
  8 | 
  9 | // MOVSHDUP X3, X2
 10 | #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
 11 | // MOVSLDUP X3, X3
 12 | #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
 13 | // ADDSUBPS X2, X3
 14 | #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
 15 | 
 16 | // MOVSHDUP X5, X4
 17 | #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
 18 | // MOVSLDUP X5, X5
 19 | #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
 20 | // ADDSUBPS X4, X5
 21 | #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
 22 | 
 23 | // MOVSHDUP X7, X6
 24 | #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
 25 | // MOVSLDUP X7, X7
 26 | #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
 27 | // ADDSUBPS X6, X7
 28 | #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
 29 | 
 30 | // MOVSHDUP X9, X8
 31 | #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
 32 | // MOVSLDUP X9, X9
 33 | #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
 34 | // ADDSUBPS X8, X9
 35 | #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
 36 | 
 37 | // func AxpyUnitary(alpha complex64, x, y []complex64)
 38 | TEXT ·AxpyUnitary(SB), NOSPLIT, $0
 39 | 	MOVQ    x_base+8(FP), SI  // SI = &x
 40 | 	MOVQ    y_base+32(FP), DI // DI = &y
 41 | 	MOVQ    x_len+16(FP), CX  // CX = min( len(x), len(y) )
 42 | 	CMPQ    y_len+40(FP), CX
 43 | 	CMOVQLE y_len+40(FP), CX
 44 | 	CMPQ    CX, $0            // if CX == 0 { return }
 45 | 	JE      caxy_end
 46 | 	PXOR    X0, X0            // Clear work registers and cache-align loop
 47 | 	PXOR    X1, X1
 48 | 	MOVSD   alpha+0(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
 49 | 	SHUFPD  $0, X0, X0        // X0  = { imag(a), real(a), imag(a), real(a) }
 50 | 	MOVAPS  X0, X1
 51 | 	SHUFPS  $0x11, X1, X1     // X1 = { real(a), imag(a), real(a), imag(a) }
 52 | 	XORQ    AX, AX            // i = 0
 53 | 	MOVQ    DI, BX            // Align on 16-byte boundary for ADDPS
 54 | 	ANDQ    $15, BX           // BX = &y & 15
 55 | 	JZ      caxy_no_trim      // if BX == 0 { goto caxy_no_trim }
 56 | 
 57 | 	// Trim first value in unaligned buffer
 58 | 	XORPS X2, X2         // Clear work registers and cache-align loop
 59 | 	XORPS X3, X3
 60 | 	XORPS X4, X4
 61 | 	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
 62 | 	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
 63 | 	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
 64 | 	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
 65 | 	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
 66 | 
 67 | 	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) }
 68 | 	ADDSUBPS_X2_X3
 69 | 	MOVSD (DI)(AX*8), X4 // X3 += y[i]
 70 | 	ADDPS X4, X3
 71 | 	MOVSD X3, (DI)(AX*8) // y[i]  = X3
 72 | 	INCQ  AX             // i++
 73 | 	DECQ  CX             // --CX
 74 | 	JZ    caxy_end       // if CX == 0 { return }
 75 | 
 76 | caxy_no_trim:
 77 | 	MOVAPS X0, X10   // Copy X0 and X1 for pipelineing
 78 | 	MOVAPS X1, X11
 79 | 	MOVQ   CX, BX
 80 | 	ANDQ   $7, CX    // CX = n % 8
 81 | 	SHRQ   $3, BX    // BX = floor( n / 8 )
 82 | 	JZ     caxy_tail // if BX == 0 { goto caxy_tail }
 83 | 
 84 | caxy_loop: // do {
 85 | 	// X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) }
 86 | 	MOVUPS (SI)(AX*8), X3
 87 | 	MOVUPS 16(SI)(AX*8), X5
 88 | 	MOVUPS 32(SI)(AX*8), X7
 89 | 	MOVUPS 48(SI)(AX*8), X9
 90 | 
 91 | 	// X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
 92 | 	MOVSHDUP_X3_X2
 93 | 	MOVSHDUP_X5_X4
 94 | 	MOVSHDUP_X7_X6
 95 | 	MOVSHDUP_X9_X8
 96 | 
 97 | 	// X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
 98 | 	MOVSLDUP_X3_X3
 99 | 	MOVSLDUP_X5_X5
100 | 	MOVSLDUP_X7_X7
101 | 	MOVSLDUP_X9_X9
102 | 
103 | 	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i]),
104 | 	// 		imag(a) * real(x[i+1]), real(a) * real(x[i+1])  }
105 | 	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]),
106 | 	//		real(a) * imag(x[i+1]), imag(a) * imag(x[i+1])  }
107 | 	MULPS X1, X2
108 | 	MULPS X0, X3
109 | 	MULPS X11, X4
110 | 	MULPS X10, X5
111 | 	MULPS X1, X6
112 | 	MULPS X0, X7
113 | 	MULPS X11, X8
114 | 	MULPS X10, X9
115 | 
116 | 	// X_i = {
117 | 	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
118 | 	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
119 | 	//	imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]),
120 | 	//	real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]),
121 | 	//  }
122 | 	ADDSUBPS_X2_X3
123 | 	ADDSUBPS_X4_X5
124 | 	ADDSUBPS_X6_X7
125 | 	ADDSUBPS_X8_X9
126 | 
127 | 	// X_i = { imag(result[i])   + imag(y[i]),   real(result[i])   + real(y[i]),
128 | 	//	   imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1])  }
129 | 	ADDPS  (DI)(AX*8), X3
130 | 	ADDPS  16(DI)(AX*8), X5
131 | 	ADDPS  32(DI)(AX*8), X7
132 | 	ADDPS  48(DI)(AX*8), X9
133 | 	MOVUPS X3, (DI)(AX*8)   // y[i:i+1] = X_i
134 | 	MOVUPS X5, 16(DI)(AX*8)
135 | 	MOVUPS X7, 32(DI)(AX*8)
136 | 	MOVUPS X9, 48(DI)(AX*8)
137 | 	ADDQ   $8, AX           // i += 8
138 | 	DECQ   BX               // --BX
139 | 	JNZ    caxy_loop        // }  while BX > 0
140 | 	CMPQ   CX, $0           // if CX == 0  { return }
141 | 	JE     caxy_end
142 | 
143 | caxy_tail: // do {
144 | 	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
145 | 	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
146 | 	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
147 | 	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
148 | 	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
149 | 
150 | 	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]),
151 | 	//	  real(a)*real(x[i]) - imag(a)*imag(x[i])   }
152 | 	ADDSUBPS_X2_X3
153 | 	MOVSD (DI)(AX*8), X4 // X3 += y[i]
154 | 	ADDPS X4, X3
155 | 	MOVSD X3, (DI)(AX*8) // y[i]  = X3
156 | 	INCQ  AX             // ++i
157 | 	LOOP  caxy_tail      // } while --CX > 0
158 | 
159 | caxy_end:
160 | 	RET
161 | 


--------------------------------------------------------------------------------
/asm/c64/axpyunitaryto_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //+build !noasm,!appengine
  6 | 
  7 | #include "textflag.h"
  8 | 
  9 | // MOVSHDUP X3, X2
 10 | #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3
 11 | // MOVSLDUP X3, X3
 12 | #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB
 13 | // ADDSUBPS X2, X3
 14 | #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA
 15 | 
 16 | // MOVSHDUP X5, X4
 17 | #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5
 18 | // MOVSLDUP X5, X5
 19 | #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED
 20 | // ADDSUBPS X4, X5
 21 | #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC
 22 | 
 23 | // MOVSHDUP X7, X6
 24 | #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7
 25 | // MOVSLDUP X7, X7
 26 | #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF
 27 | // ADDSUBPS X6, X7
 28 | #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE
 29 | 
 30 | // MOVSHDUP X9, X8
 31 | #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1
 32 | // MOVSLDUP X9, X9
 33 | #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9
 34 | // ADDSUBPS X8, X9
 35 | #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8
 36 | 
 37 | // func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64)
 38 | TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
 39 | 	MOVQ    dst_base+0(FP), DI // DI = &dst
 40 | 	MOVQ    x_base+32(FP), SI  // SI = &x
 41 | 	MOVQ    y_base+56(FP), DX  // DX = &y
 42 | 	MOVQ    x_len+40(FP), CX
 43 | 	CMPQ    y_len+64(FP), CX   // CX = min( len(x), len(y), len(dst) )
 44 | 	CMOVQLE y_len+64(FP), CX
 45 | 	CMPQ    dst_len+8(FP), CX
 46 | 	CMOVQLE dst_len+8(FP), CX
 47 | 	CMPQ    CX, $0             // if CX == 0 { return }
 48 | 	JE      caxy_end
 49 | 	MOVSD   alpha+24(FP), X0   // X0 = { 0, 0, imag(a), real(a) }
 50 | 	SHUFPD  $0, X0, X0         // X0  = { imag(a), real(a), imag(a), real(a) }
 51 | 	MOVAPS  X0, X1
 52 | 	SHUFPS  $0x11, X1, X1      // X1 = { real(a), imag(a), real(a), imag(a) }
 53 | 	XORQ    AX, AX             // i = 0
 54 | 	MOVQ    DX, BX             // Align on 16-byte boundary for ADDPS
 55 | 	ANDQ    $15, BX            // BX = &y & 15
 56 | 	JZ      caxy_no_trim       // if BX == 0 { goto caxy_no_trim }
 57 | 
 58 | 	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
 59 | 	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
 60 | 	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
 61 | 	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
 62 | 	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
 63 | 
 64 | 	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) }
 65 | 	ADDSUBPS_X2_X3
 66 | 	MOVSD (DX)(AX*8), X4 // X3 += y[i]
 67 | 	ADDPS X4, X3
 68 | 	MOVSD X3, (DI)(AX*8) // dst[i]  = X3
 69 | 	INCQ  AX             // i++
 70 | 	DECQ  CX             // --CX
 71 | 	JZ    caxy_tail      // if BX == 0 { goto caxy_tail }
 72 | 
 73 | caxy_no_trim:
 74 | 	MOVAPS X0, X10   // Copy X0 and X1 for pipelineing
 75 | 	MOVAPS X1, X11
 76 | 	MOVQ   CX, BX
 77 | 	ANDQ   $7, CX    // CX = n % 8
 78 | 	SHRQ   $3, BX    // BX = floor( n / 8 )
 79 | 	JZ     caxy_tail // if BX == 0 { goto caxy_tail }
 80 | 
 81 | caxy_loop:
 82 | 	// X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) }
 83 | 	MOVUPS (SI)(AX*8), X3
 84 | 	MOVUPS 16(SI)(AX*8), X5
 85 | 	MOVUPS 32(SI)(AX*8), X7
 86 | 	MOVUPS 48(SI)(AX*8), X9
 87 | 
 88 | 	// X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) }
 89 | 	MOVSHDUP_X3_X2
 90 | 	MOVSHDUP_X5_X4
 91 | 	MOVSHDUP_X7_X6
 92 | 	MOVSHDUP_X9_X8
 93 | 
 94 | 	// X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) }
 95 | 	MOVSLDUP_X3_X3
 96 | 	MOVSLDUP_X5_X5
 97 | 	MOVSLDUP_X7_X7
 98 | 	MOVSLDUP_X9_X9
 99 | 
100 | 	// X_i     = {  imag(a) * real(x[i]),   real(a) * real(x[i]),
101 | 	// 		imag(a) * real(x[i+1]), real(a) * real(x[i+1])  }
102 | 	// X_(i-1) = {  real(a) * imag(x[i]),   imag(a) * imag(x[i]),
103 | 	//		real(a) * imag(x[i+1]), imag(a) * imag(x[i+1])  }
104 | 	MULPS X1, X2
105 | 	MULPS X0, X3
106 | 	MULPS X11, X4
107 | 	MULPS X10, X5
108 | 	MULPS X1, X6
109 | 	MULPS X0, X7
110 | 	MULPS X11, X8
111 | 	MULPS X10, X9
112 | 
113 | 	// X_i = {
114 | 	//	imag(result[i]):   imag(a)*real(x[i]) + real(a)*imag(x[i]),
115 | 	//	real(result[i]):   real(a)*real(x[i]) - imag(a)*imag(x[i]),
116 | 	//	imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]),
117 | 	//	real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]),
118 | 	//  }
119 | 	ADDSUBPS_X2_X3
120 | 	ADDSUBPS_X4_X5
121 | 	ADDSUBPS_X6_X7
122 | 	ADDSUBPS_X8_X9
123 | 
124 | 	// X_i = { imag(result[i])   + imag(y[i]),   real(result[i])   + real(y[i]),
125 | 	//	   imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1])  }
126 | 	ADDPS  (DX)(AX*8), X3
127 | 	ADDPS  16(DX)(AX*8), X5
128 | 	ADDPS  32(DX)(AX*8), X7
129 | 	ADDPS  48(DX)(AX*8), X9
130 | 	MOVUPS X3, (DI)(AX*8)   // y[i:i+1] = X_i
131 | 	MOVUPS X5, 16(DI)(AX*8)
132 | 	MOVUPS X7, 32(DI)(AX*8)
133 | 	MOVUPS X9, 48(DI)(AX*8)
134 | 	ADDQ   $8, AX           // i += 8
135 | 	DECQ   BX               // --BX
136 | 	JNZ    caxy_loop        // }  while BX > 0
137 | 	CMPQ   CX, $0           // if CX == 0  { return }
138 | 	JE     caxy_end
139 | 
140 | caxy_tail: // do {
141 | 	MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) }
142 | 	MOVSHDUP_X3_X2       // X2 = { imag(x[i]), imag(x[i]) }
143 | 	MOVSLDUP_X3_X3       // X3 = { real(x[i]), real(x[i]) }
144 | 	MULPS X1, X2         // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) }
145 | 	MULPS X0, X3         // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) }
146 | 
147 | 	// X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]),
148 | 	//	  real(a)*real(x[i]) - imag(a)*imag(x[i])  }
149 | 	ADDSUBPS_X2_X3
150 | 	MOVSD (DX)(AX*8), X4 // X3 += y[i]
151 | 	ADDPS X4, X3
152 | 	MOVSD X3, (DI)(AX*8) // y[i]  = X3
153 | 	INCQ  AX             // ++i
154 | 	LOOP  caxy_tail      // } while --CX > 0
155 | 
156 | caxy_end:
157 | 	RET
158 | 


--------------------------------------------------------------------------------
/asm/c64/conj.go:
--------------------------------------------------------------------------------
1 | // Copyright ©2015 The gonum Authors. All rights reserved.
2 | // Use of this source code is governed by a BSD-style
3 | // license that can be found in the LICENSE file.
4 | 
5 | package c64
6 | 
7 | func conj(c complex64) complex64 { return complex(real(c), -imag(c)) }
8 | 


--------------------------------------------------------------------------------
/asm/c64/doc.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2017 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // This repository is no longer maintained.
 6 | // Development has moved to https://github.com/gonum/gonum.
 7 | //
 8 | // Package c64 provides complex64 vector primitives.
 9 | package c64
10 | 


--------------------------------------------------------------------------------
/asm/c64/dotc.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2015 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package c64
 6 | 
 7 | // DotcUnitary is
 8 | //  for i, v := range x {
 9 | //  	sum += y[i] * conj(v)
10 | //  }
11 | //  return sum
12 | func DotcUnitary(x, y []complex64) (sum complex64) {
13 | 	for i, v := range x {
14 | 		sum += y[i] * conj(v)
15 | 	}
16 | 	return sum
17 | }
18 | 
19 | // DotcInc is
20 | //  for i := 0; i < int(n); i++ {
21 | //  	sum += y[iy] * conj(x[ix])
22 | //  	ix += incX
23 | //  	iy += incY
24 | //  }
25 | //  return sum
26 | func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) {
27 | 	for i := 0; i < int(n); i++ {
28 | 		sum += y[iy] * conj(x[ix])
29 | 		ix += incX
30 | 		iy += incY
31 | 	}
32 | 	return sum
33 | }
34 | 


--------------------------------------------------------------------------------
/asm/c64/dotu.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2015 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package c64
 6 | 
 7 | // DotuUnitary is
 8 | //  for i, v := range x {
 9 | //  	sum += y[i] * v
10 | //  }
11 | //  return sum
12 | func DotuUnitary(x, y []complex64) (sum complex64) {
13 | 	for i, v := range x {
14 | 		sum += y[i] * v
15 | 	}
16 | 	return sum
17 | }
18 | 
19 | // DotuInc is
20 | //  for i := 0; i < int(n); i++ {
21 | //  	sum += y[iy] * x[ix]
22 | //  	ix += incX
23 | //  	iy += incY
24 | //  }
25 | //  return sum
26 | func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) {
27 | 	for i := 0; i < int(n); i++ {
28 | 		sum += y[iy] * x[ix]
29 | 		ix += incX
30 | 		iy += incY
31 | 	}
32 | 	return sum
33 | }
34 | 


--------------------------------------------------------------------------------
/asm/c64/scal.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package c64
 6 | 
 7 | // ScalUnitary is
 8 | //  for i := range x {
 9 | //  	x[i] *= alpha
10 | //  }
11 | func ScalUnitary(alpha complex64, x []complex64) {
12 | 	for i := range x {
13 | 		x[i] *= alpha
14 | 	}
15 | }
16 | 
17 | // ScalUnitaryTo is
18 | //  for i, v := range x {
19 | //  	dst[i] = alpha * v
20 | //  }
21 | func ScalUnitaryTo(dst []complex64, alpha complex64, x []complex64) {
22 | 	for i, v := range x {
23 | 		dst[i] = alpha * v
24 | 	}
25 | }
26 | 
27 | // ScalInc is
28 | //  var ix uintptr
29 | //  for i := 0; i < int(n); i++ {
30 | //  	x[ix] *= alpha
31 | //  	ix += incX
32 | //  }
33 | func ScalInc(alpha complex64, x []complex64, n, incX uintptr) {
34 | 	var ix uintptr
35 | 	for i := 0; i < int(n); i++ {
36 | 		x[ix] *= alpha
37 | 		ix += incX
38 | 	}
39 | }
40 | 
41 | // ScalIncTo is
42 | //  var idst, ix uintptr
43 | //  for i := 0; i < int(n); i++ {
44 | //  	dst[idst] = alpha * x[ix]
45 | //  	ix += incX
46 | //  	idst += incDst
47 | //  }
48 | func ScalIncTo(dst []complex64, incDst uintptr, alpha complex64, x []complex64, n, incX uintptr) {
49 | 	var idst, ix uintptr
50 | 	for i := 0; i < int(n); i++ {
51 | 		dst[idst] = alpha * x[ix]
52 | 		ix += incX
53 | 		idst += incDst
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/asm/c64/stubs_amd64.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !noasm,!appengine
 6 | 
 7 | package c64
 8 | 
 9 | // AxpyUnitary is
10 | //  for i, v := range x {
11 | //  	y[i] += alpha * v
12 | //  }
13 | func AxpyUnitary(alpha complex64, x, y []complex64)
14 | 
15 | // AxpyUnitaryTo is
16 | //  for i, v := range x {
17 | //  	dst[i] = alpha*v + y[i]
18 | //  }
19 | func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64)
20 | 
21 | // AxpyInc is
22 | //  for i := 0; i < int(n); i++ {
23 | //  	y[iy] += alpha * x[ix]
24 | //  	ix += incX
25 | //  	iy += incY
26 | //  }
27 | func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
28 | 
29 | // AxpyIncTo is
30 | //  for i := 0; i < int(n); i++ {
31 | //  	dst[idst] = alpha*x[ix] + y[iy]
32 | //  	ix += incX
33 | //  	iy += incY
34 | //  	idst += incDst
35 | //  }
36 | func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr)
37 | 


--------------------------------------------------------------------------------
/asm/c64/stubs_noasm.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !amd64 noasm appengine
 6 | 
 7 | package c64
 8 | 
 9 | // AxpyUnitary is
10 | //  for i, v := range x {
11 | //  	y[i] += alpha * v
12 | //  }
13 | func AxpyUnitary(alpha complex64, x, y []complex64) {
14 | 	for i, v := range x {
15 | 		y[i] += alpha * v
16 | 	}
17 | }
18 | 
19 | // AxpyUnitaryTo is
20 | //  for i, v := range x {
21 | //  	dst[i] = alpha*v + y[i]
22 | //  }
23 | func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64) {
24 | 	for i, v := range x {
25 | 		dst[i] = alpha*v + y[i]
26 | 	}
27 | }
28 | 
29 | // AxpyInc is
30 | //  for i := 0; i < int(n); i++ {
31 | //  	y[iy] += alpha * x[ix]
32 | //  	ix += incX
33 | //  	iy += incY
34 | //  }
35 | func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) {
36 | 	for i := 0; i < int(n); i++ {
37 | 		y[iy] += alpha * x[ix]
38 | 		ix += incX
39 | 		iy += incY
40 | 	}
41 | }
42 | 
43 | // AxpyIncTo is
44 | //  for i := 0; i < int(n); i++ {
45 | //  	dst[idst] = alpha*x[ix] + y[iy]
46 | //  	ix += incX
47 | //  	iy += incY
48 | //  	idst += incDst
49 | //  }
50 | func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) {
51 | 	for i := 0; i < int(n); i++ {
52 | 		dst[idst] = alpha*x[ix] + y[iy]
53 | 		ix += incX
54 | 		iy += incY
55 | 		idst += incDst
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/asm/c64/stubs_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package c64
  6 | 
  7 | import "testing"
  8 | 
  9 | var tests = []struct {
 10 | 	incX, incY, incDst int
 11 | 	ix, iy, idst       uintptr
 12 | 	a                  complex64
 13 | 	dst, x, y          []complex64
 14 | 	ex                 []complex64
 15 | }{
 16 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 17 | 		a:   1 + 1i,
 18 | 		dst: []complex64{5},
 19 | 		x:   []complex64{1},
 20 | 		y:   []complex64{1i},
 21 | 		ex:  []complex64{1 + 2i}},
 22 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 23 | 		a:   1 + 2i,
 24 | 		dst: []complex64{0, 0, 0},
 25 | 		x:   []complex64{0, 0, 0},
 26 | 		y:   []complex64{1, 1, 1},
 27 | 		ex:  []complex64{1, 1, 1}},
 28 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 29 | 		a:   1 + 2i,
 30 | 		dst: []complex64{0, 0, 0},
 31 | 		x:   []complex64{0, 0},
 32 | 		y:   []complex64{1, 1, 1},
 33 | 		ex:  []complex64{1, 1}},
 34 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 35 | 		a:   1 + 2i,
 36 | 		dst: []complex64{1i, 1i, 1i},
 37 | 		x:   []complex64{1i, 1i, 1i},
 38 | 		y:   []complex64{1, 2, 1},
 39 | 		ex:  []complex64{-1 + 1i, 1i, -1 + 1i}},
 40 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 41 | 		a:   -1i,
 42 | 		dst: []complex64{1i, 1i, 1i},
 43 | 		x:   []complex64{1i, 1i, 1i},
 44 | 		y:   []complex64{1, 2, 1},
 45 | 		ex:  []complex64{2, 3, 2}},
 46 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 47 | 		a:   -1i,
 48 | 		dst: []complex64{1i, 1i, 1i},
 49 | 		x:   []complex64{1i, 1i, 1i, 1i, 1i}[1:4],
 50 | 		y:   []complex64{1, 1, 2, 1, 1}[1:4],
 51 | 		ex:  []complex64{2, 3, 2}},
 52 | 	{incX: 2, incY: 4, incDst: 3, ix: 0, iy: 0, idst: 0,
 53 | 		a:   -2,
 54 | 		dst: []complex64{1i, 1i, 1i, 1i, 1i},
 55 | 		x:   []complex64{2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i},
 56 | 		y:   []complex64{1, 1, 2, 1, 1},
 57 | 		ex:  []complex64{-3 - 2i, -3 - 2i, -2 - 2i, -3 - 2i, -3 - 2i}},
 58 | 	// Run big test twice, once aligned once unaligned.
 59 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 60 | 		a:   1 - 1i,
 61 | 		dst: make([]complex64, 10),
 62 | 		x:   []complex64{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i},
 63 | 		y:   []complex64{1, 1, 2, 1, 1, 1, 1, 2, 1, 1},
 64 | 		ex:  []complex64{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}},
 65 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 66 | 		a:   1 - 1i,
 67 | 		dst: make([]complex64, 10),
 68 | 		x:   []complex64{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i},
 69 | 		y:   []complex64{1, 1, 2, 1, 1, 1, 1, 2, 1, 1},
 70 | 		ex:  []complex64{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}},
 71 | 	{incX: -2, incY: -2, incDst: -3, ix: 18, iy: 18, idst: 27,
 72 | 		a:   1 - 1i,
 73 | 		dst: make([]complex64, 10),
 74 | 		x:   []complex64{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i},
 75 | 		y:   []complex64{1, 1, 2, 1, 1, 1, 1, 2, 1, 1},
 76 | 		ex:  []complex64{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}},
 77 | 	{incX: -2, incY: 2, incDst: -3, ix: 18, iy: 0, idst: 27,
 78 | 		a:   1 - 1i,
 79 | 		dst: make([]complex64, 10),
 80 | 		x:   []complex64{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i},
 81 | 		y:   []complex64{1, 1, 2, 1, 1, 1, 1, 2, 1, 1},
 82 | 		ex:  []complex64{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}},
 83 | }
 84 | 
 85 | func guardVector(vec []complex64, guard_val complex64, guard_len int) (guarded []complex64) {
 86 | 	guarded = make([]complex64, len(vec)+guard_len*2)
 87 | 	copy(guarded[guard_len:], vec)
 88 | 	for i := 0; i < guard_len; i++ {
 89 | 		guarded[i] = guard_val
 90 | 		guarded[len(guarded)-1-i] = guard_val
 91 | 	}
 92 | 	return guarded
 93 | }
 94 | 
 95 | func isValidGuard(vec []complex64, guard_val complex64, guard_len int) bool {
 96 | 	for i := 0; i < guard_len; i++ {
 97 | 		if vec[i] != guard_val || vec[len(vec)-1-i] != guard_val {
 98 | 			return false
 99 | 		}
100 | 	}
101 | 	return true
102 | }
103 | 
104 | func TestAxpyUnitary(t *testing.T) {
105 | 	var x_gd, y_gd complex64 = 1, 1
106 | 	for cas, test := range tests {
107 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
108 | 		test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln)
109 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
110 | 		AxpyUnitary(test.a, x, y)
111 | 		for i := range test.ex {
112 | 			if y[i] != test.ex[i] {
113 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i], test.ex[i])
114 | 			}
115 | 		}
116 | 		if !isValidGuard(test.x, x_gd, xg_ln) {
117 | 			t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:])
118 | 		}
119 | 		if !isValidGuard(test.y, y_gd, yg_ln) {
120 | 			t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:])
121 | 		}
122 | 	}
123 | }
124 | 
125 | func TestAxpyUnitaryTo(t *testing.T) {
126 | 	var x_gd, y_gd, dst_gd complex64 = 1, 1, 0
127 | 	for cas, test := range tests {
128 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
129 | 		test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln)
130 | 		test.dst = guardVector(test.dst, dst_gd, xg_ln)
131 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
132 | 		dst := test.dst[xg_ln : len(test.dst)-xg_ln]
133 | 		AxpyUnitaryTo(dst, test.a, x, y)
134 | 		for i := range test.ex {
135 | 			if dst[i] != test.ex[i] {
136 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i], test.ex[i])
137 | 			}
138 | 		}
139 | 		if !isValidGuard(test.x, x_gd, xg_ln) {
140 | 			t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:])
141 | 		}
142 | 		if !isValidGuard(test.y, y_gd, yg_ln) {
143 | 			t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:])
144 | 		}
145 | 		if !isValidGuard(test.dst, dst_gd, xg_ln) {
146 | 			t.Errorf("Test %d Guard violated in dst vector %v %v", cas, test.dst[:xg_ln], test.dst[len(test.dst)-xg_ln:])
147 | 		}
148 | 
149 | 	}
150 | }
151 | 
152 | func guardIncVector(vec []complex64, guard_val complex64, incV uintptr, guard_len int) (guarded []complex64) {
153 | 	inc := int(incV)
154 | 	s_ln := len(vec) * inc
155 | 	if inc < 0 {
156 | 		s_ln = len(vec) * -inc
157 | 	}
158 | 	guarded = make([]complex64, s_ln+guard_len*2)
159 | 	for i, cas := 0, 0; i < len(guarded); i++ {
160 | 		switch {
161 | 		case i < guard_len, i > guard_len+s_ln:
162 | 			guarded[i] = guard_val
163 | 		case (i-guard_len)%(inc) == 0 && cas < len(vec):
164 | 			guarded[i] = vec[cas]
165 | 			cas++
166 | 		default:
167 | 			guarded[i] = guard_val
168 | 		}
169 | 	}
170 | 	return guarded
171 | }
172 | 
173 | func checkValidIncGuard(t *testing.T, vec []complex64, guard_val complex64, incV uintptr, guard_len int) {
174 | 	inc := int(incV)
175 | 	s_ln := len(vec) - 2*guard_len
176 | 	if inc < 0 {
177 | 		s_ln = len(vec) * -inc
178 | 	}
179 | 
180 | 	for i := range vec {
181 | 		switch {
182 | 		case vec[i] == guard_val:
183 | 			// Correct value
184 | 		case i < guard_len:
185 | 			t.Errorf("Front guard violated at %d %v", i, vec[:guard_len])
186 | 		case i > guard_len+s_ln:
187 | 			t.Errorf("Back guard violated at %d %v", i-guard_len-s_ln, vec[guard_len+s_ln:])
188 | 		case (i-guard_len)%inc == 0 && (i-guard_len)/inc < len(vec):
189 | 			// Ignore input values
190 | 		default:
191 | 			t.Errorf("Internal guard violated at %d %v", i-guard_len, vec[guard_len:guard_len+s_ln])
192 | 		}
193 | 	}
194 | }
195 | 
196 | func TestAxpyInc(t *testing.T) {
197 | 	var x_gd, y_gd complex64 = 1, 1
198 | 	for cas, test := range tests {
199 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
200 | 		test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln)
201 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
202 | 		AxpyInc(test.a, x, y, uintptr(len(test.ex)), uintptr(test.incX), uintptr(test.incY), test.ix, test.iy)
203 | 		for i := range test.ex {
204 | 			if y[int(test.iy)+i*int(test.incY)] != test.ex[i] {
205 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i*int(test.incY)], test.ex[i])
206 | 			}
207 | 		}
208 | 		checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln)
209 | 		checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln)
210 | 	}
211 | }
212 | 
213 | func TestAxpyIncTo(t *testing.T) {
214 | 	var x_gd, y_gd, dst_gd complex64 = 1, 1, 0
215 | 	for cas, test := range tests {
216 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
217 | 		test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln)
218 | 		test.dst = guardIncVector(test.dst, dst_gd, uintptr(test.incDst), xg_ln)
219 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
220 | 		dst := test.dst[xg_ln : len(test.dst)-xg_ln]
221 | 		AxpyIncTo(dst, uintptr(test.incDst), test.idst, test.a, x, y, uintptr(len(test.ex)), uintptr(test.incX), uintptr(test.incY), test.ix, test.iy)
222 | 		for i := range test.ex {
223 | 			if dst[int(test.idst)+i*int(test.incDst)] != test.ex[i] {
224 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i*int(test.incDst)], test.ex[i])
225 | 			}
226 | 		}
227 | 		checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln)
228 | 		checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln)
229 | 		checkValidIncGuard(t, test.dst, dst_gd, uintptr(test.incDst), xg_ln)
230 | 	}
231 | }
232 | 


--------------------------------------------------------------------------------
/asm/f32/axpyinc_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
10 | TEXT ·AxpyInc(SB), NOSPLIT, $0
11 | 	MOVQ  n+56(FP), CX      // CX = n
12 | 	CMPQ  CX, $0            // if n==0 { return }
13 | 	JLE   axpyi_end
14 | 	MOVQ  x_base+8(FP), SI  // SI = &x
15 | 	MOVQ  y_base+32(FP), DI // DI = &y
16 | 	MOVQ  ix+80(FP), R8     // R8 = ix
17 | 	MOVQ  iy+88(FP), R9     // R9 = iy
18 | 	LEAQ  (SI)(R8*4), SI    // SI = &(x[ix])
19 | 	LEAQ  (DI)(R9*4), DI    // DI = &(y[iy])
20 | 	MOVQ  DI, DX            // DX = DI   Read Pointer for y
21 | 	MOVQ  incX+64(FP), R8   // R8 = incX
22 | 	SHLQ  $2, R8            // R8 *= sizeof(float32)
23 | 	MOVQ  incY+72(FP), R9   // R9 = incY
24 | 	SHLQ  $2, R9            // R9 *= sizeof(float32)
25 | 	MOVSS alpha+0(FP), X0   // X0 = alpha
26 | 	MOVSS X0, X1            // X1 = X0  // for pipelining
27 | 	MOVQ  CX, BX
28 | 	ANDQ  $3, BX            // BX = n % 4
29 | 	SHRQ  $2, CX            // CX = floor( n / 4 )
30 | 	JZ    axpyi_tail_start  // if CX == 0 { goto axpyi_tail_start }
31 | 
32 | axpyi_loop: // Loop unrolled 4x   do {
33 | 	MOVSS (SI), X2       // X_i = x[i]
34 | 	MOVSS (SI)(R8*1), X3
35 | 	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])
36 | 	MOVSS (SI), X4
37 | 	MOVSS (SI)(R8*1), X5
38 | 	MULSS X1, X2         // X_i *= a
39 | 	MULSS X0, X3
40 | 	MULSS X1, X4
41 | 	MULSS X0, X5
42 | 	ADDSS (DX), X2       // X_i += y[i]
43 | 	ADDSS (DX)(R9*1), X3
44 | 	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
45 | 	ADDSS (DX), X4
46 | 	ADDSS (DX)(R9*1), X5
47 | 	MOVSS X2, (DI)       // y[i] = X_i
48 | 	MOVSS X3, (DI)(R9*1)
49 | 	LEAQ  (DI)(R9*2), DI // DI = &(DI[incY*2])
50 | 	MOVSS X4, (DI)
51 | 	MOVSS X5, (DI)(R9*1)
52 | 	LEAQ  (SI)(R8*2), SI // SI = &(SI[incX*2])  // Increment addresses
53 | 	LEAQ  (DX)(R9*2), DX // DX = &(DX[incY*2])
54 | 	LEAQ  (DI)(R9*2), DI // DI = &(DI[incY*2])
55 | 	LOOP  axpyi_loop     // } while --CX > 0
56 | 	CMPQ  BX, $0         // if BX == 0 { return }
57 | 	JE    axpyi_end
58 | 
59 | axpyi_tail_start: // Reset loop registers
60 | 	MOVQ BX, CX // Loop counter: CX = BX
61 | 
62 | axpyi_tail: // do {
63 | 	MOVSS (SI), X2   // X2 = x[i]
64 | 	MULSS X1, X2     // X2 *= a
65 | 	ADDSS (DI), X2   // X2 += y[i]
66 | 	MOVSS X2, (DI)   // y[i] = X2
67 | 	ADDQ  R8, SI     // SI = &(SI[incX])
68 | 	ADDQ  R9, DI     // DI = &(DI[incY])
69 | 	LOOP  axpyi_tail // } while --CX > 0
70 | 
71 | axpyi_end:
72 | 	RET
73 | 
74 | 


--------------------------------------------------------------------------------
/asm/f32/axpyincto_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
10 | TEXT ·AxpyIncTo(SB), NOSPLIT, $0
11 | 	MOVQ  n+96(FP), CX       // CX = n
12 | 	CMPQ  CX, $0             // if n==0 { return }
13 | 	JLE   axpyi_end
14 | 	MOVQ  dst_base+0(FP), DI // DI = &dst
15 | 	MOVQ  x_base+48(FP), SI  // SI = &x
16 | 	MOVQ  y_base+72(FP), DX  // DX = &y
17 | 	MOVQ  ix+120(FP), R8     // R8 = ix  // Load the first index
18 | 	MOVQ  iy+128(FP), R9     // R9 = iy
19 | 	MOVQ  idst+32(FP), R10   // R10 = idst
20 | 	LEAQ  (SI)(R8*4), SI     // SI = &(x[ix])
21 | 	LEAQ  (DX)(R9*4), DX     // DX = &(y[iy])
22 | 	LEAQ  (DI)(R10*4), DI    // DI = &(dst[idst])
23 | 	MOVQ  incX+104(FP), R8   // R8 = incX
24 | 	SHLQ  $2, R8             // R8 *= sizeof(float32)
25 | 	MOVQ  incY+112(FP), R9   // R9 = incY
26 | 	SHLQ  $2, R9             // R9 *= sizeof(float32)
27 | 	MOVQ  incDst+24(FP), R10 // R10 = incDst
28 | 	SHLQ  $2, R10            // R10 *= sizeof(float32)
29 | 	MOVSS alpha+40(FP), X0   // X0 = alpha
30 | 	MOVSS X0, X1             // X1 = X0  // for pipelining
31 | 	MOVQ  CX, BX
32 | 	ANDQ  $3, BX             // BX = n % 4
33 | 	SHRQ  $2, CX             // CX = floor( n / 4 )
34 | 	JZ    axpyi_tail_start   // if CX == 0 { goto axpyi_tail_start }
35 | 
36 | axpyi_loop: // Loop unrolled 4x   do {
37 | 	MOVSS (SI), X2        // X_i = x[i]
38 | 	MOVSS (SI)(R8*1), X3
39 | 	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])
40 | 	MOVSS (SI), X4
41 | 	MOVSS (SI)(R8*1), X5
42 | 	MULSS X1, X2          // X_i *= a
43 | 	MULSS X0, X3
44 | 	MULSS X1, X4
45 | 	MULSS X0, X5
46 | 	ADDSS (DX), X2        // X_i += y[i]
47 | 	ADDSS (DX)(R9*1), X3
48 | 	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
49 | 	ADDSS (DX), X4
50 | 	ADDSS (DX)(R9*1), X5
51 | 	MOVSS X2, (DI)        // dst[i] = X_i
52 | 	MOVSS X3, (DI)(R10*1)
53 | 	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst*2])
54 | 	MOVSS X4, (DI)
55 | 	MOVSS X5, (DI)(R10*1)
56 | 	LEAQ  (SI)(R8*2), SI  // SI = &(SI[incX*2])  // Increment addresses
57 | 	LEAQ  (DX)(R9*2), DX  // DX = &(DX[incY*2])
58 | 	LEAQ  (DI)(R10*2), DI // DI = &(DI[incDst*2])
59 | 	LOOP  axpyi_loop      // } while --CX > 0
60 | 	CMPQ  BX, $0          // if BX == 0 { return }
61 | 	JE    axpyi_end
62 | 
63 | axpyi_tail_start: // Reset loop registers
64 | 	MOVQ BX, CX // Loop counter: CX = BX
65 | 
66 | axpyi_tail: // do {
67 | 	MOVSS (SI), X2   // X2 = x[i]
68 | 	MULSS X1, X2     // X2 *= a
69 | 	ADDSS (DX), X2   // X2 += y[i]
70 | 	MOVSS X2, (DI)   // dst[i] = X2
71 | 	ADDQ  R8, SI     // SI = &(SI[incX])
72 | 	ADDQ  R9, DX     // DX = &(DX[incY])
73 | 	ADDQ  R10, DI    // DI = &(DI[incY])
74 | 	LOOP  axpyi_tail // } while --CX > 0
75 | 
76 | axpyi_end:
77 | 	RET
78 | 
79 | 


--------------------------------------------------------------------------------
/asm/f32/axpyunitary_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func AxpyUnitary(alpha float32, x, y []float32)
10 | TEXT ·AxpyUnitary(SB), NOSPLIT, $0
11 | 	MOVQ    x_base+8(FP), SI  // SI = &x
12 | 	MOVQ    y_base+32(FP), DI // DI = &y
13 | 	MOVQ    x_len+16(FP), BX  // BX = min( len(x), len(y) )
14 | 	CMPQ    y_len+40(FP), BX
15 | 	CMOVQLE y_len+40(FP), BX
16 | 	CMPQ    BX, $0            // if BX == 0 { return }
17 | 	JE      axpy_end
18 | 	MOVSS   alpha+0(FP), X0
19 | 	SHUFPS  $0, X0, X0        // X0 = { a, a, a, a }
20 | 	XORQ    AX, AX            // i = 0
21 | 	PXOR    X2, X2            // 2 NOP instructions (PXOR) to align
22 | 	PXOR    X3, X3            // loop to cache line
23 | 	MOVQ    DI, CX
24 | 	ANDQ    $0xF, CX          // Align on 16-byte boundary for ADDPS
25 | 	JZ      axpy_no_trim      // if CX == 0 { goto axpy_no_trim }
26 | 
27 | 	XORQ $0xF, CX // CX = 4 - floor( BX % 16 / 4 )
28 | 	INCQ CX
29 | 	SHRQ $2, CX
30 | 
31 | axpy_align: // Trim first value(s) in unaligned buffer  do {
32 | 	MOVSS (SI)(AX*4), X2 // X2 = x[i]
33 | 	MULSS X0, X2         // X2 *= a
34 | 	ADDSS (DI)(AX*4), X2 // X2 += y[i]
35 | 	MOVSS X2, (DI)(AX*4) // y[i] = X2
36 | 	INCQ  AX             // i++
37 | 	DECQ  BX
38 | 	JZ    axpy_end       // if --BX == 0 { return }
39 | 	LOOP  axpy_align     // } while --CX > 0
40 | 
41 | axpy_no_trim:
42 | 	MOVUPS X0, X1           // Copy X0 to X1 for pipelining
43 | 	MOVQ   BX, CX
44 | 	ANDQ   $0xF, BX         // BX = len % 16
45 | 	SHRQ   $4, CX           // CX = int( len / 16 )
46 | 	JZ     axpy_tail4_start // if CX == 0 { return }
47 | 
48 | axpy_loop: // Loop unrolled 16x   do {
49 | 	MOVUPS (SI)(AX*4), X2   // X2 = x[i:i+4]
50 | 	MOVUPS 16(SI)(AX*4), X3
51 | 	MOVUPS 32(SI)(AX*4), X4
52 | 	MOVUPS 48(SI)(AX*4), X5
53 | 	MULPS  X0, X2           // X2 *= a
54 | 	MULPS  X1, X3
55 | 	MULPS  X0, X4
56 | 	MULPS  X1, X5
57 | 	ADDPS  (DI)(AX*4), X2   // X2 += y[i:i+4]
58 | 	ADDPS  16(DI)(AX*4), X3
59 | 	ADDPS  32(DI)(AX*4), X4
60 | 	ADDPS  48(DI)(AX*4), X5
61 | 	MOVUPS X2, (DI)(AX*4)   // dst[i:i+4] = X2
62 | 	MOVUPS X3, 16(DI)(AX*4)
63 | 	MOVUPS X4, 32(DI)(AX*4)
64 | 	MOVUPS X5, 48(DI)(AX*4)
65 | 	ADDQ   $16, AX          // i += 16
66 | 	LOOP   axpy_loop        // while (--CX) > 0
67 | 	CMPQ   BX, $0           // if BX == 0 { return }
68 | 	JE     axpy_end
69 | 
70 | axpy_tail4_start: // Reset loop counter for 4-wide tail loop
71 | 	MOVQ BX, CX          // CX = floor( BX / 4 )
72 | 	SHRQ $2, CX
73 | 	JZ   axpy_tail_start // if CX == 0 { goto axpy_tail_start }
74 | 
75 | axpy_tail4: // Loop unrolled 4x   do {
76 | 	MOVUPS (SI)(AX*4), X2 // X2 = x[i]
77 | 	MULPS  X0, X2         // X2 *= a
78 | 	ADDPS  (DI)(AX*4), X2 // X2 += y[i]
79 | 	MOVUPS X2, (DI)(AX*4) // y[i] = X2
80 | 	ADDQ   $4, AX         // i += 4
81 | 	LOOP   axpy_tail4     // } while --CX > 0
82 | 
83 | axpy_tail_start: // Reset loop counter for 1-wide tail loop
84 | 	MOVQ BX, CX   // CX = BX % 4
85 | 	ANDQ $3, CX
86 | 	JZ   axpy_end // if CX == 0 { return }
87 | 
88 | axpy_tail:
89 | 	MOVSS (SI)(AX*4), X1 // X1 = x[i]
90 | 	MULSS X0, X1         // X1 *= a
91 | 	ADDSS (DI)(AX*4), X1 // X1 += y[i]
92 | 	MOVSS X1, (DI)(AX*4) // y[i] = X1
93 | 	INCQ  AX             // i++
94 | 	LOOP  axpy_tail      // } while --CX > 0
95 | 
96 | axpy_end:
97 | 	RET
98 | 


--------------------------------------------------------------------------------
/asm/f32/axpyunitaryto_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
10 | TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
11 | 	MOVQ    dst_base+0(FP), DI // DI = &dst
12 | 	MOVQ    x_base+32(FP), SI  // SI = &x
13 | 	MOVQ    y_base+56(FP), DX  // DX = &y
14 | 	MOVQ    x_len+40(FP), BX   // BX = min( len(x), len(y), len(dst) )
15 | 	CMPQ    y_len+64(FP), BX
16 | 	CMOVQLE y_len+64(FP), BX
17 | 	CMPQ    dst_len+8(FP), BX
18 | 	CMOVQLE dst_len+8(FP), BX
19 | 	CMPQ    BX, $0             // if BX == 0 { return }
20 | 	JE      axpy_end
21 | 	MOVSS   alpha+24(FP), X0
22 | 	SHUFPS  $0, X0, X0         // X0 = { a, a, a, a, }
23 | 	XORQ    AX, AX             // i = 0
24 | 	MOVQ    DX, CX
25 | 	ANDQ    $0xF, CX           // Align on 16-byte boundary for ADDPS
26 | 	JZ      axpy_no_trim       // if CX == 0 { goto axpy_no_trim }
27 | 
28 | 	XORQ $0xF, CX // CX = 4 - floor ( B % 16 / 4 )
29 | 	INCQ CX
30 | 	SHRQ $2, CX
31 | 
32 | axpy_align: // Trim first value(s) in unaligned buffer  do {
33 | 	MOVSS (SI)(AX*4), X2 // X2 = x[i]
34 | 	MULSS X0, X2         // X2 *= a
35 | 	ADDSS (DX)(AX*4), X2 // X2 += y[i]
36 | 	MOVSS X2, (DI)(AX*4) // y[i] = X2
37 | 	INCQ  AX             // i++
38 | 	DECQ  BX
39 | 	JZ    axpy_end       // if --BX == 0 { return }
40 | 	LOOP  axpy_align     // } while --CX > 0
41 | 
42 | axpy_no_trim:
43 | 	MOVUPS X0, X1           // Copy X0 to X1 for pipelining
44 | 	MOVQ   BX, CX
45 | 	ANDQ   $0xF, BX         // BX = len % 16
46 | 	SHRQ   $4, CX           // CX = floor( len / 16 )
47 | 	JZ     axpy_tail4_start // if CX == 0 { return }
48 | 
49 | axpy_loop: // Loop unrolled 16x  do {
50 | 	MOVUPS (SI)(AX*4), X2   // X2 = x[i:i+4]
51 | 	MOVUPS 16(SI)(AX*4), X3
52 | 	MOVUPS 32(SI)(AX*4), X4
53 | 	MOVUPS 48(SI)(AX*4), X5
54 | 	MULPS  X0, X2           // X2 *= a
55 | 	MULPS  X1, X3
56 | 	MULPS  X0, X4
57 | 	MULPS  X1, X5
58 | 	ADDPS  (DX)(AX*4), X2   // X2 += y[i:i+4]
59 | 	ADDPS  16(DX)(AX*4), X3
60 | 	ADDPS  32(DX)(AX*4), X4
61 | 	ADDPS  48(DX)(AX*4), X5
62 | 	MOVUPS X2, (DI)(AX*4)   // dst[i:i+4] = X2
63 | 	MOVUPS X3, 16(DI)(AX*4)
64 | 	MOVUPS X4, 32(DI)(AX*4)
65 | 	MOVUPS X5, 48(DI)(AX*4)
66 | 	ADDQ   $16, AX          // i += 16
67 | 	LOOP   axpy_loop        // while (--CX) > 0
68 | 	CMPQ   BX, $0           // if BX == 0 { return }
69 | 	JE     axpy_end
70 | 
71 | axpy_tail4_start: // Reset loop counter for 4-wide tail loop
72 | 	MOVQ BX, CX          // CX = floor( BX / 4 )
73 | 	SHRQ $2, CX
74 | 	JZ   axpy_tail_start // if CX == 0 { goto axpy_tail_start }
75 | 
76 | axpy_tail4: // Loop unrolled 4x  do {
77 | 	MOVUPS (SI)(AX*4), X2 // X2 = x[i]
78 | 	MULPS  X0, X2         // X2 *= a
79 | 	ADDPS  (DX)(AX*4), X2 // X2 += y[i]
80 | 	MOVUPS X2, (DI)(AX*4) // y[i] = X2
81 | 	ADDQ   $4, AX         // i += 4
82 | 	LOOP   axpy_tail4     // } while --CX > 0
83 | 
84 | axpy_tail_start: // Reset loop counter for 1-wide tail loop
85 | 	MOVQ BX, CX   // CX = BX % 4
86 | 	ANDQ $3, CX
87 | 	JZ   axpy_end // if CX == 0 { return }
88 | 
89 | axpy_tail:
90 | 	MOVSS (SI)(AX*4), X1 // X1 = x[i]
91 | 	MULSS X0, X1         // X1 *= a
92 | 	ADDSS (DX)(AX*4), X1 // X1 += y[i]
93 | 	MOVSS X1, (DI)(AX*4) // y[i] = X1
94 | 	INCQ  AX             // i++
95 | 	LOOP  axpy_tail      // } while --CX > 0
96 | 
97 | axpy_end:
98 | 	RET
99 | 


--------------------------------------------------------------------------------
/asm/f32/ddot.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2015 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package f32
 6 | 
 7 | // DdotUnitary is
 8 | //   for i, v := range x {
 9 | //   	sum += float64(y[i]) * float64(v)
10 | //   }
11 | //   return
12 | func DdotUnitary(x, y []float32) (sum float64) {
13 | 	for i, v := range x {
14 | 		sum += float64(y[i]) * float64(v)
15 | 	}
16 | 	return
17 | }
18 | 
19 | // DdotInc is
20 | //  for i := 0; i < int(n); i++ {
21 | //  	sum += float64(y[iy]) * float64(x[ix])
22 | //  	ix += incX
23 | //  	iy += incY
24 | //  }
25 | //  return
26 | func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64) {
27 | 	for i := 0; i < int(n); i++ {
28 | 		sum += float64(y[iy]) * float64(x[ix])
29 | 		ix += incX
30 | 		iy += incY
31 | 	}
32 | 	return
33 | }
34 | 


--------------------------------------------------------------------------------
/asm/f32/doc.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2017 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // This repository is no longer maintained.
 6 | // Development has moved to https://github.com/gonum/gonum.
 7 | //
 8 | // Package f32 provides float32 vector primitives.
 9 | package f32
10 | 


--------------------------------------------------------------------------------
/asm/f32/dot.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2015 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package f32
 6 | 
 7 | // DotUnitary is
 8 | //  for i, v := range x {
 9 | //  	sum += y[i] * v
10 | //  }
11 | //  return sum
12 | func DotUnitary(x, y []float32) (sum float32) {
13 | 	for i, v := range x {
14 | 		sum += y[i] * v
15 | 	}
16 | 	return sum
17 | }
18 | 
19 | // DotInc is
20 | //  for i := 0; i < int(n); i++ {
21 | //  	sum += y[iy] * x[ix]
22 | //  	ix += incX
23 | //  	iy += incY
24 | //  }
25 | //  return sum
26 | func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32) {
27 | 	for i := 0; i < int(n); i++ {
28 | 		sum += y[iy] * x[ix]
29 | 		ix += incX
30 | 		iy += incY
31 | 	}
32 | 	return sum
33 | }
34 | 


--------------------------------------------------------------------------------
/asm/f32/scal.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | package f32
 6 | 
 7 | // ScalUnitary is
 8 | //  for i := range x {
 9 | //  	x[i] *= alpha
10 | //  }
11 | func ScalUnitary(alpha float32, x []float32) {
12 | 	for i := range x {
13 | 		x[i] *= alpha
14 | 	}
15 | }
16 | 
17 | // ScalUnitaryTo is
18 | //  for i, v := range x {
19 | //  	dst[i] = alpha * v
20 | //  }
21 | func ScalUnitaryTo(dst []float32, alpha float32, x []float32) {
22 | 	for i, v := range x {
23 | 		dst[i] = alpha * v
24 | 	}
25 | }
26 | 
27 | // ScalInc is
28 | //  var ix uintptr
29 | //  for i := 0; i < int(n); i++ {
30 | //  	x[ix] *= alpha
31 | //  	ix += incX
32 | //  }
33 | func ScalInc(alpha float32, x []float32, n, incX uintptr) {
34 | 	var ix uintptr
35 | 	for i := 0; i < int(n); i++ {
36 | 		x[ix] *= alpha
37 | 		ix += incX
38 | 	}
39 | }
40 | 
41 | // ScalIncTo is
42 | //  var idst, ix uintptr
43 | //  for i := 0; i < int(n); i++ {
44 | //  	dst[idst] = alpha * x[ix]
45 | //  	ix += incX
46 | //  	idst += incDst
47 | //  }
48 | func ScalIncTo(dst []float32, incDst uintptr, alpha float32, x []float32, n, incX uintptr) {
49 | 	var idst, ix uintptr
50 | 	for i := 0; i < int(n); i++ {
51 | 		dst[idst] = alpha * x[ix]
52 | 		ix += incX
53 | 		idst += incDst
54 | 	}
55 | }
56 | 


--------------------------------------------------------------------------------
/asm/f32/stubs_amd64.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !noasm,!appengine
 6 | 
 7 | package f32
 8 | 
 9 | // AxpyUnitary is
10 | //  for i, v := range x {
11 | //  	y[i] += alpha * v
12 | //  }
13 | func AxpyUnitary(alpha float32, x, y []float32)
14 | 
15 | // AxpyUnitaryTo is
16 | //  for i, v := range x {
17 | //  	dst[i] = alpha*v + y[i]
18 | //  }
19 | func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32)
20 | 
21 | // AxpyInc is
22 | //  for i := 0; i < int(n); i++ {
23 | //  	y[iy] += alpha * x[ix]
24 | //  	ix += incX
25 | //  	iy += incY
26 | //  }
27 | func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
28 | 
29 | // AxpyIncTo is
30 | //  for i := 0; i < int(n); i++ {
31 | //  	dst[idst] = alpha*x[ix] + y[iy]
32 | //  	ix += incX
33 | //  	iy += incY
34 | //  	idst += incDst
35 | //  }
36 | func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr)
37 | 


--------------------------------------------------------------------------------
/asm/f32/stubs_noasm.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !amd64 noasm appengine
 6 | 
 7 | package f32
 8 | 
 9 | // AxpyUnitary is
10 | //  for i, v := range x {
11 | //  	y[i] += alpha * v
12 | //  }
13 | func AxpyUnitary(alpha float32, x, y []float32) {
14 | 	for i, v := range x {
15 | 		y[i] += alpha * v
16 | 	}
17 | }
18 | 
19 | // AxpyUnitaryTo is
20 | //  for i, v := range x {
21 | //  	dst[i] = alpha*v + y[i]
22 | //  }
23 | func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) {
24 | 	for i, v := range x {
25 | 		dst[i] = alpha*v + y[i]
26 | 	}
27 | }
28 | 
29 | // AxpyInc is
30 | //  for i := 0; i < int(n); i++ {
31 | //  	y[iy] += alpha * x[ix]
32 | //  	ix += incX
33 | //  	iy += incY
34 | //  }
35 | func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
36 | 	for i := 0; i < int(n); i++ {
37 | 		y[iy] += alpha * x[ix]
38 | 		ix += incX
39 | 		iy += incY
40 | 	}
41 | }
42 | 
43 | // AxpyIncTo is
44 | //  for i := 0; i < int(n); i++ {
45 | //  	dst[idst] = alpha*x[ix] + y[iy]
46 | //  	ix += incX
47 | //  	iy += incY
48 | //  	idst += incDst
49 | //  }
50 | func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) {
51 | 	for i := 0; i < int(n); i++ {
52 | 		dst[idst] = alpha*x[ix] + y[iy]
53 | 		ix += incX
54 | 		iy += incY
55 | 		idst += incDst
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/asm/f32/stubs_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright ©2015 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package f32
  6 | 
  7 | import (
  8 | 	"math"
  9 | 	"testing"
 10 | )
 11 | 
 12 | var (
 13 | 	nan = float32(math.NaN())
 14 | 	inf = float32(math.Inf(1))
 15 | )
 16 | 
 17 | var tests = []struct {
 18 | 	incX, incY, incDst uintptr
 19 | 	ix, iy, idst       uintptr
 20 | 	a                  float32
 21 | 	dst, x, y          []float32
 22 | 	ex                 []float32
 23 | }{
 24 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 25 | 		a:   3,
 26 | 		dst: []float32{5},
 27 | 		x:   []float32{2},
 28 | 		y:   []float32{1},
 29 | 		ex:  []float32{7}},
 30 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 31 | 		a:   5,
 32 | 		dst: []float32{0, 0, 0},
 33 | 		x:   []float32{0, 0, 0},
 34 | 		y:   []float32{1, 1, 1},
 35 | 		ex:  []float32{1, 1, 1}},
 36 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 37 | 		a:   5,
 38 | 		dst: []float32{0, 0, 0},
 39 | 		x:   []float32{0, 0},
 40 | 		y:   []float32{1, 1, 1},
 41 | 		ex:  []float32{1, 1}},
 42 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 43 | 		a:   -1,
 44 | 		dst: []float32{-1, -1, -1},
 45 | 		x:   []float32{1, 1, 1},
 46 | 		y:   []float32{1, 2, 1},
 47 | 		ex:  []float32{0, 1, 0}},
 48 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 49 | 		a:   -1,
 50 | 		dst: []float32{1, 1, 1},
 51 | 		x:   []float32{1, 2, 1},
 52 | 		y:   []float32{-1, -2, -1},
 53 | 		ex:  []float32{-2, -4, -2}},
 54 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 55 | 		a:   2.5,
 56 | 		dst: []float32{1, 1, 1, 1, 1},
 57 | 		x:   []float32{1, 2, 3, 2, 1},
 58 | 		y:   []float32{0, 0, 0, 0, 0},
 59 | 		ex:  []float32{2.5, 5, 7.5, 5, 2.5}},
 60 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, // Run big test twice, once aligned once unaligned.
 61 | 		a:   16.5,
 62 | 		dst: make([]float32, 20),
 63 | 		x:   []float32{.5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5},
 64 | 		y:   []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
 65 | 		ex:  []float32{9.25, 10.25, 11.25, 12.25, 13.25, 14.25, 15.25, 16.25, 17.25, 18.25, 9.25, 10.25, 11.25, 12.25, 13.25, 14.25, 15.25, 16.25, 17.25, 18.25}},
 66 | 	{incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0,
 67 | 		a:   16.5,
 68 | 		dst: make([]float32, 10),
 69 | 		x:   []float32{.5, .5, .5, .5, .5, .5, .5, .5, .5, .5},
 70 | 		y:   []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
 71 | 		ex:  []float32{9.25, 10.25, 11.25, 12.25, 13.25, 14.25, 15.25, 16.25, 17.25, 18.25}},
 72 | }
 73 | 
 74 | func guardVector(vec []float32, guard_val float32, guard_len int) (guarded []float32) {
 75 | 	guarded = make([]float32, len(vec)+guard_len*2)
 76 | 	copy(guarded[guard_len:], vec)
 77 | 	for i := 0; i < guard_len; i++ {
 78 | 		guarded[i] = guard_val
 79 | 		guarded[len(guarded)-1-i] = guard_val
 80 | 	}
 81 | 	return guarded
 82 | }
 83 | 
 84 | func isValidGuard(vec []float32, guard_val float32, guard_len int) bool {
 85 | 	for i := 0; i < guard_len; i++ {
 86 | 		if vec[i] != guard_val || vec[len(vec)-1-i] != guard_val {
 87 | 			return false
 88 | 		}
 89 | 	}
 90 | 	return true
 91 | }
 92 | 
 93 | func same(x, y float32) bool {
 94 | 	a, b := float64(x), float64(y)
 95 | 	return a == b || (math.IsNaN(a) && math.IsNaN(b))
 96 | }
 97 | 
 98 | func TestAxpyUnitary(t *testing.T) {
 99 | 	var x_gd, y_gd float32 = 1, 1
100 | 	for cas, test := range tests {
101 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
102 | 		test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln)
103 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
104 | 		AxpyUnitary(test.a, x, y)
105 | 		for i := range test.ex {
106 | 			if !same(y[i], test.ex[i]) {
107 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i], test.ex[i])
108 | 			}
109 | 		}
110 | 		if !isValidGuard(test.x, x_gd, xg_ln) {
111 | 			t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:])
112 | 		}
113 | 		if !isValidGuard(test.y, y_gd, yg_ln) {
114 | 			t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:])
115 | 		}
116 | 	}
117 | }
118 | 
119 | func TestAxpyUnitaryTo(t *testing.T) {
120 | 	var x_gd, y_gd, dst_gd float32 = 1, 1, 0
121 | 	for cas, test := range tests {
122 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
123 | 		test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln)
124 | 		test.dst = guardVector(test.dst, dst_gd, xg_ln)
125 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
126 | 		dst := test.dst[xg_ln : len(test.dst)-xg_ln]
127 | 		AxpyUnitaryTo(dst, test.a, x, y)
128 | 		for i := range test.ex {
129 | 			if !same(test.ex[i], dst[i]) {
130 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i], test.ex[i])
131 | 			}
132 | 		}
133 | 		if !isValidGuard(test.x, x_gd, xg_ln) {
134 | 			t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:])
135 | 		}
136 | 		if !isValidGuard(test.y, y_gd, yg_ln) {
137 | 			t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:])
138 | 		}
139 | 		if !isValidGuard(test.dst, dst_gd, xg_ln) {
140 | 			t.Errorf("Test %d Guard violated in dst vector %v %v", cas, test.dst[:xg_ln], test.dst[len(test.dst)-xg_ln:])
141 | 		}
142 | 	}
143 | }
144 | 
145 | func guardIncVector(vec []float32, guard_val float32, incV uintptr, guard_len int) (guarded []float32) {
146 | 	inc := int(incV)
147 | 	s_ln := len(vec) * (inc)
148 | 	guarded = make([]float32, s_ln+guard_len*2)
149 | 	for i, j := 0, 0; i < len(guarded); i++ {
150 | 		switch {
151 | 		case i < guard_len, i > guard_len+s_ln:
152 | 			guarded[i] = guard_val
153 | 		case (i-guard_len)%(inc) == 0 && j < len(vec):
154 | 			guarded[i] = vec[j]
155 | 			j++
156 | 		default:
157 | 			guarded[i] = guard_val
158 | 		}
159 | 	}
160 | 	return guarded
161 | }
162 | 
163 | func checkValidIncGuard(t *testing.T, vec []float32, guard_val float32, incV uintptr, guard_len int) {
164 | 	inc := int(incV)
165 | 	s_ln := len(vec) - 2*guard_len
166 | 	for i := range vec {
167 | 		switch {
168 | 		case same(vec[i], guard_val):
169 | 			// Correct value
170 | 		case i < guard_len:
171 | 			t.Errorf("Front guard violated at %d %v", i, vec[:guard_len])
172 | 		case i > guard_len+s_ln:
173 | 			t.Errorf("Back guard violated at %d %v", i-guard_len-s_ln, vec[guard_len+s_ln:])
174 | 		case (i-guard_len)%inc == 0 && (i-guard_len)/inc < len(vec):
175 | 			// Ignore input values
176 | 		default:
177 | 			t.Errorf("Internal guard violated at %d %v", i-guard_len, vec[guard_len:guard_len+s_ln])
178 | 		}
179 | 	}
180 | }
181 | 
182 | func TestAxpyInc(t *testing.T) {
183 | 	var x_gd, y_gd float32 = 1, 1
184 | 	for cas, test := range tests {
185 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
186 | 		test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln)
187 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
188 | 		AxpyInc(test.a, x, y, uintptr(len(test.ex)), test.incX, test.incY, test.ix, test.iy)
189 | 		for i := range test.ex {
190 | 			if !same(y[i*int(test.incY)], test.ex[i]) {
191 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i*int(test.incY)], test.ex[i])
192 | 			}
193 | 		}
194 | 		checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln)
195 | 		checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln)
196 | 	}
197 | }
198 | 
199 | func TestAxpyIncTo(t *testing.T) {
200 | 	var x_gd, y_gd, dst_gd float32 = 1, 1, 0
201 | 	for cas, test := range tests {
202 | 		xg_ln, yg_ln := 4+cas%2, 4+cas%3
203 | 		test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln)
204 | 		test.dst = guardIncVector(test.dst, dst_gd, uintptr(test.incDst), xg_ln)
205 | 		x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln]
206 | 		dst := test.dst[xg_ln : len(test.dst)-xg_ln]
207 | 		AxpyIncTo(dst, test.incDst, test.idst, test.a, x, y, uintptr(len(test.ex)), test.incX, test.incY, test.ix, test.iy)
208 | 		for i := range test.ex {
209 | 			if !same(dst[i*int(test.incDst)], test.ex[i]) {
210 | 				t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i*int(test.incDst)], test.ex[i])
211 | 			}
212 | 		}
213 | 		checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln)
214 | 		checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln)
215 | 		checkValidIncGuard(t, test.dst, dst_gd, uintptr(test.incDst), xg_ln)
216 | 	}
217 | }
218 | 


--------------------------------------------------------------------------------
/asm/f64/abssum_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func L1Norm(x []float64) float64
10 | TEXT ·L1Norm(SB), NOSPLIT, $0
11 | 	MOVQ x_base+0(FP), SI // SI = &x
12 | 	MOVQ x_len+8(FP), CX  // CX = len(x)
13 | 	XORQ AX, AX           // i = 0
14 | 	PXOR X0, X0           // p_sum_i = 0
15 | 	PXOR X1, X1
16 | 	PXOR X2, X2
17 | 	PXOR X3, X3
18 | 	PXOR X4, X4
19 | 	PXOR X5, X5
20 | 	PXOR X6, X6
21 | 	PXOR X7, X7
22 | 	CMPQ CX, $0           // if CX == 0 { return 0 }
23 | 	JE   absum_end
24 | 	MOVQ CX, BX
25 | 	ANDQ $7, BX           // BX = len(x) % 8
26 | 	SHRQ $3, CX           // CX = floor( len(x) / 8 )
27 | 	JZ   absum_tail_start // if CX == 0 { goto absum_tail_start }
28 | 
29 | absum_loop: // do {
30 | 	// p_sum += max( p_sum + x[i], p_sum - x[i] )
31 | 	MOVUPS (SI)(AX*8), X8    // X_i = x[i:i+1]
32 | 	MOVUPS 16(SI)(AX*8), X9
33 | 	MOVUPS 32(SI)(AX*8), X10
34 | 	MOVUPS 48(SI)(AX*8), X11
35 | 	ADDPD  X8, X0            // p_sum_i += X_i  ( positive values )
36 | 	ADDPD  X9, X2
37 | 	ADDPD  X10, X4
38 | 	ADDPD  X11, X6
39 | 	SUBPD  X8, X1            // p_sum_(i+1) -= X_i  ( negative values )
40 | 	SUBPD  X9, X3
41 | 	SUBPD  X10, X5
42 | 	SUBPD  X11, X7
43 | 	MAXPD  X1, X0            // p_sum_i = max( p_sum_i, p_sum_(i+1) )
44 | 	MAXPD  X3, X2
45 | 	MAXPD  X5, X4
46 | 	MAXPD  X7, X6
47 | 	MOVAPS X0, X1            // p_sum_(i+1) = p_sum_i
48 | 	MOVAPS X2, X3
49 | 	MOVAPS X4, X5
50 | 	MOVAPS X6, X7
51 | 	ADDQ   $8, AX            // i += 8
52 | 	LOOP   absum_loop        // } while --CX > 0
53 | 
54 | 	// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
55 | 	ADDPD X3, X0
56 | 	ADDPD X5, X7
57 | 	ADDPD X7, X0
58 | 
59 | 	// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
60 | 	MOVAPS X0, X1
61 | 	SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
62 | 	ADDSD  X1, X0
63 | 	CMPQ   BX, $0
64 | 	JE     absum_end    // if BX == 0 { goto absum_end }
65 | 
66 | absum_tail_start: // Reset loop registers
67 | 	MOVQ  BX, CX // Loop counter:  CX = BX
68 | 	XORPS X8, X8 // X_8 = 0
69 | 
70 | absum_tail: // do {
71 | 	// p_sum += max( p_sum + x[i], p_sum - x[i] )
72 | 	MOVSD (SI)(AX*8), X8 // X_8 = x[i]
73 | 	MOVSD X0, X1         // p_sum_1 = p_sum_0
74 | 	ADDSD X8, X0         // p_sum_0 += X_8
75 | 	SUBSD X8, X1         // p_sum_1 -= X_8
76 | 	MAXSD X1, X0         // p_sum_0 = max( p_sum_0, p_sum_1 )
77 | 	INCQ  AX             // i++
78 | 	LOOP  absum_tail     // } while --CX > 0
79 | 
80 | absum_end: // return p_sum_0
81 | 	MOVSD X0, sum+24(FP)
82 | 	RET
83 | 


--------------------------------------------------------------------------------
/asm/f64/abssuminc_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func L1NormInc(x []float64, n, incX int) (sum float64)
10 | TEXT ·L1NormInc(SB), NOSPLIT, $0
11 | 	MOVQ  x_base+0(FP), SI // SI = &x
12 | 	MOVQ  n+24(FP), CX     // CX = n
13 | 	MOVQ  incX+32(FP), AX  // AX =  increment * sizeof( float64 )
14 | 	SHLQ  $3, AX
15 | 	MOVQ  AX, DX           // DX = AX * 3
16 | 	IMULQ $3, DX
17 | 	PXOR  X0, X0           // p_sum_i = 0
18 | 	PXOR  X1, X1
19 | 	PXOR  X2, X2
20 | 	PXOR  X3, X3
21 | 	PXOR  X4, X4
22 | 	PXOR  X5, X5
23 | 	PXOR  X6, X6
24 | 	PXOR  X7, X7
25 | 	CMPQ  CX, $0           // if CX == 0 { return 0 }
26 | 	JE    absum_end
27 | 	MOVQ  CX, BX
28 | 	ANDQ  $7, BX           // BX = n % 8
29 | 	SHRQ  $3, CX           // CX = floor( n / 8 )
30 | 	JZ    absum_tail_start // if CX == 0 { goto absum_tail_start }
31 | 
32 | absum_loop: // do {
33 | 	// p_sum = max( p_sum + x[i], p_sum - x[i] )
34 | 	MOVSD  (SI), X8        // X_i[0] = x[i]
35 | 	MOVSD  (SI)(AX*1), X9
36 | 	MOVSD  (SI)(AX*2), X10
37 | 	MOVSD  (SI)(DX*1), X11
38 | 	LEAQ   (SI)(AX*4), SI  // SI = SI + 4
39 | 	MOVHPD (SI), X8        // X_i[1] = x[i+4]
40 | 	MOVHPD (SI)(AX*1), X9
41 | 	MOVHPD (SI)(AX*2), X10
42 | 	MOVHPD (SI)(DX*1), X11
43 | 	ADDPD  X8, X0          // p_sum_i += X_i  ( positive values )
44 | 	ADDPD  X9, X2
45 | 	ADDPD  X10, X4
46 | 	ADDPD  X11, X6
47 | 	SUBPD  X8, X1          // p_sum_(i+1) -= X_i  ( negative values )
48 | 	SUBPD  X9, X3
49 | 	SUBPD  X10, X5
50 | 	SUBPD  X11, X7
51 | 	MAXPD  X1, X0          // p_sum_i = max( p_sum_i, p_sum_(i+1) )
52 | 	MAXPD  X3, X2
53 | 	MAXPD  X5, X4
54 | 	MAXPD  X7, X6
55 | 	MOVAPS X0, X1          // p_sum_(i+1) = p_sum_i
56 | 	MOVAPS X2, X3
57 | 	MOVAPS X4, X5
58 | 	MOVAPS X6, X7
59 | 	LEAQ   (SI)(AX*4), SI  // SI = SI + 4
60 | 	LOOP   absum_loop      // } while --CX > 0
61 | 
62 | 	// p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) )
63 | 	ADDPD X3, X0
64 | 	ADDPD X5, X7
65 | 	ADDPD X7, X0
66 | 
67 | 	// p_sum_0[0] = p_sum_0[0] + p_sum_0[1]
68 | 	MOVAPS X0, X1
69 | 	SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 )
70 | 	ADDSD  X1, X0
71 | 	CMPQ   BX, $0
72 | 	JE     absum_end    // if BX == 0 { goto absum_end }
73 | 
74 | absum_tail_start: // Reset loop registers
75 | 	MOVQ  BX, CX // Loop counter:  CX = BX
76 | 	XORPS X8, X8 // X_8 = 0
77 | 
78 | absum_tail: // do {
79 | 	// p_sum += max( p_sum + x[i], p_sum - x[i] )
80 | 	MOVSD (SI), X8   // X_8 = x[i]
81 | 	MOVSD X0, X1     // p_sum_1 = p_sum_0
82 | 	ADDSD X8, X0     // p_sum_0 += X_8
83 | 	SUBSD X8, X1     // p_sum_1 -= X_8
84 | 	MAXSD X1, X0     // p_sum_0 = max( p_sum_0, p_sum_1 )
85 | 	ADDQ  AX, SI     // i++
86 | 	LOOP  absum_tail // } while --CX > 0
87 | 
88 | absum_end: // return p_sum_0
89 | 	MOVSD X0, sum+40(FP)
90 | 	RET
91 | 


--------------------------------------------------------------------------------
/asm/f64/add_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func Add(dst, s []float64)
10 | TEXT ·Add(SB), NOSPLIT, $0
11 | 	MOVQ    dst_base+0(FP), DI // DI = &dst
12 | 	MOVQ    dst_len+8(FP), CX  // CX = len(dst)
13 | 	MOVQ    s_base+24(FP), SI  // SI = &s
14 | 	CMPQ    s_len+32(FP), CX   // CX = max( CX, len(s) )
15 | 	CMOVQLE s_len+32(FP), CX
16 | 	CMPQ    CX, $0             // if CX == 0 { return }
17 | 	JE      add_end
18 | 	XORQ    AX, AX
19 | 	MOVQ    DI, BX
20 | 	ANDQ    $0x0F, BX          // BX = &dst & 15
21 | 	JZ      add_no_trim        // if BX == 0 { goto add_no_trim }
22 | 
23 | 	// Align on 16-bit boundary
24 | 	MOVSD (SI)(AX*8), X0 // X0 = s[i]
25 | 	ADDSD (DI)(AX*8), X0 // X0 += dst[i]
26 | 	MOVSD X0, (DI)(AX*8) // dst[i] = X0
27 | 	INCQ  AX             // i++
28 | 	DECQ  CX             // --CX
29 | 	JE    add_end        // if CX == 0 { return  }
30 | 
31 | add_no_trim:
32 | 	MOVQ CX, BX
33 | 	ANDQ $7, BX         // BX = len(dst) % 8
34 | 	SHRQ $3, CX         // CX = floor( len(dst) / 8 )
35 | 	JZ   add_tail_start // if CX == 0 { goto add_tail_start }
36 | 
37 | add_loop: // Loop unrolled 8x   do {
38 | 	MOVUPS (SI)(AX*8), X0   // X_i = s[i:i+1]
39 | 	MOVUPS 16(SI)(AX*8), X1
40 | 	MOVUPS 32(SI)(AX*8), X2
41 | 	MOVUPS 48(SI)(AX*8), X3
42 | 	ADDPD  (DI)(AX*8), X0   // X_i += dst[i:i+1]
43 | 	ADDPD  16(DI)(AX*8), X1
44 | 	ADDPD  32(DI)(AX*8), X2
45 | 	ADDPD  48(DI)(AX*8), X3
46 | 	MOVUPS X0, (DI)(AX*8)   // dst[i:i+1] = X_i
47 | 	MOVUPS X1, 16(DI)(AX*8)
48 | 	MOVUPS X2, 32(DI)(AX*8)
49 | 	MOVUPS X3, 48(DI)(AX*8)
50 | 	ADDQ   $8, AX           // i += 8
51 | 	LOOP   add_loop         // } while --CX > 0
52 | 	CMPQ   BX, $0           // if BX == 0 { return }
53 | 	JE     add_end
54 | 
55 | add_tail_start: // Reset loop registers
56 | 	MOVQ BX, CX // Loop counter: CX = BX
57 | 
58 | add_tail: // do {
59 | 	MOVSD (SI)(AX*8), X0 // X0 = s[i]
60 | 	ADDSD (DI)(AX*8), X0 // X0 += dst[i]
61 | 	MOVSD X0, (DI)(AX*8) // dst[i] = X0
62 | 	INCQ  AX             // ++i
63 | 	LOOP  add_tail       // } while --CX > 0
64 | 
65 | add_end:
66 | 	RET
67 | 


--------------------------------------------------------------------------------
/asm/f64/addconst_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func Addconst(alpha float64, x []float64)
10 | TEXT ·AddConst(SB), NOSPLIT, $0
11 | 	MOVQ   x_base+8(FP), SI // SI = &x
12 | 	MOVQ   x_len+16(FP), CX // CX = len(x)
13 | 	CMPQ   CX, $0           // if len(x) == 0 { return }
14 | 	JE     ac_end
15 | 	MOVSD  alpha+0(FP), X4  // X4 = { a, a }
16 | 	SHUFPD $0, X4, X4
17 | 	MOVUPS X4, X5           // X5 = X4
18 | 	XORQ   AX, AX           // i = 0
19 | 	MOVQ   CX, BX
20 | 	ANDQ   $7, BX           // BX = len(x) % 8
21 | 	SHRQ   $3, CX           // CX = floor( len(x) / 8 )
22 | 	JZ     ac_tail_start    // if CX == 0 { goto ac_tail_start }
23 | 
24 | ac_loop: // Loop unrolled 8x   do {
25 | 	MOVUPS (SI)(AX*8), X0   // X_i = s[i:i+1]
26 | 	MOVUPS 16(SI)(AX*8), X1
27 | 	MOVUPS 32(SI)(AX*8), X2
28 | 	MOVUPS 48(SI)(AX*8), X3
29 | 	ADDPD  X4, X0           // X_i += a
30 | 	ADDPD  X5, X1
31 | 	ADDPD  X4, X2
32 | 	ADDPD  X5, X3
33 | 	MOVUPS X0, (SI)(AX*8)   // s[i:i+1] = X_i
34 | 	MOVUPS X1, 16(SI)(AX*8)
35 | 	MOVUPS X2, 32(SI)(AX*8)
36 | 	MOVUPS X3, 48(SI)(AX*8)
37 | 	ADDQ   $8, AX           // i += 8
38 | 	LOOP   ac_loop          // } while --CX > 0
39 | 	CMPQ   BX, $0           // if BX == 0 { return }
40 | 	JE     ac_end
41 | 
42 | ac_tail_start: // Reset loop counters
43 | 	MOVQ BX, CX // Loop counter: CX = BX
44 | 
45 | ac_tail: // do {
46 | 	MOVSD (SI)(AX*8), X0 // X0 = s[i]
47 | 	ADDSD X4, X0         // X0 += a
48 | 	MOVSD X0, (SI)(AX*8) // s[i] = X0
49 | 	INCQ  AX             // ++i
50 | 	LOOP  ac_tail        // } while --CX > 0
51 | 
52 | ac_end:
53 | 	RET
54 | 


--------------------------------------------------------------------------------
/asm/f64/asm_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright ©2015 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package f64
  6 | 
  7 | import (
  8 | 	"math"
  9 | 	"math/rand"
 10 | 	"testing"
 11 | )
 12 | 
 13 | var (
 14 | 	nan = math.NaN()
 15 | 	inf = math.Inf(1)
 16 | )
 17 | 
 18 | // newGuardedVector allocates a new slice and returns it as three subslices.
 19 | // v is a strided vector that contains elements of data at indices i*inc and
 20 | // NaN elsewhere. frontGuard and backGuard are filled with NaN values, and
 21 | // their backing arrays are directly adjacent to v in memory. The three slices
 22 | // can be used to detect invalid memory reads and writes.
 23 | func newGuardedVector(data []float64, inc int) (v, frontGuard, backGuard []float64) {
 24 | 	if inc < 0 {
 25 | 		inc = -inc
 26 | 	}
 27 | 	guard := 2 * inc
 28 | 	size := (len(data)-1)*inc + 1
 29 | 	whole := make([]float64, size+2*guard)
 30 | 	v = whole[guard : len(whole)-guard]
 31 | 	for i := range whole {
 32 | 		whole[i] = math.NaN()
 33 | 	}
 34 | 	for i, d := range data {
 35 | 		v[i*inc] = d
 36 | 	}
 37 | 	return v, whole[:guard], whole[len(whole)-guard:]
 38 | }
 39 | 
 40 | // allNaN returns true if x contains only NaN values, and false otherwise.
 41 | func allNaN(x []float64) bool {
 42 | 	for _, v := range x {
 43 | 		if !math.IsNaN(v) {
 44 | 			return false
 45 | 		}
 46 | 	}
 47 | 	return true
 48 | }
 49 | 
 50 | // equalStrided returns true if the strided vector x contains elements of the
 51 | // dense vector ref at indices i*inc, false otherwise.
 52 | func equalStrided(ref, x []float64, inc int) bool {
 53 | 	if inc < 0 {
 54 | 		inc = -inc
 55 | 	}
 56 | 	for i, v := range ref {
 57 | 		if !same(x[i*inc], v) {
 58 | 			return false
 59 | 		}
 60 | 	}
 61 | 	return true
 62 | }
 63 | 
 64 | // nonStridedWrite returns false if all elements of x at non-stride indices are
 65 | // equal to NaN, true otherwise.
 66 | func nonStridedWrite(x []float64, inc int) bool {
 67 | 	if inc < 0 {
 68 | 		inc = -inc
 69 | 	}
 70 | 	for i, v := range x {
 71 | 		if i%inc != 0 && !math.IsNaN(v) {
 72 | 			return true
 73 | 		}
 74 | 	}
 75 | 	return false
 76 | }
 77 | 
 78 | // guardVector copies the source vector (vec) into a new slice with guards.
 79 | // Guards guarded[:gdLn] and guarded[len-gdLn:] will be filled with sigil value gdVal.
 80 | func guardVector(vec []float64, gdVal float64, gdLn int) (guarded []float64) {
 81 | 	guarded = make([]float64, len(vec)+gdLn*2)
 82 | 	copy(guarded[gdLn:], vec)
 83 | 	for i := 0; i < gdLn; i++ {
 84 | 		guarded[i] = gdVal
 85 | 		guarded[len(guarded)-1-i] = gdVal
 86 | 	}
 87 | 	return guarded
 88 | }
 89 | 
 90 | // isValidGuard will test for violated guards, generated by guardVector.
 91 | func isValidGuard(vec []float64, gdVal float64, gdLn int) bool {
 92 | 	for i := 0; i < gdLn; i++ {
 93 | 		if !same(vec[i], gdVal) || !same(vec[len(vec)-1-i], gdVal) {
 94 | 			return false
 95 | 		}
 96 | 	}
 97 | 	return true
 98 | }
 99 | 
100 | // guardIncVector copies the source vector (vec) into a new incremented slice with guards.
101 | // End guards will be length gdLen.
102 | // Internal and end guards will be filled with sigil value gdVal.
103 | func guardIncVector(vec []float64, gdVal float64, inc, gdLen int) (guarded []float64) {
104 | 	if inc < 0 {
105 | 		inc = -inc
106 | 	}
107 | 	inrLen := len(vec) * inc
108 | 	guarded = make([]float64, inrLen+gdLen*2)
109 | 	for i := range guarded {
110 | 		guarded[i] = gdVal
111 | 	}
112 | 	for i, v := range vec {
113 | 		guarded[gdLen+i*inc] = v
114 | 	}
115 | 	return guarded
116 | }
117 | 
118 | // checkValidIncGuard will test for violated guards, generated by guardIncVector
119 | func checkValidIncGuard(t *testing.T, vec []float64, gdVal float64, inc, gdLen int) {
120 | 	srcLn := len(vec) - 2*gdLen
121 | 	for i := range vec {
122 | 		switch {
123 | 		case same(vec[i], gdVal):
124 | 			// Correct value
125 | 		case (i-gdLen)%inc == 0 && (i-gdLen)/inc < len(vec):
126 | 			// Ignore input values
127 | 		case i < gdLen:
128 | 			t.Errorf("Front guard violated at %d %v", i, vec[:gdLen])
129 | 		case i > gdLen+srcLn:
130 | 			t.Errorf("Back guard violated at %d %v", i-gdLen-srcLn, vec[gdLen+srcLn:])
131 | 		default:
132 | 			t.Errorf("Internal guard violated at %d %v", i-gdLen, vec[gdLen:gdLen+srcLn])
133 | 		}
134 | 	}
135 | }
136 | 
137 | // same tests for nan-aware equality.
138 | func same(a, b float64) bool {
139 | 	return a == b || (math.IsNaN(a) && math.IsNaN(b))
140 | }
141 | 
142 | var ( // Offset sets for testing alignment handling in Unitary assembly functions.
143 | 	align1 = []int{0, 1}
144 | 	align2 = newIncSet(0, 1)
145 | 	align3 = newIncToSet(0, 1)
146 | )
147 | 
148 | type incSet struct {
149 | 	x, y int
150 | }
151 | 
152 | // genInc will generate all (x,y) combinations of the input increment set.
153 | func newIncSet(inc ...int) []incSet {
154 | 	n := len(inc)
155 | 	is := make([]incSet, n*n)
156 | 	for x := range inc {
157 | 		for y := range inc {
158 | 			is[x*n+y] = incSet{inc[x], inc[y]}
159 | 		}
160 | 	}
161 | 	return is
162 | }
163 | 
164 | type incToSet struct {
165 | 	dst, x, y int
166 | }
167 | 
168 | // genIncTo will generate all (dst,x,y) combinations of the input increment set.
169 | func newIncToSet(inc ...int) []incToSet {
170 | 	n := len(inc)
171 | 	is := make([]incToSet, n*n*n)
172 | 	for i, dst := range inc {
173 | 		for x := range inc {
174 | 			for y := range inc {
175 | 				is[i*n*n+x*n+y] = incToSet{dst, inc[x], inc[y]}
176 | 			}
177 | 		}
178 | 	}
179 | 	return is
180 | }
181 | 
182 | var benchSink []float64
183 | 
184 | func randomSlice(n, inc int) []float64 {
185 | 	if inc < 0 {
186 | 		inc = -inc
187 | 	}
188 | 	x := make([]float64, (n-1)*inc+1)
189 | 	for i := range x {
190 | 		x[i] = rand.Float64()
191 | 	}
192 | 	return x
193 | }
194 | 
195 | func randSlice(n, inc int, r *rand.Rand) []float64 {
196 | 	if inc < 0 {
197 | 		inc = -inc
198 | 	}
199 | 	x := make([]float64, (n-1)*inc+1)
200 | 	for i := range x {
201 | 		x[i] = r.Float64()
202 | 	}
203 | 	return x
204 | }
205 | 


--------------------------------------------------------------------------------
/asm/f64/axpy.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2015 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !amd64 noasm appengine
 6 | 
 7 | package f64
 8 | 
 9 | // AxpyUnitary is
10 | //  for i, v := range x {
11 | //  	y[i] += alpha * v
12 | //  }
13 | func AxpyUnitary(alpha float64, x, y []float64) {
14 | 	for i, v := range x {
15 | 		y[i] += alpha * v
16 | 	}
17 | }
18 | 
19 | // AxpyUnitaryTo is
20 | //  for i, v := range x {
21 | //  	dst[i] = alpha*v + y[i]
22 | //  }
23 | func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) {
24 | 	for i, v := range x {
25 | 		dst[i] = alpha*v + y[i]
26 | 	}
27 | }
28 | 
29 | // AxpyInc is
30 | //  for i := 0; i < int(n); i++ {
31 | //  	y[iy] += alpha * x[ix]
32 | //  	ix += incX
33 | //  	iy += incY
34 | //  }
35 | func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) {
36 | 	for i := 0; i < int(n); i++ {
37 | 		y[iy] += alpha * x[ix]
38 | 		ix += incX
39 | 		iy += incY
40 | 	}
41 | }
42 | 
43 | // AxpyIncTo is
44 | //  for i := 0; i < int(n); i++ {
45 | //  	dst[idst] = alpha*x[ix] + y[iy]
46 | //  	ix += incX
47 | //  	iy += incY
48 | //  	idst += incDst
49 | //  }
50 | func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) {
51 | 	for i := 0; i < int(n); i++ {
52 | 		dst[idst] = alpha*x[ix] + y[iy]
53 | 		ix += incX
54 | 		iy += incY
55 | 		idst += incDst
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/asm/f64/axpy_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright ©2015 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package f64
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"testing"
 10 | )
 11 | 
 12 | const (
 13 | 	msgVal   = "%v: unexpected value at %v Got: %v Expected: %v"
 14 | 	msgGuard = "%v: Guard violated in %s vector %v %v"
 15 | )
 16 | 
 17 | var axpyTests = []struct {
 18 | 	alpha   float64
 19 | 	x       []float64
 20 | 	y       []float64
 21 | 	want    []float64
 22 | 	wantRev []float64 // Result when x is traversed in reverse direction.
 23 | }{
 24 | 	{
 25 | 		alpha:   0,
 26 | 		x:       []float64{},
 27 | 		y:       []float64{},
 28 | 		want:    []float64{},
 29 | 		wantRev: []float64{},
 30 | 	},
 31 | 	{
 32 | 		alpha:   0,
 33 | 		x:       []float64{2},
 34 | 		y:       []float64{-3},
 35 | 		want:    []float64{-3},
 36 | 		wantRev: []float64{-3},
 37 | 	},
 38 | 	{
 39 | 		alpha:   1,
 40 | 		x:       []float64{2},
 41 | 		y:       []float64{-3},
 42 | 		want:    []float64{-1},
 43 | 		wantRev: []float64{-1},
 44 | 	},
 45 | 	{
 46 | 		alpha:   3,
 47 | 		x:       []float64{2},
 48 | 		y:       []float64{-3},
 49 | 		want:    []float64{3},
 50 | 		wantRev: []float64{3},
 51 | 	},
 52 | 	{
 53 | 		alpha:   -3,
 54 | 		x:       []float64{2},
 55 | 		y:       []float64{-3},
 56 | 		want:    []float64{-9},
 57 | 		wantRev: []float64{-9},
 58 | 	},
 59 | 	{
 60 | 		alpha:   1,
 61 | 		x:       []float64{1, 5},
 62 | 		y:       []float64{2, -3},
 63 | 		want:    []float64{3, 2},
 64 | 		wantRev: []float64{7, -2},
 65 | 	},
 66 | 	{
 67 | 		alpha:   1,
 68 | 		x:       []float64{2, 3, 4},
 69 | 		y:       []float64{-3, -2, -1},
 70 | 		want:    []float64{-1, 1, 3},
 71 | 		wantRev: []float64{1, 1, 1},
 72 | 	},
 73 | 	{
 74 | 		alpha:   0,
 75 | 		x:       []float64{0, 0, 1, 1, 2, -3, -4},
 76 | 		y:       []float64{0, 1, 0, 3, -4, 5, -6},
 77 | 		want:    []float64{0, 1, 0, 3, -4, 5, -6},
 78 | 		wantRev: []float64{0, 1, 0, 3, -4, 5, -6},
 79 | 	},
 80 | 	{
 81 | 		alpha:   1,
 82 | 		x:       []float64{0, 0, 1, 1, 2, -3, -4},
 83 | 		y:       []float64{0, 1, 0, 3, -4, 5, -6},
 84 | 		want:    []float64{0, 1, 1, 4, -2, 2, -10},
 85 | 		wantRev: []float64{-4, -2, 2, 4, -3, 5, -6},
 86 | 	},
 87 | 	{
 88 | 		alpha:   3,
 89 | 		x:       []float64{0, 0, 1, 1, 2, -3, -4},
 90 | 		y:       []float64{0, 1, 0, 3, -4, 5, -6},
 91 | 		want:    []float64{0, 1, 3, 6, 2, -4, -18},
 92 | 		wantRev: []float64{-12, -8, 6, 6, -1, 5, -6},
 93 | 	},
 94 | 	{
 95 | 		alpha:   -3,
 96 | 		x:       []float64{0, 0, 1, 1, 2, -3, -4, 0, 0, 1, 1, 2, -3, -4},
 97 | 		y:       []float64{0, 1, 0, 3, -4, 5, -6, 0, 1, 0, 3, -4, 5, -6},
 98 | 		want:    []float64{0, 1, -3, 0, -10, 14, 6, 0, 1, -3, 0, -10, 14, 6},
 99 | 		wantRev: []float64{12, 10, -6, 0, -7, 5, -6, 12, 10, -6, 0, -7, 5, -6},
100 | 	},
101 | 	{
102 | 		alpha:   -5,
103 | 		x:       []float64{0, 0, 1, 1, 2, -3, -4, 5, 1, 2, -3, -4, 5},
104 | 		y:       []float64{0, 1, 0, 3, -4, 5, -6, 7, 3, -4, 5, -6, 7},
105 | 		want:    []float64{0, 1, -5, -2, -14, 20, 14, -18, -2, -14, 20, 14, -18},
106 | 		wantRev: []float64{-25, 21, 15, -7, -9, -20, 14, 22, -7, -9, 0, -6, 7},
107 | 	},
108 | }
109 | 
110 | func TestAxpyUnitary(t *testing.T) {
111 | 	const xGdVal, yGdVal = -1, 0.5
112 | 	for i, test := range axpyTests {
113 | 		for _, align := range align2 {
114 | 			prefix := fmt.Sprintf("Test %v (x:%v y:%v)", i, align.x, align.y)
115 | 			xgLn, ygLn := 4+align.x, 4+align.y
116 | 			xg, yg := guardVector(test.x, xGdVal, xgLn), guardVector(test.y, yGdVal, ygLn)
117 | 			x, y := xg[xgLn:len(xg)-xgLn], yg[ygLn:len(yg)-ygLn]
118 | 			AxpyUnitary(test.alpha, x, y)
119 | 			for i := range test.want {
120 | 				if !same(y[i], test.want[i]) {
121 | 					t.Errorf(msgVal, prefix, i, y[i], test.want[i])
122 | 				}
123 | 			}
124 | 			if !isValidGuard(xg, xGdVal, xgLn) {
125 | 				t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:])
126 | 			}
127 | 			if !isValidGuard(yg, yGdVal, ygLn) {
128 | 				t.Errorf(msgGuard, prefix, "y", yg[:ygLn], yg[len(yg)-ygLn:])
129 | 			}
130 | 			if !equalStrided(test.x, x, 1) {
131 | 				t.Errorf("%v: modified read-only x argument", prefix)
132 | 			}
133 | 		}
134 | 	}
135 | }
136 | 
137 | func TestAxpyUnitaryTo(t *testing.T) {
138 | 	const dstGdVal, xGdVal, yGdVal = 1, -1, 0.5
139 | 	for i, test := range axpyTests {
140 | 		for _, align := range align3 {
141 | 			prefix := fmt.Sprintf("Test %v (x:%v y:%v dst:%v)", i, align.x, align.y, align.dst)
142 | 
143 | 			dgLn, xgLn, ygLn := 4+align.dst, 4+align.x, 4+align.y
144 | 			dstOrig := make([]float64, len(test.x))
145 | 			xg, yg := guardVector(test.x, xGdVal, xgLn), guardVector(test.y, yGdVal, ygLn)
146 | 			dstg := guardVector(dstOrig, dstGdVal, dgLn)
147 | 			x, y := xg[xgLn:len(xg)-xgLn], yg[ygLn:len(yg)-ygLn]
148 | 			dst := dstg[dgLn : len(dstg)-dgLn]
149 | 
150 | 			AxpyUnitaryTo(dst, test.alpha, x, y)
151 | 			for i := range test.want {
152 | 				if !same(dst[i], test.want[i]) {
153 | 					t.Errorf(msgVal, prefix, i, dst[i], test.want[i])
154 | 				}
155 | 			}
156 | 			if !isValidGuard(xg, xGdVal, xgLn) {
157 | 				t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:])
158 | 			}
159 | 			if !isValidGuard(yg, yGdVal, ygLn) {
160 | 				t.Errorf(msgGuard, prefix, "y", yg[:ygLn], yg[len(yg)-ygLn:])
161 | 			}
162 | 			if !isValidGuard(dstg, dstGdVal, dgLn) {
163 | 				t.Errorf(msgGuard, prefix, "dst", dstg[:dgLn], dstg[len(dstg)-dgLn:])
164 | 			}
165 | 			if !equalStrided(test.x, x, 1) {
166 | 				t.Errorf("%v: modified read-only x argument", prefix)
167 | 			}
168 | 			if !equalStrided(test.y, y, 1) {
169 | 				t.Errorf("%v: modified read-only y argument", prefix)
170 | 			}
171 | 		}
172 | 	}
173 | }
174 | 
175 | func TestAxpyInc(t *testing.T) {
176 | 	const xGdVal, yGdVal = -1, 0.5
177 | 	gdLn := 4
178 | 	for i, test := range axpyTests {
179 | 		n := len(test.x)
180 | 		for _, inc := range newIncSet(-7, -4, -3, -2, -1, 1, 2, 3, 4, 7) {
181 | 			var ix, iy int
182 | 			if inc.x < 0 {
183 | 				ix = (-n + 1) * inc.x
184 | 			}
185 | 			if inc.y < 0 {
186 | 				iy = (-n + 1) * inc.y
187 | 			}
188 | 			prefix := fmt.Sprintf("test %v, inc.x = %v, inc.y = %v", i, inc.x, inc.y)
189 | 			xg := guardIncVector(test.x, xGdVal, inc.x, gdLn)
190 | 			yg := guardIncVector(test.y, yGdVal, inc.y, gdLn)
191 | 			x, y := xg[gdLn:len(xg)-gdLn], yg[gdLn:len(yg)-gdLn]
192 | 
193 | 			AxpyInc(test.alpha, x, y, uintptr(n),
194 | 				uintptr(inc.x), uintptr(inc.y), uintptr(ix), uintptr(iy))
195 | 
196 | 			want := test.want
197 | 			if inc.x*inc.y < 0 {
198 | 				want = test.wantRev
199 | 			}
200 | 			if inc.y < 0 {
201 | 				inc.y = -inc.y
202 | 			}
203 | 			for i := range want {
204 | 				if !same(y[i*inc.y], want[i]) {
205 | 					t.Errorf(msgVal, prefix, i, y[iy+i*inc.y], want[i])
206 | 				}
207 | 			}
208 | 			if !equalStrided(test.x, x, inc.x) {
209 | 				t.Errorf("%v: modified read-only x argument", prefix)
210 | 			}
211 | 			checkValidIncGuard(t, xg, xGdVal, inc.x, gdLn)
212 | 			checkValidIncGuard(t, yg, yGdVal, inc.y, gdLn)
213 | 		}
214 | 	}
215 | }
216 | 
217 | func TestAxpyIncTo(t *testing.T) {
218 | 	const dstGdVal, xGdVal, yGdVal = 1, -1, 0.5
219 | 	var want []float64
220 | 	gdLn := 4
221 | 	for i, test := range axpyTests {
222 | 		n := len(test.x)
223 | 		for _, inc := range newIncToSet(-7, -4, -3, -2, -1, 1, 2, 3, 4, 7) {
224 | 			var ix, iy, idst uintptr
225 | 			if inc.x < 0 {
226 | 				ix = uintptr((-n + 1) * inc.x)
227 | 			}
228 | 			if inc.y < 0 {
229 | 				iy = uintptr((-n + 1) * inc.y)
230 | 			}
231 | 			if inc.dst < 0 {
232 | 				idst = uintptr((-n + 1) * inc.dst)
233 | 			}
234 | 
235 | 			prefix := fmt.Sprintf("Test %v: (x: %v, y: %v, dst:%v)", i, inc.x, inc.y, inc.dst)
236 | 			dstOrig := make([]float64, len(test.want))
237 | 			xg := guardIncVector(test.x, xGdVal, inc.x, gdLn)
238 | 			yg := guardIncVector(test.y, yGdVal, inc.y, gdLn)
239 | 			dstg := guardIncVector(dstOrig, dstGdVal, inc.dst, gdLn)
240 | 			x, y := xg[gdLn:len(xg)-gdLn], yg[gdLn:len(yg)-gdLn]
241 | 			dst := dstg[gdLn : len(dstg)-gdLn]
242 | 
243 | 			AxpyIncTo(dst, uintptr(inc.dst), idst,
244 | 				test.alpha, x, y, uintptr(n),
245 | 				uintptr(inc.x), uintptr(inc.y), ix, iy)
246 | 			want = test.want
247 | 			if inc.x*inc.y < 0 {
248 | 				want = test.wantRev
249 | 			}
250 | 			var iW, incW int = 0, 1
251 | 			if inc.y*inc.dst < 0 {
252 | 				iW, incW = len(want)-1, -1
253 | 			}
254 | 			if inc.dst < 0 {
255 | 				inc.dst = -inc.dst
256 | 			}
257 | 			for i := range want {
258 | 				if !same(dst[i*inc.dst], want[iW+i*incW]) {
259 | 					t.Errorf(msgVal, prefix, i, dst[i*inc.dst], want[iW+i*incW])
260 | 				}
261 | 			}
262 | 
263 | 			checkValidIncGuard(t, xg, xGdVal, inc.x, gdLn)
264 | 			checkValidIncGuard(t, yg, yGdVal, inc.y, gdLn)
265 | 			checkValidIncGuard(t, dstg, dstGdVal, inc.dst, gdLn)
266 | 			if !equalStrided(test.x, x, inc.x) {
267 | 				t.Errorf("%v: modified read-only x argument", prefix)
268 | 			}
269 | 			if !equalStrided(test.y, y, inc.y) {
270 | 				t.Errorf("%v: modified read-only y argument", prefix)
271 | 			}
272 | 		}
273 | 	}
274 | }
275 | 


--------------------------------------------------------------------------------
/asm/f64/axpyinc_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2015 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | //
  5 | // Some of the loop unrolling code is copied from:
  6 | // http://golang.org/src/math/big/arith_amd64.s
  7 | // which is distributed under these terms:
  8 | //
  9 | // Copyright (c) 2012 The Go Authors. All rights reserved.
 10 | //
 11 | // Redistribution and use in source and binary forms, with or without
 12 | // modification, are permitted provided that the following conditions are
 13 | // met:
 14 | //
 15 | //    * Redistributions of source code must retain the above copyright
 16 | // notice, this list of conditions and the following disclaimer.
 17 | //    * Redistributions in binary form must reproduce the above
 18 | // copyright notice, this list of conditions and the following disclaimer
 19 | // in the documentation and/or other materials provided with the
 20 | // distribution.
 21 | //    * Neither the name of Google Inc. nor the names of its
 22 | // contributors may be used to endorse or promote products derived from
 23 | // this software without specific prior written permission.
 24 | //
 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 36 | 
 37 | //+build !noasm,!appengine
 38 | 
 39 | #include "textflag.h"
 40 | 
 41 | #define X_PTR SI
 42 | #define Y_PTR DI
 43 | #define DST_PTR DI
 44 | #define IDX AX
 45 | #define LEN CX
 46 | #define TAIL BX
 47 | #define INC_X R8
 48 | #define INCx3_X R11
 49 | #define INC_Y R9
 50 | #define INCx3_Y R12
 51 | #define INC_DST R9
 52 | #define INCx3_DST R12
 53 | #define ALPHA X0
 54 | #define ALPHA_2 X1
 55 | 
 56 | // func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
 57 | TEXT ·AxpyInc(SB), NOSPLIT, $0
 58 | 	MOVQ x_base+8(FP), X_PTR  // X_PTR = &x
 59 | 	MOVQ y_base+32(FP), Y_PTR // Y_PTR = &y
 60 | 	MOVQ n+56(FP), LEN        // LEN = n
 61 | 	CMPQ LEN, $0              // if LEN == 0 { return }
 62 | 	JE   end
 63 | 
 64 | 	MOVQ ix+80(FP), INC_X
 65 | 	MOVQ iy+88(FP), INC_Y
 66 | 	LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(x[ix])
 67 | 	LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(y[iy])
 68 | 	MOVQ Y_PTR, DST_PTR          // DST_PTR = Y_PTR  // Write pointer
 69 | 
 70 | 	MOVQ incX+64(FP), INC_X // INC_X = incX * sizeof(float64)
 71 | 	SHLQ $3, INC_X
 72 | 	MOVQ incY+72(FP), INC_Y // INC_Y = incY * sizeof(float64)
 73 | 	SHLQ $3, INC_Y
 74 | 
 75 | 	MOVSD alpha+0(FP), ALPHA // ALPHA = alpha
 76 | 	MOVQ  LEN, TAIL
 77 | 	ANDQ  $3, TAIL           // TAIL = n % 4
 78 | 	SHRQ  $2, LEN            // LEN = floor( n / 4 )
 79 | 	JZ    tail_start         // if LEN == 0 { goto tail_start }
 80 | 
 81 | 	MOVAPS ALPHA, ALPHA_2            // ALPHA_2 = ALPHA  for pipelining
 82 | 	LEAQ   (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
 83 | 	LEAQ   (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3
 84 | 
 85 | loop:  // do {  // y[i] += alpha * x[i] unrolled 4x.
 86 | 	MOVSD (X_PTR), X2            // X_i = x[i]
 87 | 	MOVSD (X_PTR)(INC_X*1), X3
 88 | 	MOVSD (X_PTR)(INC_X*2), X4
 89 | 	MOVSD (X_PTR)(INCx3_X*1), X5
 90 | 
 91 | 	MULSD ALPHA, X2   // X_i *= a
 92 | 	MULSD ALPHA_2, X3
 93 | 	MULSD ALPHA, X4
 94 | 	MULSD ALPHA_2, X5
 95 | 
 96 | 	ADDSD (Y_PTR), X2            // X_i += y[i]
 97 | 	ADDSD (Y_PTR)(INC_Y*1), X3
 98 | 	ADDSD (Y_PTR)(INC_Y*2), X4
 99 | 	ADDSD (Y_PTR)(INCx3_Y*1), X5
100 | 
101 | 	MOVSD X2, (DST_PTR)              // y[i] = X_i
102 | 	MOVSD X3, (DST_PTR)(INC_DST*1)
103 | 	MOVSD X4, (DST_PTR)(INC_DST*2)
104 | 	MOVSD X5, (DST_PTR)(INCx3_DST*1)
105 | 
106 | 	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
107 | 	LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4])
108 | 	DECQ LEN
109 | 	JNZ  loop                    // } while --LEN > 0
110 | 	CMPQ TAIL, $0                // if TAIL == 0 { return }
111 | 	JE   end
112 | 
113 | tail_start: // Reset Loop registers
114 | 	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
115 | 	SHRQ $1, LEN   // LEN = floor( LEN / 2 )
116 | 	JZ   tail_one
117 | 
118 | tail_two:
119 | 	MOVSD (X_PTR), X2              // X_i = x[i]
120 | 	MOVSD (X_PTR)(INC_X*1), X3
121 | 	MULSD ALPHA, X2                // X_i *= a
122 | 	MULSD ALPHA, X3
123 | 	ADDSD (Y_PTR), X2              // X_i += y[i]
124 | 	ADDSD (Y_PTR)(INC_Y*1), X3
125 | 	MOVSD X2, (DST_PTR)            // y[i] = X_i
126 | 	MOVSD X3, (DST_PTR)(INC_DST*1)
127 | 
128 | 	LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
129 | 	LEAQ (Y_PTR)(INC_Y*2), Y_PTR // Y_PTR = &(Y_PTR[incY*2])
130 | 
131 | 	ANDQ $1, TAIL
132 | 	JZ   end      // if TAIL == 0 { goto end }
133 | 
134 | tail_one:
135 | 	// y[i] += alpha * x[i] for the last n % 4 iterations.
136 | 	MOVSD (X_PTR), X2   // X2 = x[i]
137 | 	MULSD ALPHA, X2     // X2 *= a
138 | 	ADDSD (Y_PTR), X2   // X2 += y[i]
139 | 	MOVSD X2, (DST_PTR) // y[i] = X2
140 | 
141 | end:
142 | 	RET
143 | 


--------------------------------------------------------------------------------
/asm/f64/axpyincto_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2015 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | //
  5 | // Some of the loop unrolling code is copied from:
  6 | // http://golang.org/src/math/big/arith_amd64.s
  7 | // which is distributed under these terms:
  8 | //
  9 | // Copyright (c) 2012 The Go Authors. All rights reserved.
 10 | //
 11 | // Redistribution and use in source and binary forms, with or without
 12 | // modification, are permitted provided that the following conditions are
 13 | // met:
 14 | //
 15 | //    * Redistributions of source code must retain the above copyright
 16 | // notice, this list of conditions and the following disclaimer.
 17 | //    * Redistributions in binary form must reproduce the above
 18 | // copyright notice, this list of conditions and the following disclaimer
 19 | // in the documentation and/or other materials provided with the
 20 | // distribution.
 21 | //    * Neither the name of Google Inc. nor the names of its
 22 | // contributors may be used to endorse or promote products derived from
 23 | // this software without specific prior written permission.
 24 | //
 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 36 | 
 37 | //+build !noasm,!appengine
 38 | 
 39 | #include "textflag.h"
 40 | 
 41 | #define X_PTR SI
 42 | #define Y_PTR DI
 43 | #define DST_PTR DX
 44 | #define IDX AX
 45 | #define LEN CX
 46 | #define TAIL BX
 47 | #define INC_X R8
 48 | #define INCx3_X R11
 49 | #define INC_Y R9
 50 | #define INCx3_Y R12
 51 | #define INC_DST R10
 52 | #define INCx3_DST R13
 53 | #define ALPHA X0
 54 | #define ALPHA_2 X1
 55 | 
 56 | // func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
 57 | TEXT ·AxpyIncTo(SB), NOSPLIT, $0
 58 | 	MOVQ dst_base+0(FP), DST_PTR // DST_PTR := &dst
 59 | 	MOVQ x_base+48(FP), X_PTR    // X_PTR := &x
 60 | 	MOVQ y_base+72(FP), Y_PTR    // Y_PTR := &y
 61 | 	MOVQ n+96(FP), LEN           // LEN := n
 62 | 	CMPQ LEN, $0                 // if LEN == 0 { return }
 63 | 	JE   end
 64 | 
 65 | 	MOVQ ix+120(FP), INC_X
 66 | 	LEAQ (X_PTR)(INC_X*8), X_PTR       // X_PTR = &(x[ix])
 67 | 	MOVQ iy+128(FP), INC_Y
 68 | 	LEAQ (Y_PTR)(INC_Y*8), Y_PTR       // Y_PTR = &(dst[idst])
 69 | 	MOVQ idst+32(FP), INC_DST
 70 | 	LEAQ (DST_PTR)(INC_DST*8), DST_PTR // DST_PTR = &(y[iy])
 71 | 
 72 | 	MOVQ  incX+104(FP), INC_X    // INC_X = incX * sizeof(float64)
 73 | 	SHLQ  $3, INC_X
 74 | 	MOVQ  incY+112(FP), INC_Y    // INC_Y = incY * sizeof(float64)
 75 | 	SHLQ  $3, INC_Y
 76 | 	MOVQ  incDst+24(FP), INC_DST // INC_DST = incDst * sizeof(float64)
 77 | 	SHLQ  $3, INC_DST
 78 | 	MOVSD alpha+40(FP), ALPHA
 79 | 
 80 | 	MOVQ LEN, TAIL
 81 | 	ANDQ $3, TAIL   // TAIL = n % 4
 82 | 	SHRQ $2, LEN    // LEN = floor( n / 4 )
 83 | 	JZ   tail_start // if LEN == 0 { goto tail_start }
 84 | 
 85 | 	MOVSD ALPHA, ALPHA_2                  // ALPHA_2 = ALPHA for pipelining
 86 | 	LEAQ  (INC_X)(INC_X*2), INCx3_X       // INCx3_X = INC_X * 3
 87 | 	LEAQ  (INC_Y)(INC_Y*2), INCx3_Y       // INCx3_Y = INC_Y * 3
 88 | 	LEAQ  (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3
 89 | 
 90 | loop:  // do {  // y[i] += alpha * x[i] unrolled 2x.
 91 | 	MOVSD (X_PTR), X2            // X_i = x[i]
 92 | 	MOVSD (X_PTR)(INC_X*1), X3
 93 | 	MOVSD (X_PTR)(INC_X*2), X4
 94 | 	MOVSD (X_PTR)(INCx3_X*1), X5
 95 | 
 96 | 	MULSD ALPHA, X2   // X_i *= a
 97 | 	MULSD ALPHA_2, X3
 98 | 	MULSD ALPHA, X4
 99 | 	MULSD ALPHA_2, X5
100 | 
101 | 	ADDSD (Y_PTR), X2            // X_i += y[i]
102 | 	ADDSD (Y_PTR)(INC_Y*1), X3
103 | 	ADDSD (Y_PTR)(INC_Y*2), X4
104 | 	ADDSD (Y_PTR)(INCx3_Y*1), X5
105 | 
106 | 	MOVSD X2, (DST_PTR)              // y[i] = X_i
107 | 	MOVSD X3, (DST_PTR)(INC_DST*1)
108 | 	MOVSD X4, (DST_PTR)(INC_DST*2)
109 | 	MOVSD X5, (DST_PTR)(INCx3_DST*1)
110 | 
111 | 	LEAQ (X_PTR)(INC_X*4), X_PTR       // X_PTR = &(X_PTR[incX*4])
112 | 	LEAQ (Y_PTR)(INC_Y*4), Y_PTR       // Y_PTR = &(Y_PTR[incY*4])
113 | 	LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4]
114 | 	DECQ LEN
115 | 	JNZ  loop                          // } while --LEN > 0
116 | 	CMPQ TAIL, $0                      // if TAIL == 0 { return }
117 | 	JE   end
118 | 
119 | tail_start: // Reset Loop registers
120 | 	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
121 | 	SHRQ $1, LEN   // LEN = floor( LEN / 2 )
122 | 	JZ   tail_one
123 | 
124 | tail_two:
125 | 	MOVSD (X_PTR), X2              // X_i = x[i]
126 | 	MOVSD (X_PTR)(INC_X*1), X3
127 | 	MULSD ALPHA, X2                // X_i *= a
128 | 	MULSD ALPHA, X3
129 | 	ADDSD (Y_PTR), X2              // X_i += y[i]
130 | 	ADDSD (Y_PTR)(INC_Y*1), X3
131 | 	MOVSD X2, (DST_PTR)            // y[i] = X_i
132 | 	MOVSD X3, (DST_PTR)(INC_DST*1)
133 | 
134 | 	LEAQ (X_PTR)(INC_X*2), X_PTR       // X_PTR = &(X_PTR[incX*2])
135 | 	LEAQ (Y_PTR)(INC_Y*2), Y_PTR       // Y_PTR = &(Y_PTR[incY*2])
136 | 	LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incY*2]
137 | 
138 | 	ANDQ $1, TAIL
139 | 	JZ   end      // if TAIL == 0 { goto end }
140 | 
141 | tail_one:
142 | 	MOVSD (X_PTR), X2   // X2 = x[i]
143 | 	MULSD ALPHA, X2     // X2 *= a
144 | 	ADDSD (Y_PTR), X2   // X2 += y[i]
145 | 	MOVSD X2, (DST_PTR) // y[i] = X2
146 | 
147 | end:
148 | 	RET
149 | 


--------------------------------------------------------------------------------
/asm/f64/axpyunitary_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2015 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | //
  5 | // Some of the loop unrolling code is copied from:
  6 | // http://golang.org/src/math/big/arith_amd64.s
  7 | // which is distributed under these terms:
  8 | //
  9 | // Copyright (c) 2012 The Go Authors. All rights reserved.
 10 | //
 11 | // Redistribution and use in source and binary forms, with or without
 12 | // modification, are permitted provided that the following conditions are
 13 | // met:
 14 | //
 15 | //    * Redistributions of source code must retain the above copyright
 16 | // notice, this list of conditions and the following disclaimer.
 17 | //    * Redistributions in binary form must reproduce the above
 18 | // copyright notice, this list of conditions and the following disclaimer
 19 | // in the documentation and/or other materials provided with the
 20 | // distribution.
 21 | //    * Neither the name of Google Inc. nor the names of its
 22 | // contributors may be used to endorse or promote products derived from
 23 | // this software without specific prior written permission.
 24 | //
 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 36 | 
 37 | //+build !noasm,!appengine
 38 | 
 39 | #include "textflag.h"
 40 | 
 41 | #define X_PTR SI
 42 | #define Y_PTR DI
 43 | #define DST_PTR DI
 44 | #define IDX AX
 45 | #define LEN CX
 46 | #define TAIL BX
 47 | #define ALPHA X0
 48 | #define ALPHA_2 X1
 49 | 
 50 | // func AxpyUnitary(alpha float64, x, y []float64)
 51 | TEXT ·AxpyUnitary(SB), NOSPLIT, $0
 52 | 	MOVQ    x_base+8(FP), X_PTR  // X_PTR := &x
 53 | 	MOVQ    y_base+32(FP), Y_PTR // Y_PTR := &y
 54 | 	MOVQ    x_len+16(FP), LEN    // LEN = min( len(x), len(y) )
 55 | 	CMPQ    y_len+40(FP), LEN
 56 | 	CMOVQLE y_len+40(FP), LEN
 57 | 	CMPQ    LEN, $0              // if LEN == 0 { return }
 58 | 	JE      end
 59 | 	XORQ    IDX, IDX
 60 | 	MOVSD   alpha+0(FP), ALPHA   // ALPHA := { alpha, alpha }
 61 | 	SHUFPD  $0, ALPHA, ALPHA
 62 | 	MOVUPS  ALPHA, ALPHA_2       // ALPHA_2 := ALPHA   for pipelining
 63 | 	MOVQ    Y_PTR, TAIL          // Check memory alignment
 64 | 	ANDQ    $15, TAIL            // TAIL = &y % 16
 65 | 	JZ      no_trim              // if TAIL == 0 { goto no_trim }
 66 | 
 67 | 	// Align on 16-byte boundary
 68 | 	MOVSD (X_PTR), X2   // X2 := x[0]
 69 | 	MULSD ALPHA, X2     // X2 *= a
 70 | 	ADDSD (Y_PTR), X2   // X2 += y[0]
 71 | 	MOVSD X2, (DST_PTR) // y[0] = X2
 72 | 	INCQ  IDX           // i++
 73 | 	DECQ  LEN           // LEN--
 74 | 	JZ    end           // if LEN == 0 { return }
 75 | 
 76 | no_trim:
 77 | 	MOVQ LEN, TAIL
 78 | 	ANDQ $7, TAIL   // TAIL := n % 8
 79 | 	SHRQ $3, LEN    // LEN = floor( n / 8 )
 80 | 	JZ   tail_start // if LEN == 0 { goto tail2_start }
 81 | 
 82 | loop:  // do {
 83 | 	// y[i] += alpha * x[i] unrolled 8x.
 84 | 	MOVUPS (X_PTR)(IDX*8), X2   // X_i = x[i]
 85 | 	MOVUPS 16(X_PTR)(IDX*8), X3
 86 | 	MOVUPS 32(X_PTR)(IDX*8), X4
 87 | 	MOVUPS 48(X_PTR)(IDX*8), X5
 88 | 
 89 | 	MULPD ALPHA, X2   // X_i *= a
 90 | 	MULPD ALPHA_2, X3
 91 | 	MULPD ALPHA, X4
 92 | 	MULPD ALPHA_2, X5
 93 | 
 94 | 	ADDPD (Y_PTR)(IDX*8), X2   // X_i += y[i]
 95 | 	ADDPD 16(Y_PTR)(IDX*8), X3
 96 | 	ADDPD 32(Y_PTR)(IDX*8), X4
 97 | 	ADDPD 48(Y_PTR)(IDX*8), X5
 98 | 
 99 | 	MOVUPS X2, (DST_PTR)(IDX*8)   // y[i] = X_i
100 | 	MOVUPS X3, 16(DST_PTR)(IDX*8)
101 | 	MOVUPS X4, 32(DST_PTR)(IDX*8)
102 | 	MOVUPS X5, 48(DST_PTR)(IDX*8)
103 | 
104 | 	ADDQ $8, IDX  // i += 8
105 | 	DECQ LEN
106 | 	JNZ  loop     // } while --LEN > 0
107 | 	CMPQ TAIL, $0 // if TAIL == 0 { return }
108 | 	JE   end
109 | 
110 | tail_start: // Reset loop registers
111 | 	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
112 | 	SHRQ $1, LEN   // LEN = floor( TAIL / 2 )
113 | 	JZ   tail_one  // if TAIL == 0 { goto tail }
114 | 
115 | tail_two: // do {
116 | 	MOVUPS (X_PTR)(IDX*8), X2   // X2 = x[i]
117 | 	MULPD  ALPHA, X2            // X2 *= a
118 | 	ADDPD  (Y_PTR)(IDX*8), X2   // X2 += y[i]
119 | 	MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X2
120 | 	ADDQ   $2, IDX              // i += 2
121 | 	DECQ   LEN
122 | 	JNZ    tail_two             // } while --LEN > 0
123 | 
124 | 	ANDQ $1, TAIL
125 | 	JZ   end      // if TAIL == 0 { goto end }
126 | 
127 | tail_one:
128 | 	MOVSD (X_PTR)(IDX*8), X2   // X2 = x[i]
129 | 	MULSD ALPHA, X2            // X2 *= a
130 | 	ADDSD (Y_PTR)(IDX*8), X2   // X2 += y[i]
131 | 	MOVSD X2, (DST_PTR)(IDX*8) // y[i] = X2
132 | 
133 | end:
134 | 	RET
135 | 


--------------------------------------------------------------------------------
/asm/f64/axpyunitaryto_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2015 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | //
  5 | // Some of the loop unrolling code is copied from:
  6 | // http://golang.org/src/math/big/arith_amd64.s
  7 | // which is distributed under these terms:
  8 | //
  9 | // Copyright (c) 2012 The Go Authors. All rights reserved.
 10 | //
 11 | // Redistribution and use in source and binary forms, with or without
 12 | // modification, are permitted provided that the following conditions are
 13 | // met:
 14 | //
 15 | //    * Redistributions of source code must retain the above copyright
 16 | // notice, this list of conditions and the following disclaimer.
 17 | //    * Redistributions in binary form must reproduce the above
 18 | // copyright notice, this list of conditions and the following disclaimer
 19 | // in the documentation and/or other materials provided with the
 20 | // distribution.
 21 | //    * Neither the name of Google Inc. nor the names of its
 22 | // contributors may be used to endorse or promote products derived from
 23 | // this software without specific prior written permission.
 24 | //
 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 36 | 
 37 | //+build !noasm,!appengine
 38 | 
 39 | #include "textflag.h"
 40 | 
 41 | #define X_PTR SI
 42 | #define Y_PTR DX
 43 | #define DST_PTR DI
 44 | #define IDX AX
 45 | #define LEN CX
 46 | #define TAIL BX
 47 | #define ALPHA X0
 48 | #define ALPHA_2 X1
 49 | 
 50 | // func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64)
 51 | TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0
 52 | 	MOVQ    dst_base+0(FP), DST_PTR // DST_PTR := &dst
 53 | 	MOVQ    x_base+32(FP), X_PTR    // X_PTR := &x
 54 | 	MOVQ    y_base+56(FP), Y_PTR    // Y_PTR := &y
 55 | 	MOVQ    x_len+40(FP), LEN       // LEN = min( len(x), len(y), len(dst) )
 56 | 	CMPQ    y_len+64(FP), LEN
 57 | 	CMOVQLE y_len+64(FP), LEN
 58 | 	CMPQ    dst_len+8(FP), LEN
 59 | 	CMOVQLE dst_len+8(FP), LEN
 60 | 
 61 | 	CMPQ LEN, $0
 62 | 	JE   end     // if LEN == 0 { return }
 63 | 
 64 | 	XORQ   IDX, IDX            // IDX = 0
 65 | 	MOVSD  alpha+24(FP), ALPHA
 66 | 	SHUFPD $0, ALPHA, ALPHA    // ALPHA := { alpha, alpha }
 67 | 	MOVQ   Y_PTR, TAIL         // Check memory alignment
 68 | 	ANDQ   $15, TAIL           // TAIL = &y % 16
 69 | 	JZ     no_trim             // if TAIL == 0 { goto no_trim }
 70 | 
 71 | 	// Align on 16-byte boundary
 72 | 	MOVSD (X_PTR), X2   // X2 := x[0]
 73 | 	MULSD ALPHA, X2     // X2 *= a
 74 | 	ADDSD (Y_PTR), X2   // X2 += y[0]
 75 | 	MOVSD X2, (DST_PTR) // y[0] = X2
 76 | 	INCQ  IDX           // i++
 77 | 	DECQ  LEN           // LEN--
 78 | 	JZ    end           // if LEN == 0 { return }
 79 | 
 80 | no_trim:
 81 | 	MOVQ LEN, TAIL
 82 | 	ANDQ $7, TAIL   // TAIL := n % 8
 83 | 	SHRQ $3, LEN    // LEN = floor( n / 8 )
 84 | 	JZ   tail_start // if LEN == 0 { goto tail_start }
 85 | 
 86 | 	MOVUPS ALPHA, ALPHA_2 // ALPHA_2 := ALPHA  for pipelining
 87 | 
 88 | loop:  // do {
 89 | 	// y[i] += alpha * x[i] unrolled 8x.
 90 | 	MOVUPS (X_PTR)(IDX*8), X2   // X_i = x[i]
 91 | 	MOVUPS 16(X_PTR)(IDX*8), X3
 92 | 	MOVUPS 32(X_PTR)(IDX*8), X4
 93 | 	MOVUPS 48(X_PTR)(IDX*8), X5
 94 | 
 95 | 	MULPD ALPHA, X2   // X_i *= alpha
 96 | 	MULPD ALPHA_2, X3
 97 | 	MULPD ALPHA, X4
 98 | 	MULPD ALPHA_2, X5
 99 | 
100 | 	ADDPD (Y_PTR)(IDX*8), X2   // X_i += y[i]
101 | 	ADDPD 16(Y_PTR)(IDX*8), X3
102 | 	ADDPD 32(Y_PTR)(IDX*8), X4
103 | 	ADDPD 48(Y_PTR)(IDX*8), X5
104 | 
105 | 	MOVUPS X2, (DST_PTR)(IDX*8)   // y[i] = X_i
106 | 	MOVUPS X3, 16(DST_PTR)(IDX*8)
107 | 	MOVUPS X4, 32(DST_PTR)(IDX*8)
108 | 	MOVUPS X5, 48(DST_PTR)(IDX*8)
109 | 
110 | 	ADDQ $8, IDX  // i += 8
111 | 	DECQ LEN
112 | 	JNZ  loop     // } while --LEN > 0
113 | 	CMPQ TAIL, $0 // if TAIL == 0 { return }
114 | 	JE   end
115 | 
116 | tail_start: // Reset loop registers
117 | 	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
118 | 	SHRQ $1, LEN   // LEN = floor( TAIL / 2 )
119 | 	JZ   tail_one  // if LEN == 0 { goto tail }
120 | 
121 | tail_two: // do {
122 | 	MOVUPS (X_PTR)(IDX*8), X2   // X2 = x[i]
123 | 	MULPD  ALPHA, X2            // X2 *= alpha
124 | 	ADDPD  (Y_PTR)(IDX*8), X2   // X2 += y[i]
125 | 	MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X2
126 | 	ADDQ   $2, IDX              // i += 2
127 | 	DECQ   LEN
128 | 	JNZ    tail_two             // } while --LEN > 0
129 | 
130 | 	ANDQ $1, TAIL
131 | 	JZ   end      // if TAIL == 0 { goto end }
132 | 
133 | tail_one:
134 | 	MOVSD (X_PTR)(IDX*8), X2   // X2 = x[i]
135 | 	MULSD ALPHA, X2            // X2 *= a
136 | 	ADDSD (Y_PTR)(IDX*8), X2   // X2 += y[i]
137 | 	MOVSD X2, (DST_PTR)(IDX*8) // y[i] = X2
138 | 
139 | end:
140 | 	RET
141 | 


--------------------------------------------------------------------------------
/asm/f64/benchAxpy_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright ©2017 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | // +build go1.7
  6 | 
  7 | package f64
  8 | 
  9 | import (
 10 | 	"fmt"
 11 | 	"testing"
 12 | )
 13 | 
 14 | const (
 15 | 	testLen = 1e5
 16 | )
 17 | 
 18 | var (
 19 | 	a = 2.0
 20 | 	x = make([]float64, testLen)
 21 | 	y = make([]float64, testLen)
 22 | 	z = make([]float64, testLen)
 23 | )
 24 | 
 25 | func init() {
 26 | 	for n := range x {
 27 | 		x[n] = float64(n)
 28 | 		y[n] = float64(n)
 29 | 	}
 30 | }
 31 | 
 32 | func BenchmarkAxpyUnitary(t *testing.B) {
 33 | 	naiveaxpyu := func(a float64, x, y []float64) {
 34 | 		for i, v := range x {
 35 | 			y[i] += a * v
 36 | 		}
 37 | 	}
 38 | 	tests := []struct {
 39 | 		name string
 40 | 		f    func(a float64, x, y []float64)
 41 | 	}{
 42 | 		{"AxpyUnitary", AxpyUnitary},
 43 | 		{"NaiveAxpyUnitary", naiveaxpyu},
 44 | 	}
 45 | 	for _, test := range tests {
 46 | 		for _, ln := range []uintptr{1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4, 1e5} {
 47 | 			t.Run(fmt.Sprintf("%s-%d", test.name, ln), func(b *testing.B) {
 48 | 				b.SetBytes(int64(64 * ln))
 49 | 				x, y := x[:ln], y[:ln]
 50 | 				b.ResetTimer()
 51 | 				for i := 0; i < b.N; i++ {
 52 | 					test.f(a, x, y)
 53 | 				}
 54 | 			})
 55 | 		}
 56 | 	}
 57 | }
 58 | 
 59 | func BenchmarkAxpyUnitaryTo(t *testing.B) {
 60 | 	naiveaxpyut := func(d []float64, a float64, x, y []float64) {
 61 | 		for i, v := range x {
 62 | 			d[i] = y[i] + a*v
 63 | 		}
 64 | 	}
 65 | 	tests := []struct {
 66 | 		name string
 67 | 		f    func(z []float64, a float64, x, y []float64)
 68 | 	}{
 69 | 		{"AxpyUnitaryTo", AxpyUnitaryTo},
 70 | 		{"NaiveAxpyUnitaryTo", naiveaxpyut},
 71 | 	}
 72 | 	for _, test := range tests {
 73 | 		for _, ln := range []uintptr{1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4, 1e5} {
 74 | 			t.Run(fmt.Sprintf("%s-%d", test.name, ln), func(b *testing.B) {
 75 | 				b.SetBytes(int64(64 * ln))
 76 | 				x, y, z := x[:ln], y[:ln], z[:ln]
 77 | 				b.ResetTimer()
 78 | 				for i := 0; i < b.N; i++ {
 79 | 					test.f(z, a, x, y)
 80 | 				}
 81 | 			})
 82 | 		}
 83 | 	}
 84 | }
 85 | 
 86 | var incsAxpy = []struct {
 87 | 	len uintptr
 88 | 	inc []int
 89 | }{
 90 | 	{1, []int{1}},
 91 | 	{2, []int{1, 2, 4, 10}},
 92 | 	{3, []int{1, 2, 4, 10}},
 93 | 	{4, []int{1, 2, 4, 10}},
 94 | 	{5, []int{1, 2, 4, 10}},
 95 | 	{10, []int{1, 2, 4, 10}},
 96 | 	{500, []int{1, 2, 4, 10}},
 97 | 	{1e3, []int{1, 2, 4, 10}},
 98 | 	{1e4, []int{1, 2, 4, 10, -1, -2, -4, -10}},
 99 | }
100 | 
101 | func BenchmarkAxpyInc(t *testing.B) {
102 | 	naiveaxpyinc := func(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) {
103 | 		for i := 0; i < int(n); i++ {
104 | 			y[iy] += alpha * x[ix]
105 | 			ix += incX
106 | 			iy += incY
107 | 		}
108 | 	}
109 | 	tests := []struct {
110 | 		name string
111 | 		f    func(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
112 | 	}{
113 | 		{"AxpyInc", AxpyInc},
114 | 		{"NaiveAxpyInc", naiveaxpyinc},
115 | 	}
116 | 	for _, test := range tests {
117 | 		for _, tt := range incsAxpy {
118 | 			for _, inc := range tt.inc {
119 | 				t.Run(fmt.Sprintf("%s-%d-inc(%d)", test.name, tt.len, inc), func(b *testing.B) {
120 | 					b.SetBytes(int64(64 * tt.len))
121 | 					var idx, tstInc uintptr = 0, uintptr(inc)
122 | 					if inc < 0 {
123 | 						idx = uintptr((-int(tt.len) + 1) * inc)
124 | 					}
125 | 					for i := 0; i < b.N; i++ {
126 | 						test.f(a, x, y, uintptr(tt.len), tstInc, tstInc, idx, idx)
127 | 					}
128 | 				})
129 | 			}
130 | 		}
131 | 	}
132 | }
133 | 
134 | func BenchmarkAxpyIncTo(t *testing.B) {
135 | 	naiveaxpyincto := func(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) {
136 | 		for i := 0; i < int(n); i++ {
137 | 			dst[idst] = alpha*x[ix] + y[iy]
138 | 			ix += incX
139 | 			iy += incY
140 | 			idst += incDst
141 | 		}
142 | 	}
143 | 	tests := []struct {
144 | 		name string
145 | 		f    func(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
146 | 	}{
147 | 		{"AxpyIncTo", AxpyIncTo},
148 | 		{"NaiveAxpyIncTo", naiveaxpyincto},
149 | 	}
150 | 	for _, test := range tests {
151 | 		for _, tt := range incsAxpy {
152 | 			for _, inc := range tt.inc {
153 | 				t.Run(fmt.Sprintf("%s-%d-inc(%d)", test.name, tt.len, inc), func(b *testing.B) {
154 | 					b.SetBytes(int64(64 * tt.len))
155 | 					var idx, tstInc uintptr = 0, uintptr(inc)
156 | 					if inc < 0 {
157 | 						idx = uintptr((-int(tt.len) + 1) * inc)
158 | 					}
159 | 					for i := 0; i < b.N; i++ {
160 | 						test.f(z, tstInc, idx, a, x, y, uintptr(tt.len),
161 | 							tstInc, tstInc, idx, idx)
162 | 					}
163 | 				})
164 | 			}
165 | 		}
166 | 	}
167 | }
168 | 


--------------------------------------------------------------------------------
/asm/f64/benchScal_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2017 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build go1.7
 6 | 
 7 | package f64
 8 | 
 9 | import (
10 | 	"fmt"
11 | 	"testing"
12 | )
13 | 
14 | var uniScal = []int64{1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4}
15 | 
16 | func BenchmarkScalUnitary(t *testing.B) {
17 | 	tstName := "ScalUnitary"
18 | 	for _, ln := range uniScal {
19 | 		t.Run(fmt.Sprintf("%s-%d", tstName, ln), func(b *testing.B) {
20 | 			b.SetBytes(64 * ln)
21 | 			x := x[:ln]
22 | 			b.ResetTimer()
23 | 			for i := 0; i < b.N; i++ {
24 | 				ScalUnitary(a, x)
25 | 			}
26 | 		})
27 | 	}
28 | }
29 | 
30 | func BenchmarkScalUnitaryTo(t *testing.B) {
31 | 	tstName := "ScalUnitaryTo"
32 | 	for _, ln := range uniScal {
33 | 		t.Run(fmt.Sprintf("%s-%d", tstName, ln), func(b *testing.B) {
34 | 			b.SetBytes(int64(64 * ln))
35 | 			x, y := x[:ln], y[:ln]
36 | 			b.ResetTimer()
37 | 			for i := 0; i < b.N; i++ {
38 | 				ScalUnitaryTo(y, a, x)
39 | 			}
40 | 		})
41 | 	}
42 | }
43 | 
44 | var incScal = []struct {
45 | 	len uintptr
46 | 	inc []int
47 | }{
48 | 	{1, []int{1}},
49 | 	{3, []int{1, 2, 4, 10}},
50 | 	{10, []int{1, 2, 4, 10}},
51 | 	{30, []int{1, 2, 4, 10}},
52 | 	{1e2, []int{1, 2, 4, 10}},
53 | 	{3e2, []int{1, 2, 4, 10}},
54 | 	{1e3, []int{1, 2, 4, 10}},
55 | 	{3e3, []int{1, 2, 4, 10}},
56 | 	{1e4, []int{1, 2, 4, 10}},
57 | }
58 | 
59 | func BenchmarkScalInc(t *testing.B) {
60 | 	tstName := "ScalInc"
61 | 	for _, tt := range incScal {
62 | 		for _, inc := range tt.inc {
63 | 			t.Run(fmt.Sprintf("%s-%d-inc(%d)", tstName, tt.len, inc), func(b *testing.B) {
64 | 				b.SetBytes(int64(64 * tt.len))
65 | 				tstInc := uintptr(inc)
66 | 				for i := 0; i < b.N; i++ {
67 | 					ScalInc(a, x, uintptr(tt.len), tstInc)
68 | 				}
69 | 			})
70 | 		}
71 | 	}
72 | }
73 | 
74 | func BenchmarkScalIncTo(t *testing.B) {
75 | 	tstName := "ScalIncTo"
76 | 	for _, tt := range incScal {
77 | 		for _, inc := range tt.inc {
78 | 			t.Run(fmt.Sprintf("%s-%d-inc(%d)", tstName, tt.len, inc), func(b *testing.B) {
79 | 				b.SetBytes(int64(64 * tt.len))
80 | 				tstInc := uintptr(inc)
81 | 				for i := 0; i < b.N; i++ {
82 | 					ScalIncTo(z, tstInc, a, x, uintptr(tt.len), tstInc)
83 | 				}
84 | 			})
85 | 		}
86 | 	}
87 | }
88 | 


--------------------------------------------------------------------------------
/asm/f64/cumprod_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | TEXT ·CumProd(SB), NOSPLIT, $0
10 | 	MOVQ    dst_base+0(FP), DI // DI = &dst
11 | 	MOVQ    dst_len+8(FP), CX  // CX = len(dst)
12 | 	MOVQ    s_base+24(FP), SI  // SI = &s
13 | 	CMPQ    s_len+32(FP), CX   // CX = max( CX, len(s) )
14 | 	CMOVQLE s_len+32(FP), CX
15 | 	MOVQ    CX, ret_len+56(FP) // len(ret) = CX
16 | 	CMPQ    CX, $0             // if CX == 0 { return }
17 | 	JE      cp_end
18 | 	XORQ    AX, AX             // i = 0
19 | 
20 | 	MOVSD  (SI), X5   // p_prod = { s[0], s[0] }
21 | 	SHUFPD $0, X5, X5
22 | 	MOVSD  X5, (DI)   // dst[0] = s[0]
23 | 	INCQ   AX         // ++i
24 | 	DECQ   CX         // -- CX
25 | 	JZ     cp_end     // if CX == 0 { return }
26 | 
27 | 	MOVQ CX, BX
28 | 	ANDQ $3, BX        // BX = CX % 4
29 | 	SHRQ $2, CX        // CX = floor( CX / 4 )
30 | 	JZ   cp_tail_start // if CX == 0 { goto cp_tail_start }
31 | 
32 | cp_loop: // Loop unrolled 4x   do {
33 | 	MOVUPS (SI)(AX*8), X0   // X0 = s[i:i+1]
34 | 	MOVUPS 16(SI)(AX*8), X2
35 | 	MOVAPS X0, X1           // X1 = X0
36 | 	MOVAPS X2, X3
37 | 	SHUFPD $1, X1, X1       // { X1[0], X1[1] } = { X1[1], X1[0] }
38 | 	SHUFPD $1, X3, X3
39 | 	MULPD  X0, X1           // X1 *= X0
40 | 	MULPD  X2, X3
41 | 	SHUFPD $2, X1, X0       // { X0[0], X0[1] } = { X0[0], X1[1] }
42 | 	SHUFPD $3, X1, X1       // { X1[0], X1[1] } = { X1[1], X1[1] }
43 | 	SHUFPD $2, X3, X2
44 | 	SHUFPD $3, X3, X3
45 | 	MULPD  X5, X0           // X0 *= p_prod
46 | 	MULPD  X1, X5           // p_prod *= X1
47 | 	MULPD  X5, X2
48 | 	MOVUPS X0, (DI)(AX*8)   // dst[i] = X0
49 | 	MOVUPS X2, 16(DI)(AX*8)
50 | 	MULPD  X3, X5
51 | 	ADDQ   $4, AX           // i += 4
52 | 	LOOP   cp_loop          // } while --CX > 0
53 | 
54 | 	// if BX == 0 { return }
55 | 	CMPQ BX, $0
56 | 	JE   cp_end
57 | 
58 | cp_tail_start: // Reset loop registers
59 | 	MOVQ BX, CX // Loop counter: CX = BX
60 | 
61 | cp_tail: // do {
62 | 	MULSD (SI)(AX*8), X5 // p_prod *= s[i]
63 | 	MOVSD X5, (DI)(AX*8) // dst[i] = p_prod
64 | 	INCQ  AX             // ++i
65 | 	LOOP  cp_tail        // } while --CX > 0
66 | 
67 | cp_end:
68 | 	MOVQ DI, ret_base+48(FP) // &ret = &dst
69 | 	MOVQ dst_cap+16(FP), SI  // cap(ret) = cap(dst)
70 | 	MOVQ SI, ret_cap+64(FP)
71 | 	RET
72 | 


--------------------------------------------------------------------------------
/asm/f64/cumsum_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | TEXT ·CumSum(SB), NOSPLIT, $0
10 | 	MOVQ    dst_base+0(FP), DI // DI = &dst
11 | 	MOVQ    dst_len+8(FP), CX  // CX = len(dst)
12 | 	MOVQ    s_base+24(FP), SI  // SI = &s
13 | 	CMPQ    s_len+32(FP), CX   // CX = max( CX, len(s) )
14 | 	CMOVQLE s_len+32(FP), CX
15 | 	MOVQ    CX, ret_len+56(FP) // len(ret) = CX
16 | 	CMPQ    CX, $0             // if CX == 0 { return }
17 | 	JE      cs_end
18 | 	XORQ    AX, AX             // i = 0
19 | 	PXOR    X5, X5             // p_sum = 0
20 | 	MOVQ    CX, BX
21 | 	ANDQ    $3, BX             // BX = CX % 4
22 | 	SHRQ    $2, CX             // CX = floor( CX / 4 )
23 | 	JZ      cs_tail_start      // if CX == 0 { goto cs_tail_start }
24 | 
25 | cs_loop: // Loop unrolled 4x   do {
26 | 	MOVUPS (SI)(AX*8), X0   // X0 = s[i:i+1]
27 | 	MOVUPS 16(SI)(AX*8), X2
28 | 	MOVAPS X0, X1           // X1 = X0
29 | 	MOVAPS X2, X3
30 | 	SHUFPD $1, X1, X1       // { X1[0], X1[1] } = { X1[1], X1[0] }
31 | 	SHUFPD $1, X3, X3
32 | 	ADDPD  X0, X1           // X1 += X0
33 | 	ADDPD  X2, X3
34 | 	SHUFPD $2, X1, X0       // { X0[0], X0[1] } = { X0[0], X1[1] }
35 | 	SHUFPD $3, X1, X1       // { X1[0], X1[1] } = { X1[1], X1[1] }
36 | 	SHUFPD $2, X3, X2
37 | 	SHUFPD $3, X3, X3
38 | 	ADDPD  X5, X0           // X0 += p_sum
39 | 	ADDPD  X1, X5           // p_sum += X1
40 | 	ADDPD  X5, X2
41 | 	MOVUPS X0, (DI)(AX*8)   // dst[i] = X0
42 | 	MOVUPS X2, 16(DI)(AX*8)
43 | 	ADDPD  X3, X5
44 | 	ADDQ   $4, AX           // i += 4
45 | 	LOOP   cs_loop          // } while --CX > 0
46 | 
47 | 	// if BX == 0 { return }
48 | 	CMPQ BX, $0
49 | 	JE   cs_end
50 | 
51 | cs_tail_start: // Reset loop registers
52 | 	MOVQ BX, CX // Loop counter: CX = BX
53 | 
54 | cs_tail: // do {
55 | 	ADDSD (SI)(AX*8), X5 // p_sum *= s[i]
56 | 	MOVSD X5, (DI)(AX*8) // dst[i] = p_sum
57 | 	INCQ  AX             // ++i
58 | 	LOOP  cs_tail        // } while --CX > 0
59 | 
60 | cs_end:
61 | 	MOVQ DI, ret_base+48(FP) // &ret = &dst
62 | 	MOVQ dst_cap+16(FP), SI  // cap(ret) = cap(dst)
63 | 	MOVQ SI, ret_cap+64(FP)
64 | 	RET
65 | 


--------------------------------------------------------------------------------
/asm/f64/div_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func Div(dst, s []float64)
10 | TEXT ·Div(SB), NOSPLIT, $0
11 | 	MOVQ    dst_base+0(FP), DI // DI = &dst
12 | 	MOVQ    dst_len+8(FP), CX  // CX = len(dst)
13 | 	MOVQ    s_base+24(FP), SI  // SI = &s
14 | 	CMPQ    s_len+32(FP), CX   // CX = max( CX, len(s) )
15 | 	CMOVQLE s_len+32(FP), CX
16 | 	CMPQ    CX, $0             // if CX == 0 { return }
17 | 	JE      div_end
18 | 	XORQ    AX, AX             // i = 0
19 | 	MOVQ    SI, BX
20 | 	ANDQ    $15, BX            // BX = &s & 15
21 | 	JZ      div_no_trim        // if BX == 0 { goto div_no_trim }
22 | 
23 | 	// Align on 16-bit boundary
24 | 	MOVSD (DI)(AX*8), X0 // X0 = dst[i]
25 | 	DIVSD (SI)(AX*8), X0 // X0 /= s[i]
26 | 	MOVSD X0, (DI)(AX*8) // dst[i] = X0
27 | 	INCQ  AX             // ++i
28 | 	DECQ  CX             // --CX
29 | 	JZ    div_end        // if CX == 0 { return }
30 | 
31 | div_no_trim:
32 | 	MOVQ CX, BX
33 | 	ANDQ $7, BX         // BX = len(dst) % 8
34 | 	SHRQ $3, CX         // CX = floor( len(dst) / 8 )
35 | 	JZ   div_tail_start // if CX == 0 { goto div_tail_start }
36 | 
37 | div_loop: // Loop unrolled 8x   do {
38 | 	MOVUPS (DI)(AX*8), X0   // X0 = dst[i:i+1]
39 | 	MOVUPS 16(DI)(AX*8), X1
40 | 	MOVUPS 32(DI)(AX*8), X2
41 | 	MOVUPS 48(DI)(AX*8), X3
42 | 	DIVPD  (SI)(AX*8), X0   // X0 /= s[i:i+1]
43 | 	DIVPD  16(SI)(AX*8), X1
44 | 	DIVPD  32(SI)(AX*8), X2
45 | 	DIVPD  48(SI)(AX*8), X3
46 | 	MOVUPS X0, (DI)(AX*8)   // dst[i] = X0
47 | 	MOVUPS X1, 16(DI)(AX*8)
48 | 	MOVUPS X2, 32(DI)(AX*8)
49 | 	MOVUPS X3, 48(DI)(AX*8)
50 | 	ADDQ   $8, AX           // i += 8
51 | 	LOOP   div_loop         // } while --CX > 0
52 | 	CMPQ   BX, $0           // if BX == 0 { return }
53 | 	JE     div_end
54 | 
55 | div_tail_start: // Reset loop registers
56 | 	MOVQ BX, CX // Loop counter: CX = BX
57 | 
58 | div_tail: // do {
59 | 	MOVSD (DI)(AX*8), X0 // X0 = dst[i]
60 | 	DIVSD (SI)(AX*8), X0 // X0 /= s[i]
61 | 	MOVSD X0, (DI)(AX*8) // dst[i] = X0
62 | 	INCQ  AX             // ++i
63 | 	LOOP  div_tail       // } while --CX > 0
64 | 
65 | div_end:
66 | 	RET
67 | 
68 | 


--------------------------------------------------------------------------------
/asm/f64/divto_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func DivTo(dst, x, y []float64)
10 | TEXT ·DivTo(SB), NOSPLIT, $0
11 | 	MOVQ    dst_base+0(FP), DI // DI = &dst
12 | 	MOVQ    dst_len+8(FP), CX  // CX = len(dst)
13 | 	MOVQ    x_base+24(FP), SI  // SI = &x
14 | 	MOVQ    y_base+48(FP), DX  // DX = &y
15 | 	CMPQ    x_len+32(FP), CX   // CX = max( len(dst), len(x), len(y) )
16 | 	CMOVQLE x_len+32(FP), CX
17 | 	CMPQ    y_len+56(FP), CX
18 | 	CMOVQLE y_len+56(FP), CX
19 | 	MOVQ    CX, ret_len+80(FP) // len(ret) = CX
20 | 	CMPQ    CX, $0             // if CX == 0 { return }
21 | 	JE      div_end
22 | 	XORQ    AX, AX             // i = 0
23 | 	MOVQ    DX, BX
24 | 	ANDQ    $15, BX            // BX = &y & OxF
25 | 	JZ      div_no_trim        // if BX == 0 { goto div_no_trim }
26 | 
27 | 	// Align on 16-bit boundary
28 | 	MOVSD (SI)(AX*8), X0 // X0 = s[i]
29 | 	DIVSD (DX)(AX*8), X0 // X0 /= t[i]
30 | 	MOVSD X0, (DI)(AX*8) // dst[i] = X0
31 | 	INCQ  AX             // ++i
32 | 	DECQ  CX             // --CX
33 | 	JZ    div_end        // if CX == 0 { return }
34 | 
35 | div_no_trim:
36 | 	MOVQ CX, BX
37 | 	ANDQ $7, BX         // BX = len(dst) % 8
38 | 	SHRQ $3, CX         // CX = floor( len(dst) / 8 )
39 | 	JZ   div_tail_start // if CX == 0 { goto div_tail_start }
40 | 
41 | div_loop: // Loop unrolled 8x   do {
42 | 	MOVUPS (SI)(AX*8), X0   // X0 = x[i:i+1]
43 | 	MOVUPS 16(SI)(AX*8), X1
44 | 	MOVUPS 32(SI)(AX*8), X2
45 | 	MOVUPS 48(SI)(AX*8), X3
46 | 	DIVPD  (DX)(AX*8), X0   // X0 /= y[i:i+1]
47 | 	DIVPD  16(DX)(AX*8), X1
48 | 	DIVPD  32(DX)(AX*8), X2
49 | 	DIVPD  48(DX)(AX*8), X3
50 | 	MOVUPS X0, (DI)(AX*8)   // dst[i:i+1] = X0
51 | 	MOVUPS X1, 16(DI)(AX*8)
52 | 	MOVUPS X2, 32(DI)(AX*8)
53 | 	MOVUPS X3, 48(DI)(AX*8)
54 | 	ADDQ   $8, AX           // i += 8
55 | 	LOOP   div_loop         // } while --CX > 0
56 | 	CMPQ   BX, $0           // if BX == 0 { return }
57 | 	JE     div_end
58 | 
59 | div_tail_start: // Reset loop registers
60 | 	MOVQ BX, CX // Loop counter: CX = BX
61 | 
62 | div_tail: // do {
63 | 	MOVSD (SI)(AX*8), X0 // X0  = x[i]
64 | 	DIVSD (DX)(AX*8), X0 // X0 /= y[i]
65 | 	MOVSD X0, (DI)(AX*8)
66 | 	INCQ  AX             // ++i
67 | 	LOOP  div_tail       // } while --CX > 0
68 | 
69 | div_end:
70 | 	MOVQ DI, ret_base+72(FP) // &ret = &dst
71 | 	MOVQ dst_cap+16(FP), DI  // cap(ret) = cap(dst)
72 | 	MOVQ DI, ret_cap+88(FP)
73 | 	RET
74 | 


--------------------------------------------------------------------------------
/asm/f64/doc.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2017 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // This repository is no longer maintained.
 6 | // Development has moved to https://github.com/gonum/gonum.
 7 | //
 8 | // Package f64 provides float64 vector primitives.
 9 | package f64
10 | 


--------------------------------------------------------------------------------
/asm/f64/dot.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2015 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !amd64 noasm appengine
 6 | 
 7 | package f64
 8 | 
 9 | // DotUnitary is
10 | //  for i, v := range x {
11 | //  	sum += y[i] * v
12 | //  }
13 | //  return sum
14 | func DotUnitary(x, y []float64) (sum float64) {
15 | 	for i, v := range x {
16 | 		sum += y[i] * v
17 | 	}
18 | 	return sum
19 | }
20 | 
21 | // DotInc is
22 | //  for i := 0; i < int(n); i++ {
23 | //  	sum += y[iy] * x[ix]
24 | //  	ix += incX
25 | //  	iy += incY
26 | //  }
27 | //  return sum
28 | func DotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64) {
29 | 	for i := 0; i < int(n); i++ {
30 | 		sum += y[iy] * x[ix]
31 | 		ix += incX
32 | 		iy += incY
33 | 	}
34 | 	return sum
35 | }
36 | 


--------------------------------------------------------------------------------
/asm/f64/dot_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2015 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | //
  5 | // Some of the loop unrolling code is copied from:
  6 | // http://golang.org/src/math/big/arith_amd64.s
  7 | // which is distributed under these terms:
  8 | //
  9 | // Copyright (c) 2012 The Go Authors. All rights reserved.
 10 | //
 11 | // Redistribution and use in source and binary forms, with or without
 12 | // modification, are permitted provided that the following conditions are
 13 | // met:
 14 | //
 15 | //    * Redistributions of source code must retain the above copyright
 16 | // notice, this list of conditions and the following disclaimer.
 17 | //    * Redistributions in binary form must reproduce the above
 18 | // copyright notice, this list of conditions and the following disclaimer
 19 | // in the documentation and/or other materials provided with the
 20 | // distribution.
 21 | //    * Neither the name of Google Inc. nor the names of its
 22 | // contributors may be used to endorse or promote products derived from
 23 | // this software without specific prior written permission.
 24 | //
 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 36 | 
 37 | //+build !noasm,!appengine
 38 | 
 39 | #include "textflag.h"
 40 | 
 41 | // func DdotUnitary(x, y []float64) (sum float64)
 42 | // This function assumes len(y) >= len(x).
 43 | TEXT ·DotUnitary(SB), NOSPLIT, $0
 44 | 	MOVQ x+0(FP), R8
 45 | 	MOVQ x_len+8(FP), DI // n = len(x)
 46 | 	MOVQ y+24(FP), R9
 47 | 
 48 | 	MOVSD $(0.0), X7 // sum = 0
 49 | 	MOVSD $(0.0), X8 // sum = 0
 50 | 
 51 | 	MOVQ $0, SI   // i = 0
 52 | 	SUBQ $4, DI   // n -= 4
 53 | 	JL   tail_uni // if n < 0 goto tail_uni
 54 | 
 55 | loop_uni:
 56 | 	// sum += x[i] * y[i] unrolled 4x.
 57 | 	MOVUPD 0(R8)(SI*8), X0
 58 | 	MOVUPD 0(R9)(SI*8), X1
 59 | 	MOVUPD 16(R8)(SI*8), X2
 60 | 	MOVUPD 16(R9)(SI*8), X3
 61 | 	MULPD  X1, X0
 62 | 	MULPD  X3, X2
 63 | 	ADDPD  X0, X7
 64 | 	ADDPD  X2, X8
 65 | 
 66 | 	ADDQ $4, SI   // i += 4
 67 | 	SUBQ $4, DI   // n -= 4
 68 | 	JGE  loop_uni // if n >= 0 goto loop_uni
 69 | 
 70 | tail_uni:
 71 | 	ADDQ $4, DI  // n += 4
 72 | 	JLE  end_uni // if n <= 0 goto end_uni
 73 | 
 74 | onemore_uni:
 75 | 	// sum += x[i] * y[i] for the remaining 1-3 elements.
 76 | 	MOVSD 0(R8)(SI*8), X0
 77 | 	MOVSD 0(R9)(SI*8), X1
 78 | 	MULSD X1, X0
 79 | 	ADDSD X0, X7
 80 | 
 81 | 	ADDQ $1, SI      // i++
 82 | 	SUBQ $1, DI      // n--
 83 | 	JNZ  onemore_uni // if n != 0 goto onemore_uni
 84 | 
 85 | end_uni:
 86 | 	// Add the four sums together.
 87 | 	ADDPD    X8, X7
 88 | 	MOVSD    X7, X0
 89 | 	UNPCKHPD X7, X7
 90 | 	ADDSD    X0, X7
 91 | 	MOVSD    X7, sum+48(FP) // Return final sum.
 92 | 	RET
 93 | 
 94 | // func DdotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64)
 95 | TEXT ·DotInc(SB), NOSPLIT, $0
 96 | 	MOVQ x+0(FP), R8
 97 | 	MOVQ y+24(FP), R9
 98 | 	MOVQ n+48(FP), CX
 99 | 	MOVQ incX+56(FP), R11
100 | 	MOVQ incY+64(FP), R12
101 | 	MOVQ ix+72(FP), R13
102 | 	MOVQ iy+80(FP), R14
103 | 
104 | 	MOVSD $(0.0), X7      // sum = 0
105 | 	LEAQ  (R8)(R13*8), SI // p = &x[ix]
106 | 	LEAQ  (R9)(R14*8), DI // q = &y[ix]
107 | 	SHLQ  $3, R11         // incX *= sizeof(float64)
108 | 	SHLQ  $3, R12         // indY *= sizeof(float64)
109 | 
110 | 	SUBQ $2, CX   // n -= 2
111 | 	JL   tail_inc // if n < 0 goto tail_inc
112 | 
113 | loop_inc:
114 | 	// sum += *p * *q unrolled 2x.
115 | 	MOVHPD (SI), X0
116 | 	MOVHPD (DI), X1
117 | 	ADDQ   R11, SI  // p += incX
118 | 	ADDQ   R12, DI  // q += incY
119 | 	MOVLPD (SI), X0
120 | 	MOVLPD (DI), X1
121 | 	ADDQ   R11, SI  // p += incX
122 | 	ADDQ   R12, DI  // q += incY
123 | 
124 | 	MULPD X1, X0
125 | 	ADDPD X0, X7
126 | 
127 | 	SUBQ $2, CX   // n -= 2
128 | 	JGE  loop_inc // if n >= 0 goto loop_inc
129 | 
130 | tail_inc:
131 | 	ADDQ $2, CX  // n += 2
132 | 	JLE  end_inc // if n <= 0 goto end_inc
133 | 
134 | 	// sum += *p * *q for the last iteration if n is odd.
135 | 	MOVSD (SI), X0
136 | 	MULSD (DI), X0
137 | 	ADDSD X0, X7
138 | 
139 | end_inc:
140 | 	// Add the two sums together.
141 | 	MOVSD    X7, X0
142 | 	UNPCKHPD X7, X7
143 | 	ADDSD    X0, X7
144 | 	MOVSD    X7, sum+88(FP) // Return final sum.
145 | 	RET
146 | 


--------------------------------------------------------------------------------
/asm/f64/dot_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright ©2015 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package f64
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"math"
 10 | 	"math/rand"
 11 | 	"testing"
 12 | )
 13 | 
 14 | func TestDotUnitary(t *testing.T) {
 15 | 	for i, test := range []struct {
 16 | 		xData []float64
 17 | 		yData []float64
 18 | 
 19 | 		want float64
 20 | 	}{
 21 | 		{
 22 | 			xData: []float64{2},
 23 | 			yData: []float64{-3},
 24 | 			want:  -6,
 25 | 		},
 26 | 		{
 27 | 			xData: []float64{2, 3},
 28 | 			yData: []float64{-3, 4},
 29 | 			want:  6,
 30 | 		},
 31 | 		{
 32 | 			xData: []float64{2, 3, -4},
 33 | 			yData: []float64{-3, 4, 5},
 34 | 			want:  -14,
 35 | 		},
 36 | 		{
 37 | 			xData: []float64{2, 3, -4, -5},
 38 | 			yData: []float64{-3, 4, 5, -6},
 39 | 			want:  16,
 40 | 		},
 41 | 		{
 42 | 			xData: []float64{0, 2, 3, -4, -5},
 43 | 			yData: []float64{0, -3, 4, 5, -6},
 44 | 			want:  16,
 45 | 		},
 46 | 		{
 47 | 			xData: []float64{0, 0, 2, 3, -4, -5},
 48 | 			yData: []float64{0, 1, -3, 4, 5, -6},
 49 | 			want:  16,
 50 | 		},
 51 | 		{
 52 | 			xData: []float64{0, 0, 1, 1, 2, -3, -4},
 53 | 			yData: []float64{0, 1, 0, 3, -4, 5, -6},
 54 | 			want:  4,
 55 | 		},
 56 | 		{
 57 | 			xData: []float64{0, 0, 1, 1, 2, -3, -4, 5},
 58 | 			yData: []float64{0, 1, 0, 3, -4, 5, -6, 7},
 59 | 			want:  39,
 60 | 		},
 61 | 	} {
 62 | 		const msgGuard = "test %v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v"
 63 | 
 64 | 		x, xFront, xBack := newGuardedVector(test.xData, 1)
 65 | 		y, yFront, yBack := newGuardedVector(test.yData, 1)
 66 | 		got := DotUnitary(x, y)
 67 | 
 68 | 		if !allNaN(xFront) || !allNaN(xBack) {
 69 | 			t.Errorf(msgGuard, i, "x", xFront, xBack)
 70 | 		}
 71 | 		if !allNaN(yFront) || !allNaN(yBack) {
 72 | 			t.Errorf(msgGuard, i, "y", yFront, yBack)
 73 | 		}
 74 | 		if !equalStrided(test.xData, x, 1) {
 75 | 			t.Errorf("test %v: modified read-only x argument", i)
 76 | 		}
 77 | 		if !equalStrided(test.yData, y, 1) {
 78 | 			t.Errorf("test %v: modified read-only y argument", i)
 79 | 		}
 80 | 		if math.IsNaN(got) {
 81 | 			t.Errorf("test %v: invalid memory read", i)
 82 | 			continue
 83 | 		}
 84 | 
 85 | 		if got != test.want {
 86 | 			t.Errorf("test %v: unexpected result. want %v, got %v", i, test.want, got)
 87 | 		}
 88 | 	}
 89 | }
 90 | 
 91 | func TestDotInc(t *testing.T) {
 92 | 	for i, test := range []struct {
 93 | 		xData []float64
 94 | 		yData []float64
 95 | 
 96 | 		want    float64
 97 | 		wantRev float64 // Result when one of the vectors is reversed.
 98 | 	}{
 99 | 		{
100 | 			xData:   []float64{2},
101 | 			yData:   []float64{-3},
102 | 			want:    -6,
103 | 			wantRev: -6,
104 | 		},
105 | 		{
106 | 			xData:   []float64{2, 3},
107 | 			yData:   []float64{-3, 4},
108 | 			want:    6,
109 | 			wantRev: -1,
110 | 		},
111 | 		{
112 | 			xData:   []float64{2, 3, -4},
113 | 			yData:   []float64{-3, 4, 5},
114 | 			want:    -14,
115 | 			wantRev: 34,
116 | 		},
117 | 		{
118 | 			xData:   []float64{2, 3, -4, -5},
119 | 			yData:   []float64{-3, 4, 5, -6},
120 | 			want:    16,
121 | 			wantRev: 2,
122 | 		},
123 | 		{
124 | 			xData:   []float64{0, 2, 3, -4, -5},
125 | 			yData:   []float64{0, -3, 4, 5, -6},
126 | 			want:    16,
127 | 			wantRev: 34,
128 | 		},
129 | 		{
130 | 			xData:   []float64{0, 0, 2, 3, -4, -5},
131 | 			yData:   []float64{0, 1, -3, 4, 5, -6},
132 | 			want:    16,
133 | 			wantRev: -5,
134 | 		},
135 | 		{
136 | 			xData:   []float64{0, 0, 1, 1, 2, -3, -4},
137 | 			yData:   []float64{0, 1, 0, 3, -4, 5, -6},
138 | 			want:    4,
139 | 			wantRev: -4,
140 | 		},
141 | 		{
142 | 			xData:   []float64{0, 0, 1, 1, 2, -3, -4, 5},
143 | 			yData:   []float64{0, 1, 0, 3, -4, 5, -6, 7},
144 | 			want:    39,
145 | 			wantRev: 3,
146 | 		},
147 | 	} {
148 | 		const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v"
149 | 
150 | 		for _, incX := range []int{-7, -3, -2, -1, 1, 2, 3, 7} {
151 | 			for _, incY := range []int{-7, -3, -2, -1, 1, 2, 3, 7} {
152 | 				n := len(test.xData)
153 | 				x, xFront, xBack := newGuardedVector(test.xData, incX)
154 | 				y, yFront, yBack := newGuardedVector(test.yData, incY)
155 | 
156 | 				var ix, iy int
157 | 				if incX < 0 {
158 | 					ix = (-n + 1) * incX
159 | 				}
160 | 				if incY < 0 {
161 | 					iy = (-n + 1) * incY
162 | 				}
163 | 				got := DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy))
164 | 
165 | 				prefix := fmt.Sprintf("test %v, incX = %v, incY = %v", i, incX, incY)
166 | 				if !allNaN(xFront) || !allNaN(xBack) {
167 | 					t.Errorf(msgGuard, prefix, "x", xFront, xBack)
168 | 				}
169 | 				if !allNaN(yFront) || !allNaN(yBack) {
170 | 					t.Errorf(msgGuard, prefix, "y", yFront, yBack)
171 | 				}
172 | 				if nonStridedWrite(x, incX) || !equalStrided(test.xData, x, incX) {
173 | 					t.Errorf("%v: modified read-only x argument", prefix)
174 | 				}
175 | 				if nonStridedWrite(y, incY) || !equalStrided(test.yData, y, incY) {
176 | 					t.Errorf("%v: modified read-only y argument", prefix)
177 | 				}
178 | 				if math.IsNaN(got) {
179 | 					t.Errorf("%v: invalid memory read", prefix)
180 | 					continue
181 | 				}
182 | 
183 | 				want := test.want
184 | 				if incX*incY < 0 {
185 | 					want = test.wantRev
186 | 				}
187 | 				if got != want {
188 | 					t.Errorf("%v: unexpected result. want %v, got %v", prefix, want, got)
189 | 				}
190 | 			}
191 | 		}
192 | 	}
193 | }
194 | 
195 | func BenchmarkDotUnitaryN1(b *testing.B)      { dotUnitaryBenchmark(b, 1) }
196 | func BenchmarkDotUnitaryN2(b *testing.B)      { dotUnitaryBenchmark(b, 2) }
197 | func BenchmarkDotUnitaryN3(b *testing.B)      { dotUnitaryBenchmark(b, 3) }
198 | func BenchmarkDotUnitaryN4(b *testing.B)      { dotUnitaryBenchmark(b, 4) }
199 | func BenchmarkDotUnitaryN10(b *testing.B)     { dotUnitaryBenchmark(b, 10) }
200 | func BenchmarkDotUnitaryN100(b *testing.B)    { dotUnitaryBenchmark(b, 100) }
201 | func BenchmarkDotUnitaryN1000(b *testing.B)   { dotUnitaryBenchmark(b, 1000) }
202 | func BenchmarkDotUnitaryN10000(b *testing.B)  { dotUnitaryBenchmark(b, 10000) }
203 | func BenchmarkDotUnitaryN100000(b *testing.B) { dotUnitaryBenchmark(b, 100000) }
204 | 
205 | var r float64
206 | 
207 | func dotUnitaryBenchmark(b *testing.B, n int) {
208 | 	x := make([]float64, n)
209 | 	for i := range x {
210 | 		x[i] = rand.Float64()
211 | 	}
212 | 	y := make([]float64, n)
213 | 	for i := range y {
214 | 		y[i] = rand.Float64()
215 | 	}
216 | 	b.ResetTimer()
217 | 	for i := 0; i < b.N; i++ {
218 | 		r = DotUnitary(x, y)
219 | 	}
220 | }
221 | 
222 | func BenchmarkDotIncN1Inc1(b *testing.B) { dotIncBenchmark(b, 1, 1) }
223 | 
224 | func BenchmarkDotIncN2Inc1(b *testing.B)  { dotIncBenchmark(b, 2, 1) }
225 | func BenchmarkDotIncN2Inc2(b *testing.B)  { dotIncBenchmark(b, 2, 2) }
226 | func BenchmarkDotIncN2Inc4(b *testing.B)  { dotIncBenchmark(b, 2, 4) }
227 | func BenchmarkDotIncN2Inc10(b *testing.B) { dotIncBenchmark(b, 2, 10) }
228 | 
229 | func BenchmarkDotIncN3Inc1(b *testing.B)  { dotIncBenchmark(b, 3, 1) }
230 | func BenchmarkDotIncN3Inc2(b *testing.B)  { dotIncBenchmark(b, 3, 2) }
231 | func BenchmarkDotIncN3Inc4(b *testing.B)  { dotIncBenchmark(b, 3, 4) }
232 | func BenchmarkDotIncN3Inc10(b *testing.B) { dotIncBenchmark(b, 3, 10) }
233 | 
234 | func BenchmarkDotIncN4Inc1(b *testing.B)  { dotIncBenchmark(b, 4, 1) }
235 | func BenchmarkDotIncN4Inc2(b *testing.B)  { dotIncBenchmark(b, 4, 2) }
236 | func BenchmarkDotIncN4Inc4(b *testing.B)  { dotIncBenchmark(b, 4, 4) }
237 | func BenchmarkDotIncN4Inc10(b *testing.B) { dotIncBenchmark(b, 4, 10) }
238 | 
239 | func BenchmarkDotIncN10Inc1(b *testing.B)  { dotIncBenchmark(b, 10, 1) }
240 | func BenchmarkDotIncN10Inc2(b *testing.B)  { dotIncBenchmark(b, 10, 2) }
241 | func BenchmarkDotIncN10Inc4(b *testing.B)  { dotIncBenchmark(b, 10, 4) }
242 | func BenchmarkDotIncN10Inc10(b *testing.B) { dotIncBenchmark(b, 10, 10) }
243 | 
244 | func BenchmarkDotIncN1000Inc1(b *testing.B)  { dotIncBenchmark(b, 1000, 1) }
245 | func BenchmarkDotIncN1000Inc2(b *testing.B)  { dotIncBenchmark(b, 1000, 2) }
246 | func BenchmarkDotIncN1000Inc4(b *testing.B)  { dotIncBenchmark(b, 1000, 4) }
247 | func BenchmarkDotIncN1000Inc10(b *testing.B) { dotIncBenchmark(b, 1000, 10) }
248 | 
249 | func BenchmarkDotIncN100000Inc1(b *testing.B)  { dotIncBenchmark(b, 100000, 1) }
250 | func BenchmarkDotIncN100000Inc2(b *testing.B)  { dotIncBenchmark(b, 100000, 2) }
251 | func BenchmarkDotIncN100000Inc4(b *testing.B)  { dotIncBenchmark(b, 100000, 4) }
252 | func BenchmarkDotIncN100000Inc10(b *testing.B) { dotIncBenchmark(b, 100000, 10) }
253 | 
254 | func BenchmarkDotIncN100000IncM1(b *testing.B)  { dotIncBenchmark(b, 100000, -1) }
255 | func BenchmarkDotIncN100000IncM2(b *testing.B)  { dotIncBenchmark(b, 100000, -2) }
256 | func BenchmarkDotIncN100000IncM4(b *testing.B)  { dotIncBenchmark(b, 100000, -4) }
257 | func BenchmarkDotIncN100000IncM10(b *testing.B) { dotIncBenchmark(b, 100000, -10) }
258 | 
259 | func dotIncBenchmark(b *testing.B, n, inc int) {
260 | 	absInc := inc
261 | 	if inc < 0 {
262 | 		absInc = -inc
263 | 	}
264 | 	x := make([]float64, (n-1)*absInc+1)
265 | 	for i := range x {
266 | 		x[i] = rand.Float64()
267 | 	}
268 | 	y := make([]float64, (n-1)*absInc+1)
269 | 	for i := range y {
270 | 		y[i] = rand.Float64()
271 | 	}
272 | 	var ini int
273 | 	if inc < 0 {
274 | 		ini = (-n + 1) * inc
275 | 	}
276 | 	b.ResetTimer()
277 | 	for i := 0; i < b.N; i++ {
278 | 		r = DotInc(x, y, uintptr(n), uintptr(inc), uintptr(inc), uintptr(ini), uintptr(ini))
279 | 	}
280 | }
281 | 


--------------------------------------------------------------------------------
/asm/f64/l1norm_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func L1Dist(s, t []float64) float64
10 | TEXT ·L1Dist(SB), NOSPLIT, $0
11 | 	MOVQ    s_base+0(FP), DI  // DI = &s
12 | 	MOVQ    t_base+24(FP), SI // SI = &t
13 | 	MOVQ    s_len+8(FP), CX   // CX = len(s)
14 | 	CMPQ    t_len+32(FP), CX  // CX = max( CX, len(t) )
15 | 	CMOVQLE t_len+32(FP), CX
16 | 	PXOR    X3, X3            // norm = 0
17 | 	CMPQ    CX, $0            // if CX == 0 { return 0 }
18 | 	JE      l1_end
19 | 	XORQ    AX, AX            // i = 0
20 | 	MOVQ    CX, BX
21 | 	ANDQ    $1, BX            // BX = CX % 2
22 | 	SHRQ    $1, CX            // CX = floor( CX / 2 )
23 | 	JZ      l1_tail_start     // if CX == 0 { return 0 }
24 | 
25 | l1_loop: // Loop unrolled 2x  do {
26 | 	MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1]
27 | 	MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1]
28 | 	MOVAPS X0, X2
29 | 	SUBPD  X1, X0
30 | 	SUBPD  X2, X1
31 | 	MAXPD  X1, X0         // X0 = max( X0 - X1, X1 - X0 )
32 | 	ADDPD  X0, X3         // norm += X0
33 | 	ADDQ   $2, AX         // i += 2
34 | 	LOOP   l1_loop        // } while --CX > 0
35 | 	CMPQ   BX, $0         // if BX == 0 { return }
36 | 	JE     l1_end
37 | 
38 | l1_tail_start: // Reset loop registers
39 | 	MOVQ BX, CX // Loop counter: CX = BX
40 | 	PXOR X0, X0 // reset X0, X1 to break dependencies
41 | 	PXOR X1, X1
42 | 
43 | l1_tail:
44 | 	MOVSD  (SI)(AX*8), X0 // X0 = t[i]
45 | 	MOVSD  (DI)(AX*8), X1 // x1 = s[i]
46 | 	MOVAPD X0, X2
47 | 	SUBSD  X1, X0
48 | 	SUBSD  X2, X1
49 | 	MAXSD  X1, X0         // X0 = max( X0 - X1, X1 - X0 )
50 | 	ADDSD  X0, X3         // norm += X0
51 | 
52 | l1_end:
53 | 	MOVAPS X3, X2
54 | 	SHUFPD $1, X2, X2
55 | 	ADDSD  X3, X2         // X2 = X3[1] + X3[0]
56 | 	MOVSD  X2, ret+48(FP) // return X2
57 | 	RET
58 | 
59 | 


--------------------------------------------------------------------------------
/asm/f64/linfnorm_amd64.s:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | // +build !noasm,!appengine
 6 | 
 7 | #include "textflag.h"
 8 | 
 9 | // func LinfDist(s, t []float64) float64
10 | TEXT ·LinfDist(SB), NOSPLIT, $0
11 | 	MOVQ    s_base+0(FP), DI  // DI = &s
12 | 	MOVQ    t_base+24(FP), SI // SI = &t
13 | 	MOVQ    s_len+8(FP), CX   // CX = len(s)
14 | 	CMPQ    t_len+32(FP), CX  // CX = max( CX, len(t) )
15 | 	CMOVQLE t_len+32(FP), CX
16 | 	PXOR    X3, X3            // norm = 0
17 | 	CMPQ    CX, $0            // if CX == 0 { return 0 }
18 | 	JE      l1_end
19 | 	XORQ    AX, AX            // i = 0
20 | 	MOVQ    CX, BX
21 | 	ANDQ    $1, BX            // BX = CX % 2
22 | 	SHRQ    $1, CX            // CX = floor( CX / 2 )
23 | 	JZ      l1_tail_start     // if CX == 0 { return 0 }
24 | 
25 | l1_loop: // Loop unrolled 2x  do {
26 | 	MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1]
27 | 	MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1]
28 | 	MOVAPS X0, X2
29 | 	SUBPD  X1, X0
30 | 	SUBPD  X2, X1
31 | 	MAXPD  X1, X0         // X0 = max( X0 - X1, X1 - X0 )
32 | 	MAXPD  X0, X3         // norm = max( norm, X0 )
33 | 	ADDQ   $2, AX         // i += 2
34 | 	LOOP   l1_loop        // } while --CX > 0
35 | 	CMPQ   BX, $0         // if BX == 0 { return }
36 | 	JE     l1_end
37 | 
38 | l1_tail_start: // Reset loop registers
39 | 	MOVQ BX, CX // Loop counter: CX = BX
40 | 	PXOR X0, X0 // reset X0, X1 to break dependencies
41 | 	PXOR X1, X1
42 | 
43 | l1_tail:
44 | 	MOVSD  (SI)(AX*8), X0 // X0 = t[i]
45 | 	MOVSD  (DI)(AX*8), X1 // X1 = s[i]
46 | 	MOVAPD X0, X2
47 | 	SUBSD  X1, X0
48 | 	SUBSD  X2, X1
49 | 	MAXSD  X1, X0         // X0 = max( X0 - X1, X1 - X0 )
50 | 	MAXSD  X0, X3         // norm = max( norm, X0 )
51 | 
52 | l1_end:
53 | 	MOVAPS X3, X2
54 | 	SHUFPD $1, X2, X2
55 | 	MAXSD  X3, X2         // X2 = max( X3[1], X3[0] )
56 | 	MOVSD  X2, ret+48(FP) // return X2
57 | 	RET
58 | 


--------------------------------------------------------------------------------
/asm/f64/scal.go:
--------------------------------------------------------------------------------
 1 | // Copyright ©2016 The gonum Authors. All rights reserved.
 2 | // Use of this source code is governed by a BSD-style
 3 | // license that can be found in the LICENSE file.
 4 | 
 5 | //+build !amd64 noasm appengine
 6 | 
 7 | package f64
 8 | 
 9 | // ScalUnitary is
10 | //  for i := range x {
11 | //  	x[i] *= alpha
12 | //  }
13 | func ScalUnitary(alpha float64, x []float64) {
14 | 	for i := range x {
15 | 		x[i] *= alpha
16 | 	}
17 | }
18 | 
19 | // ScalUnitaryTo is
20 | //  for i, v := range x {
21 | //  	dst[i] = alpha * v
22 | //  }
23 | func ScalUnitaryTo(dst []float64, alpha float64, x []float64) {
24 | 	for i, v := range x {
25 | 		dst[i] = alpha * v
26 | 	}
27 | }
28 | 
29 | // ScalInc is
30 | //  var ix uintptr
31 | //  for i := 0; i < int(n); i++ {
32 | //  	x[ix] *= alpha
33 | //  	ix += incX
34 | //  }
35 | func ScalInc(alpha float64, x []float64, n, incX uintptr) {
36 | 	var ix uintptr
37 | 	for i := 0; i < int(n); i++ {
38 | 		x[ix] *= alpha
39 | 		ix += incX
40 | 	}
41 | }
42 | 
43 | // ScalIncTo is
44 | //  var idst, ix uintptr
45 | //  for i := 0; i < int(n); i++ {
46 | //  	dst[idst] = alpha * x[ix]
47 | //  	ix += incX
48 | //  	idst += incDst
49 | //  }
50 | func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr) {
51 | 	var idst, ix uintptr
52 | 	for i := 0; i < int(n); i++ {
53 | 		dst[idst] = alpha * x[ix]
54 | 		ix += incX
55 | 		idst += incDst
56 | 	}
57 | }
58 | 


--------------------------------------------------------------------------------
/asm/f64/scal_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | package f64
  6 | 
  7 | import (
  8 | 	"fmt"
  9 | 	"math/rand"
 10 | 	"testing"
 11 | )
 12 | 
 13 | var scalTests = []struct {
 14 | 	alpha float64
 15 | 	x     []float64
 16 | 	want  []float64
 17 | }{
 18 | 	{
 19 | 		alpha: 0,
 20 | 		x:     []float64{},
 21 | 		want:  []float64{},
 22 | 	},
 23 | 	{
 24 | 		alpha: 0,
 25 | 		x:     []float64{1},
 26 | 		want:  []float64{0},
 27 | 	},
 28 | 	{
 29 | 		alpha: 1,
 30 | 		x:     []float64{1},
 31 | 		want:  []float64{1},
 32 | 	},
 33 | 	{
 34 | 		alpha: 2,
 35 | 		x:     []float64{1, -2},
 36 | 		want:  []float64{2, -4},
 37 | 	},
 38 | 	{
 39 | 		alpha: 2,
 40 | 		x:     []float64{1, -2, 3},
 41 | 		want:  []float64{2, -4, 6},
 42 | 	},
 43 | 	{
 44 | 		alpha: 2,
 45 | 		x:     []float64{1, -2, 3, 4},
 46 | 		want:  []float64{2, -4, 6, 8},
 47 | 	},
 48 | 	{
 49 | 		alpha: 2,
 50 | 		x:     []float64{1, -2, 3, 4, -5},
 51 | 		want:  []float64{2, -4, 6, 8, -10},
 52 | 	},
 53 | 	{
 54 | 		alpha: 2,
 55 | 		x:     []float64{0, 1, -2, 3, 4, -5, 6, -7},
 56 | 		want:  []float64{0, 2, -4, 6, 8, -10, 12, -14},
 57 | 	},
 58 | 	{
 59 | 		alpha: 2,
 60 | 		x:     []float64{0, 1, -2, 3, 4, -5, 6, -7, 8},
 61 | 		want:  []float64{0, 2, -4, 6, 8, -10, 12, -14, 16},
 62 | 	},
 63 | 	{
 64 | 		alpha: 2,
 65 | 		x:     []float64{0, 1, -2, 3, 4, -5, 6, -7, 8, 9},
 66 | 		want:  []float64{0, 2, -4, 6, 8, -10, 12, -14, 16, 18},
 67 | 	},
 68 | 	{
 69 | 		alpha: 3,
 70 | 		x:     []float64{0, 1, -2, 3, 4, -5, 6, -7, 8, 9, 12},
 71 | 		want:  []float64{0, 3, -6, 9, 12, -15, 18, -21, 24, 27, 36},
 72 | 	},
 73 | }
 74 | 
 75 | func TestScalUnitary(t *testing.T) {
 76 | 	const xGdVal = -0.5
 77 | 	for i, test := range scalTests {
 78 | 		for _, align := range align1 {
 79 | 			prefix := fmt.Sprintf("Test %v (x:%v)", i, align)
 80 | 			xgLn := 4 + align
 81 | 			xg := guardVector(test.x, xGdVal, xgLn)
 82 | 			x := xg[xgLn : len(xg)-xgLn]
 83 | 
 84 | 			ScalUnitary(test.alpha, x)
 85 | 
 86 | 			for i := range test.want {
 87 | 				if !same(x[i], test.want[i]) {
 88 | 					t.Errorf(msgVal, prefix, i, x[i], test.want[i])
 89 | 				}
 90 | 			}
 91 | 			if !isValidGuard(xg, xGdVal, xgLn) {
 92 | 				t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:])
 93 | 			}
 94 | 		}
 95 | 	}
 96 | }
 97 | 
 98 | func TestScalUnitaryTo(t *testing.T) {
 99 | 	const xGdVal, dstGdVal = -1, 0.5
100 | 	rng := rand.New(rand.NewSource(42))
101 | 	for i, test := range scalTests {
102 | 		n := len(test.x)
103 | 		for _, align := range align2 {
104 | 			prefix := fmt.Sprintf("Test %v (x:%v dst:%v)", i, align.x, align.y)
105 | 			xgLn, dgLn := 4+align.x, 4+align.y
106 | 			xg := guardVector(test.x, xGdVal, xgLn)
107 | 			dg := guardVector(randSlice(n, 1, rng), dstGdVal, dgLn)
108 | 			x, dst := xg[xgLn:len(xg)-xgLn], dg[dgLn:len(dg)-dgLn]
109 | 
110 | 			ScalUnitaryTo(dst, test.alpha, x)
111 | 
112 | 			for i := range test.want {
113 | 				if !same(dst[i], test.want[i]) {
114 | 					t.Errorf(msgVal, prefix, i, dst[i], test.want[i])
115 | 				}
116 | 			}
117 | 			if !isValidGuard(xg, xGdVal, xgLn) {
118 | 				t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:])
119 | 			}
120 | 			if !isValidGuard(dg, dstGdVal, dgLn) {
121 | 				t.Errorf(msgGuard, prefix, "y", dg[:dgLn], dg[len(dg)-dgLn:])
122 | 			}
123 | 			if !equalStrided(test.x, x, 1) {
124 | 				t.Errorf("%v: modified read-only x argument", prefix)
125 | 			}
126 | 		}
127 | 	}
128 | }
129 | 
130 | func TestScalInc(t *testing.T) {
131 | 	const xGdVal = -0.5
132 | 	gdLn := 4
133 | 	for i, test := range scalTests {
134 | 		n := len(test.x)
135 | 		for _, incX := range []int{1, 2, 3, 4, 7, 10} {
136 | 			prefix := fmt.Sprintf("Test %v (x:%v)", i, incX)
137 | 			xg := guardIncVector(test.x, xGdVal, incX, gdLn)
138 | 			x := xg[gdLn : len(xg)-gdLn]
139 | 
140 | 			ScalInc(test.alpha, x, uintptr(n), uintptr(incX))
141 | 
142 | 			for i := range test.want {
143 | 				if !same(x[i*incX], test.want[i]) {
144 | 					t.Errorf(msgVal, prefix, i, x[i*incX], test.want[i])
145 | 				}
146 | 			}
147 | 			checkValidIncGuard(t, xg, xGdVal, incX, gdLn)
148 | 		}
149 | 	}
150 | }
151 | 
152 | func TestScalIncTo(t *testing.T) {
153 | 	const xGdVal, dstGdVal = -1, 0.5
154 | 	gdLn := 4
155 | 	rng := rand.New(rand.NewSource(42))
156 | 	for i, test := range scalTests {
157 | 		n := len(test.x)
158 | 		for _, inc := range newIncSet(1, 2, 3, 4, 7, 10) {
159 | 			prefix := fmt.Sprintf("test %v (x:%v dst:%v)", i, inc.x, inc.y)
160 | 			xg := guardIncVector(test.x, xGdVal, inc.x, gdLn)
161 | 			dg := guardIncVector(randSlice(n, 1, rng), dstGdVal, inc.y, gdLn)
162 | 			x, dst := xg[gdLn:len(xg)-gdLn], dg[gdLn:len(dg)-gdLn]
163 | 
164 | 			ScalIncTo(dst, uintptr(inc.y), test.alpha, x, uintptr(n), uintptr(inc.x))
165 | 
166 | 			for i := range test.want {
167 | 				if !same(dst[i*inc.y], test.want[i]) {
168 | 					t.Errorf(msgVal, prefix, i, dst[i*inc.y], test.want[i])
169 | 				}
170 | 			}
171 | 			checkValidIncGuard(t, xg, xGdVal, inc.x, gdLn)
172 | 			checkValidIncGuard(t, dg, dstGdVal, inc.y, gdLn)
173 | 			if !equalStrided(test.x, x, inc.x) {
174 | 				t.Errorf("%v: modified read-only x argument", prefix)
175 | 			}
176 | 
177 | 		}
178 | 	}
179 | }
180 | 


--------------------------------------------------------------------------------
/asm/f64/scalinc_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | //
  5 | // Some of the loop unrolling code is copied from:
  6 | // http://golang.org/src/math/big/arith_amd64.s
  7 | // which is distributed under these terms:
  8 | //
  9 | // Copyright (c) 2012 The Go Authors. All rights reserved.
 10 | //
 11 | // Redistribution and use in source and binary forms, with or without
 12 | // modification, are permitted provided that the following conditions are
 13 | // met:
 14 | //
 15 | //    * Redistributions of source code must retain the above copyright
 16 | // notice, this list of conditions and the following disclaimer.
 17 | //    * Redistributions in binary form must reproduce the above
 18 | // copyright notice, this list of conditions and the following disclaimer
 19 | // in the documentation and/or other materials provided with the
 20 | // distribution.
 21 | //    * Neither the name of Google Inc. nor the names of its
 22 | // contributors may be used to endorse or promote products derived from
 23 | // this software without specific prior written permission.
 24 | //
 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 36 | 
 37 | //+build !noasm,!appengine
 38 | 
 39 | #include "textflag.h"
 40 | 
 41 | #define X_PTR SI
 42 | #define LEN CX
 43 | #define TAIL BX
 44 | #define INC_X R8
 45 | #define INCx3_X R9
 46 | #define ALPHA X0
 47 | #define ALPHA_2 X1
 48 | 
 49 | // func ScalInc(alpha float64, x []float64, n, incX uintptr)
 50 | TEXT ·ScalInc(SB), NOSPLIT, $0
 51 | 	MOVSD alpha+0(FP), ALPHA  // ALPHA = alpha
 52 | 	MOVQ  x_base+8(FP), X_PTR // X_PTR = &x
 53 | 	MOVQ  incX+40(FP), INC_X  // INC_X = incX
 54 | 	SHLQ  $3, INC_X           // INC_X *= sizeof(float64)
 55 | 	MOVQ  n+32(FP), LEN       // LEN = n
 56 | 	CMPQ  LEN, $0
 57 | 	JE    end                 // if LEN == 0 { return }
 58 | 
 59 | 	MOVQ LEN, TAIL
 60 | 	ANDQ $3, TAIL   // TAIL = LEN % 4
 61 | 	SHRQ $2, LEN    // LEN = floor( LEN / 4 )
 62 | 	JZ   tail_start // if LEN == 0 { goto tail_start }
 63 | 
 64 | 	MOVUPS ALPHA, ALPHA_2            // ALPHA_2 = ALPHA for pipelining
 65 | 	LEAQ   (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3
 66 | 
 67 | loop:  // do { // x[i] *= alpha unrolled 4x.
 68 | 	MOVSD (X_PTR), X2            // X_i = x[i]
 69 | 	MOVSD (X_PTR)(INC_X*1), X3
 70 | 	MOVSD (X_PTR)(INC_X*2), X4
 71 | 	MOVSD (X_PTR)(INCx3_X*1), X5
 72 | 
 73 | 	MULSD ALPHA, X2   // X_i *= a
 74 | 	MULSD ALPHA_2, X3
 75 | 	MULSD ALPHA, X4
 76 | 	MULSD ALPHA_2, X5
 77 | 
 78 | 	MOVSD X2, (X_PTR)            // x[i] = X_i
 79 | 	MOVSD X3, (X_PTR)(INC_X*1)
 80 | 	MOVSD X4, (X_PTR)(INC_X*2)
 81 | 	MOVSD X5, (X_PTR)(INCx3_X*1)
 82 | 
 83 | 	LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4])
 84 | 	DECQ LEN
 85 | 	JNZ  loop                    // } while --LEN > 0
 86 | 	CMPQ TAIL, $0
 87 | 	JE   end                     // if TAIL == 0 { return }
 88 | 
 89 | tail_start: // Reset loop registers
 90 | 	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
 91 | 	SHRQ $1, LEN   // LEN = floor( LEN / 2 )
 92 | 	JZ   tail_one
 93 | 
 94 | tail_two: // do {
 95 | 	MOVSD (X_PTR), X2          // X_i = x[i]
 96 | 	MOVSD (X_PTR)(INC_X*1), X3
 97 | 	MULSD ALPHA, X2            // X_i *= a
 98 | 	MULSD ALPHA, X3
 99 | 	MOVSD X2, (X_PTR)          // x[i] = X_i
100 | 	MOVSD X3, (X_PTR)(INC_X*1)
101 | 
102 | 	LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2])
103 | 
104 | 	ANDQ $1, TAIL
105 | 	JZ   end
106 | 
107 | tail_one:
108 | 	MOVSD (X_PTR), X2 // X_i = x[i]
109 | 	MULSD ALPHA, X2   // X_i *= ALPHA
110 | 	MOVSD X2, (X_PTR) // x[i] = X_i
111 | 
112 | end:
113 | 	RET
114 | 


--------------------------------------------------------------------------------
/asm/f64/scalincto_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | //
  5 | // Some of the loop unrolling code is copied from:
  6 | // http://golang.org/src/math/big/arith_amd64.s
  7 | // which is distributed under these terms:
  8 | //
  9 | // Copyright (c) 2012 The Go Authors. All rights reserved.
 10 | //
 11 | // Redistribution and use in source and binary forms, with or without
 12 | // modification, are permitted provided that the following conditions are
 13 | // met:
 14 | //
 15 | //    * Redistributions of source code must retain the above copyright
 16 | // notice, this list of conditions and the following disclaimer.
 17 | //    * Redistributions in binary form must reproduce the above
 18 | // copyright notice, this list of conditions and the following disclaimer
 19 | // in the documentation and/or other materials provided with the
 20 | // distribution.
 21 | //    * Neither the name of Google Inc. nor the names of its
 22 | // contributors may be used to endorse or promote products derived from
 23 | // this software without specific prior written permission.
 24 | //
 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 36 | 
 37 | //+build !noasm,!appengine
 38 | 
 39 | #include "textflag.h"
 40 | 
 41 | #define X_PTR SI
 42 | #define DST_PTR DI
 43 | #define LEN CX
 44 | #define TAIL BX
 45 | #define INC_X R8
 46 | #define INCx3_X R9
 47 | #define INC_DST R10
 48 | #define INCx3_DST R11
 49 | #define ALPHA X0
 50 | #define ALPHA_2 X1
 51 | 
 52 | // func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr)
 53 | TEXT ·ScalIncTo(SB), NOSPLIT, $0
 54 | 	MOVQ  dst_base+0(FP), DST_PTR // DST_PTR = &dst
 55 | 	MOVQ  incDst+24(FP), INC_DST  // INC_DST = incDst
 56 | 	SHLQ  $3, INC_DST             // INC_DST *= sizeof(float64)
 57 | 	MOVSD alpha+32(FP), ALPHA     // ALPHA = alpha
 58 | 	MOVQ  x_base+40(FP), X_PTR    // X_PTR = &x
 59 | 	MOVQ  n+64(FP), LEN           // LEN = n
 60 | 	MOVQ  incX+72(FP), INC_X      // INC_X = incX
 61 | 	SHLQ  $3, INC_X               // INC_X *= sizeof(float64)
 62 | 	CMPQ  LEN, $0
 63 | 	JE    end                     // if LEN == 0 { return }
 64 | 
 65 | 	MOVQ LEN, TAIL
 66 | 	ANDQ $3, TAIL   // TAIL = LEN % 4
 67 | 	SHRQ $2, LEN    // LEN = floor( LEN / 4 )
 68 | 	JZ   tail_start // if LEN == 0 { goto tail_start }
 69 | 
 70 | 	MOVUPS ALPHA, ALPHA_2                  // ALPHA_2 = ALPHA for pipelining
 71 | 	LEAQ   (INC_X)(INC_X*2), INCx3_X       // INCx3_X = INC_X * 3
 72 | 	LEAQ   (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3
 73 | 
 74 | loop:  // do { // x[i] *= alpha unrolled 4x.
 75 | 	MOVSD (X_PTR), X2            // X_i = x[i]
 76 | 	MOVSD (X_PTR)(INC_X*1), X3
 77 | 	MOVSD (X_PTR)(INC_X*2), X4
 78 | 	MOVSD (X_PTR)(INCx3_X*1), X5
 79 | 
 80 | 	MULSD ALPHA, X2   // X_i *= a
 81 | 	MULSD ALPHA_2, X3
 82 | 	MULSD ALPHA, X4
 83 | 	MULSD ALPHA_2, X5
 84 | 
 85 | 	MOVSD X2, (DST_PTR)              // dst[i] = X_i
 86 | 	MOVSD X3, (DST_PTR)(INC_DST*1)
 87 | 	MOVSD X4, (DST_PTR)(INC_DST*2)
 88 | 	MOVSD X5, (DST_PTR)(INCx3_DST*1)
 89 | 
 90 | 	LEAQ (X_PTR)(INC_X*4), X_PTR       // X_PTR = &(X_PTR[incX*4])
 91 | 	LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4])
 92 | 	DECQ LEN
 93 | 	JNZ  loop                          // } while --LEN > 0
 94 | 	CMPQ TAIL, $0
 95 | 	JE   end                           // if TAIL == 0 { return }
 96 | 
 97 | tail_start: // Reset loop registers
 98 | 	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
 99 | 	SHRQ $1, LEN   // LEN = floor( LEN / 2 )
100 | 	JZ   tail_one
101 | 
102 | tail_two:
103 | 	MOVSD (X_PTR), X2              // X_i = x[i]
104 | 	MOVSD (X_PTR)(INC_X*1), X3
105 | 	MULSD ALPHA, X2                // X_i *= a
106 | 	MULSD ALPHA, X3
107 | 	MOVSD X2, (DST_PTR)            // dst[i] = X_i
108 | 	MOVSD X3, (DST_PTR)(INC_DST*1)
109 | 
110 | 	LEAQ (X_PTR)(INC_X*2), X_PTR       // X_PTR = &(X_PTR[incX*2])
111 | 	LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incDst*2])
112 | 
113 | 	ANDQ $1, TAIL
114 | 	JZ   end
115 | 
116 | tail_one:
117 | 	MOVSD (X_PTR), X2   // X_i = x[i]
118 | 	MULSD ALPHA, X2     // X_i *= ALPHA
119 | 	MOVSD X2, (DST_PTR) // x[i] = X_i
120 | 
121 | end:
122 | 	RET
123 | 


--------------------------------------------------------------------------------
/asm/f64/scalunitary_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | //
  5 | // Some of the loop unrolling code is copied from:
  6 | // http://golang.org/src/math/big/arith_amd64.s
  7 | // which is distributed under these terms:
  8 | //
  9 | // Copyright (c) 2012 The Go Authors. All rights reserved.
 10 | //
 11 | // Redistribution and use in source and binary forms, with or without
 12 | // modification, are permitted provided that the following conditions are
 13 | // met:
 14 | //
 15 | //    * Redistributions of source code must retain the above copyright
 16 | // notice, this list of conditions and the following disclaimer.
 17 | //    * Redistributions in binary form must reproduce the above
 18 | // copyright notice, this list of conditions and the following disclaimer
 19 | // in the documentation and/or other materials provided with the
 20 | // distribution.
 21 | //    * Neither the name of Google Inc. nor the names of its
 22 | // contributors may be used to endorse or promote products derived from
 23 | // this software without specific prior written permission.
 24 | //
 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 36 | 
 37 | //+build !noasm,!appengine
 38 | 
 39 | #include "textflag.h"
 40 | 
 41 | #define MOVDDUP_ALPHA    LONG $0x44120FF2; WORD $0x0824 // @ MOVDDUP XMM0, 8[RSP]
 42 | 
 43 | #define X_PTR SI
 44 | #define DST_PTR DI
 45 | #define IDX AX
 46 | #define LEN CX
 47 | #define TAIL BX
 48 | #define ALPHA X0
 49 | #define ALPHA_2 X1
 50 | 
 51 | // func ScalUnitary(alpha float64, x []float64)
 52 | TEXT ·ScalUnitary(SB), NOSPLIT, $0
 53 | 	MOVDDUP_ALPHA            // ALPHA = { alpha, alpha }
 54 | 	MOVQ x_base+8(FP), X_PTR // X_PTR = &x
 55 | 	MOVQ x_len+16(FP), LEN   // LEN = len(x)
 56 | 	CMPQ LEN, $0
 57 | 	JE   end                 // if LEN == 0 { return }
 58 | 	XORQ IDX, IDX            // IDX = 0
 59 | 
 60 | 	MOVQ LEN, TAIL
 61 | 	ANDQ $7, TAIL   // TAIL = LEN % 8
 62 | 	SHRQ $3, LEN    // LEN = floor( LEN / 8 )
 63 | 	JZ   tail_start // if LEN == 0 { goto tail_start }
 64 | 
 65 | 	MOVUPS ALPHA, ALPHA_2
 66 | 
 67 | loop:  // do {  // x[i] *= alpha unrolled 8x.
 68 | 	MOVUPS (X_PTR)(IDX*8), X2   // X_i = x[i]
 69 | 	MOVUPS 16(X_PTR)(IDX*8), X3
 70 | 	MOVUPS 32(X_PTR)(IDX*8), X4
 71 | 	MOVUPS 48(X_PTR)(IDX*8), X5
 72 | 
 73 | 	MULPD ALPHA, X2   // X_i *= ALPHA
 74 | 	MULPD ALPHA_2, X3
 75 | 	MULPD ALPHA, X4
 76 | 	MULPD ALPHA_2, X5
 77 | 
 78 | 	MOVUPS X2, (X_PTR)(IDX*8)   // x[i] = X_i
 79 | 	MOVUPS X3, 16(X_PTR)(IDX*8)
 80 | 	MOVUPS X4, 32(X_PTR)(IDX*8)
 81 | 	MOVUPS X5, 48(X_PTR)(IDX*8)
 82 | 
 83 | 	ADDQ $8, IDX  // i += 8
 84 | 	DECQ LEN
 85 | 	JNZ  loop     // while --LEN > 0
 86 | 	CMPQ TAIL, $0
 87 | 	JE   end      // if TAIL == 0 { return }
 88 | 
 89 | tail_start: // Reset loop registers
 90 | 	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
 91 | 	SHRQ $1, LEN   // LEN = floor( TAIL / 2 )
 92 | 	JZ   tail_one  // if n == 0 goto end
 93 | 
 94 | tail_two: // do {
 95 | 	MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i]
 96 | 	MULPD  ALPHA, X2          // X_i *= ALPHA
 97 | 	MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i
 98 | 	ADDQ   $2, IDX            // i += 2
 99 | 	DECQ   LEN
100 | 	JNZ    tail_two           // while --LEN > 0
101 | 
102 | 	ANDQ $1, TAIL
103 | 	JZ   end      // if TAIL == 0 { return }
104 | 
105 | tail_one:
106 | 	// x[i] *= alpha for the remaining element.
107 | 	MOVSD (X_PTR)(IDX*8), X2
108 | 	MULSD ALPHA, X2
109 | 	MOVSD X2, (X_PTR)(IDX*8)
110 | 
111 | end:
112 | 	RET
113 | 


--------------------------------------------------------------------------------
/asm/f64/scalunitaryto_amd64.s:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | //
  5 | // Some of the loop unrolling code is copied from:
  6 | // http://golang.org/src/math/big/arith_amd64.s
  7 | // which is distributed under these terms:
  8 | //
  9 | // Copyright (c) 2012 The Go Authors. All rights reserved.
 10 | //
 11 | // Redistribution and use in source and binary forms, with or without
 12 | // modification, are permitted provided that the following conditions are
 13 | // met:
 14 | //
 15 | //    * Redistributions of source code must retain the above copyright
 16 | // notice, this list of conditions and the following disclaimer.
 17 | //    * Redistributions in binary form must reproduce the above
 18 | // copyright notice, this list of conditions and the following disclaimer
 19 | // in the documentation and/or other materials provided with the
 20 | // distribution.
 21 | //    * Neither the name of Google Inc. nor the names of its
 22 | // contributors may be used to endorse or promote products derived from
 23 | // this software without specific prior written permission.
 24 | //
 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 36 | 
 37 | //+build !noasm,!appengine
 38 | 
 39 | #include "textflag.h"
 40 | 
 41 | #define MOVDDUP_ALPHA    LONG $0x44120FF2; WORD $0x2024 // @ MOVDDUP 32(SP), X0  /*XMM0, 32[RSP]*/
 42 | 
 43 | #define X_PTR SI
 44 | #define DST_PTR DI
 45 | #define IDX AX
 46 | #define LEN CX
 47 | #define TAIL BX
 48 | #define ALPHA X0
 49 | #define ALPHA_2 X1
 50 | 
 51 | // func ScalUnitaryTo(dst []float64, alpha float64, x []float64)
 52 | // This function assumes len(dst) >= len(x).
 53 | TEXT ·ScalUnitaryTo(SB), NOSPLIT, $0
 54 | 	MOVQ x_base+32(FP), X_PTR    // X_PTR = &x
 55 | 	MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst
 56 | 	MOVDDUP_ALPHA                // ALPHA = { alpha, alpha }
 57 | 	MOVQ x_len+40(FP), LEN       // LEN = len(x)
 58 | 	CMPQ LEN, $0
 59 | 	JE   end                     // if LEN == 0 { return }
 60 | 
 61 | 	XORQ IDX, IDX   // IDX = 0
 62 | 	MOVQ LEN, TAIL
 63 | 	ANDQ $7, TAIL   // TAIL = LEN % 8
 64 | 	SHRQ $3, LEN    // LEN = floor( LEN / 8 )
 65 | 	JZ   tail_start // if LEN == 0 { goto tail_start }
 66 | 
 67 | 	MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining
 68 | 
 69 | loop:  // do { // dst[i] = alpha * x[i] unrolled 8x.
 70 | 	MOVUPS (X_PTR)(IDX*8), X2   // X_i = x[i]
 71 | 	MOVUPS 16(X_PTR)(IDX*8), X3
 72 | 	MOVUPS 32(X_PTR)(IDX*8), X4
 73 | 	MOVUPS 48(X_PTR)(IDX*8), X5
 74 | 
 75 | 	MULPD ALPHA, X2   // X_i *= ALPHA
 76 | 	MULPD ALPHA_2, X3
 77 | 	MULPD ALPHA, X4
 78 | 	MULPD ALPHA_2, X5
 79 | 
 80 | 	MOVUPS X2, (DST_PTR)(IDX*8)   // dst[i] = X_i
 81 | 	MOVUPS X3, 16(DST_PTR)(IDX*8)
 82 | 	MOVUPS X4, 32(DST_PTR)(IDX*8)
 83 | 	MOVUPS X5, 48(DST_PTR)(IDX*8)
 84 | 
 85 | 	ADDQ $8, IDX  // i += 8
 86 | 	DECQ LEN
 87 | 	JNZ  loop     // while --LEN > 0
 88 | 	CMPQ TAIL, $0
 89 | 	JE   end      // if TAIL == 0 { return }
 90 | 
 91 | tail_start: // Reset loop counters
 92 | 	MOVQ TAIL, LEN // Loop counter: LEN = TAIL
 93 | 	SHRQ $1, LEN   // LEN = floor( TAIL / 2 )
 94 | 	JZ   tail_one  // if LEN == 0 { goto tail_one }
 95 | 
 96 | tail_two: // do {
 97 | 	MOVUPS (X_PTR)(IDX*8), X2   // X_i = x[i]
 98 | 	MULPD  ALPHA, X2            // X_i *= ALPHA
 99 | 	MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i
100 | 	ADDQ   $2, IDX              // i += 2
101 | 	DECQ   LEN
102 | 	JNZ    tail_two             // while --LEN > 0
103 | 
104 | 	ANDQ $1, TAIL
105 | 	JZ   end      // if TAIL == 0 { return }
106 | 
107 | tail_one:
108 | 	MOVSD (X_PTR)(IDX*8), X2   // X_i = x[i]
109 | 	MULSD ALPHA, X2            // X_i *= ALPHA
110 | 	MOVSD X2, (DST_PTR)(IDX*8) // dst[i] = X_i
111 | 
112 | end:
113 | 	RET
114 | 


--------------------------------------------------------------------------------
/asm/f64/stubs_amd64.go:
--------------------------------------------------------------------------------
  1 | // Copyright ©2015 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //+build !noasm,!appengine
  6 | 
  7 | package f64
  8 | 
  9 | // L1Norm is
 10 | //  for _, v := range x {
 11 | //  	sum += math.Abs(v)
 12 | //  }
 13 | //  return sum
 14 | func L1Norm(x []float64) (sum float64)
 15 | 
 16 | // L1NormInc is
 17 | //  for i := 0; i < n*incX; i += incX {
 18 | //  	sum += math.Abs(x[i])
 19 | //  }
 20 | //  return sum
 21 | func L1NormInc(x []float64, n, incX int) (sum float64)
 22 | 
 23 | // AddConst is
 24 | //  for i := range x {
 25 | //  	x[i] += alpha
 26 | //  }
 27 | func AddConst(alpha float64, x []float64)
 28 | 
 29 | // Add is
 30 | //  for i, v := range s {
 31 | //  	dst[i] += v
 32 | //  }
 33 | func Add(dst, s []float64)
 34 | 
 35 | // AxpyUnitary is
 36 | //  for i, v := range x {
 37 | //  	y[i] += alpha * v
 38 | //  }
 39 | func AxpyUnitary(alpha float64, x, y []float64)
 40 | 
 41 | // AxpyUnitaryTo is
 42 | //  for i, v := range x {
 43 | //  	dst[i] = alpha*v + y[i]
 44 | //  }
 45 | func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64)
 46 | 
 47 | // AxpyInc is
 48 | //  for i := 0; i < int(n); i++ {
 49 | //  	y[iy] += alpha * x[ix]
 50 | //  	ix += incX
 51 | //  	iy += incY
 52 | //  }
 53 | func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
 54 | 
 55 | // AxpyIncTo is
 56 | //  for i := 0; i < int(n); i++ {
 57 | //  	dst[idst] = alpha*x[ix] + y[iy]
 58 | //  	ix += incX
 59 | //  	iy += incY
 60 | //  	idst += incDst
 61 | //  }
 62 | func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr)
 63 | 
 64 | // CumSum is
 65 | //  if len(s) == 0 {
 66 | //  	return dst
 67 | //  }
 68 | //  dst[0] = s[0]
 69 | //  for i, v := range s[1:] {
 70 | //  	dst[i+1] = dst[i] + v
 71 | //  }
 72 | //  return dst
 73 | func CumSum(dst, s []float64) []float64
 74 | 
 75 | // CumProd is
 76 | //  if len(s) == 0 {
 77 | //  	return dst
 78 | //  }
 79 | //  dst[0] = s[0]
 80 | //  for i, v := range s[1:] {
 81 | //  	dst[i+1] = dst[i] * v
 82 | //  }
 83 | //  return dst
 84 | func CumProd(dst, s []float64) []float64
 85 | 
 86 | // Div is
 87 | //  for i, v := range s {
 88 | //  	dst[i] /= v
 89 | //  }
 90 | func Div(dst, s []float64)
 91 | 
 92 | // DivTo is
 93 | //  for i, v := range s {
 94 | //  	dst[i] = v / t[i]
 95 | //  }
 96 | //  return dst
 97 | func DivTo(dst, x, y []float64) []float64
 98 | 
 99 | // DotUnitary is
100 | //  for i, v := range x {
101 | //  	sum += y[i] * v
102 | //  }
103 | //  return sum
104 | func DotUnitary(x, y []float64) (sum float64)
105 | 
106 | // DotInc is
107 | //  for i := 0; i < int(n); i++ {
108 | //  	sum += y[iy] * x[ix]
109 | //  	ix += incX
110 | //  	iy += incY
111 | //  }
112 | //  return sum
113 | func DotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64)
114 | 
115 | // L1Dist is
116 | //  var norm float64
117 | //  for i, v := range s {
118 | //  	norm += math.Abs(t[i] - v)
119 | //  }
120 | //  return norm
121 | func L1Dist(s, t []float64) float64
122 | 
123 | // LinfDist is
124 | //  var norm float64
125 | //  if len(s) == 0 {
126 | //  	return 0
127 | //  }
128 | //  norm = math.Abs(t[0] - s[0])
129 | //  for i, v := range s[1:] {
130 | //  	absDiff := math.Abs(t[i+1] - v)
131 | //  	if absDiff > norm || math.IsNaN(norm) {
132 | //  		norm = absDiff
133 | //  	}
134 | //  }
135 | //  return norm
136 | func LinfDist(s, t []float64) float64
137 | 
138 | // ScalUnitary is
139 | //  for i := range x {
140 | //  	x[i] *= alpha
141 | //  }
142 | func ScalUnitary(alpha float64, x []float64)
143 | 
144 | // ScalUnitaryTo is
145 | //  for i, v := range x {
146 | //  	dst[i] = alpha * v
147 | //  }
148 | func ScalUnitaryTo(dst []float64, alpha float64, x []float64)
149 | 
150 | // ScalInc is
151 | //  var ix uintptr
152 | //  for i := 0; i < int(n); i++ {
153 | //  	x[ix] *= alpha
154 | //  	ix += incX
155 | //  }
156 | func ScalInc(alpha float64, x []float64, n, incX uintptr)
157 | 
158 | // ScalIncTo is
159 | //  var idst, ix uintptr
160 | //  for i := 0; i < int(n); i++ {
161 | //  	dst[idst] = alpha * x[ix]
162 | //  	ix += incX
163 | //  	idst += incDst
164 | //  }
165 | func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr)
166 | 


--------------------------------------------------------------------------------
/asm/f64/stubs_noasm.go:
--------------------------------------------------------------------------------
  1 | // Copyright ©2016 The gonum Authors. All rights reserved.
  2 | // Use of this source code is governed by a BSD-style
  3 | // license that can be found in the LICENSE file.
  4 | 
  5 | //+build !amd64 noasm appengine
  6 | 
  7 | package f64
  8 | 
  9 | import "math"
 10 | 
 11 | // L1Norm is
 12 | //  for _, v := range x {
 13 | //  	sum += math.Abs(v)
 14 | //  }
 15 | //  return sum
 16 | func L1Norm(x []float64) (sum float64) {
 17 | 	for _, v := range x {
 18 | 		sum += math.Abs(v)
 19 | 	}
 20 | 	return sum
 21 | }
 22 | 
 23 | // L1NormInc is
 24 | //  for i := 0; i < n*incX; i += incX {
 25 | //  	sum += math.Abs(x[i])
 26 | //  }
 27 | //  return sum
 28 | func L1NormInc(x []float64, n, incX int) (sum float64) {
 29 | 	for i := 0; i < n*incX; i += incX {
 30 | 		sum += math.Abs(x[i])
 31 | 	}
 32 | 	return sum
 33 | }
 34 | 
 35 | // Add is
 36 | //  for i, v := range s {
 37 | //  	dst[i] += v
 38 | //  }
 39 | func Add(dst, s []float64) {
 40 | 	for i, v := range s {
 41 | 		dst[i] += v
 42 | 	}
 43 | }
 44 | 
 45 | // AddConst is
 46 | //  for i := range x {
 47 | //  	x[i] += alpha
 48 | //  }
 49 | func AddConst(alpha float64, x []float64) {
 50 | 	for i := range x {
 51 | 		x[i] += alpha
 52 | 	}
 53 | }
 54 | 
 55 | // CumSum is
 56 | //  if len(s) == 0 {
 57 | //  	return dst
 58 | //  }
 59 | //  dst[0] = s[0]
 60 | //  for i, v := range s[1:] {
 61 | //  	dst[i+1] = dst[i] + v
 62 | //  }
 63 | //  return dst
 64 | func CumSum(dst, s []float64) []float64 {
 65 | 	if len(s) == 0 {
 66 | 		return dst
 67 | 	}
 68 | 	dst[0] = s[0]
 69 | 	for i, v := range s[1:] {
 70 | 		dst[i+1] = dst[i] + v
 71 | 	}
 72 | 	return dst
 73 | }
 74 | 
 75 | // CumProd is
 76 | //  if len(s) == 0 {
 77 | //  	return dst
 78 | //  }
 79 | //  dst[0] = s[0]
 80 | //  for i, v := range s[1:] {
 81 | //  	dst[i+1] = dst[i] * v
 82 | //  }
 83 | //  return dst
 84 | func CumProd(dst, s []float64) []float64 {
 85 | 	if len(s) == 0 {
 86 | 		return dst
 87 | 	}
 88 | 	dst[0] = s[0]
 89 | 	for i, v := range s[1:] {
 90 | 		dst[i+1] = dst[i] * v
 91 | 	}
 92 | 	return dst
 93 | }
 94 | 
 95 | // Div is
 96 | //  for i, v := range s {
 97 | //  	dst[i] /= v
 98 | //  }
 99 | func Div(dst, s []float64) {
100 | 	for i, v := range s {
101 | 		dst[i] /= v
102 | 	}
103 | }
104 | 
105 | // DivTo is
106 | //  for i, v := range s {
107 | //  	dst[i] = v / t[i]
108 | //  }
109 | //  return dst
110 | func DivTo(dst, s, t []float64) []float64 {
111 | 	for i, v := range s {
112 | 		dst[i] = v / t[i]
113 | 	}
114 | 	return dst
115 | }
116 | 
117 | // L1Dist is
118 | //  var norm float64
119 | //  for i, v := range s {
120 | //  	norm += math.Abs(t[i] - v)
121 | //  }
122 | //  return norm
123 | func L1Dist(s, t []float64) float64 {
124 | 	var norm float64
125 | 	for i, v := range s {
126 | 		norm += math.Abs(t[i] - v)
127 | 	}
128 | 	return norm
129 | }
130 | 
131 | // LinfDist is
132 | //  var norm float64
133 | //  if len(s) == 0 {
134 | //  	return 0
135 | //  }
136 | //  norm = math.Abs(t[0] - s[0])
137 | //  for i, v := range s[1:] {
138 | //  	absDiff := math.Abs(t[i+1] - v)
139 | //  	if absDiff > norm || math.IsNaN(norm) {
140 | //  		norm = absDiff
141 | //  	}
142 | //  }
143 | //  return norm
144 | func LinfDist(s, t []float64) float64 {
145 | 	var norm float64
146 | 	if len(s) == 0 {
147 | 		return 0
148 | 	}
149 | 	norm = math.Abs(t[0] - s[0])
150 | 	for i, v := range s[1:] {
151 | 		absDiff := math.Abs(t[i+1] - v)
152 | 		if absDiff > norm || math.IsNaN(norm) {
153 | 			norm = absDiff
154 | 		}
155 | 	}
156 | 	return norm
157 | }
158 | 


--------------------------------------------------------------------------------