├── .github ├── ISSUE_TEMPLATE.md └── PULL_REQUEST_TEMPLATE.md ├── .travis.yml ├── .travis └── test-coverage.sh ├── README.md ├── asm ├── bench_gen.sh ├── c128 │ ├── axpyinc_amd64.s │ ├── axpyincto_amd64.s │ ├── axpyunitary_amd64.s │ ├── axpyunitaryto_amd64.s │ ├── bench_test.go │ ├── doc.go │ ├── dotc.go │ ├── dotu.go │ ├── scal.go │ ├── stubs_amd64.go │ ├── stubs_noasm.go │ └── stubs_test.go ├── c64 │ ├── axpyinc_amd64.s │ ├── axpyincto_amd64.s │ ├── axpyunitary_amd64.s │ ├── axpyunitaryto_amd64.s │ ├── bench_test.go │ ├── conj.go │ ├── doc.go │ ├── dotc.go │ ├── dotu.go │ ├── scal.go │ ├── stubs_amd64.go │ ├── stubs_noasm.go │ └── stubs_test.go ├── f32 │ ├── axpyinc_amd64.s │ ├── axpyincto_amd64.s │ ├── axpyunitary_amd64.s │ ├── axpyunitaryto_amd64.s │ ├── bench_test.go │ ├── ddot.go │ ├── doc.go │ ├── dot.go │ ├── scal.go │ ├── stubs_amd64.go │ ├── stubs_noasm.go │ └── stubs_test.go └── f64 │ ├── abssum_amd64.s │ ├── abssuminc_amd64.s │ ├── add_amd64.s │ ├── addconst_amd64.s │ ├── asm_test.go │ ├── axpy.go │ ├── axpy_test.go │ ├── axpyinc_amd64.s │ ├── axpyincto_amd64.s │ ├── axpyunitary_amd64.s │ ├── axpyunitaryto_amd64.s │ ├── benchAxpy_test.go │ ├── benchScal_test.go │ ├── bench_other_test.go │ ├── bench_test.go │ ├── cumprod_amd64.s │ ├── cumsum_amd64.s │ ├── div_amd64.s │ ├── divto_amd64.s │ ├── doc.go │ ├── dot.go │ ├── dot_amd64.s │ ├── dot_test.go │ ├── l1norm_amd64.s │ ├── linfnorm_amd64.s │ ├── scal.go │ ├── scal_test.go │ ├── scalinc_amd64.s │ ├── scalincto_amd64.s │ ├── scalunitary_amd64.s │ ├── scalunitaryto_amd64.s │ ├── stubs_amd64.go │ ├── stubs_noasm.go │ └── stubs_test.go └── binding └── binding.go /.github/ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### This repository is no longer actively maintained. 2 | 3 | Development of the packages in this repository has moved to https://github.com/gonum/gonum. 4 | Please file issues [there](https://github.com/gonum/gonum/issues) after having checked that your issue has not been fixed. 5 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | ### This repository is no longer actively maintained. 2 | 3 | Development of the packages in this repository has moved to https://github.com/gonum/gonum. 4 | Please send pull requests [there](https://github.com/gonum/gonum/pulls) after having checked that your addition has not already been made. 5 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | sudo: false 2 | 3 | language: go 4 | 5 | # Versions of go that are explicitly supported by gonum. 6 | go: 7 | - 1.5.4 8 | - 1.6.3 9 | - 1.7.3 10 | 11 | # Required for coverage, testing and generate. 12 | before_install: 13 | - go get golang.org/x/tools/cmd/cover 14 | - go get golang.org/x/tools/cmd/goimports 15 | - go get github.com/mattn/goveralls 16 | - go get github.com/klauspost/asmfmt/cmd/asmfmt 17 | 18 | # Get deps, build, test, and ensure the code is gofmt'ed. 19 | # If we are building as gonum, then we have access to the coveralls api key, so we can run coverage as well. 20 | script: 21 | - go get -d -t -v ./... 22 | - go build -v -x ./... 23 | - go test -v -x -a ./... 24 | - go test -v -x -a -tags noasm ./... 25 | - go test -v -x -a -tags appengine ./... 26 | - test -z "$(gofmt -d .)" 27 | - diff <(asmfmt -d .) <("") 28 | - if [[ $TRAVIS_SECURE_ENV_VARS = "true" ]]; then bash ./.travis/test-coverage.sh; fi 29 | -------------------------------------------------------------------------------- /.travis/test-coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROFILE_OUT=$PWD/profile.out 4 | ACC_OUT=$PWD/acc.out 5 | 6 | testCover() { 7 | # set the return value to 0 (succesful) 8 | retval=0 9 | # get the directory to check from the parameter. Default to '.' 10 | d=${1:-.} 11 | # skip if there are no Go files here 12 | ls $d/*.go &> /dev/null || return $retval 13 | # switch to the directory to check 14 | pushd $d > /dev/null 15 | # create the coverage profile 16 | coverageresult=`go test -v -coverprofile=$PROFILE_OUT -tags noasm` 17 | # output the result so we can check the shell output 18 | echo ${coverageresult} 19 | # append the results to acc.out if coverage didn't fail, else set the retval to 1 (failed) 20 | ( [[ ${coverageresult} == *FAIL* ]] && retval=1 ) || ( [ -f $PROFILE_OUT ] && grep -v "mode: set" $PROFILE_OUT >> $ACC_OUT ) 21 | # return to our working dir 22 | popd > /dev/null 23 | # return our return value 24 | return $retval 25 | } 26 | 27 | # Init acc.out 28 | echo "mode: set" > $ACC_OUT 29 | 30 | # Run test coverage on all directories containing go files 31 | find . -maxdepth 10 -type d | while read d; do testCover $d || exit; done 32 | 33 | # Upload the coverage profile to coveralls.io 34 | [ -n "$COVERALLS_TOKEN" ] && goveralls -coverprofile=$ACC_OUT -service=travis-ci -repotoken $COVERALLS_TOKEN 35 | 36 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Gonum Internal [![Build Status](https://travis-ci.org/gonum/internal.svg?branch=master)](https://travis-ci.org/gonum/internal) [![Coverage Status](https://coveralls.io/repos/gonum/internal/badge.svg?branch=master&service=github)](https://coveralls.io/github/gonum/internal?branch=master) [![GoDoc](https://godoc.org/github.com/gonum/internal?status.svg)](https://godoc.org/github.com/gonum/internal) 2 | 3 | # This repository is no longer maintained. Development has moved to https://github.com/gonum/gonum. 4 | 5 | This is the set of internal packages for the Gonum project. 6 | 7 | ## Issues 8 | 9 | If you find any bugs, feel free to file an issue on the github [issue tracker for gonum/gonum](https://github.com/gonum/gonum/issues) if the bug exists in that reposity; no code changes will be made to this repository. Other dicussions should be taken to the gonum-dev Google Group. 10 | 11 | https://groups.google.com/forum/#!forum/gonum-dev 12 | 13 | ## License 14 | 15 | Please see [github.com/gonum/license](https://github.com/gonum/license) for general license information, contributors, authors, etc on the Gonum suite of packages. 16 | -------------------------------------------------------------------------------- /asm/bench_gen.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copyright ©2016 The gonum Authors. All rights reserved. 4 | # Use of this source code is governed by a BSD-style 5 | # license that can be found in the LICENSE file. 6 | 7 | cat c64/bench_test.go \ 8 | | gofmt -r 'complex(float32(n), float32(n)) -> float32(n)' \ 9 | | gofmt -r 'complex64 -> float32' \ 10 | | gofmt -r '1 + 1i -> 1' \ 11 | | gofmt -r '2 + 2i -> 2' \ 12 | | sed 's/C64/F32/g' \ 13 | | sed 's/c64/f32/g' \ 14 | > f32/bench_test.go 15 | 16 | cat c64/bench_test.go \ 17 | | gofmt -r 'complex(float32(n), float32(n)) -> float64(n)' \ 18 | | gofmt -r 'complex64 -> float64' \ 19 | | gofmt -r '1 + 1i -> 1' \ 20 | | gofmt -r '2 + 2i -> 2' \ 21 | | sed 's/C64/F64/g' \ 22 | | sed 's/c64/f64/g' \ 23 | > f64/bench_test.go 24 | 25 | cat c64/bench_test.go \ 26 | | gofmt -r 'float32 -> float64' \ 27 | | gofmt -r 'complex64 -> complex128' \ 28 | | sed 's/C64/C128/g' \ 29 | | sed 's/c64/c128/g' \ 30 | > c128/bench_test.go 31 | -------------------------------------------------------------------------------- /asm/c128/axpyinc_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // MOVDDUP X2, X3 10 | #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA 11 | // MOVDDUP X4, X5 12 | #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC 13 | // MOVDDUP X6, X7 14 | #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE 15 | // MOVDDUP X8, X9 16 | #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8 17 | 18 | // ADDSUBPD X2, X3 19 | #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 20 | // ADDSUBPD X4, X5 21 | #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 | // ADDSUBPD X6, X7 23 | #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 24 | // ADDSUBPD X8, X9 25 | #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 26 | 27 | // func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) 28 | TEXT ·AxpyInc(SB), NOSPLIT, $0 29 | MOVQ x_base+16(FP), SI // SI = &x 30 | MOVQ y_base+40(FP), DI // DI = &y 31 | MOVQ n+64(FP), CX // CX = n 32 | CMPQ CX, $0 // if n==0 { return } 33 | JE axpyi_end 34 | MOVQ ix+88(FP), R8 // R8 = ix // Load the first index 35 | SHLQ $4, R8 // R8 *= sizeof(complex128) 36 | MOVQ iy+96(FP), R9 // R9 = iy 37 | SHLQ $4, R9 // R9 *= sizeof(complex128) 38 | LEAQ (SI)(R8*1), SI // SI = &(x[ix]) 39 | LEAQ (DI)(R9*1), DI // DI = &(y[iy]) 40 | MOVQ DI, DX // DX = DI // Separate Read/Write pointers 41 | MOVQ incX+72(FP), R8 // R8 = incX 42 | SHLQ $4, R8 // R8 *= sizeof(complex128) 43 | MOVQ incY+80(FP), R9 // R9 = iy 44 | SHLQ $4, R9 // R9 *= sizeof(complex128) 45 | MOVUPS alpha+0(FP), X0 // X0 = { imag(a), real(a) } 46 | MOVAPS X0, X1 47 | SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) } 48 | MOVAPS X0, X10 // Copy X0 and X1 for pipelining 49 | MOVAPS X1, X11 50 | MOVQ CX, BX 51 | ANDQ $3, CX // CX = n % 4 52 | SHRQ $2, BX // BX = floor( n / 4 ) 53 | JZ axpyi_tail // if BX == 0 { goto axpyi_tail } 54 | 55 | axpyi_loop: // do { 56 | MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) } 57 | MOVUPS (SI)(R8*1), X4 58 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 59 | MOVUPS (SI), X6 60 | MOVUPS (SI)(R8*1), X8 61 | 62 | // X_(i+1) = { real(x[i], real(x[i]) } 63 | MOVDDUP_X2_X3 64 | MOVDDUP_X4_X5 65 | MOVDDUP_X6_X7 66 | MOVDDUP_X8_X9 67 | 68 | // X_i = { imag(x[i]), imag(x[i]) } 69 | SHUFPD $0x3, X2, X2 70 | SHUFPD $0x3, X4, X4 71 | SHUFPD $0x3, X6, X6 72 | SHUFPD $0x3, X8, X8 73 | 74 | // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 75 | // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 76 | MULPD X1, X2 77 | MULPD X0, X3 78 | MULPD X11, X4 79 | MULPD X10, X5 80 | MULPD X1, X6 81 | MULPD X0, X7 82 | MULPD X11, X8 83 | MULPD X10, X9 84 | 85 | // X_(i+1) = { 86 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 87 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 88 | // } 89 | ADDSUBPD_X2_X3 90 | ADDSUBPD_X4_X5 91 | ADDSUBPD_X6_X7 92 | ADDSUBPD_X8_X9 93 | 94 | // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 95 | ADDPD (DX), X3 96 | ADDPD (DX)(R9*1), X5 97 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 98 | ADDPD (DX), X7 99 | ADDPD (DX)(R9*1), X9 100 | MOVUPS X3, (DI) // dst[i] = X_(i+1) 101 | MOVUPS X5, (DI)(R9*1) 102 | LEAQ (DI)(R9*2), DI 103 | MOVUPS X7, (DI) 104 | MOVUPS X9, (DI)(R9*1) 105 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 106 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 107 | LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2]) 108 | DECQ BX 109 | JNZ axpyi_loop // } while --BX > 0 110 | CMPQ CX, $0 // if CX == 0 { return } 111 | JE axpyi_end 112 | 113 | axpyi_tail: // do { 114 | MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) } 115 | MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) } 116 | SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) } 117 | MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 118 | MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 119 | 120 | // X_(i+1) = { 121 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 122 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 123 | // } 124 | ADDSUBPD_X2_X3 125 | 126 | // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 127 | ADDPD (DI), X3 128 | MOVUPS X3, (DI) // y[i] = X_i 129 | ADDQ R8, SI // SI = &(SI[incX]) 130 | ADDQ R9, DI // DI = &(DI[incY]) 131 | LOOP axpyi_tail // } while --CX > 0 132 | 133 | axpyi_end: 134 | RET 135 | -------------------------------------------------------------------------------- /asm/c128/axpyincto_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // MOVDDUP X2, X3 10 | #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA 11 | // MOVDDUP X4, X5 12 | #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC 13 | // MOVDDUP X6, X7 14 | #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE 15 | // MOVDDUP X8, X9 16 | #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8 17 | 18 | // ADDSUBPD X2, X3 19 | #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 20 | // ADDSUBPD X4, X5 21 | #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 | // ADDSUBPD X6, X7 23 | #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 24 | // ADDSUBPD X8, X9 25 | #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 26 | 27 | // func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) 28 | TEXT ·AxpyIncTo(SB), NOSPLIT, $0 29 | MOVQ dst_base+0(FP), DI // DI = &dst 30 | MOVQ x_base+56(FP), SI // SI = &x 31 | MOVQ y_base+80(FP), DX // DX = &y 32 | MOVQ n+104(FP), CX // CX = n 33 | CMPQ CX, $0 // if n==0 { return } 34 | JE axpyi_end 35 | MOVQ ix+128(FP), R8 // R8 = ix // Load the first index 36 | SHLQ $4, R8 // R8 *= sizeof(complex128) 37 | MOVQ iy+136(FP), R9 // R9 = iy 38 | SHLQ $4, R9 // R9 *= sizeof(complex128) 39 | MOVQ idst+32(FP), R10 // R10 = idst 40 | SHLQ $4, R10 // R10 *= sizeof(complex128) 41 | LEAQ (SI)(R8*1), SI // SI = &(x[ix]) 42 | LEAQ (DX)(R9*1), DX // DX = &(y[iy]) 43 | LEAQ (DI)(R10*1), DI // DI = &(dst[idst]) 44 | MOVQ incX+112(FP), R8 // R8 = incX 45 | SHLQ $4, R8 // R8 *= sizeof(complex128) 46 | MOVQ incY+120(FP), R9 // R9 = incY 47 | SHLQ $4, R9 // R9 *= sizeof(complex128) 48 | MOVQ incDst+24(FP), R10 // R10 = incDst 49 | SHLQ $4, R10 // R10 *= sizeof(complex128) 50 | MOVUPS alpha+40(FP), X0 // X0 = { imag(a), real(a) } 51 | MOVAPS X0, X1 52 | SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) } 53 | MOVAPS X0, X10 // Copy X0 and X1 for pipelining 54 | MOVAPS X1, X11 55 | MOVQ CX, BX 56 | ANDQ $3, CX // CX = n % 4 57 | SHRQ $2, BX // BX = floor( n / 4 ) 58 | JZ axpyi_tail // if BX == 0 { goto axpyi_tail } 59 | 60 | axpyi_loop: // do { 61 | MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) } 62 | MOVUPS (SI)(R8*1), X4 63 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 64 | 65 | MOVUPS (SI), X6 66 | MOVUPS (SI)(R8*1), X8 67 | 68 | // X_(i+1) = { real(x[i], real(x[i]) } 69 | MOVDDUP_X2_X3 70 | MOVDDUP_X4_X5 71 | MOVDDUP_X6_X7 72 | MOVDDUP_X8_X9 73 | 74 | // X_i = { imag(x[i]), imag(x[i]) } 75 | SHUFPD $0x3, X2, X2 76 | SHUFPD $0x3, X4, X4 77 | SHUFPD $0x3, X6, X6 78 | SHUFPD $0x3, X8, X8 79 | 80 | // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 81 | // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 82 | MULPD X1, X2 83 | MULPD X0, X3 84 | MULPD X11, X4 85 | MULPD X10, X5 86 | MULPD X1, X6 87 | MULPD X0, X7 88 | MULPD X11, X8 89 | MULPD X10, X9 90 | 91 | // X_(i+1) = { 92 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 93 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 94 | // } 95 | ADDSUBPD_X2_X3 96 | ADDSUBPD_X4_X5 97 | ADDSUBPD_X6_X7 98 | ADDSUBPD_X8_X9 99 | 100 | // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 101 | ADDPD (DX), X3 102 | ADDPD (DX)(R9*1), X5 103 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 104 | ADDPD (DX), X7 105 | ADDPD (DX)(R9*1), X9 106 | MOVUPS X3, (DI) // dst[i] = X_(i+1) 107 | MOVUPS X5, (DI)(R10*1) 108 | LEAQ (DI)(R10*2), DI 109 | MOVUPS X7, (DI) 110 | MOVUPS X9, (DI)(R10*1) 111 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 112 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 113 | LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2]) 114 | DECQ BX 115 | JNZ axpyi_loop // } while --BX > 0 116 | CMPQ CX, $0 // if CX == 0 { return } 117 | JE axpyi_end 118 | 119 | axpyi_tail: // do { 120 | MOVUPS (SI), X2 // X_i = { imag(x[i]), real(x[i]) } 121 | MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) } 122 | SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) } 123 | MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 124 | MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 125 | 126 | // X_(i+1) = { 127 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 128 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 129 | // } 130 | ADDSUBPD_X2_X3 131 | 132 | // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 133 | ADDPD (DX), X3 134 | MOVUPS X3, (DI) // y[i] X_(i+1) 135 | ADDQ R8, SI // SI += incX 136 | ADDQ R9, DX // DX += incY 137 | ADDQ R10, DI // DI += incDst 138 | LOOP axpyi_tail // } while --CX > 0 139 | 140 | axpyi_end: 141 | RET 142 | -------------------------------------------------------------------------------- /asm/c128/axpyunitary_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // MOVDDUP X2, X3 10 | #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA 11 | // MOVDDUP X4, X5 12 | #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC 13 | // MOVDDUP X6, X7 14 | #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE 15 | // MOVDDUP X8, X9 16 | #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8 17 | 18 | // ADDSUBPD X2, X3 19 | #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 20 | // ADDSUBPD X4, X5 21 | #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 | // ADDSUBPD X6, X7 23 | #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 24 | // ADDSUBPD X8, X9 25 | #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 26 | 27 | // func AxpyUnitary(alpha complex128, x, y []complex128) 28 | TEXT ·AxpyUnitary(SB), NOSPLIT, $0 29 | MOVQ x_base+16(FP), SI // SI = &x 30 | MOVQ y_base+40(FP), DI // DI = &y 31 | MOVQ x_len+24(FP), CX // CX = min( len(x), len(y) ) 32 | CMPQ y_len+48(FP), CX 33 | CMOVQLE y_len+48(FP), CX 34 | CMPQ CX, $0 // if CX == 0 { return } 35 | JE caxy_end 36 | PXOR X0, X0 // Clear work registers and cache-align loop 37 | PXOR X1, X1 38 | MOVUPS alpha+0(FP), X0 // X0 = { imag(a), real(a) } 39 | MOVAPS X0, X1 40 | SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) } 41 | XORQ AX, AX // i = 0 42 | MOVAPS X0, X10 // Copy X0 and X1 for pipelining 43 | MOVAPS X1, X11 44 | MOVQ CX, BX 45 | ANDQ $3, CX // CX = n % 4 46 | SHRQ $2, BX // BX = floor( n / 4 ) 47 | JZ caxy_tail // if BX == 0 { goto caxy_tail } 48 | 49 | caxy_loop: // do { 50 | MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) } 51 | MOVUPS 16(SI)(AX*8), X4 52 | MOVUPS 32(SI)(AX*8), X6 53 | MOVUPS 48(SI)(AX*8), X8 54 | 55 | // X_(i+1) = { real(x[i], real(x[i]) } 56 | MOVDDUP_X2_X3 57 | MOVDDUP_X4_X5 58 | MOVDDUP_X6_X7 59 | MOVDDUP_X8_X9 60 | 61 | // X_i = { imag(x[i]), imag(x[i]) } 62 | SHUFPD $0x3, X2, X2 63 | SHUFPD $0x3, X4, X4 64 | SHUFPD $0x3, X6, X6 65 | SHUFPD $0x3, X8, X8 66 | 67 | // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 68 | // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 69 | MULPD X1, X2 70 | MULPD X0, X3 71 | MULPD X11, X4 72 | MULPD X10, X5 73 | MULPD X1, X6 74 | MULPD X0, X7 75 | MULPD X11, X8 76 | MULPD X10, X9 77 | 78 | // X_(i+1) = { 79 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 80 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 81 | // } 82 | ADDSUBPD_X2_X3 83 | ADDSUBPD_X4_X5 84 | ADDSUBPD_X6_X7 85 | ADDSUBPD_X8_X9 86 | 87 | // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 88 | ADDPD (DI)(AX*8), X3 89 | ADDPD 16(DI)(AX*8), X5 90 | ADDPD 32(DI)(AX*8), X7 91 | ADDPD 48(DI)(AX*8), X9 92 | MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1) 93 | MOVUPS X5, 16(DI)(AX*8) 94 | MOVUPS X7, 32(DI)(AX*8) 95 | MOVUPS X9, 48(DI)(AX*8) 96 | ADDQ $8, AX // i += 8 97 | DECQ BX 98 | JNZ caxy_loop // } while --BX > 0 99 | CMPQ CX, $0 // if CX == 0 { return } 100 | JE caxy_end 101 | 102 | caxy_tail: // do { 103 | MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) } 104 | MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) } 105 | SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) } 106 | MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 107 | MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 108 | 109 | // X_(i+1) = { 110 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 111 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 112 | // } 113 | ADDSUBPD_X2_X3 114 | 115 | // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 116 | ADDPD (DI)(AX*8), X3 117 | MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1) 118 | ADDQ $2, AX // i += 2 119 | LOOP caxy_tail // } while --CX > 0 120 | 121 | caxy_end: 122 | RET 123 | -------------------------------------------------------------------------------- /asm/c128/axpyunitaryto_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // MOVDDUP X2, X3 10 | #define MOVDDUP_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xDA 11 | // MOVDDUP X4, X5 12 | #define MOVDDUP_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xEC 13 | // MOVDDUP X6, X7 14 | #define MOVDDUP_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0x12; BYTE $0xFE 15 | // MOVDDUP X8, X9 16 | #define MOVDDUP_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC8 17 | 18 | // ADDSUBPD X2, X3 19 | #define ADDSUBPD_X2_X3 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 20 | // ADDSUBPD X4, X5 21 | #define ADDSUBPD_X4_X5 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 | // ADDSUBPD X6, X7 23 | #define ADDSUBPD_X6_X7 BYTE $0x66; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 24 | // ADDSUBPD X8, X9 25 | #define ADDSUBPD_X8_X9 BYTE $0x66; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 26 | 27 | // func AxpyUnitaryTo(dst []complex128, alpha complex64, x, y []complex128) 28 | TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0 29 | MOVQ dst_base+0(FP), DI // DI = &dst 30 | MOVQ x_base+40(FP), SI // SI = &x 31 | MOVQ y_base+64(FP), DX // DX = &y 32 | MOVQ x_len+48(FP), CX // CX = min( len(x), len(y), len(dst) ) 33 | CMPQ y_len+72(FP), CX 34 | CMOVQLE y_len+72(FP), CX 35 | CMPQ dst_len+8(FP), CX 36 | CMOVQLE dst_len+8(FP), CX 37 | CMPQ CX, $0 // if CX == 0 { return } 38 | JE caxy_end 39 | MOVUPS alpha+24(FP), X0 // X0 = { imag(a), real(a) } 40 | MOVAPS X0, X1 41 | SHUFPD $0x1, X1, X1 // X1 = { real(a), imag(a) } 42 | XORQ AX, AX // i = 0 43 | MOVAPS X0, X10 // Copy X0 and X1 for pipelining 44 | MOVAPS X1, X11 45 | MOVQ CX, BX 46 | ANDQ $3, CX // CX = n % 4 47 | SHRQ $2, BX // BX = floor( n / 4 ) 48 | JZ caxy_tail // if BX == 0 { goto caxy_tail } 49 | 50 | caxy_loop: // do { 51 | MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) } 52 | MOVUPS 16(SI)(AX*8), X4 53 | MOVUPS 32(SI)(AX*8), X6 54 | MOVUPS 48(SI)(AX*8), X8 55 | 56 | // X_(i+1) = { real(x[i], real(x[i]) } 57 | MOVDDUP_X2_X3 // Load and duplicate imag elements (xi, xi) 58 | MOVDDUP_X4_X5 59 | MOVDDUP_X6_X7 60 | MOVDDUP_X8_X9 61 | 62 | // X_i = { imag(x[i]), imag(x[i]) } 63 | SHUFPD $0x3, X2, X2 // duplicate real elements (xr, xr) 64 | SHUFPD $0x3, X4, X4 65 | SHUFPD $0x3, X6, X6 66 | SHUFPD $0x3, X8, X8 67 | 68 | // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 69 | // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 70 | MULPD X1, X2 71 | MULPD X0, X3 72 | MULPD X11, X4 73 | MULPD X10, X5 74 | MULPD X1, X6 75 | MULPD X0, X7 76 | MULPD X11, X8 77 | MULPD X10, X9 78 | 79 | // X_(i+1) = { 80 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 81 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 82 | // } 83 | ADDSUBPD_X2_X3 84 | ADDSUBPD_X4_X5 85 | ADDSUBPD_X6_X7 86 | ADDSUBPD_X8_X9 87 | 88 | // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 89 | ADDPD (DX)(AX*8), X3 90 | ADDPD 16(DX)(AX*8), X5 91 | ADDPD 32(DX)(AX*8), X7 92 | ADDPD 48(DX)(AX*8), X9 93 | MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1) 94 | MOVUPS X5, 16(DI)(AX*8) 95 | MOVUPS X7, 32(DI)(AX*8) 96 | MOVUPS X9, 48(DI)(AX*8) 97 | ADDQ $8, AX // i += 8 98 | DECQ BX 99 | JNZ caxy_loop // } while --BX > 0 100 | CMPQ CX, $0 // if CX == 0 { return } 101 | JE caxy_end 102 | 103 | caxy_tail: // Same calculation, but read in values to avoid trampling memory 104 | MOVUPS (SI)(AX*8), X2 // X_i = { imag(x[i]), real(x[i]) } 105 | MOVDDUP_X2_X3 // X_(i+1) = { real(x[i], real(x[i]) } 106 | SHUFPD $0x3, X2, X2 // X_i = { imag(x[i]), imag(x[i]) } 107 | MULPD X1, X2 // X_i = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 108 | MULPD X0, X3 // X_(i+1) = { imag(a) * real(x[i]), real(a) * real(x[i]) } 109 | 110 | // X_(i+1) = { 111 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 112 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]) 113 | // } 114 | ADDSUBPD_X2_X3 115 | 116 | // X_(i+1) = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 117 | ADDPD (DX)(AX*8), X3 118 | MOVUPS X3, (DI)(AX*8) // y[i] = X_(i+1) 119 | ADDQ $2, AX // i += 2 120 | LOOP caxy_tail // } while --CX > 0 121 | 122 | caxy_end: 123 | RET 124 | -------------------------------------------------------------------------------- /asm/c128/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2017 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // This repository is no longer maintained. 6 | // Development has moved to https://github.com/gonum/gonum. 7 | // 8 | // Package c128 provides complex128 vector primitives. 9 | package c128 10 | -------------------------------------------------------------------------------- /asm/c128/dotc.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package c128 6 | 7 | import "math/cmplx" 8 | 9 | // DotcUnitary is 10 | // for i, v := range x { 11 | // sum += y[i] * cmplx.Conj(v) 12 | // } 13 | // return sum 14 | func DotcUnitary(x, y []complex128) (sum complex128) { 15 | for i, v := range x { 16 | sum += y[i] * cmplx.Conj(v) 17 | } 18 | return sum 19 | } 20 | 21 | // DotcInc is 22 | // for i := 0; i < int(n); i++ { 23 | // sum += y[iy] * cmplx.Conj(x[ix]) 24 | // ix += incX 25 | // iy += incY 26 | // } 27 | // return sum 28 | func DotcInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) { 29 | for i := 0; i < int(n); i++ { 30 | sum += y[iy] * cmplx.Conj(x[ix]) 31 | ix += incX 32 | iy += incY 33 | } 34 | return sum 35 | } 36 | -------------------------------------------------------------------------------- /asm/c128/dotu.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package c128 6 | 7 | // DotuUnitary is 8 | // for i, v := range x { 9 | // sum += y[i] * v 10 | // } 11 | // return sum 12 | func DotuUnitary(x, y []complex128) (sum complex128) { 13 | for i, v := range x { 14 | sum += y[i] * v 15 | } 16 | return sum 17 | } 18 | 19 | // DotuInc is 20 | // for i := 0; i < int(n); i++ { 21 | // sum += y[iy] * x[ix] 22 | // ix += incX 23 | // iy += incY 24 | // } 25 | // return sum 26 | func DotuInc(x, y []complex128, n, incX, incY, ix, iy uintptr) (sum complex128) { 27 | for i := 0; i < int(n); i++ { 28 | sum += y[iy] * x[ix] 29 | ix += incX 30 | iy += incY 31 | } 32 | return sum 33 | } 34 | -------------------------------------------------------------------------------- /asm/c128/scal.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package c128 6 | 7 | // ScalUnitary is 8 | // for i := range x { 9 | // x[i] *= alpha 10 | // } 11 | func ScalUnitary(alpha complex128, x []complex128) { 12 | for i := range x { 13 | x[i] *= alpha 14 | } 15 | } 16 | 17 | // ScalUnitaryTo is 18 | // for i, v := range x { 19 | // dst[i] = alpha * v 20 | // } 21 | func ScalUnitaryTo(dst []complex128, alpha complex128, x []complex128) { 22 | for i, v := range x { 23 | dst[i] = alpha * v 24 | } 25 | } 26 | 27 | // ScalInc is 28 | // var ix uintptr 29 | // for i := 0; i < int(n); i++ { 30 | // x[ix] *= alpha 31 | // ix += incX 32 | // } 33 | func ScalInc(alpha complex128, x []complex128, n, incX uintptr) { 34 | var ix uintptr 35 | for i := 0; i < int(n); i++ { 36 | x[ix] *= alpha 37 | ix += incX 38 | } 39 | } 40 | 41 | // ScalIncTo is 42 | // var idst, ix uintptr 43 | // for i := 0; i < int(n); i++ { 44 | // dst[idst] = alpha * x[ix] 45 | // ix += incX 46 | // idst += incDst 47 | // } 48 | func ScalIncTo(dst []complex128, incDst uintptr, alpha complex128, x []complex128, n, incX uintptr) { 49 | var idst, ix uintptr 50 | for i := 0; i < int(n); i++ { 51 | dst[idst] = alpha * x[ix] 52 | ix += incX 53 | idst += incDst 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /asm/c128/stubs_amd64.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | package c128 8 | 9 | // AxpyUnitary is 10 | // for i, v := range x { 11 | // y[i] += alpha * v 12 | // } 13 | func AxpyUnitary(alpha complex128, x, y []complex128) 14 | 15 | // AxpyUnitaryTo is 16 | // for i, v := range x { 17 | // dst[i] = alpha*v + y[i] 18 | // } 19 | func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128) 20 | 21 | // AxpyInc is 22 | // for i := 0; i < int(n); i++ { 23 | // y[iy] += alpha * x[ix] 24 | // ix += incX 25 | // iy += incY 26 | // } 27 | func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) 28 | 29 | // AxpyIncTo is 30 | // for i := 0; i < int(n); i++ { 31 | // dst[idst] = alpha*x[ix] + y[iy] 32 | // ix += incX 33 | // iy += incY 34 | // idst += incDst 35 | // } 36 | func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) 37 | -------------------------------------------------------------------------------- /asm/c128/stubs_noasm.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !amd64 noasm appengine 6 | 7 | package c128 8 | 9 | // AxpyUnitary is 10 | // for i, v := range x { 11 | // y[i] += alpha * v 12 | // } 13 | func AxpyUnitary(alpha complex128, x, y []complex128) { 14 | for i, v := range x { 15 | y[i] += alpha * v 16 | } 17 | } 18 | 19 | // AxpyUnitaryTo is 20 | // for i, v := range x { 21 | // dst[i] = alpha*v + y[i] 22 | // } 23 | func AxpyUnitaryTo(dst []complex128, alpha complex128, x, y []complex128) { 24 | for i, v := range x { 25 | dst[i] = alpha*v + y[i] 26 | } 27 | } 28 | 29 | // AxpyInc is 30 | // for i := 0; i < int(n); i++ { 31 | // y[iy] += alpha * x[ix] 32 | // ix += incX 33 | // iy += incY 34 | // } 35 | func AxpyInc(alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) { 36 | for i := 0; i < int(n); i++ { 37 | y[iy] += alpha * x[ix] 38 | ix += incX 39 | iy += incY 40 | } 41 | } 42 | 43 | // AxpyIncTo is 44 | // for i := 0; i < int(n); i++ { 45 | // dst[idst] = alpha*x[ix] + y[iy] 46 | // ix += incX 47 | // iy += incY 48 | // idst += incDst 49 | // } 50 | func AxpyIncTo(dst []complex128, incDst, idst uintptr, alpha complex128, x, y []complex128, n, incX, incY, ix, iy uintptr) { 51 | for i := 0; i < int(n); i++ { 52 | dst[idst] = alpha*x[ix] + y[iy] 53 | ix += incX 54 | iy += incY 55 | idst += incDst 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /asm/c128/stubs_test.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package c128 6 | 7 | import "testing" 8 | 9 | var tests = []struct { 10 | incX, incY, incDst int 11 | ix, iy, idst uintptr 12 | a complex128 13 | dst, x, y []complex128 14 | ex []complex128 15 | }{ 16 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 17 | a: 1 + 1i, 18 | dst: []complex128{5}, 19 | x: []complex128{1}, 20 | y: []complex128{1i}, 21 | ex: []complex128{1 + 2i}}, 22 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 23 | a: 1 + 2i, 24 | dst: []complex128{0, 0, 0}, 25 | x: []complex128{0, 0, 0}, 26 | y: []complex128{1, 1, 1}, 27 | ex: []complex128{1, 1, 1}}, 28 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 29 | a: 1 + 2i, 30 | dst: []complex128{0, 0, 0}, 31 | x: []complex128{0, 0}, 32 | y: []complex128{1, 1, 1}, 33 | ex: []complex128{1, 1}}, 34 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 35 | a: 1 + 2i, 36 | dst: []complex128{1i, 1i, 1i}, 37 | x: []complex128{1i, 1i, 1i}, 38 | y: []complex128{1, 2, 1}, 39 | ex: []complex128{-1 + 1i, 1i, -1 + 1i}}, 40 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 41 | a: -1i, 42 | dst: []complex128{1i, 1i, 1i}, 43 | x: []complex128{1i, 1i, 1i}, 44 | y: []complex128{1, 2, 1}, 45 | ex: []complex128{2, 3, 2}}, 46 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 47 | a: -1i, 48 | dst: []complex128{1i, 1i, 1i}, 49 | x: []complex128{1i, 1i, 1i, 1i, 1i}[1:4], 50 | y: []complex128{1, 1, 2, 1, 1}[1:4], 51 | ex: []complex128{2, 3, 2}}, 52 | {incX: 2, incY: 4, incDst: 3, ix: 0, iy: 0, idst: 0, 53 | a: -2, 54 | dst: []complex128{1i, 1i, 1i, 1i, 1i}, 55 | x: []complex128{2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i}, 56 | y: []complex128{1, 1, 2, 1, 1}, 57 | ex: []complex128{-3 - 2i, -3 - 2i, -2 - 2i, -3 - 2i, -3 - 2i}}, 58 | // Run big test twice, once aligned once unaligned. 59 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 60 | a: 1 - 1i, 61 | dst: make([]complex128, 10), 62 | x: []complex128{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i}, 63 | y: []complex128{1, 1, 2, 1, 1, 1, 1, 2, 1, 1}, 64 | ex: []complex128{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}}, 65 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 66 | a: 1 - 1i, 67 | dst: make([]complex128, 10), 68 | x: []complex128{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i}, 69 | y: []complex128{1, 1, 2, 1, 1, 1, 1, 2, 1, 1}, 70 | ex: []complex128{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}}, 71 | {incX: -2, incY: -2, incDst: -3, ix: 18, iy: 18, idst: 27, 72 | a: 1 - 1i, 73 | dst: make([]complex128, 10), 74 | x: []complex128{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i}, 75 | y: []complex128{1, 1, 2, 1, 1, 1, 1, 2, 1, 1}, 76 | ex: []complex128{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}}, 77 | {incX: -2, incY: 2, incDst: -3, ix: 18, iy: 0, idst: 27, 78 | a: 1 - 1i, 79 | dst: make([]complex128, 10), 80 | x: []complex128{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i}, 81 | y: []complex128{1, 1, 2, 1, 1, 1, 1, 2, 1, 1}, 82 | ex: []complex128{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}}, 83 | } 84 | 85 | func guardVector(vec []complex128, guard_val complex128, guard_len int) (guarded []complex128) { 86 | guarded = make([]complex128, len(vec)+guard_len*2) 87 | copy(guarded[guard_len:], vec) 88 | for i := 0; i < guard_len; i++ { 89 | guarded[i] = guard_val 90 | guarded[len(guarded)-1-i] = guard_val 91 | } 92 | return guarded 93 | } 94 | 95 | func isValidGuard(vec []complex128, guard_val complex128, guard_len int) bool { 96 | for i := 0; i < guard_len; i++ { 97 | if vec[i] != guard_val || vec[len(vec)-1-i] != guard_val { 98 | return false 99 | } 100 | } 101 | return true 102 | } 103 | 104 | func TestAxpyUnitary(t *testing.T) { 105 | var x_gd, y_gd complex128 = 1, 1 106 | for cas, test := range tests { 107 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 108 | test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln) 109 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 110 | AxpyUnitary(test.a, x, y) 111 | for i := range test.ex { 112 | if y[i] != test.ex[i] { 113 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i], test.ex[i]) 114 | } 115 | } 116 | if !isValidGuard(test.x, x_gd, xg_ln) { 117 | t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:]) 118 | } 119 | if !isValidGuard(test.y, y_gd, yg_ln) { 120 | t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:]) 121 | } 122 | } 123 | } 124 | 125 | func TestAxpyUnitaryTo(t *testing.T) { 126 | var x_gd, y_gd, dst_gd complex128 = 1, 1, 0 127 | for cas, test := range tests { 128 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 129 | test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln) 130 | test.dst = guardVector(test.dst, dst_gd, xg_ln) 131 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 132 | dst := test.dst[xg_ln : len(test.dst)-xg_ln] 133 | AxpyUnitaryTo(dst, test.a, x, y) 134 | for i := range test.ex { 135 | if dst[i] != test.ex[i] { 136 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i], test.ex[i]) 137 | } 138 | } 139 | if !isValidGuard(test.x, x_gd, xg_ln) { 140 | t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:]) 141 | } 142 | if !isValidGuard(test.y, y_gd, yg_ln) { 143 | t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:]) 144 | } 145 | if !isValidGuard(test.dst, dst_gd, xg_ln) { 146 | t.Errorf("Test %d Guard violated in dst vector %v %v", cas, test.dst[:xg_ln], test.dst[len(test.dst)-xg_ln:]) 147 | } 148 | 149 | } 150 | } 151 | 152 | func guardIncVector(vec []complex128, guard_val complex128, incV uintptr, guard_len int) (guarded []complex128) { 153 | inc := int(incV) 154 | s_ln := len(vec) * inc 155 | if inc < 0 { 156 | s_ln = len(vec) * -inc 157 | } 158 | guarded = make([]complex128, s_ln+guard_len*2) 159 | for i, cas := 0, 0; i < len(guarded); i++ { 160 | switch { 161 | case i < guard_len, i > guard_len+s_ln: 162 | guarded[i] = guard_val 163 | case (i-guard_len)%(inc) == 0 && cas < len(vec): 164 | guarded[i] = vec[cas] 165 | cas++ 166 | default: 167 | guarded[i] = guard_val 168 | } 169 | } 170 | return guarded 171 | } 172 | 173 | func checkValidIncGuard(t *testing.T, vec []complex128, guard_val complex128, incV uintptr, guard_len int) { 174 | inc := int(incV) 175 | s_ln := len(vec) - 2*guard_len 176 | if inc < 0 { 177 | s_ln = len(vec) * -inc 178 | } 179 | 180 | for i := range vec { 181 | switch { 182 | case vec[i] == guard_val: 183 | // Correct value 184 | case i < guard_len: 185 | t.Errorf("Front guard violated at %d %v", i, vec[:guard_len]) 186 | case i > guard_len+s_ln: 187 | t.Errorf("Back guard violated at %d %v", i-guard_len-s_ln, vec[guard_len+s_ln:]) 188 | case (i-guard_len)%inc == 0 && (i-guard_len)/inc < len(vec): 189 | // Ignore input values 190 | default: 191 | t.Errorf("Internal guard violated at %d %v", i-guard_len, vec[guard_len:guard_len+s_ln]) 192 | } 193 | } 194 | } 195 | 196 | func TestAxpyInc(t *testing.T) { 197 | var x_gd, y_gd complex128 = 1, 1 198 | for cas, test := range tests { 199 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 200 | test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln) 201 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 202 | AxpyInc(test.a, x, y, uintptr(len(test.ex)), uintptr(test.incX), uintptr(test.incY), test.ix, test.iy) 203 | for i := range test.ex { 204 | if y[int(test.iy)+i*int(test.incY)] != test.ex[i] { 205 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i*int(test.incY)], test.ex[i]) 206 | } 207 | } 208 | checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln) 209 | checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln) 210 | } 211 | } 212 | 213 | func TestAxpyIncTo(t *testing.T) { 214 | var x_gd, y_gd, dst_gd complex128 = 1, 1, 0 215 | for cas, test := range tests { 216 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 217 | test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln) 218 | test.dst = guardIncVector(test.dst, dst_gd, uintptr(test.incDst), xg_ln) 219 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 220 | dst := test.dst[xg_ln : len(test.dst)-xg_ln] 221 | AxpyIncTo(dst, uintptr(test.incDst), test.idst, test.a, x, y, uintptr(len(test.ex)), uintptr(test.incX), uintptr(test.incY), test.ix, test.iy) 222 | for i := range test.ex { 223 | if dst[int(test.idst)+i*int(test.incDst)] != test.ex[i] { 224 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i*int(test.incDst)], test.ex[i]) 225 | } 226 | } 227 | checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln) 228 | checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln) 229 | checkValidIncGuard(t, test.dst, dst_gd, uintptr(test.incDst), xg_ln) 230 | } 231 | } 232 | -------------------------------------------------------------------------------- /asm/c64/axpyinc_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // MOVSHDUP X3, X2 10 | #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3 11 | // MOVSLDUP X3, X3 12 | #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB 13 | // ADDSUBPS X2, X3 14 | #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 15 | 16 | // MOVSHDUP X5, X4 17 | #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5 18 | // MOVSLDUP X5, X5 19 | #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED 20 | // ADDSUBPS X4, X5 21 | #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 | 23 | // MOVSHDUP X7, X6 24 | #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7 25 | // MOVSLDUP X7, X7 26 | #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF 27 | // ADDSUBPS X6, X7 28 | #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 29 | 30 | // MOVSHDUP X9, X8 31 | #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1 32 | // MOVSLDUP X9, X9 33 | #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9 34 | // ADDSUBPS X8, X9 35 | #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 36 | 37 | // func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) 38 | TEXT ·AxpyInc(SB), NOSPLIT, $0 39 | MOVQ x_base+8(FP), SI // SI = &x 40 | MOVQ y_base+32(FP), DI // DI = &y 41 | MOVQ n+56(FP), CX // CX = n 42 | CMPQ CX, $0 // if n==0 { return } 43 | JE axpyi_end 44 | MOVQ ix+80(FP), R8 // R8 = ix 45 | MOVQ iy+88(FP), R9 // R9 = iy 46 | LEAQ (SI)(R8*8), SI // SI = &(x[ix]) 47 | LEAQ (DI)(R9*8), DI // DI = &(y[iy]) 48 | MOVQ DI, DX // DX = DI // Read/Write pointers 49 | MOVQ incX+64(FP), R8 // R8 = incX 50 | SHLQ $3, R8 // R8 *= sizeof(complex64) 51 | MOVQ incY+72(FP), R9 // R9 = incY 52 | SHLQ $3, R9 // R9 *= sizeof(complex64) 53 | MOVSD alpha+0(FP), X0 // X0 = { 0, 0, imag(a), real(a) } 54 | MOVAPS X0, X1 55 | SHUFPS $0x11, X1, X1 // X1 = { 0, 0, real(a), imag(a) } 56 | MOVAPS X0, X10 // Copy X0 and X1 for pipelining 57 | MOVAPS X1, X11 58 | MOVQ CX, BX 59 | ANDQ $3, CX // CX = n % 4 60 | SHRQ $2, BX // BX = floor( n / 4 ) 61 | JZ axpyi_tail // if BX == 0 { goto axpyi_tail } 62 | 63 | axpyi_loop: // do { 64 | MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) } 65 | MOVSD (SI)(R8*1), X5 66 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 67 | MOVSD (SI), X7 68 | MOVSD (SI)(R8*1), X9 69 | 70 | // X_(i-1) = { imag(x[i]), imag(x[i]) } 71 | MOVSHDUP_X3_X2 72 | MOVSHDUP_X5_X4 73 | MOVSHDUP_X7_X6 74 | MOVSHDUP_X9_X8 75 | 76 | // X_i = { real(x[i]), real(x[i]) } 77 | MOVSLDUP_X3_X3 78 | MOVSLDUP_X5_X5 79 | MOVSLDUP_X7_X7 80 | MOVSLDUP_X9_X9 81 | 82 | // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 83 | // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) } 84 | MULPS X1, X2 85 | MULPS X0, X3 86 | MULPS X11, X4 87 | MULPS X10, X5 88 | MULPS X1, X6 89 | MULPS X0, X7 90 | MULPS X11, X8 91 | MULPS X10, X9 92 | 93 | // X_i = { 94 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 95 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 96 | // } 97 | ADDSUBPS_X2_X3 98 | ADDSUBPS_X4_X5 99 | ADDSUBPS_X6_X7 100 | ADDSUBPS_X8_X9 101 | 102 | // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 103 | MOVSD (DX), X2 104 | MOVSD (DX)(R9*1), X4 105 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 106 | MOVSD (DX), X6 107 | MOVSD (DX)(R9*1), X8 108 | ADDPS X2, X3 109 | ADDPS X4, X5 110 | ADDPS X6, X7 111 | ADDPS X8, X9 112 | 113 | MOVSD X3, (DI) // y[i] = X_i 114 | MOVSD X5, (DI)(R9*1) 115 | LEAQ (DI)(R9*2), DI // DI = &(DI[incDst]) 116 | MOVSD X7, (DI) 117 | MOVSD X9, (DI)(R9*1) 118 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 119 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 120 | LEAQ (DI)(R9*2), DI // DI = &(DI[incDst]) 121 | DECQ BX 122 | JNZ axpyi_loop // } while --BX > 0 123 | CMPQ CX, $0 // if CX == 0 { return } 124 | JE axpyi_end 125 | 126 | axpyi_tail: // do { 127 | MOVSD (SI), X3 // X_i = { imag(x[i+1]), real(x[i+1]) } 128 | MOVSHDUP_X3_X2 // X_(i-1) = { real(x[i]), real(x[i]) } 129 | MOVSLDUP_X3_X3 // X_i = { imag(x[i]), imag(x[i]) } 130 | 131 | // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) } 132 | // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 133 | MULPS X1, X2 134 | MULPS X0, X3 135 | 136 | // X_i = { 137 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 138 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 139 | // } 140 | ADDSUBPS_X2_X3 // (ai*x1r+ar*x1i, ar*x1r-ai*x1i) 141 | 142 | // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 143 | MOVSD (DI), X4 144 | ADDPS X4, X3 145 | MOVSD X3, (DI) // y[i] = X_i 146 | ADDQ R8, SI // SI += incX 147 | ADDQ R9, DI // DI += incY 148 | LOOP axpyi_tail // } while --CX > 0 149 | 150 | axpyi_end: 151 | RET 152 | -------------------------------------------------------------------------------- /asm/c64/axpyincto_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // MOVSHDUP X3, X2 10 | #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3 11 | // MOVSLDUP X3, X3 12 | #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB 13 | // ADDSUBPS X2, X3 14 | #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 15 | 16 | // MOVSHDUP X5, X4 17 | #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5 18 | // MOVSLDUP X5, X5 19 | #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED 20 | // ADDSUBPS X4, X5 21 | #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 | 23 | // MOVSHDUP X7, X6 24 | #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7 25 | // MOVSLDUP X7, X7 26 | #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF 27 | // ADDSUBPS X6, X7 28 | #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 29 | 30 | // MOVSHDUP X9, X8 31 | #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1 32 | // MOVSLDUP X9, X9 33 | #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9 34 | // ADDSUBPS X8, X9 35 | #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 36 | 37 | // func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) 38 | TEXT ·AxpyIncTo(SB), NOSPLIT, $0 39 | MOVQ dst_base+0(FP), DI // DI = &dst 40 | MOVQ x_base+48(FP), SI // SI = &x 41 | MOVQ y_base+72(FP), DX // DX = &y 42 | MOVQ n+96(FP), CX // CX = n 43 | CMPQ CX, $0 // if n==0 { return } 44 | JE axpyi_end 45 | MOVQ ix+120(FP), R8 // Load the first index 46 | MOVQ iy+128(FP), R9 47 | MOVQ idst+32(FP), R10 48 | LEAQ (SI)(R8*8), SI // SI = &(x[ix]) 49 | LEAQ (DX)(R9*8), DX // DX = &(y[iy]) 50 | LEAQ (DI)(R10*8), DI // DI = &(dst[idst]) 51 | MOVQ incX+104(FP), R8 // Incrementors*8 for easy iteration (ADDQ) 52 | SHLQ $3, R8 53 | MOVQ incY+112(FP), R9 54 | SHLQ $3, R9 55 | MOVQ incDst+24(FP), R10 56 | SHLQ $3, R10 57 | MOVSD alpha+40(FP), X0 // X0 = { 0, 0, imag(a), real(a) } 58 | MOVAPS X0, X1 59 | SHUFPS $0x11, X1, X1 // X1 = { 0, 0, real(a), imag(a) } 60 | MOVAPS X0, X10 // Copy X0 and X1 for pipelining 61 | MOVAPS X1, X11 62 | MOVQ CX, BX 63 | ANDQ $3, CX // CX = n % 4 64 | SHRQ $2, BX // BX = floor( n / 4 ) 65 | JZ axpyi_tail // if BX == 0 { goto axpyi_tail } 66 | 67 | axpyi_loop: // do { 68 | MOVSD (SI), X3 // X_i = { imag(x[i]), real(x[i]) } 69 | MOVSD (SI)(R8*1), X5 70 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 71 | MOVSD (SI), X7 72 | MOVSD (SI)(R8*1), X9 73 | 74 | // X_(i-1) = { imag(x[i]), imag(x[i]) } 75 | MOVSHDUP_X3_X2 76 | MOVSHDUP_X5_X4 77 | MOVSHDUP_X7_X6 78 | MOVSHDUP_X9_X8 79 | 80 | // X_i = { real(x[i]), real(x[i]) } 81 | MOVSLDUP_X3_X3 82 | MOVSLDUP_X5_X5 83 | MOVSLDUP_X7_X7 84 | MOVSLDUP_X9_X9 85 | 86 | // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 87 | // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) } 88 | MULPS X1, X2 89 | MULPS X0, X3 90 | MULPS X11, X4 91 | MULPS X10, X5 92 | MULPS X1, X6 93 | MULPS X0, X7 94 | MULPS X11, X8 95 | MULPS X10, X9 96 | 97 | // X_i = { 98 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 99 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 100 | // } 101 | ADDSUBPS_X2_X3 102 | ADDSUBPS_X4_X5 103 | ADDSUBPS_X6_X7 104 | ADDSUBPS_X8_X9 105 | 106 | // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 107 | MOVSD (DX), X2 108 | MOVSD (DX)(R9*1), X4 109 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 110 | MOVSD (DX), X6 111 | MOVSD (DX)(R9*1), X8 112 | ADDPS X2, X3 113 | ADDPS X4, X5 114 | ADDPS X6, X7 115 | ADDPS X8, X9 116 | 117 | MOVSD X3, (DI) // y[i] = X_i 118 | MOVSD X5, (DI)(R10*1) 119 | LEAQ (DI)(R10*2), DI // DI = &(DI[incDst]) 120 | MOVSD X7, (DI) 121 | MOVSD X9, (DI)(R10*1) 122 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 123 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 124 | LEAQ (DI)(R10*2), DI // DI = &(DI[incDst]) 125 | DECQ BX 126 | JNZ axpyi_loop // } while --BX > 0 127 | CMPQ CX, $0 // if CX == 0 { return } 128 | JE axpyi_end 129 | 130 | axpyi_tail: 131 | MOVSD (SI), X3 // X_i = { imag(x[i]), real(x[i]) } 132 | MOVSHDUP_X3_X2 // X_(i-1) = { imag(x[i]), imag(x[i]) } 133 | MOVSLDUP_X3_X3 // X_i = { real(x[i]), real(x[i]) } 134 | 135 | // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]) } 136 | // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 137 | MULPS X1, X2 138 | MULPS X0, X3 139 | 140 | // X_i = { 141 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 142 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 143 | // } 144 | ADDSUBPS_X2_X3 145 | 146 | // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]) } 147 | MOVSD (DX), X4 148 | ADDPS X4, X3 149 | MOVSD X3, (DI) // y[i] = X_i 150 | ADDQ R8, SI // SI += incX 151 | ADDQ R9, DX // DX += incY 152 | ADDQ R10, DI // DI += incDst 153 | LOOP axpyi_tail // } while --CX > 0 154 | 155 | axpyi_end: 156 | RET 157 | -------------------------------------------------------------------------------- /asm/c64/axpyunitary_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // MOVSHDUP X3, X2 10 | #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3 11 | // MOVSLDUP X3, X3 12 | #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB 13 | // ADDSUBPS X2, X3 14 | #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 15 | 16 | // MOVSHDUP X5, X4 17 | #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5 18 | // MOVSLDUP X5, X5 19 | #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED 20 | // ADDSUBPS X4, X5 21 | #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 | 23 | // MOVSHDUP X7, X6 24 | #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7 25 | // MOVSLDUP X7, X7 26 | #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF 27 | // ADDSUBPS X6, X7 28 | #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 29 | 30 | // MOVSHDUP X9, X8 31 | #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1 32 | // MOVSLDUP X9, X9 33 | #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9 34 | // ADDSUBPS X8, X9 35 | #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 36 | 37 | // func AxpyUnitary(alpha complex64, x, y []complex64) 38 | TEXT ·AxpyUnitary(SB), NOSPLIT, $0 39 | MOVQ x_base+8(FP), SI // SI = &x 40 | MOVQ y_base+32(FP), DI // DI = &y 41 | MOVQ x_len+16(FP), CX // CX = min( len(x), len(y) ) 42 | CMPQ y_len+40(FP), CX 43 | CMOVQLE y_len+40(FP), CX 44 | CMPQ CX, $0 // if CX == 0 { return } 45 | JE caxy_end 46 | PXOR X0, X0 // Clear work registers and cache-align loop 47 | PXOR X1, X1 48 | MOVSD alpha+0(FP), X0 // X0 = { 0, 0, imag(a), real(a) } 49 | SHUFPD $0, X0, X0 // X0 = { imag(a), real(a), imag(a), real(a) } 50 | MOVAPS X0, X1 51 | SHUFPS $0x11, X1, X1 // X1 = { real(a), imag(a), real(a), imag(a) } 52 | XORQ AX, AX // i = 0 53 | MOVQ DI, BX // Align on 16-byte boundary for ADDPS 54 | ANDQ $15, BX // BX = &y & 15 55 | JZ caxy_no_trim // if BX == 0 { goto caxy_no_trim } 56 | 57 | // Trim first value in unaligned buffer 58 | XORPS X2, X2 // Clear work registers and cache-align loop 59 | XORPS X3, X3 60 | XORPS X4, X4 61 | MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) } 62 | MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) } 63 | MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) } 64 | MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 65 | MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) } 66 | 67 | // X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) } 68 | ADDSUBPS_X2_X3 69 | MOVSD (DI)(AX*8), X4 // X3 += y[i] 70 | ADDPS X4, X3 71 | MOVSD X3, (DI)(AX*8) // y[i] = X3 72 | INCQ AX // i++ 73 | DECQ CX // --CX 74 | JZ caxy_end // if CX == 0 { return } 75 | 76 | caxy_no_trim: 77 | MOVAPS X0, X10 // Copy X0 and X1 for pipelineing 78 | MOVAPS X1, X11 79 | MOVQ CX, BX 80 | ANDQ $7, CX // CX = n % 8 81 | SHRQ $3, BX // BX = floor( n / 8 ) 82 | JZ caxy_tail // if BX == 0 { goto caxy_tail } 83 | 84 | caxy_loop: // do { 85 | // X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) } 86 | MOVUPS (SI)(AX*8), X3 87 | MOVUPS 16(SI)(AX*8), X5 88 | MOVUPS 32(SI)(AX*8), X7 89 | MOVUPS 48(SI)(AX*8), X9 90 | 91 | // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) } 92 | MOVSHDUP_X3_X2 93 | MOVSHDUP_X5_X4 94 | MOVSHDUP_X7_X6 95 | MOVSHDUP_X9_X8 96 | 97 | // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) } 98 | MOVSLDUP_X3_X3 99 | MOVSLDUP_X5_X5 100 | MOVSLDUP_X7_X7 101 | MOVSLDUP_X9_X9 102 | 103 | // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]), 104 | // imag(a) * real(x[i+1]), real(a) * real(x[i+1]) } 105 | // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]), 106 | // real(a) * imag(x[i+1]), imag(a) * imag(x[i+1]) } 107 | MULPS X1, X2 108 | MULPS X0, X3 109 | MULPS X11, X4 110 | MULPS X10, X5 111 | MULPS X1, X6 112 | MULPS X0, X7 113 | MULPS X11, X8 114 | MULPS X10, X9 115 | 116 | // X_i = { 117 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 118 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 119 | // imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]), 120 | // real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]), 121 | // } 122 | ADDSUBPS_X2_X3 123 | ADDSUBPS_X4_X5 124 | ADDSUBPS_X6_X7 125 | ADDSUBPS_X8_X9 126 | 127 | // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]), 128 | // imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1]) } 129 | ADDPS (DI)(AX*8), X3 130 | ADDPS 16(DI)(AX*8), X5 131 | ADDPS 32(DI)(AX*8), X7 132 | ADDPS 48(DI)(AX*8), X9 133 | MOVUPS X3, (DI)(AX*8) // y[i:i+1] = X_i 134 | MOVUPS X5, 16(DI)(AX*8) 135 | MOVUPS X7, 32(DI)(AX*8) 136 | MOVUPS X9, 48(DI)(AX*8) 137 | ADDQ $8, AX // i += 8 138 | DECQ BX // --BX 139 | JNZ caxy_loop // } while BX > 0 140 | CMPQ CX, $0 // if CX == 0 { return } 141 | JE caxy_end 142 | 143 | caxy_tail: // do { 144 | MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) } 145 | MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) } 146 | MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) } 147 | MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 148 | MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) } 149 | 150 | // X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), 151 | // real(a)*real(x[i]) - imag(a)*imag(x[i]) } 152 | ADDSUBPS_X2_X3 153 | MOVSD (DI)(AX*8), X4 // X3 += y[i] 154 | ADDPS X4, X3 155 | MOVSD X3, (DI)(AX*8) // y[i] = X3 156 | INCQ AX // ++i 157 | LOOP caxy_tail // } while --CX > 0 158 | 159 | caxy_end: 160 | RET 161 | -------------------------------------------------------------------------------- /asm/c64/axpyunitaryto_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // MOVSHDUP X3, X2 10 | #define MOVSHDUP_X3_X2 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xD3 11 | // MOVSLDUP X3, X3 12 | #define MOVSLDUP_X3_X3 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xDB 13 | // ADDSUBPS X2, X3 14 | #define ADDSUBPS_X2_X3 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xDA 15 | 16 | // MOVSHDUP X5, X4 17 | #define MOVSHDUP_X5_X4 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xE5 18 | // MOVSLDUP X5, X5 19 | #define MOVSLDUP_X5_X5 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xED 20 | // ADDSUBPS X4, X5 21 | #define ADDSUBPS_X4_X5 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xEC 22 | 23 | // MOVSHDUP X7, X6 24 | #define MOVSHDUP_X7_X6 BYTE $0xF3; BYTE $0x0F; BYTE $0x16; BYTE $0xF7 25 | // MOVSLDUP X7, X7 26 | #define MOVSLDUP_X7_X7 BYTE $0xF3; BYTE $0x0F; BYTE $0x12; BYTE $0xFF 27 | // ADDSUBPS X6, X7 28 | #define ADDSUBPS_X6_X7 BYTE $0xF2; BYTE $0x0F; BYTE $0xD0; BYTE $0xFE 29 | 30 | // MOVSHDUP X9, X8 31 | #define MOVSHDUP_X9_X8 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x16; BYTE $0xC1 32 | // MOVSLDUP X9, X9 33 | #define MOVSLDUP_X9_X9 BYTE $0xF3; BYTE $0x45; BYTE $0x0F; BYTE $0x12; BYTE $0xC9 34 | // ADDSUBPS X8, X9 35 | #define ADDSUBPS_X8_X9 BYTE $0xF2; BYTE $0x45; BYTE $0x0F; BYTE $0xD0; BYTE $0xC8 36 | 37 | // func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64) 38 | TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0 39 | MOVQ dst_base+0(FP), DI // DI = &dst 40 | MOVQ x_base+32(FP), SI // SI = &x 41 | MOVQ y_base+56(FP), DX // DX = &y 42 | MOVQ x_len+40(FP), CX 43 | CMPQ y_len+64(FP), CX // CX = min( len(x), len(y), len(dst) ) 44 | CMOVQLE y_len+64(FP), CX 45 | CMPQ dst_len+8(FP), CX 46 | CMOVQLE dst_len+8(FP), CX 47 | CMPQ CX, $0 // if CX == 0 { return } 48 | JE caxy_end 49 | MOVSD alpha+24(FP), X0 // X0 = { 0, 0, imag(a), real(a) } 50 | SHUFPD $0, X0, X0 // X0 = { imag(a), real(a), imag(a), real(a) } 51 | MOVAPS X0, X1 52 | SHUFPS $0x11, X1, X1 // X1 = { real(a), imag(a), real(a), imag(a) } 53 | XORQ AX, AX // i = 0 54 | MOVQ DX, BX // Align on 16-byte boundary for ADDPS 55 | ANDQ $15, BX // BX = &y & 15 56 | JZ caxy_no_trim // if BX == 0 { goto caxy_no_trim } 57 | 58 | MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) } 59 | MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) } 60 | MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) } 61 | MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 62 | MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) } 63 | 64 | // X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), real(a)*real(x[i]) - imag(a)*imag(x[i]) } 65 | ADDSUBPS_X2_X3 66 | MOVSD (DX)(AX*8), X4 // X3 += y[i] 67 | ADDPS X4, X3 68 | MOVSD X3, (DI)(AX*8) // dst[i] = X3 69 | INCQ AX // i++ 70 | DECQ CX // --CX 71 | JZ caxy_tail // if BX == 0 { goto caxy_tail } 72 | 73 | caxy_no_trim: 74 | MOVAPS X0, X10 // Copy X0 and X1 for pipelineing 75 | MOVAPS X1, X11 76 | MOVQ CX, BX 77 | ANDQ $7, CX // CX = n % 8 78 | SHRQ $3, BX // BX = floor( n / 8 ) 79 | JZ caxy_tail // if BX == 0 { goto caxy_tail } 80 | 81 | caxy_loop: 82 | // X_i = { imag(x[i]), real(x[i]), imag(x[i+1]), real(x[i+1]) } 83 | MOVUPS (SI)(AX*8), X3 84 | MOVUPS 16(SI)(AX*8), X5 85 | MOVUPS 32(SI)(AX*8), X7 86 | MOVUPS 48(SI)(AX*8), X9 87 | 88 | // X_(i-1) = { imag(x[i]), imag(x[i]), imag(x[i]+1), imag(x[i]+1) } 89 | MOVSHDUP_X3_X2 90 | MOVSHDUP_X5_X4 91 | MOVSHDUP_X7_X6 92 | MOVSHDUP_X9_X8 93 | 94 | // X_i = { real(x[i]), real(x[i]), real(x[i+1]), real(x[i+1]) } 95 | MOVSLDUP_X3_X3 96 | MOVSLDUP_X5_X5 97 | MOVSLDUP_X7_X7 98 | MOVSLDUP_X9_X9 99 | 100 | // X_i = { imag(a) * real(x[i]), real(a) * real(x[i]), 101 | // imag(a) * real(x[i+1]), real(a) * real(x[i+1]) } 102 | // X_(i-1) = { real(a) * imag(x[i]), imag(a) * imag(x[i]), 103 | // real(a) * imag(x[i+1]), imag(a) * imag(x[i+1]) } 104 | MULPS X1, X2 105 | MULPS X0, X3 106 | MULPS X11, X4 107 | MULPS X10, X5 108 | MULPS X1, X6 109 | MULPS X0, X7 110 | MULPS X11, X8 111 | MULPS X10, X9 112 | 113 | // X_i = { 114 | // imag(result[i]): imag(a)*real(x[i]) + real(a)*imag(x[i]), 115 | // real(result[i]): real(a)*real(x[i]) - imag(a)*imag(x[i]), 116 | // imag(result[i+1]): imag(a)*real(x[i+1]) + real(a)*imag(x[i+1]), 117 | // real(result[i+1]): real(a)*real(x[i+1]) - imag(a)*imag(x[i+1]), 118 | // } 119 | ADDSUBPS_X2_X3 120 | ADDSUBPS_X4_X5 121 | ADDSUBPS_X6_X7 122 | ADDSUBPS_X8_X9 123 | 124 | // X_i = { imag(result[i]) + imag(y[i]), real(result[i]) + real(y[i]), 125 | // imag(result[i+1]) + imag(y[i+1]), real(result[i+1]) + real(y[i+1]) } 126 | ADDPS (DX)(AX*8), X3 127 | ADDPS 16(DX)(AX*8), X5 128 | ADDPS 32(DX)(AX*8), X7 129 | ADDPS 48(DX)(AX*8), X9 130 | MOVUPS X3, (DI)(AX*8) // y[i:i+1] = X_i 131 | MOVUPS X5, 16(DI)(AX*8) 132 | MOVUPS X7, 32(DI)(AX*8) 133 | MOVUPS X9, 48(DI)(AX*8) 134 | ADDQ $8, AX // i += 8 135 | DECQ BX // --BX 136 | JNZ caxy_loop // } while BX > 0 137 | CMPQ CX, $0 // if CX == 0 { return } 138 | JE caxy_end 139 | 140 | caxy_tail: // do { 141 | MOVSD (SI)(AX*8), X3 // X3 = { imag(x[i]), real(x[i]) } 142 | MOVSHDUP_X3_X2 // X2 = { imag(x[i]), imag(x[i]) } 143 | MOVSLDUP_X3_X3 // X3 = { real(x[i]), real(x[i]) } 144 | MULPS X1, X2 // X2 = { real(a) * imag(x[i]), imag(a) * imag(x[i]) } 145 | MULPS X0, X3 // X3 = { imag(a) * real(x[i]), real(a) * real(x[i]) } 146 | 147 | // X3 = { imag(a)*real(x[i]) + real(a)*imag(x[i]), 148 | // real(a)*real(x[i]) - imag(a)*imag(x[i]) } 149 | ADDSUBPS_X2_X3 150 | MOVSD (DX)(AX*8), X4 // X3 += y[i] 151 | ADDPS X4, X3 152 | MOVSD X3, (DI)(AX*8) // y[i] = X3 153 | INCQ AX // ++i 154 | LOOP caxy_tail // } while --CX > 0 155 | 156 | caxy_end: 157 | RET 158 | -------------------------------------------------------------------------------- /asm/c64/conj.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package c64 6 | 7 | func conj(c complex64) complex64 { return complex(real(c), -imag(c)) } 8 | -------------------------------------------------------------------------------- /asm/c64/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2017 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // This repository is no longer maintained. 6 | // Development has moved to https://github.com/gonum/gonum. 7 | // 8 | // Package c64 provides complex64 vector primitives. 9 | package c64 10 | -------------------------------------------------------------------------------- /asm/c64/dotc.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package c64 6 | 7 | // DotcUnitary is 8 | // for i, v := range x { 9 | // sum += y[i] * conj(v) 10 | // } 11 | // return sum 12 | func DotcUnitary(x, y []complex64) (sum complex64) { 13 | for i, v := range x { 14 | sum += y[i] * conj(v) 15 | } 16 | return sum 17 | } 18 | 19 | // DotcInc is 20 | // for i := 0; i < int(n); i++ { 21 | // sum += y[iy] * conj(x[ix]) 22 | // ix += incX 23 | // iy += incY 24 | // } 25 | // return sum 26 | func DotcInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) { 27 | for i := 0; i < int(n); i++ { 28 | sum += y[iy] * conj(x[ix]) 29 | ix += incX 30 | iy += incY 31 | } 32 | return sum 33 | } 34 | -------------------------------------------------------------------------------- /asm/c64/dotu.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package c64 6 | 7 | // DotuUnitary is 8 | // for i, v := range x { 9 | // sum += y[i] * v 10 | // } 11 | // return sum 12 | func DotuUnitary(x, y []complex64) (sum complex64) { 13 | for i, v := range x { 14 | sum += y[i] * v 15 | } 16 | return sum 17 | } 18 | 19 | // DotuInc is 20 | // for i := 0; i < int(n); i++ { 21 | // sum += y[iy] * x[ix] 22 | // ix += incX 23 | // iy += incY 24 | // } 25 | // return sum 26 | func DotuInc(x, y []complex64, n, incX, incY, ix, iy uintptr) (sum complex64) { 27 | for i := 0; i < int(n); i++ { 28 | sum += y[iy] * x[ix] 29 | ix += incX 30 | iy += incY 31 | } 32 | return sum 33 | } 34 | -------------------------------------------------------------------------------- /asm/c64/scal.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package c64 6 | 7 | // ScalUnitary is 8 | // for i := range x { 9 | // x[i] *= alpha 10 | // } 11 | func ScalUnitary(alpha complex64, x []complex64) { 12 | for i := range x { 13 | x[i] *= alpha 14 | } 15 | } 16 | 17 | // ScalUnitaryTo is 18 | // for i, v := range x { 19 | // dst[i] = alpha * v 20 | // } 21 | func ScalUnitaryTo(dst []complex64, alpha complex64, x []complex64) { 22 | for i, v := range x { 23 | dst[i] = alpha * v 24 | } 25 | } 26 | 27 | // ScalInc is 28 | // var ix uintptr 29 | // for i := 0; i < int(n); i++ { 30 | // x[ix] *= alpha 31 | // ix += incX 32 | // } 33 | func ScalInc(alpha complex64, x []complex64, n, incX uintptr) { 34 | var ix uintptr 35 | for i := 0; i < int(n); i++ { 36 | x[ix] *= alpha 37 | ix += incX 38 | } 39 | } 40 | 41 | // ScalIncTo is 42 | // var idst, ix uintptr 43 | // for i := 0; i < int(n); i++ { 44 | // dst[idst] = alpha * x[ix] 45 | // ix += incX 46 | // idst += incDst 47 | // } 48 | func ScalIncTo(dst []complex64, incDst uintptr, alpha complex64, x []complex64, n, incX uintptr) { 49 | var idst, ix uintptr 50 | for i := 0; i < int(n); i++ { 51 | dst[idst] = alpha * x[ix] 52 | ix += incX 53 | idst += incDst 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /asm/c64/stubs_amd64.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | package c64 8 | 9 | // AxpyUnitary is 10 | // for i, v := range x { 11 | // y[i] += alpha * v 12 | // } 13 | func AxpyUnitary(alpha complex64, x, y []complex64) 14 | 15 | // AxpyUnitaryTo is 16 | // for i, v := range x { 17 | // dst[i] = alpha*v + y[i] 18 | // } 19 | func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64) 20 | 21 | // AxpyInc is 22 | // for i := 0; i < int(n); i++ { 23 | // y[iy] += alpha * x[ix] 24 | // ix += incX 25 | // iy += incY 26 | // } 27 | func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) 28 | 29 | // AxpyIncTo is 30 | // for i := 0; i < int(n); i++ { 31 | // dst[idst] = alpha*x[ix] + y[iy] 32 | // ix += incX 33 | // iy += incY 34 | // idst += incDst 35 | // } 36 | func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) 37 | -------------------------------------------------------------------------------- /asm/c64/stubs_noasm.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !amd64 noasm appengine 6 | 7 | package c64 8 | 9 | // AxpyUnitary is 10 | // for i, v := range x { 11 | // y[i] += alpha * v 12 | // } 13 | func AxpyUnitary(alpha complex64, x, y []complex64) { 14 | for i, v := range x { 15 | y[i] += alpha * v 16 | } 17 | } 18 | 19 | // AxpyUnitaryTo is 20 | // for i, v := range x { 21 | // dst[i] = alpha*v + y[i] 22 | // } 23 | func AxpyUnitaryTo(dst []complex64, alpha complex64, x, y []complex64) { 24 | for i, v := range x { 25 | dst[i] = alpha*v + y[i] 26 | } 27 | } 28 | 29 | // AxpyInc is 30 | // for i := 0; i < int(n); i++ { 31 | // y[iy] += alpha * x[ix] 32 | // ix += incX 33 | // iy += incY 34 | // } 35 | func AxpyInc(alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) { 36 | for i := 0; i < int(n); i++ { 37 | y[iy] += alpha * x[ix] 38 | ix += incX 39 | iy += incY 40 | } 41 | } 42 | 43 | // AxpyIncTo is 44 | // for i := 0; i < int(n); i++ { 45 | // dst[idst] = alpha*x[ix] + y[iy] 46 | // ix += incX 47 | // iy += incY 48 | // idst += incDst 49 | // } 50 | func AxpyIncTo(dst []complex64, incDst, idst uintptr, alpha complex64, x, y []complex64, n, incX, incY, ix, iy uintptr) { 51 | for i := 0; i < int(n); i++ { 52 | dst[idst] = alpha*x[ix] + y[iy] 53 | ix += incX 54 | iy += incY 55 | idst += incDst 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /asm/c64/stubs_test.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package c64 6 | 7 | import "testing" 8 | 9 | var tests = []struct { 10 | incX, incY, incDst int 11 | ix, iy, idst uintptr 12 | a complex64 13 | dst, x, y []complex64 14 | ex []complex64 15 | }{ 16 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 17 | a: 1 + 1i, 18 | dst: []complex64{5}, 19 | x: []complex64{1}, 20 | y: []complex64{1i}, 21 | ex: []complex64{1 + 2i}}, 22 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 23 | a: 1 + 2i, 24 | dst: []complex64{0, 0, 0}, 25 | x: []complex64{0, 0, 0}, 26 | y: []complex64{1, 1, 1}, 27 | ex: []complex64{1, 1, 1}}, 28 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 29 | a: 1 + 2i, 30 | dst: []complex64{0, 0, 0}, 31 | x: []complex64{0, 0}, 32 | y: []complex64{1, 1, 1}, 33 | ex: []complex64{1, 1}}, 34 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 35 | a: 1 + 2i, 36 | dst: []complex64{1i, 1i, 1i}, 37 | x: []complex64{1i, 1i, 1i}, 38 | y: []complex64{1, 2, 1}, 39 | ex: []complex64{-1 + 1i, 1i, -1 + 1i}}, 40 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 41 | a: -1i, 42 | dst: []complex64{1i, 1i, 1i}, 43 | x: []complex64{1i, 1i, 1i}, 44 | y: []complex64{1, 2, 1}, 45 | ex: []complex64{2, 3, 2}}, 46 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 47 | a: -1i, 48 | dst: []complex64{1i, 1i, 1i}, 49 | x: []complex64{1i, 1i, 1i, 1i, 1i}[1:4], 50 | y: []complex64{1, 1, 2, 1, 1}[1:4], 51 | ex: []complex64{2, 3, 2}}, 52 | {incX: 2, incY: 4, incDst: 3, ix: 0, iy: 0, idst: 0, 53 | a: -2, 54 | dst: []complex64{1i, 1i, 1i, 1i, 1i}, 55 | x: []complex64{2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i}, 56 | y: []complex64{1, 1, 2, 1, 1}, 57 | ex: []complex64{-3 - 2i, -3 - 2i, -2 - 2i, -3 - 2i, -3 - 2i}}, 58 | // Run big test twice, once aligned once unaligned. 59 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 60 | a: 1 - 1i, 61 | dst: make([]complex64, 10), 62 | x: []complex64{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i}, 63 | y: []complex64{1, 1, 2, 1, 1, 1, 1, 2, 1, 1}, 64 | ex: []complex64{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}}, 65 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 66 | a: 1 - 1i, 67 | dst: make([]complex64, 10), 68 | x: []complex64{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i}, 69 | y: []complex64{1, 1, 2, 1, 1, 1, 1, 2, 1, 1}, 70 | ex: []complex64{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}}, 71 | {incX: -2, incY: -2, incDst: -3, ix: 18, iy: 18, idst: 27, 72 | a: 1 - 1i, 73 | dst: make([]complex64, 10), 74 | x: []complex64{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i}, 75 | y: []complex64{1, 1, 2, 1, 1, 1, 1, 2, 1, 1}, 76 | ex: []complex64{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}}, 77 | {incX: -2, incY: 2, incDst: -3, ix: 18, iy: 0, idst: 27, 78 | a: 1 - 1i, 79 | dst: make([]complex64, 10), 80 | x: []complex64{1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i, 1i}, 81 | y: []complex64{1, 1, 2, 1, 1, 1, 1, 2, 1, 1}, 82 | ex: []complex64{2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 2 + 1i, 3 + 1i, 2 + 1i, 2 + 1i}}, 83 | } 84 | 85 | func guardVector(vec []complex64, guard_val complex64, guard_len int) (guarded []complex64) { 86 | guarded = make([]complex64, len(vec)+guard_len*2) 87 | copy(guarded[guard_len:], vec) 88 | for i := 0; i < guard_len; i++ { 89 | guarded[i] = guard_val 90 | guarded[len(guarded)-1-i] = guard_val 91 | } 92 | return guarded 93 | } 94 | 95 | func isValidGuard(vec []complex64, guard_val complex64, guard_len int) bool { 96 | for i := 0; i < guard_len; i++ { 97 | if vec[i] != guard_val || vec[len(vec)-1-i] != guard_val { 98 | return false 99 | } 100 | } 101 | return true 102 | } 103 | 104 | func TestAxpyUnitary(t *testing.T) { 105 | var x_gd, y_gd complex64 = 1, 1 106 | for cas, test := range tests { 107 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 108 | test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln) 109 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 110 | AxpyUnitary(test.a, x, y) 111 | for i := range test.ex { 112 | if y[i] != test.ex[i] { 113 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i], test.ex[i]) 114 | } 115 | } 116 | if !isValidGuard(test.x, x_gd, xg_ln) { 117 | t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:]) 118 | } 119 | if !isValidGuard(test.y, y_gd, yg_ln) { 120 | t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:]) 121 | } 122 | } 123 | } 124 | 125 | func TestAxpyUnitaryTo(t *testing.T) { 126 | var x_gd, y_gd, dst_gd complex64 = 1, 1, 0 127 | for cas, test := range tests { 128 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 129 | test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln) 130 | test.dst = guardVector(test.dst, dst_gd, xg_ln) 131 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 132 | dst := test.dst[xg_ln : len(test.dst)-xg_ln] 133 | AxpyUnitaryTo(dst, test.a, x, y) 134 | for i := range test.ex { 135 | if dst[i] != test.ex[i] { 136 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i], test.ex[i]) 137 | } 138 | } 139 | if !isValidGuard(test.x, x_gd, xg_ln) { 140 | t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:]) 141 | } 142 | if !isValidGuard(test.y, y_gd, yg_ln) { 143 | t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:]) 144 | } 145 | if !isValidGuard(test.dst, dst_gd, xg_ln) { 146 | t.Errorf("Test %d Guard violated in dst vector %v %v", cas, test.dst[:xg_ln], test.dst[len(test.dst)-xg_ln:]) 147 | } 148 | 149 | } 150 | } 151 | 152 | func guardIncVector(vec []complex64, guard_val complex64, incV uintptr, guard_len int) (guarded []complex64) { 153 | inc := int(incV) 154 | s_ln := len(vec) * inc 155 | if inc < 0 { 156 | s_ln = len(vec) * -inc 157 | } 158 | guarded = make([]complex64, s_ln+guard_len*2) 159 | for i, cas := 0, 0; i < len(guarded); i++ { 160 | switch { 161 | case i < guard_len, i > guard_len+s_ln: 162 | guarded[i] = guard_val 163 | case (i-guard_len)%(inc) == 0 && cas < len(vec): 164 | guarded[i] = vec[cas] 165 | cas++ 166 | default: 167 | guarded[i] = guard_val 168 | } 169 | } 170 | return guarded 171 | } 172 | 173 | func checkValidIncGuard(t *testing.T, vec []complex64, guard_val complex64, incV uintptr, guard_len int) { 174 | inc := int(incV) 175 | s_ln := len(vec) - 2*guard_len 176 | if inc < 0 { 177 | s_ln = len(vec) * -inc 178 | } 179 | 180 | for i := range vec { 181 | switch { 182 | case vec[i] == guard_val: 183 | // Correct value 184 | case i < guard_len: 185 | t.Errorf("Front guard violated at %d %v", i, vec[:guard_len]) 186 | case i > guard_len+s_ln: 187 | t.Errorf("Back guard violated at %d %v", i-guard_len-s_ln, vec[guard_len+s_ln:]) 188 | case (i-guard_len)%inc == 0 && (i-guard_len)/inc < len(vec): 189 | // Ignore input values 190 | default: 191 | t.Errorf("Internal guard violated at %d %v", i-guard_len, vec[guard_len:guard_len+s_ln]) 192 | } 193 | } 194 | } 195 | 196 | func TestAxpyInc(t *testing.T) { 197 | var x_gd, y_gd complex64 = 1, 1 198 | for cas, test := range tests { 199 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 200 | test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln) 201 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 202 | AxpyInc(test.a, x, y, uintptr(len(test.ex)), uintptr(test.incX), uintptr(test.incY), test.ix, test.iy) 203 | for i := range test.ex { 204 | if y[int(test.iy)+i*int(test.incY)] != test.ex[i] { 205 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i*int(test.incY)], test.ex[i]) 206 | } 207 | } 208 | checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln) 209 | checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln) 210 | } 211 | } 212 | 213 | func TestAxpyIncTo(t *testing.T) { 214 | var x_gd, y_gd, dst_gd complex64 = 1, 1, 0 215 | for cas, test := range tests { 216 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 217 | test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln) 218 | test.dst = guardIncVector(test.dst, dst_gd, uintptr(test.incDst), xg_ln) 219 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 220 | dst := test.dst[xg_ln : len(test.dst)-xg_ln] 221 | AxpyIncTo(dst, uintptr(test.incDst), test.idst, test.a, x, y, uintptr(len(test.ex)), uintptr(test.incX), uintptr(test.incY), test.ix, test.iy) 222 | for i := range test.ex { 223 | if dst[int(test.idst)+i*int(test.incDst)] != test.ex[i] { 224 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i*int(test.incDst)], test.ex[i]) 225 | } 226 | } 227 | checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln) 228 | checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln) 229 | checkValidIncGuard(t, test.dst, dst_gd, uintptr(test.incDst), xg_ln) 230 | } 231 | } 232 | -------------------------------------------------------------------------------- /asm/f32/axpyinc_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) 10 | TEXT ·AxpyInc(SB), NOSPLIT, $0 11 | MOVQ n+56(FP), CX // CX = n 12 | CMPQ CX, $0 // if n==0 { return } 13 | JLE axpyi_end 14 | MOVQ x_base+8(FP), SI // SI = &x 15 | MOVQ y_base+32(FP), DI // DI = &y 16 | MOVQ ix+80(FP), R8 // R8 = ix 17 | MOVQ iy+88(FP), R9 // R9 = iy 18 | LEAQ (SI)(R8*4), SI // SI = &(x[ix]) 19 | LEAQ (DI)(R9*4), DI // DI = &(y[iy]) 20 | MOVQ DI, DX // DX = DI Read Pointer for y 21 | MOVQ incX+64(FP), R8 // R8 = incX 22 | SHLQ $2, R8 // R8 *= sizeof(float32) 23 | MOVQ incY+72(FP), R9 // R9 = incY 24 | SHLQ $2, R9 // R9 *= sizeof(float32) 25 | MOVSS alpha+0(FP), X0 // X0 = alpha 26 | MOVSS X0, X1 // X1 = X0 // for pipelining 27 | MOVQ CX, BX 28 | ANDQ $3, BX // BX = n % 4 29 | SHRQ $2, CX // CX = floor( n / 4 ) 30 | JZ axpyi_tail_start // if CX == 0 { goto axpyi_tail_start } 31 | 32 | axpyi_loop: // Loop unrolled 4x do { 33 | MOVSS (SI), X2 // X_i = x[i] 34 | MOVSS (SI)(R8*1), X3 35 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 36 | MOVSS (SI), X4 37 | MOVSS (SI)(R8*1), X5 38 | MULSS X1, X2 // X_i *= a 39 | MULSS X0, X3 40 | MULSS X1, X4 41 | MULSS X0, X5 42 | ADDSS (DX), X2 // X_i += y[i] 43 | ADDSS (DX)(R9*1), X3 44 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 45 | ADDSS (DX), X4 46 | ADDSS (DX)(R9*1), X5 47 | MOVSS X2, (DI) // y[i] = X_i 48 | MOVSS X3, (DI)(R9*1) 49 | LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2]) 50 | MOVSS X4, (DI) 51 | MOVSS X5, (DI)(R9*1) 52 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) // Increment addresses 53 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 54 | LEAQ (DI)(R9*2), DI // DI = &(DI[incY*2]) 55 | LOOP axpyi_loop // } while --CX > 0 56 | CMPQ BX, $0 // if BX == 0 { return } 57 | JE axpyi_end 58 | 59 | axpyi_tail_start: // Reset loop registers 60 | MOVQ BX, CX // Loop counter: CX = BX 61 | 62 | axpyi_tail: // do { 63 | MOVSS (SI), X2 // X2 = x[i] 64 | MULSS X1, X2 // X2 *= a 65 | ADDSS (DI), X2 // X2 += y[i] 66 | MOVSS X2, (DI) // y[i] = X2 67 | ADDQ R8, SI // SI = &(SI[incX]) 68 | ADDQ R9, DI // DI = &(DI[incY]) 69 | LOOP axpyi_tail // } while --CX > 0 70 | 71 | axpyi_end: 72 | RET 73 | 74 | -------------------------------------------------------------------------------- /asm/f32/axpyincto_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) 10 | TEXT ·AxpyIncTo(SB), NOSPLIT, $0 11 | MOVQ n+96(FP), CX // CX = n 12 | CMPQ CX, $0 // if n==0 { return } 13 | JLE axpyi_end 14 | MOVQ dst_base+0(FP), DI // DI = &dst 15 | MOVQ x_base+48(FP), SI // SI = &x 16 | MOVQ y_base+72(FP), DX // DX = &y 17 | MOVQ ix+120(FP), R8 // R8 = ix // Load the first index 18 | MOVQ iy+128(FP), R9 // R9 = iy 19 | MOVQ idst+32(FP), R10 // R10 = idst 20 | LEAQ (SI)(R8*4), SI // SI = &(x[ix]) 21 | LEAQ (DX)(R9*4), DX // DX = &(y[iy]) 22 | LEAQ (DI)(R10*4), DI // DI = &(dst[idst]) 23 | MOVQ incX+104(FP), R8 // R8 = incX 24 | SHLQ $2, R8 // R8 *= sizeof(float32) 25 | MOVQ incY+112(FP), R9 // R9 = incY 26 | SHLQ $2, R9 // R9 *= sizeof(float32) 27 | MOVQ incDst+24(FP), R10 // R10 = incDst 28 | SHLQ $2, R10 // R10 *= sizeof(float32) 29 | MOVSS alpha+40(FP), X0 // X0 = alpha 30 | MOVSS X0, X1 // X1 = X0 // for pipelining 31 | MOVQ CX, BX 32 | ANDQ $3, BX // BX = n % 4 33 | SHRQ $2, CX // CX = floor( n / 4 ) 34 | JZ axpyi_tail_start // if CX == 0 { goto axpyi_tail_start } 35 | 36 | axpyi_loop: // Loop unrolled 4x do { 37 | MOVSS (SI), X2 // X_i = x[i] 38 | MOVSS (SI)(R8*1), X3 39 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) 40 | MOVSS (SI), X4 41 | MOVSS (SI)(R8*1), X5 42 | MULSS X1, X2 // X_i *= a 43 | MULSS X0, X3 44 | MULSS X1, X4 45 | MULSS X0, X5 46 | ADDSS (DX), X2 // X_i += y[i] 47 | ADDSS (DX)(R9*1), X3 48 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 49 | ADDSS (DX), X4 50 | ADDSS (DX)(R9*1), X5 51 | MOVSS X2, (DI) // dst[i] = X_i 52 | MOVSS X3, (DI)(R10*1) 53 | LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2]) 54 | MOVSS X4, (DI) 55 | MOVSS X5, (DI)(R10*1) 56 | LEAQ (SI)(R8*2), SI // SI = &(SI[incX*2]) // Increment addresses 57 | LEAQ (DX)(R9*2), DX // DX = &(DX[incY*2]) 58 | LEAQ (DI)(R10*2), DI // DI = &(DI[incDst*2]) 59 | LOOP axpyi_loop // } while --CX > 0 60 | CMPQ BX, $0 // if BX == 0 { return } 61 | JE axpyi_end 62 | 63 | axpyi_tail_start: // Reset loop registers 64 | MOVQ BX, CX // Loop counter: CX = BX 65 | 66 | axpyi_tail: // do { 67 | MOVSS (SI), X2 // X2 = x[i] 68 | MULSS X1, X2 // X2 *= a 69 | ADDSS (DX), X2 // X2 += y[i] 70 | MOVSS X2, (DI) // dst[i] = X2 71 | ADDQ R8, SI // SI = &(SI[incX]) 72 | ADDQ R9, DX // DX = &(DX[incY]) 73 | ADDQ R10, DI // DI = &(DI[incY]) 74 | LOOP axpyi_tail // } while --CX > 0 75 | 76 | axpyi_end: 77 | RET 78 | 79 | -------------------------------------------------------------------------------- /asm/f32/axpyunitary_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func AxpyUnitary(alpha float32, x, y []float32) 10 | TEXT ·AxpyUnitary(SB), NOSPLIT, $0 11 | MOVQ x_base+8(FP), SI // SI = &x 12 | MOVQ y_base+32(FP), DI // DI = &y 13 | MOVQ x_len+16(FP), BX // BX = min( len(x), len(y) ) 14 | CMPQ y_len+40(FP), BX 15 | CMOVQLE y_len+40(FP), BX 16 | CMPQ BX, $0 // if BX == 0 { return } 17 | JE axpy_end 18 | MOVSS alpha+0(FP), X0 19 | SHUFPS $0, X0, X0 // X0 = { a, a, a, a } 20 | XORQ AX, AX // i = 0 21 | PXOR X2, X2 // 2 NOP instructions (PXOR) to align 22 | PXOR X3, X3 // loop to cache line 23 | MOVQ DI, CX 24 | ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS 25 | JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim } 26 | 27 | XORQ $0xF, CX // CX = 4 - floor( BX % 16 / 4 ) 28 | INCQ CX 29 | SHRQ $2, CX 30 | 31 | axpy_align: // Trim first value(s) in unaligned buffer do { 32 | MOVSS (SI)(AX*4), X2 // X2 = x[i] 33 | MULSS X0, X2 // X2 *= a 34 | ADDSS (DI)(AX*4), X2 // X2 += y[i] 35 | MOVSS X2, (DI)(AX*4) // y[i] = X2 36 | INCQ AX // i++ 37 | DECQ BX 38 | JZ axpy_end // if --BX == 0 { return } 39 | LOOP axpy_align // } while --CX > 0 40 | 41 | axpy_no_trim: 42 | MOVUPS X0, X1 // Copy X0 to X1 for pipelining 43 | MOVQ BX, CX 44 | ANDQ $0xF, BX // BX = len % 16 45 | SHRQ $4, CX // CX = int( len / 16 ) 46 | JZ axpy_tail4_start // if CX == 0 { return } 47 | 48 | axpy_loop: // Loop unrolled 16x do { 49 | MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4] 50 | MOVUPS 16(SI)(AX*4), X3 51 | MOVUPS 32(SI)(AX*4), X4 52 | MOVUPS 48(SI)(AX*4), X5 53 | MULPS X0, X2 // X2 *= a 54 | MULPS X1, X3 55 | MULPS X0, X4 56 | MULPS X1, X5 57 | ADDPS (DI)(AX*4), X2 // X2 += y[i:i+4] 58 | ADDPS 16(DI)(AX*4), X3 59 | ADDPS 32(DI)(AX*4), X4 60 | ADDPS 48(DI)(AX*4), X5 61 | MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2 62 | MOVUPS X3, 16(DI)(AX*4) 63 | MOVUPS X4, 32(DI)(AX*4) 64 | MOVUPS X5, 48(DI)(AX*4) 65 | ADDQ $16, AX // i += 16 66 | LOOP axpy_loop // while (--CX) > 0 67 | CMPQ BX, $0 // if BX == 0 { return } 68 | JE axpy_end 69 | 70 | axpy_tail4_start: // Reset loop counter for 4-wide tail loop 71 | MOVQ BX, CX // CX = floor( BX / 4 ) 72 | SHRQ $2, CX 73 | JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start } 74 | 75 | axpy_tail4: // Loop unrolled 4x do { 76 | MOVUPS (SI)(AX*4), X2 // X2 = x[i] 77 | MULPS X0, X2 // X2 *= a 78 | ADDPS (DI)(AX*4), X2 // X2 += y[i] 79 | MOVUPS X2, (DI)(AX*4) // y[i] = X2 80 | ADDQ $4, AX // i += 4 81 | LOOP axpy_tail4 // } while --CX > 0 82 | 83 | axpy_tail_start: // Reset loop counter for 1-wide tail loop 84 | MOVQ BX, CX // CX = BX % 4 85 | ANDQ $3, CX 86 | JZ axpy_end // if CX == 0 { return } 87 | 88 | axpy_tail: 89 | MOVSS (SI)(AX*4), X1 // X1 = x[i] 90 | MULSS X0, X1 // X1 *= a 91 | ADDSS (DI)(AX*4), X1 // X1 += y[i] 92 | MOVSS X1, (DI)(AX*4) // y[i] = X1 93 | INCQ AX // i++ 94 | LOOP axpy_tail // } while --CX > 0 95 | 96 | axpy_end: 97 | RET 98 | -------------------------------------------------------------------------------- /asm/f32/axpyunitaryto_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) 10 | TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0 11 | MOVQ dst_base+0(FP), DI // DI = &dst 12 | MOVQ x_base+32(FP), SI // SI = &x 13 | MOVQ y_base+56(FP), DX // DX = &y 14 | MOVQ x_len+40(FP), BX // BX = min( len(x), len(y), len(dst) ) 15 | CMPQ y_len+64(FP), BX 16 | CMOVQLE y_len+64(FP), BX 17 | CMPQ dst_len+8(FP), BX 18 | CMOVQLE dst_len+8(FP), BX 19 | CMPQ BX, $0 // if BX == 0 { return } 20 | JE axpy_end 21 | MOVSS alpha+24(FP), X0 22 | SHUFPS $0, X0, X0 // X0 = { a, a, a, a, } 23 | XORQ AX, AX // i = 0 24 | MOVQ DX, CX 25 | ANDQ $0xF, CX // Align on 16-byte boundary for ADDPS 26 | JZ axpy_no_trim // if CX == 0 { goto axpy_no_trim } 27 | 28 | XORQ $0xF, CX // CX = 4 - floor ( B % 16 / 4 ) 29 | INCQ CX 30 | SHRQ $2, CX 31 | 32 | axpy_align: // Trim first value(s) in unaligned buffer do { 33 | MOVSS (SI)(AX*4), X2 // X2 = x[i] 34 | MULSS X0, X2 // X2 *= a 35 | ADDSS (DX)(AX*4), X2 // X2 += y[i] 36 | MOVSS X2, (DI)(AX*4) // y[i] = X2 37 | INCQ AX // i++ 38 | DECQ BX 39 | JZ axpy_end // if --BX == 0 { return } 40 | LOOP axpy_align // } while --CX > 0 41 | 42 | axpy_no_trim: 43 | MOVUPS X0, X1 // Copy X0 to X1 for pipelining 44 | MOVQ BX, CX 45 | ANDQ $0xF, BX // BX = len % 16 46 | SHRQ $4, CX // CX = floor( len / 16 ) 47 | JZ axpy_tail4_start // if CX == 0 { return } 48 | 49 | axpy_loop: // Loop unrolled 16x do { 50 | MOVUPS (SI)(AX*4), X2 // X2 = x[i:i+4] 51 | MOVUPS 16(SI)(AX*4), X3 52 | MOVUPS 32(SI)(AX*4), X4 53 | MOVUPS 48(SI)(AX*4), X5 54 | MULPS X0, X2 // X2 *= a 55 | MULPS X1, X3 56 | MULPS X0, X4 57 | MULPS X1, X5 58 | ADDPS (DX)(AX*4), X2 // X2 += y[i:i+4] 59 | ADDPS 16(DX)(AX*4), X3 60 | ADDPS 32(DX)(AX*4), X4 61 | ADDPS 48(DX)(AX*4), X5 62 | MOVUPS X2, (DI)(AX*4) // dst[i:i+4] = X2 63 | MOVUPS X3, 16(DI)(AX*4) 64 | MOVUPS X4, 32(DI)(AX*4) 65 | MOVUPS X5, 48(DI)(AX*4) 66 | ADDQ $16, AX // i += 16 67 | LOOP axpy_loop // while (--CX) > 0 68 | CMPQ BX, $0 // if BX == 0 { return } 69 | JE axpy_end 70 | 71 | axpy_tail4_start: // Reset loop counter for 4-wide tail loop 72 | MOVQ BX, CX // CX = floor( BX / 4 ) 73 | SHRQ $2, CX 74 | JZ axpy_tail_start // if CX == 0 { goto axpy_tail_start } 75 | 76 | axpy_tail4: // Loop unrolled 4x do { 77 | MOVUPS (SI)(AX*4), X2 // X2 = x[i] 78 | MULPS X0, X2 // X2 *= a 79 | ADDPS (DX)(AX*4), X2 // X2 += y[i] 80 | MOVUPS X2, (DI)(AX*4) // y[i] = X2 81 | ADDQ $4, AX // i += 4 82 | LOOP axpy_tail4 // } while --CX > 0 83 | 84 | axpy_tail_start: // Reset loop counter for 1-wide tail loop 85 | MOVQ BX, CX // CX = BX % 4 86 | ANDQ $3, CX 87 | JZ axpy_end // if CX == 0 { return } 88 | 89 | axpy_tail: 90 | MOVSS (SI)(AX*4), X1 // X1 = x[i] 91 | MULSS X0, X1 // X1 *= a 92 | ADDSS (DX)(AX*4), X1 // X1 += y[i] 93 | MOVSS X1, (DI)(AX*4) // y[i] = X1 94 | INCQ AX // i++ 95 | LOOP axpy_tail // } while --CX > 0 96 | 97 | axpy_end: 98 | RET 99 | -------------------------------------------------------------------------------- /asm/f32/ddot.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package f32 6 | 7 | // DdotUnitary is 8 | // for i, v := range x { 9 | // sum += float64(y[i]) * float64(v) 10 | // } 11 | // return 12 | func DdotUnitary(x, y []float32) (sum float64) { 13 | for i, v := range x { 14 | sum += float64(y[i]) * float64(v) 15 | } 16 | return 17 | } 18 | 19 | // DdotInc is 20 | // for i := 0; i < int(n); i++ { 21 | // sum += float64(y[iy]) * float64(x[ix]) 22 | // ix += incX 23 | // iy += incY 24 | // } 25 | // return 26 | func DdotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float64) { 27 | for i := 0; i < int(n); i++ { 28 | sum += float64(y[iy]) * float64(x[ix]) 29 | ix += incX 30 | iy += incY 31 | } 32 | return 33 | } 34 | -------------------------------------------------------------------------------- /asm/f32/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2017 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // This repository is no longer maintained. 6 | // Development has moved to https://github.com/gonum/gonum. 7 | // 8 | // Package f32 provides float32 vector primitives. 9 | package f32 10 | -------------------------------------------------------------------------------- /asm/f32/dot.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package f32 6 | 7 | // DotUnitary is 8 | // for i, v := range x { 9 | // sum += y[i] * v 10 | // } 11 | // return sum 12 | func DotUnitary(x, y []float32) (sum float32) { 13 | for i, v := range x { 14 | sum += y[i] * v 15 | } 16 | return sum 17 | } 18 | 19 | // DotInc is 20 | // for i := 0; i < int(n); i++ { 21 | // sum += y[iy] * x[ix] 22 | // ix += incX 23 | // iy += incY 24 | // } 25 | // return sum 26 | func DotInc(x, y []float32, n, incX, incY, ix, iy uintptr) (sum float32) { 27 | for i := 0; i < int(n); i++ { 28 | sum += y[iy] * x[ix] 29 | ix += incX 30 | iy += incY 31 | } 32 | return sum 33 | } 34 | -------------------------------------------------------------------------------- /asm/f32/scal.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package f32 6 | 7 | // ScalUnitary is 8 | // for i := range x { 9 | // x[i] *= alpha 10 | // } 11 | func ScalUnitary(alpha float32, x []float32) { 12 | for i := range x { 13 | x[i] *= alpha 14 | } 15 | } 16 | 17 | // ScalUnitaryTo is 18 | // for i, v := range x { 19 | // dst[i] = alpha * v 20 | // } 21 | func ScalUnitaryTo(dst []float32, alpha float32, x []float32) { 22 | for i, v := range x { 23 | dst[i] = alpha * v 24 | } 25 | } 26 | 27 | // ScalInc is 28 | // var ix uintptr 29 | // for i := 0; i < int(n); i++ { 30 | // x[ix] *= alpha 31 | // ix += incX 32 | // } 33 | func ScalInc(alpha float32, x []float32, n, incX uintptr) { 34 | var ix uintptr 35 | for i := 0; i < int(n); i++ { 36 | x[ix] *= alpha 37 | ix += incX 38 | } 39 | } 40 | 41 | // ScalIncTo is 42 | // var idst, ix uintptr 43 | // for i := 0; i < int(n); i++ { 44 | // dst[idst] = alpha * x[ix] 45 | // ix += incX 46 | // idst += incDst 47 | // } 48 | func ScalIncTo(dst []float32, incDst uintptr, alpha float32, x []float32, n, incX uintptr) { 49 | var idst, ix uintptr 50 | for i := 0; i < int(n); i++ { 51 | dst[idst] = alpha * x[ix] 52 | ix += incX 53 | idst += incDst 54 | } 55 | } 56 | -------------------------------------------------------------------------------- /asm/f32/stubs_amd64.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | package f32 8 | 9 | // AxpyUnitary is 10 | // for i, v := range x { 11 | // y[i] += alpha * v 12 | // } 13 | func AxpyUnitary(alpha float32, x, y []float32) 14 | 15 | // AxpyUnitaryTo is 16 | // for i, v := range x { 17 | // dst[i] = alpha*v + y[i] 18 | // } 19 | func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) 20 | 21 | // AxpyInc is 22 | // for i := 0; i < int(n); i++ { 23 | // y[iy] += alpha * x[ix] 24 | // ix += incX 25 | // iy += incY 26 | // } 27 | func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) 28 | 29 | // AxpyIncTo is 30 | // for i := 0; i < int(n); i++ { 31 | // dst[idst] = alpha*x[ix] + y[iy] 32 | // ix += incX 33 | // iy += incY 34 | // idst += incDst 35 | // } 36 | func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) 37 | -------------------------------------------------------------------------------- /asm/f32/stubs_noasm.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !amd64 noasm appengine 6 | 7 | package f32 8 | 9 | // AxpyUnitary is 10 | // for i, v := range x { 11 | // y[i] += alpha * v 12 | // } 13 | func AxpyUnitary(alpha float32, x, y []float32) { 14 | for i, v := range x { 15 | y[i] += alpha * v 16 | } 17 | } 18 | 19 | // AxpyUnitaryTo is 20 | // for i, v := range x { 21 | // dst[i] = alpha*v + y[i] 22 | // } 23 | func AxpyUnitaryTo(dst []float32, alpha float32, x, y []float32) { 24 | for i, v := range x { 25 | dst[i] = alpha*v + y[i] 26 | } 27 | } 28 | 29 | // AxpyInc is 30 | // for i := 0; i < int(n); i++ { 31 | // y[iy] += alpha * x[ix] 32 | // ix += incX 33 | // iy += incY 34 | // } 35 | func AxpyInc(alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) { 36 | for i := 0; i < int(n); i++ { 37 | y[iy] += alpha * x[ix] 38 | ix += incX 39 | iy += incY 40 | } 41 | } 42 | 43 | // AxpyIncTo is 44 | // for i := 0; i < int(n); i++ { 45 | // dst[idst] = alpha*x[ix] + y[iy] 46 | // ix += incX 47 | // iy += incY 48 | // idst += incDst 49 | // } 50 | func AxpyIncTo(dst []float32, incDst, idst uintptr, alpha float32, x, y []float32, n, incX, incY, ix, iy uintptr) { 51 | for i := 0; i < int(n); i++ { 52 | dst[idst] = alpha*x[ix] + y[iy] 53 | ix += incX 54 | iy += incY 55 | idst += incDst 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /asm/f32/stubs_test.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package f32 6 | 7 | import ( 8 | "math" 9 | "testing" 10 | ) 11 | 12 | var ( 13 | nan = float32(math.NaN()) 14 | inf = float32(math.Inf(1)) 15 | ) 16 | 17 | var tests = []struct { 18 | incX, incY, incDst uintptr 19 | ix, iy, idst uintptr 20 | a float32 21 | dst, x, y []float32 22 | ex []float32 23 | }{ 24 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 25 | a: 3, 26 | dst: []float32{5}, 27 | x: []float32{2}, 28 | y: []float32{1}, 29 | ex: []float32{7}}, 30 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 31 | a: 5, 32 | dst: []float32{0, 0, 0}, 33 | x: []float32{0, 0, 0}, 34 | y: []float32{1, 1, 1}, 35 | ex: []float32{1, 1, 1}}, 36 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 37 | a: 5, 38 | dst: []float32{0, 0, 0}, 39 | x: []float32{0, 0}, 40 | y: []float32{1, 1, 1}, 41 | ex: []float32{1, 1}}, 42 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 43 | a: -1, 44 | dst: []float32{-1, -1, -1}, 45 | x: []float32{1, 1, 1}, 46 | y: []float32{1, 2, 1}, 47 | ex: []float32{0, 1, 0}}, 48 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 49 | a: -1, 50 | dst: []float32{1, 1, 1}, 51 | x: []float32{1, 2, 1}, 52 | y: []float32{-1, -2, -1}, 53 | ex: []float32{-2, -4, -2}}, 54 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 55 | a: 2.5, 56 | dst: []float32{1, 1, 1, 1, 1}, 57 | x: []float32{1, 2, 3, 2, 1}, 58 | y: []float32{0, 0, 0, 0, 0}, 59 | ex: []float32{2.5, 5, 7.5, 5, 2.5}}, 60 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, // Run big test twice, once aligned once unaligned. 61 | a: 16.5, 62 | dst: make([]float32, 20), 63 | x: []float32{.5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5, .5}, 64 | y: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 65 | ex: []float32{9.25, 10.25, 11.25, 12.25, 13.25, 14.25, 15.25, 16.25, 17.25, 18.25, 9.25, 10.25, 11.25, 12.25, 13.25, 14.25, 15.25, 16.25, 17.25, 18.25}}, 66 | {incX: 2, incY: 2, incDst: 3, ix: 0, iy: 0, idst: 0, 67 | a: 16.5, 68 | dst: make([]float32, 10), 69 | x: []float32{.5, .5, .5, .5, .5, .5, .5, .5, .5, .5}, 70 | y: []float32{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, 71 | ex: []float32{9.25, 10.25, 11.25, 12.25, 13.25, 14.25, 15.25, 16.25, 17.25, 18.25}}, 72 | } 73 | 74 | func guardVector(vec []float32, guard_val float32, guard_len int) (guarded []float32) { 75 | guarded = make([]float32, len(vec)+guard_len*2) 76 | copy(guarded[guard_len:], vec) 77 | for i := 0; i < guard_len; i++ { 78 | guarded[i] = guard_val 79 | guarded[len(guarded)-1-i] = guard_val 80 | } 81 | return guarded 82 | } 83 | 84 | func isValidGuard(vec []float32, guard_val float32, guard_len int) bool { 85 | for i := 0; i < guard_len; i++ { 86 | if vec[i] != guard_val || vec[len(vec)-1-i] != guard_val { 87 | return false 88 | } 89 | } 90 | return true 91 | } 92 | 93 | func same(x, y float32) bool { 94 | a, b := float64(x), float64(y) 95 | return a == b || (math.IsNaN(a) && math.IsNaN(b)) 96 | } 97 | 98 | func TestAxpyUnitary(t *testing.T) { 99 | var x_gd, y_gd float32 = 1, 1 100 | for cas, test := range tests { 101 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 102 | test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln) 103 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 104 | AxpyUnitary(test.a, x, y) 105 | for i := range test.ex { 106 | if !same(y[i], test.ex[i]) { 107 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i], test.ex[i]) 108 | } 109 | } 110 | if !isValidGuard(test.x, x_gd, xg_ln) { 111 | t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:]) 112 | } 113 | if !isValidGuard(test.y, y_gd, yg_ln) { 114 | t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:]) 115 | } 116 | } 117 | } 118 | 119 | func TestAxpyUnitaryTo(t *testing.T) { 120 | var x_gd, y_gd, dst_gd float32 = 1, 1, 0 121 | for cas, test := range tests { 122 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 123 | test.x, test.y = guardVector(test.x, x_gd, xg_ln), guardVector(test.y, y_gd, yg_ln) 124 | test.dst = guardVector(test.dst, dst_gd, xg_ln) 125 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 126 | dst := test.dst[xg_ln : len(test.dst)-xg_ln] 127 | AxpyUnitaryTo(dst, test.a, x, y) 128 | for i := range test.ex { 129 | if !same(test.ex[i], dst[i]) { 130 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i], test.ex[i]) 131 | } 132 | } 133 | if !isValidGuard(test.x, x_gd, xg_ln) { 134 | t.Errorf("Test %d Guard violated in x vector %v %v", cas, test.x[:xg_ln], test.x[len(test.x)-xg_ln:]) 135 | } 136 | if !isValidGuard(test.y, y_gd, yg_ln) { 137 | t.Errorf("Test %d Guard violated in y vector %v %v", cas, test.y[:yg_ln], test.y[len(test.y)-yg_ln:]) 138 | } 139 | if !isValidGuard(test.dst, dst_gd, xg_ln) { 140 | t.Errorf("Test %d Guard violated in dst vector %v %v", cas, test.dst[:xg_ln], test.dst[len(test.dst)-xg_ln:]) 141 | } 142 | } 143 | } 144 | 145 | func guardIncVector(vec []float32, guard_val float32, incV uintptr, guard_len int) (guarded []float32) { 146 | inc := int(incV) 147 | s_ln := len(vec) * (inc) 148 | guarded = make([]float32, s_ln+guard_len*2) 149 | for i, j := 0, 0; i < len(guarded); i++ { 150 | switch { 151 | case i < guard_len, i > guard_len+s_ln: 152 | guarded[i] = guard_val 153 | case (i-guard_len)%(inc) == 0 && j < len(vec): 154 | guarded[i] = vec[j] 155 | j++ 156 | default: 157 | guarded[i] = guard_val 158 | } 159 | } 160 | return guarded 161 | } 162 | 163 | func checkValidIncGuard(t *testing.T, vec []float32, guard_val float32, incV uintptr, guard_len int) { 164 | inc := int(incV) 165 | s_ln := len(vec) - 2*guard_len 166 | for i := range vec { 167 | switch { 168 | case same(vec[i], guard_val): 169 | // Correct value 170 | case i < guard_len: 171 | t.Errorf("Front guard violated at %d %v", i, vec[:guard_len]) 172 | case i > guard_len+s_ln: 173 | t.Errorf("Back guard violated at %d %v", i-guard_len-s_ln, vec[guard_len+s_ln:]) 174 | case (i-guard_len)%inc == 0 && (i-guard_len)/inc < len(vec): 175 | // Ignore input values 176 | default: 177 | t.Errorf("Internal guard violated at %d %v", i-guard_len, vec[guard_len:guard_len+s_ln]) 178 | } 179 | } 180 | } 181 | 182 | func TestAxpyInc(t *testing.T) { 183 | var x_gd, y_gd float32 = 1, 1 184 | for cas, test := range tests { 185 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 186 | test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln) 187 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 188 | AxpyInc(test.a, x, y, uintptr(len(test.ex)), test.incX, test.incY, test.ix, test.iy) 189 | for i := range test.ex { 190 | if !same(y[i*int(test.incY)], test.ex[i]) { 191 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, y[i*int(test.incY)], test.ex[i]) 192 | } 193 | } 194 | checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln) 195 | checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln) 196 | } 197 | } 198 | 199 | func TestAxpyIncTo(t *testing.T) { 200 | var x_gd, y_gd, dst_gd float32 = 1, 1, 0 201 | for cas, test := range tests { 202 | xg_ln, yg_ln := 4+cas%2, 4+cas%3 203 | test.x, test.y = guardIncVector(test.x, x_gd, uintptr(test.incX), xg_ln), guardIncVector(test.y, y_gd, uintptr(test.incY), yg_ln) 204 | test.dst = guardIncVector(test.dst, dst_gd, uintptr(test.incDst), xg_ln) 205 | x, y := test.x[xg_ln:len(test.x)-xg_ln], test.y[yg_ln:len(test.y)-yg_ln] 206 | dst := test.dst[xg_ln : len(test.dst)-xg_ln] 207 | AxpyIncTo(dst, test.incDst, test.idst, test.a, x, y, uintptr(len(test.ex)), test.incX, test.incY, test.ix, test.iy) 208 | for i := range test.ex { 209 | if !same(dst[i*int(test.incDst)], test.ex[i]) { 210 | t.Errorf("Test %d Unexpected result at %d Got: %v Expected: %v", cas, i, dst[i*int(test.incDst)], test.ex[i]) 211 | } 212 | } 213 | checkValidIncGuard(t, test.x, x_gd, uintptr(test.incX), xg_ln) 214 | checkValidIncGuard(t, test.y, y_gd, uintptr(test.incY), yg_ln) 215 | checkValidIncGuard(t, test.dst, dst_gd, uintptr(test.incDst), xg_ln) 216 | } 217 | } 218 | -------------------------------------------------------------------------------- /asm/f64/abssum_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func L1Norm(x []float64) float64 10 | TEXT ·L1Norm(SB), NOSPLIT, $0 11 | MOVQ x_base+0(FP), SI // SI = &x 12 | MOVQ x_len+8(FP), CX // CX = len(x) 13 | XORQ AX, AX // i = 0 14 | PXOR X0, X0 // p_sum_i = 0 15 | PXOR X1, X1 16 | PXOR X2, X2 17 | PXOR X3, X3 18 | PXOR X4, X4 19 | PXOR X5, X5 20 | PXOR X6, X6 21 | PXOR X7, X7 22 | CMPQ CX, $0 // if CX == 0 { return 0 } 23 | JE absum_end 24 | MOVQ CX, BX 25 | ANDQ $7, BX // BX = len(x) % 8 26 | SHRQ $3, CX // CX = floor( len(x) / 8 ) 27 | JZ absum_tail_start // if CX == 0 { goto absum_tail_start } 28 | 29 | absum_loop: // do { 30 | // p_sum += max( p_sum + x[i], p_sum - x[i] ) 31 | MOVUPS (SI)(AX*8), X8 // X_i = x[i:i+1] 32 | MOVUPS 16(SI)(AX*8), X9 33 | MOVUPS 32(SI)(AX*8), X10 34 | MOVUPS 48(SI)(AX*8), X11 35 | ADDPD X8, X0 // p_sum_i += X_i ( positive values ) 36 | ADDPD X9, X2 37 | ADDPD X10, X4 38 | ADDPD X11, X6 39 | SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values ) 40 | SUBPD X9, X3 41 | SUBPD X10, X5 42 | SUBPD X11, X7 43 | MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) ) 44 | MAXPD X3, X2 45 | MAXPD X5, X4 46 | MAXPD X7, X6 47 | MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i 48 | MOVAPS X2, X3 49 | MOVAPS X4, X5 50 | MOVAPS X6, X7 51 | ADDQ $8, AX // i += 8 52 | LOOP absum_loop // } while --CX > 0 53 | 54 | // p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) ) 55 | ADDPD X3, X0 56 | ADDPD X5, X7 57 | ADDPD X7, X0 58 | 59 | // p_sum_0[0] = p_sum_0[0] + p_sum_0[1] 60 | MOVAPS X0, X1 61 | SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 ) 62 | ADDSD X1, X0 63 | CMPQ BX, $0 64 | JE absum_end // if BX == 0 { goto absum_end } 65 | 66 | absum_tail_start: // Reset loop registers 67 | MOVQ BX, CX // Loop counter: CX = BX 68 | XORPS X8, X8 // X_8 = 0 69 | 70 | absum_tail: // do { 71 | // p_sum += max( p_sum + x[i], p_sum - x[i] ) 72 | MOVSD (SI)(AX*8), X8 // X_8 = x[i] 73 | MOVSD X0, X1 // p_sum_1 = p_sum_0 74 | ADDSD X8, X0 // p_sum_0 += X_8 75 | SUBSD X8, X1 // p_sum_1 -= X_8 76 | MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 ) 77 | INCQ AX // i++ 78 | LOOP absum_tail // } while --CX > 0 79 | 80 | absum_end: // return p_sum_0 81 | MOVSD X0, sum+24(FP) 82 | RET 83 | -------------------------------------------------------------------------------- /asm/f64/abssuminc_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func L1NormInc(x []float64, n, incX int) (sum float64) 10 | TEXT ·L1NormInc(SB), NOSPLIT, $0 11 | MOVQ x_base+0(FP), SI // SI = &x 12 | MOVQ n+24(FP), CX // CX = n 13 | MOVQ incX+32(FP), AX // AX = increment * sizeof( float64 ) 14 | SHLQ $3, AX 15 | MOVQ AX, DX // DX = AX * 3 16 | IMULQ $3, DX 17 | PXOR X0, X0 // p_sum_i = 0 18 | PXOR X1, X1 19 | PXOR X2, X2 20 | PXOR X3, X3 21 | PXOR X4, X4 22 | PXOR X5, X5 23 | PXOR X6, X6 24 | PXOR X7, X7 25 | CMPQ CX, $0 // if CX == 0 { return 0 } 26 | JE absum_end 27 | MOVQ CX, BX 28 | ANDQ $7, BX // BX = n % 8 29 | SHRQ $3, CX // CX = floor( n / 8 ) 30 | JZ absum_tail_start // if CX == 0 { goto absum_tail_start } 31 | 32 | absum_loop: // do { 33 | // p_sum = max( p_sum + x[i], p_sum - x[i] ) 34 | MOVSD (SI), X8 // X_i[0] = x[i] 35 | MOVSD (SI)(AX*1), X9 36 | MOVSD (SI)(AX*2), X10 37 | MOVSD (SI)(DX*1), X11 38 | LEAQ (SI)(AX*4), SI // SI = SI + 4 39 | MOVHPD (SI), X8 // X_i[1] = x[i+4] 40 | MOVHPD (SI)(AX*1), X9 41 | MOVHPD (SI)(AX*2), X10 42 | MOVHPD (SI)(DX*1), X11 43 | ADDPD X8, X0 // p_sum_i += X_i ( positive values ) 44 | ADDPD X9, X2 45 | ADDPD X10, X4 46 | ADDPD X11, X6 47 | SUBPD X8, X1 // p_sum_(i+1) -= X_i ( negative values ) 48 | SUBPD X9, X3 49 | SUBPD X10, X5 50 | SUBPD X11, X7 51 | MAXPD X1, X0 // p_sum_i = max( p_sum_i, p_sum_(i+1) ) 52 | MAXPD X3, X2 53 | MAXPD X5, X4 54 | MAXPD X7, X6 55 | MOVAPS X0, X1 // p_sum_(i+1) = p_sum_i 56 | MOVAPS X2, X3 57 | MOVAPS X4, X5 58 | MOVAPS X6, X7 59 | LEAQ (SI)(AX*4), SI // SI = SI + 4 60 | LOOP absum_loop // } while --CX > 0 61 | 62 | // p_sum_0 = \sum_{i=1}^{3}( p_sum_(i*2) ) 63 | ADDPD X3, X0 64 | ADDPD X5, X7 65 | ADDPD X7, X0 66 | 67 | // p_sum_0[0] = p_sum_0[0] + p_sum_0[1] 68 | MOVAPS X0, X1 69 | SHUFPD $0x3, X0, X0 // lower( p_sum_0 ) = upper( p_sum_0 ) 70 | ADDSD X1, X0 71 | CMPQ BX, $0 72 | JE absum_end // if BX == 0 { goto absum_end } 73 | 74 | absum_tail_start: // Reset loop registers 75 | MOVQ BX, CX // Loop counter: CX = BX 76 | XORPS X8, X8 // X_8 = 0 77 | 78 | absum_tail: // do { 79 | // p_sum += max( p_sum + x[i], p_sum - x[i] ) 80 | MOVSD (SI), X8 // X_8 = x[i] 81 | MOVSD X0, X1 // p_sum_1 = p_sum_0 82 | ADDSD X8, X0 // p_sum_0 += X_8 83 | SUBSD X8, X1 // p_sum_1 -= X_8 84 | MAXSD X1, X0 // p_sum_0 = max( p_sum_0, p_sum_1 ) 85 | ADDQ AX, SI // i++ 86 | LOOP absum_tail // } while --CX > 0 87 | 88 | absum_end: // return p_sum_0 89 | MOVSD X0, sum+40(FP) 90 | RET 91 | -------------------------------------------------------------------------------- /asm/f64/add_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func Add(dst, s []float64) 10 | TEXT ·Add(SB), NOSPLIT, $0 11 | MOVQ dst_base+0(FP), DI // DI = &dst 12 | MOVQ dst_len+8(FP), CX // CX = len(dst) 13 | MOVQ s_base+24(FP), SI // SI = &s 14 | CMPQ s_len+32(FP), CX // CX = max( CX, len(s) ) 15 | CMOVQLE s_len+32(FP), CX 16 | CMPQ CX, $0 // if CX == 0 { return } 17 | JE add_end 18 | XORQ AX, AX 19 | MOVQ DI, BX 20 | ANDQ $0x0F, BX // BX = &dst & 15 21 | JZ add_no_trim // if BX == 0 { goto add_no_trim } 22 | 23 | // Align on 16-bit boundary 24 | MOVSD (SI)(AX*8), X0 // X0 = s[i] 25 | ADDSD (DI)(AX*8), X0 // X0 += dst[i] 26 | MOVSD X0, (DI)(AX*8) // dst[i] = X0 27 | INCQ AX // i++ 28 | DECQ CX // --CX 29 | JE add_end // if CX == 0 { return } 30 | 31 | add_no_trim: 32 | MOVQ CX, BX 33 | ANDQ $7, BX // BX = len(dst) % 8 34 | SHRQ $3, CX // CX = floor( len(dst) / 8 ) 35 | JZ add_tail_start // if CX == 0 { goto add_tail_start } 36 | 37 | add_loop: // Loop unrolled 8x do { 38 | MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1] 39 | MOVUPS 16(SI)(AX*8), X1 40 | MOVUPS 32(SI)(AX*8), X2 41 | MOVUPS 48(SI)(AX*8), X3 42 | ADDPD (DI)(AX*8), X0 // X_i += dst[i:i+1] 43 | ADDPD 16(DI)(AX*8), X1 44 | ADDPD 32(DI)(AX*8), X2 45 | ADDPD 48(DI)(AX*8), X3 46 | MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X_i 47 | MOVUPS X1, 16(DI)(AX*8) 48 | MOVUPS X2, 32(DI)(AX*8) 49 | MOVUPS X3, 48(DI)(AX*8) 50 | ADDQ $8, AX // i += 8 51 | LOOP add_loop // } while --CX > 0 52 | CMPQ BX, $0 // if BX == 0 { return } 53 | JE add_end 54 | 55 | add_tail_start: // Reset loop registers 56 | MOVQ BX, CX // Loop counter: CX = BX 57 | 58 | add_tail: // do { 59 | MOVSD (SI)(AX*8), X0 // X0 = s[i] 60 | ADDSD (DI)(AX*8), X0 // X0 += dst[i] 61 | MOVSD X0, (DI)(AX*8) // dst[i] = X0 62 | INCQ AX // ++i 63 | LOOP add_tail // } while --CX > 0 64 | 65 | add_end: 66 | RET 67 | -------------------------------------------------------------------------------- /asm/f64/addconst_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func Addconst(alpha float64, x []float64) 10 | TEXT ·AddConst(SB), NOSPLIT, $0 11 | MOVQ x_base+8(FP), SI // SI = &x 12 | MOVQ x_len+16(FP), CX // CX = len(x) 13 | CMPQ CX, $0 // if len(x) == 0 { return } 14 | JE ac_end 15 | MOVSD alpha+0(FP), X4 // X4 = { a, a } 16 | SHUFPD $0, X4, X4 17 | MOVUPS X4, X5 // X5 = X4 18 | XORQ AX, AX // i = 0 19 | MOVQ CX, BX 20 | ANDQ $7, BX // BX = len(x) % 8 21 | SHRQ $3, CX // CX = floor( len(x) / 8 ) 22 | JZ ac_tail_start // if CX == 0 { goto ac_tail_start } 23 | 24 | ac_loop: // Loop unrolled 8x do { 25 | MOVUPS (SI)(AX*8), X0 // X_i = s[i:i+1] 26 | MOVUPS 16(SI)(AX*8), X1 27 | MOVUPS 32(SI)(AX*8), X2 28 | MOVUPS 48(SI)(AX*8), X3 29 | ADDPD X4, X0 // X_i += a 30 | ADDPD X5, X1 31 | ADDPD X4, X2 32 | ADDPD X5, X3 33 | MOVUPS X0, (SI)(AX*8) // s[i:i+1] = X_i 34 | MOVUPS X1, 16(SI)(AX*8) 35 | MOVUPS X2, 32(SI)(AX*8) 36 | MOVUPS X3, 48(SI)(AX*8) 37 | ADDQ $8, AX // i += 8 38 | LOOP ac_loop // } while --CX > 0 39 | CMPQ BX, $0 // if BX == 0 { return } 40 | JE ac_end 41 | 42 | ac_tail_start: // Reset loop counters 43 | MOVQ BX, CX // Loop counter: CX = BX 44 | 45 | ac_tail: // do { 46 | MOVSD (SI)(AX*8), X0 // X0 = s[i] 47 | ADDSD X4, X0 // X0 += a 48 | MOVSD X0, (SI)(AX*8) // s[i] = X0 49 | INCQ AX // ++i 50 | LOOP ac_tail // } while --CX > 0 51 | 52 | ac_end: 53 | RET 54 | -------------------------------------------------------------------------------- /asm/f64/asm_test.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package f64 6 | 7 | import ( 8 | "math" 9 | "math/rand" 10 | "testing" 11 | ) 12 | 13 | var ( 14 | nan = math.NaN() 15 | inf = math.Inf(1) 16 | ) 17 | 18 | // newGuardedVector allocates a new slice and returns it as three subslices. 19 | // v is a strided vector that contains elements of data at indices i*inc and 20 | // NaN elsewhere. frontGuard and backGuard are filled with NaN values, and 21 | // their backing arrays are directly adjacent to v in memory. The three slices 22 | // can be used to detect invalid memory reads and writes. 23 | func newGuardedVector(data []float64, inc int) (v, frontGuard, backGuard []float64) { 24 | if inc < 0 { 25 | inc = -inc 26 | } 27 | guard := 2 * inc 28 | size := (len(data)-1)*inc + 1 29 | whole := make([]float64, size+2*guard) 30 | v = whole[guard : len(whole)-guard] 31 | for i := range whole { 32 | whole[i] = math.NaN() 33 | } 34 | for i, d := range data { 35 | v[i*inc] = d 36 | } 37 | return v, whole[:guard], whole[len(whole)-guard:] 38 | } 39 | 40 | // allNaN returns true if x contains only NaN values, and false otherwise. 41 | func allNaN(x []float64) bool { 42 | for _, v := range x { 43 | if !math.IsNaN(v) { 44 | return false 45 | } 46 | } 47 | return true 48 | } 49 | 50 | // equalStrided returns true if the strided vector x contains elements of the 51 | // dense vector ref at indices i*inc, false otherwise. 52 | func equalStrided(ref, x []float64, inc int) bool { 53 | if inc < 0 { 54 | inc = -inc 55 | } 56 | for i, v := range ref { 57 | if !same(x[i*inc], v) { 58 | return false 59 | } 60 | } 61 | return true 62 | } 63 | 64 | // nonStridedWrite returns false if all elements of x at non-stride indices are 65 | // equal to NaN, true otherwise. 66 | func nonStridedWrite(x []float64, inc int) bool { 67 | if inc < 0 { 68 | inc = -inc 69 | } 70 | for i, v := range x { 71 | if i%inc != 0 && !math.IsNaN(v) { 72 | return true 73 | } 74 | } 75 | return false 76 | } 77 | 78 | // guardVector copies the source vector (vec) into a new slice with guards. 79 | // Guards guarded[:gdLn] and guarded[len-gdLn:] will be filled with sigil value gdVal. 80 | func guardVector(vec []float64, gdVal float64, gdLn int) (guarded []float64) { 81 | guarded = make([]float64, len(vec)+gdLn*2) 82 | copy(guarded[gdLn:], vec) 83 | for i := 0; i < gdLn; i++ { 84 | guarded[i] = gdVal 85 | guarded[len(guarded)-1-i] = gdVal 86 | } 87 | return guarded 88 | } 89 | 90 | // isValidGuard will test for violated guards, generated by guardVector. 91 | func isValidGuard(vec []float64, gdVal float64, gdLn int) bool { 92 | for i := 0; i < gdLn; i++ { 93 | if !same(vec[i], gdVal) || !same(vec[len(vec)-1-i], gdVal) { 94 | return false 95 | } 96 | } 97 | return true 98 | } 99 | 100 | // guardIncVector copies the source vector (vec) into a new incremented slice with guards. 101 | // End guards will be length gdLen. 102 | // Internal and end guards will be filled with sigil value gdVal. 103 | func guardIncVector(vec []float64, gdVal float64, inc, gdLen int) (guarded []float64) { 104 | if inc < 0 { 105 | inc = -inc 106 | } 107 | inrLen := len(vec) * inc 108 | guarded = make([]float64, inrLen+gdLen*2) 109 | for i := range guarded { 110 | guarded[i] = gdVal 111 | } 112 | for i, v := range vec { 113 | guarded[gdLen+i*inc] = v 114 | } 115 | return guarded 116 | } 117 | 118 | // checkValidIncGuard will test for violated guards, generated by guardIncVector 119 | func checkValidIncGuard(t *testing.T, vec []float64, gdVal float64, inc, gdLen int) { 120 | srcLn := len(vec) - 2*gdLen 121 | for i := range vec { 122 | switch { 123 | case same(vec[i], gdVal): 124 | // Correct value 125 | case (i-gdLen)%inc == 0 && (i-gdLen)/inc < len(vec): 126 | // Ignore input values 127 | case i < gdLen: 128 | t.Errorf("Front guard violated at %d %v", i, vec[:gdLen]) 129 | case i > gdLen+srcLn: 130 | t.Errorf("Back guard violated at %d %v", i-gdLen-srcLn, vec[gdLen+srcLn:]) 131 | default: 132 | t.Errorf("Internal guard violated at %d %v", i-gdLen, vec[gdLen:gdLen+srcLn]) 133 | } 134 | } 135 | } 136 | 137 | // same tests for nan-aware equality. 138 | func same(a, b float64) bool { 139 | return a == b || (math.IsNaN(a) && math.IsNaN(b)) 140 | } 141 | 142 | var ( // Offset sets for testing alignment handling in Unitary assembly functions. 143 | align1 = []int{0, 1} 144 | align2 = newIncSet(0, 1) 145 | align3 = newIncToSet(0, 1) 146 | ) 147 | 148 | type incSet struct { 149 | x, y int 150 | } 151 | 152 | // genInc will generate all (x,y) combinations of the input increment set. 153 | func newIncSet(inc ...int) []incSet { 154 | n := len(inc) 155 | is := make([]incSet, n*n) 156 | for x := range inc { 157 | for y := range inc { 158 | is[x*n+y] = incSet{inc[x], inc[y]} 159 | } 160 | } 161 | return is 162 | } 163 | 164 | type incToSet struct { 165 | dst, x, y int 166 | } 167 | 168 | // genIncTo will generate all (dst,x,y) combinations of the input increment set. 169 | func newIncToSet(inc ...int) []incToSet { 170 | n := len(inc) 171 | is := make([]incToSet, n*n*n) 172 | for i, dst := range inc { 173 | for x := range inc { 174 | for y := range inc { 175 | is[i*n*n+x*n+y] = incToSet{dst, inc[x], inc[y]} 176 | } 177 | } 178 | } 179 | return is 180 | } 181 | 182 | var benchSink []float64 183 | 184 | func randomSlice(n, inc int) []float64 { 185 | if inc < 0 { 186 | inc = -inc 187 | } 188 | x := make([]float64, (n-1)*inc+1) 189 | for i := range x { 190 | x[i] = rand.Float64() 191 | } 192 | return x 193 | } 194 | 195 | func randSlice(n, inc int, r *rand.Rand) []float64 { 196 | if inc < 0 { 197 | inc = -inc 198 | } 199 | x := make([]float64, (n-1)*inc+1) 200 | for i := range x { 201 | x[i] = r.Float64() 202 | } 203 | return x 204 | } 205 | -------------------------------------------------------------------------------- /asm/f64/axpy.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !amd64 noasm appengine 6 | 7 | package f64 8 | 9 | // AxpyUnitary is 10 | // for i, v := range x { 11 | // y[i] += alpha * v 12 | // } 13 | func AxpyUnitary(alpha float64, x, y []float64) { 14 | for i, v := range x { 15 | y[i] += alpha * v 16 | } 17 | } 18 | 19 | // AxpyUnitaryTo is 20 | // for i, v := range x { 21 | // dst[i] = alpha*v + y[i] 22 | // } 23 | func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) { 24 | for i, v := range x { 25 | dst[i] = alpha*v + y[i] 26 | } 27 | } 28 | 29 | // AxpyInc is 30 | // for i := 0; i < int(n); i++ { 31 | // y[iy] += alpha * x[ix] 32 | // ix += incX 33 | // iy += incY 34 | // } 35 | func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) { 36 | for i := 0; i < int(n); i++ { 37 | y[iy] += alpha * x[ix] 38 | ix += incX 39 | iy += incY 40 | } 41 | } 42 | 43 | // AxpyIncTo is 44 | // for i := 0; i < int(n); i++ { 45 | // dst[idst] = alpha*x[ix] + y[iy] 46 | // ix += incX 47 | // iy += incY 48 | // idst += incDst 49 | // } 50 | func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) { 51 | for i := 0; i < int(n); i++ { 52 | dst[idst] = alpha*x[ix] + y[iy] 53 | ix += incX 54 | iy += incY 55 | idst += incDst 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /asm/f64/axpy_test.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package f64 6 | 7 | import ( 8 | "fmt" 9 | "testing" 10 | ) 11 | 12 | const ( 13 | msgVal = "%v: unexpected value at %v Got: %v Expected: %v" 14 | msgGuard = "%v: Guard violated in %s vector %v %v" 15 | ) 16 | 17 | var axpyTests = []struct { 18 | alpha float64 19 | x []float64 20 | y []float64 21 | want []float64 22 | wantRev []float64 // Result when x is traversed in reverse direction. 23 | }{ 24 | { 25 | alpha: 0, 26 | x: []float64{}, 27 | y: []float64{}, 28 | want: []float64{}, 29 | wantRev: []float64{}, 30 | }, 31 | { 32 | alpha: 0, 33 | x: []float64{2}, 34 | y: []float64{-3}, 35 | want: []float64{-3}, 36 | wantRev: []float64{-3}, 37 | }, 38 | { 39 | alpha: 1, 40 | x: []float64{2}, 41 | y: []float64{-3}, 42 | want: []float64{-1}, 43 | wantRev: []float64{-1}, 44 | }, 45 | { 46 | alpha: 3, 47 | x: []float64{2}, 48 | y: []float64{-3}, 49 | want: []float64{3}, 50 | wantRev: []float64{3}, 51 | }, 52 | { 53 | alpha: -3, 54 | x: []float64{2}, 55 | y: []float64{-3}, 56 | want: []float64{-9}, 57 | wantRev: []float64{-9}, 58 | }, 59 | { 60 | alpha: 1, 61 | x: []float64{1, 5}, 62 | y: []float64{2, -3}, 63 | want: []float64{3, 2}, 64 | wantRev: []float64{7, -2}, 65 | }, 66 | { 67 | alpha: 1, 68 | x: []float64{2, 3, 4}, 69 | y: []float64{-3, -2, -1}, 70 | want: []float64{-1, 1, 3}, 71 | wantRev: []float64{1, 1, 1}, 72 | }, 73 | { 74 | alpha: 0, 75 | x: []float64{0, 0, 1, 1, 2, -3, -4}, 76 | y: []float64{0, 1, 0, 3, -4, 5, -6}, 77 | want: []float64{0, 1, 0, 3, -4, 5, -6}, 78 | wantRev: []float64{0, 1, 0, 3, -4, 5, -6}, 79 | }, 80 | { 81 | alpha: 1, 82 | x: []float64{0, 0, 1, 1, 2, -3, -4}, 83 | y: []float64{0, 1, 0, 3, -4, 5, -6}, 84 | want: []float64{0, 1, 1, 4, -2, 2, -10}, 85 | wantRev: []float64{-4, -2, 2, 4, -3, 5, -6}, 86 | }, 87 | { 88 | alpha: 3, 89 | x: []float64{0, 0, 1, 1, 2, -3, -4}, 90 | y: []float64{0, 1, 0, 3, -4, 5, -6}, 91 | want: []float64{0, 1, 3, 6, 2, -4, -18}, 92 | wantRev: []float64{-12, -8, 6, 6, -1, 5, -6}, 93 | }, 94 | { 95 | alpha: -3, 96 | x: []float64{0, 0, 1, 1, 2, -3, -4, 0, 0, 1, 1, 2, -3, -4}, 97 | y: []float64{0, 1, 0, 3, -4, 5, -6, 0, 1, 0, 3, -4, 5, -6}, 98 | want: []float64{0, 1, -3, 0, -10, 14, 6, 0, 1, -3, 0, -10, 14, 6}, 99 | wantRev: []float64{12, 10, -6, 0, -7, 5, -6, 12, 10, -6, 0, -7, 5, -6}, 100 | }, 101 | { 102 | alpha: -5, 103 | x: []float64{0, 0, 1, 1, 2, -3, -4, 5, 1, 2, -3, -4, 5}, 104 | y: []float64{0, 1, 0, 3, -4, 5, -6, 7, 3, -4, 5, -6, 7}, 105 | want: []float64{0, 1, -5, -2, -14, 20, 14, -18, -2, -14, 20, 14, -18}, 106 | wantRev: []float64{-25, 21, 15, -7, -9, -20, 14, 22, -7, -9, 0, -6, 7}, 107 | }, 108 | } 109 | 110 | func TestAxpyUnitary(t *testing.T) { 111 | const xGdVal, yGdVal = -1, 0.5 112 | for i, test := range axpyTests { 113 | for _, align := range align2 { 114 | prefix := fmt.Sprintf("Test %v (x:%v y:%v)", i, align.x, align.y) 115 | xgLn, ygLn := 4+align.x, 4+align.y 116 | xg, yg := guardVector(test.x, xGdVal, xgLn), guardVector(test.y, yGdVal, ygLn) 117 | x, y := xg[xgLn:len(xg)-xgLn], yg[ygLn:len(yg)-ygLn] 118 | AxpyUnitary(test.alpha, x, y) 119 | for i := range test.want { 120 | if !same(y[i], test.want[i]) { 121 | t.Errorf(msgVal, prefix, i, y[i], test.want[i]) 122 | } 123 | } 124 | if !isValidGuard(xg, xGdVal, xgLn) { 125 | t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:]) 126 | } 127 | if !isValidGuard(yg, yGdVal, ygLn) { 128 | t.Errorf(msgGuard, prefix, "y", yg[:ygLn], yg[len(yg)-ygLn:]) 129 | } 130 | if !equalStrided(test.x, x, 1) { 131 | t.Errorf("%v: modified read-only x argument", prefix) 132 | } 133 | } 134 | } 135 | } 136 | 137 | func TestAxpyUnitaryTo(t *testing.T) { 138 | const dstGdVal, xGdVal, yGdVal = 1, -1, 0.5 139 | for i, test := range axpyTests { 140 | for _, align := range align3 { 141 | prefix := fmt.Sprintf("Test %v (x:%v y:%v dst:%v)", i, align.x, align.y, align.dst) 142 | 143 | dgLn, xgLn, ygLn := 4+align.dst, 4+align.x, 4+align.y 144 | dstOrig := make([]float64, len(test.x)) 145 | xg, yg := guardVector(test.x, xGdVal, xgLn), guardVector(test.y, yGdVal, ygLn) 146 | dstg := guardVector(dstOrig, dstGdVal, dgLn) 147 | x, y := xg[xgLn:len(xg)-xgLn], yg[ygLn:len(yg)-ygLn] 148 | dst := dstg[dgLn : len(dstg)-dgLn] 149 | 150 | AxpyUnitaryTo(dst, test.alpha, x, y) 151 | for i := range test.want { 152 | if !same(dst[i], test.want[i]) { 153 | t.Errorf(msgVal, prefix, i, dst[i], test.want[i]) 154 | } 155 | } 156 | if !isValidGuard(xg, xGdVal, xgLn) { 157 | t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:]) 158 | } 159 | if !isValidGuard(yg, yGdVal, ygLn) { 160 | t.Errorf(msgGuard, prefix, "y", yg[:ygLn], yg[len(yg)-ygLn:]) 161 | } 162 | if !isValidGuard(dstg, dstGdVal, dgLn) { 163 | t.Errorf(msgGuard, prefix, "dst", dstg[:dgLn], dstg[len(dstg)-dgLn:]) 164 | } 165 | if !equalStrided(test.x, x, 1) { 166 | t.Errorf("%v: modified read-only x argument", prefix) 167 | } 168 | if !equalStrided(test.y, y, 1) { 169 | t.Errorf("%v: modified read-only y argument", prefix) 170 | } 171 | } 172 | } 173 | } 174 | 175 | func TestAxpyInc(t *testing.T) { 176 | const xGdVal, yGdVal = -1, 0.5 177 | gdLn := 4 178 | for i, test := range axpyTests { 179 | n := len(test.x) 180 | for _, inc := range newIncSet(-7, -4, -3, -2, -1, 1, 2, 3, 4, 7) { 181 | var ix, iy int 182 | if inc.x < 0 { 183 | ix = (-n + 1) * inc.x 184 | } 185 | if inc.y < 0 { 186 | iy = (-n + 1) * inc.y 187 | } 188 | prefix := fmt.Sprintf("test %v, inc.x = %v, inc.y = %v", i, inc.x, inc.y) 189 | xg := guardIncVector(test.x, xGdVal, inc.x, gdLn) 190 | yg := guardIncVector(test.y, yGdVal, inc.y, gdLn) 191 | x, y := xg[gdLn:len(xg)-gdLn], yg[gdLn:len(yg)-gdLn] 192 | 193 | AxpyInc(test.alpha, x, y, uintptr(n), 194 | uintptr(inc.x), uintptr(inc.y), uintptr(ix), uintptr(iy)) 195 | 196 | want := test.want 197 | if inc.x*inc.y < 0 { 198 | want = test.wantRev 199 | } 200 | if inc.y < 0 { 201 | inc.y = -inc.y 202 | } 203 | for i := range want { 204 | if !same(y[i*inc.y], want[i]) { 205 | t.Errorf(msgVal, prefix, i, y[iy+i*inc.y], want[i]) 206 | } 207 | } 208 | if !equalStrided(test.x, x, inc.x) { 209 | t.Errorf("%v: modified read-only x argument", prefix) 210 | } 211 | checkValidIncGuard(t, xg, xGdVal, inc.x, gdLn) 212 | checkValidIncGuard(t, yg, yGdVal, inc.y, gdLn) 213 | } 214 | } 215 | } 216 | 217 | func TestAxpyIncTo(t *testing.T) { 218 | const dstGdVal, xGdVal, yGdVal = 1, -1, 0.5 219 | var want []float64 220 | gdLn := 4 221 | for i, test := range axpyTests { 222 | n := len(test.x) 223 | for _, inc := range newIncToSet(-7, -4, -3, -2, -1, 1, 2, 3, 4, 7) { 224 | var ix, iy, idst uintptr 225 | if inc.x < 0 { 226 | ix = uintptr((-n + 1) * inc.x) 227 | } 228 | if inc.y < 0 { 229 | iy = uintptr((-n + 1) * inc.y) 230 | } 231 | if inc.dst < 0 { 232 | idst = uintptr((-n + 1) * inc.dst) 233 | } 234 | 235 | prefix := fmt.Sprintf("Test %v: (x: %v, y: %v, dst:%v)", i, inc.x, inc.y, inc.dst) 236 | dstOrig := make([]float64, len(test.want)) 237 | xg := guardIncVector(test.x, xGdVal, inc.x, gdLn) 238 | yg := guardIncVector(test.y, yGdVal, inc.y, gdLn) 239 | dstg := guardIncVector(dstOrig, dstGdVal, inc.dst, gdLn) 240 | x, y := xg[gdLn:len(xg)-gdLn], yg[gdLn:len(yg)-gdLn] 241 | dst := dstg[gdLn : len(dstg)-gdLn] 242 | 243 | AxpyIncTo(dst, uintptr(inc.dst), idst, 244 | test.alpha, x, y, uintptr(n), 245 | uintptr(inc.x), uintptr(inc.y), ix, iy) 246 | want = test.want 247 | if inc.x*inc.y < 0 { 248 | want = test.wantRev 249 | } 250 | var iW, incW int = 0, 1 251 | if inc.y*inc.dst < 0 { 252 | iW, incW = len(want)-1, -1 253 | } 254 | if inc.dst < 0 { 255 | inc.dst = -inc.dst 256 | } 257 | for i := range want { 258 | if !same(dst[i*inc.dst], want[iW+i*incW]) { 259 | t.Errorf(msgVal, prefix, i, dst[i*inc.dst], want[iW+i*incW]) 260 | } 261 | } 262 | 263 | checkValidIncGuard(t, xg, xGdVal, inc.x, gdLn) 264 | checkValidIncGuard(t, yg, yGdVal, inc.y, gdLn) 265 | checkValidIncGuard(t, dstg, dstGdVal, inc.dst, gdLn) 266 | if !equalStrided(test.x, x, inc.x) { 267 | t.Errorf("%v: modified read-only x argument", prefix) 268 | } 269 | if !equalStrided(test.y, y, inc.y) { 270 | t.Errorf("%v: modified read-only y argument", prefix) 271 | } 272 | } 273 | } 274 | } 275 | -------------------------------------------------------------------------------- /asm/f64/axpyinc_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | // 5 | // Some of the loop unrolling code is copied from: 6 | // http://golang.org/src/math/big/arith_amd64.s 7 | // which is distributed under these terms: 8 | // 9 | // Copyright (c) 2012 The Go Authors. All rights reserved. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // * Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // * Redistributions in binary form must reproduce the above 18 | // copyright notice, this list of conditions and the following disclaimer 19 | // in the documentation and/or other materials provided with the 20 | // distribution. 21 | // * Neither the name of Google Inc. nor the names of its 22 | // contributors may be used to endorse or promote products derived from 23 | // this software without specific prior written permission. 24 | // 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | 37 | //+build !noasm,!appengine 38 | 39 | #include "textflag.h" 40 | 41 | #define X_PTR SI 42 | #define Y_PTR DI 43 | #define DST_PTR DI 44 | #define IDX AX 45 | #define LEN CX 46 | #define TAIL BX 47 | #define INC_X R8 48 | #define INCx3_X R11 49 | #define INC_Y R9 50 | #define INCx3_Y R12 51 | #define INC_DST R9 52 | #define INCx3_DST R12 53 | #define ALPHA X0 54 | #define ALPHA_2 X1 55 | 56 | // func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) 57 | TEXT ·AxpyInc(SB), NOSPLIT, $0 58 | MOVQ x_base+8(FP), X_PTR // X_PTR = &x 59 | MOVQ y_base+32(FP), Y_PTR // Y_PTR = &y 60 | MOVQ n+56(FP), LEN // LEN = n 61 | CMPQ LEN, $0 // if LEN == 0 { return } 62 | JE end 63 | 64 | MOVQ ix+80(FP), INC_X 65 | MOVQ iy+88(FP), INC_Y 66 | LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(x[ix]) 67 | LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(y[iy]) 68 | MOVQ Y_PTR, DST_PTR // DST_PTR = Y_PTR // Write pointer 69 | 70 | MOVQ incX+64(FP), INC_X // INC_X = incX * sizeof(float64) 71 | SHLQ $3, INC_X 72 | MOVQ incY+72(FP), INC_Y // INC_Y = incY * sizeof(float64) 73 | SHLQ $3, INC_Y 74 | 75 | MOVSD alpha+0(FP), ALPHA // ALPHA = alpha 76 | MOVQ LEN, TAIL 77 | ANDQ $3, TAIL // TAIL = n % 4 78 | SHRQ $2, LEN // LEN = floor( n / 4 ) 79 | JZ tail_start // if LEN == 0 { goto tail_start } 80 | 81 | MOVAPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining 82 | LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 83 | LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3 84 | 85 | loop: // do { // y[i] += alpha * x[i] unrolled 4x. 86 | MOVSD (X_PTR), X2 // X_i = x[i] 87 | MOVSD (X_PTR)(INC_X*1), X3 88 | MOVSD (X_PTR)(INC_X*2), X4 89 | MOVSD (X_PTR)(INCx3_X*1), X5 90 | 91 | MULSD ALPHA, X2 // X_i *= a 92 | MULSD ALPHA_2, X3 93 | MULSD ALPHA, X4 94 | MULSD ALPHA_2, X5 95 | 96 | ADDSD (Y_PTR), X2 // X_i += y[i] 97 | ADDSD (Y_PTR)(INC_Y*1), X3 98 | ADDSD (Y_PTR)(INC_Y*2), X4 99 | ADDSD (Y_PTR)(INCx3_Y*1), X5 100 | 101 | MOVSD X2, (DST_PTR) // y[i] = X_i 102 | MOVSD X3, (DST_PTR)(INC_DST*1) 103 | MOVSD X4, (DST_PTR)(INC_DST*2) 104 | MOVSD X5, (DST_PTR)(INCx3_DST*1) 105 | 106 | LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) 107 | LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4]) 108 | DECQ LEN 109 | JNZ loop // } while --LEN > 0 110 | CMPQ TAIL, $0 // if TAIL == 0 { return } 111 | JE end 112 | 113 | tail_start: // Reset Loop registers 114 | MOVQ TAIL, LEN // Loop counter: LEN = TAIL 115 | SHRQ $1, LEN // LEN = floor( LEN / 2 ) 116 | JZ tail_one 117 | 118 | tail_two: 119 | MOVSD (X_PTR), X2 // X_i = x[i] 120 | MOVSD (X_PTR)(INC_X*1), X3 121 | MULSD ALPHA, X2 // X_i *= a 122 | MULSD ALPHA, X3 123 | ADDSD (Y_PTR), X2 // X_i += y[i] 124 | ADDSD (Y_PTR)(INC_Y*1), X3 125 | MOVSD X2, (DST_PTR) // y[i] = X_i 126 | MOVSD X3, (DST_PTR)(INC_DST*1) 127 | 128 | LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2]) 129 | LEAQ (Y_PTR)(INC_Y*2), Y_PTR // Y_PTR = &(Y_PTR[incY*2]) 130 | 131 | ANDQ $1, TAIL 132 | JZ end // if TAIL == 0 { goto end } 133 | 134 | tail_one: 135 | // y[i] += alpha * x[i] for the last n % 4 iterations. 136 | MOVSD (X_PTR), X2 // X2 = x[i] 137 | MULSD ALPHA, X2 // X2 *= a 138 | ADDSD (Y_PTR), X2 // X2 += y[i] 139 | MOVSD X2, (DST_PTR) // y[i] = X2 140 | 141 | end: 142 | RET 143 | -------------------------------------------------------------------------------- /asm/f64/axpyincto_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | // 5 | // Some of the loop unrolling code is copied from: 6 | // http://golang.org/src/math/big/arith_amd64.s 7 | // which is distributed under these terms: 8 | // 9 | // Copyright (c) 2012 The Go Authors. All rights reserved. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // * Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // * Redistributions in binary form must reproduce the above 18 | // copyright notice, this list of conditions and the following disclaimer 19 | // in the documentation and/or other materials provided with the 20 | // distribution. 21 | // * Neither the name of Google Inc. nor the names of its 22 | // contributors may be used to endorse or promote products derived from 23 | // this software without specific prior written permission. 24 | // 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | 37 | //+build !noasm,!appengine 38 | 39 | #include "textflag.h" 40 | 41 | #define X_PTR SI 42 | #define Y_PTR DI 43 | #define DST_PTR DX 44 | #define IDX AX 45 | #define LEN CX 46 | #define TAIL BX 47 | #define INC_X R8 48 | #define INCx3_X R11 49 | #define INC_Y R9 50 | #define INCx3_Y R12 51 | #define INC_DST R10 52 | #define INCx3_DST R13 53 | #define ALPHA X0 54 | #define ALPHA_2 X1 55 | 56 | // func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) 57 | TEXT ·AxpyIncTo(SB), NOSPLIT, $0 58 | MOVQ dst_base+0(FP), DST_PTR // DST_PTR := &dst 59 | MOVQ x_base+48(FP), X_PTR // X_PTR := &x 60 | MOVQ y_base+72(FP), Y_PTR // Y_PTR := &y 61 | MOVQ n+96(FP), LEN // LEN := n 62 | CMPQ LEN, $0 // if LEN == 0 { return } 63 | JE end 64 | 65 | MOVQ ix+120(FP), INC_X 66 | LEAQ (X_PTR)(INC_X*8), X_PTR // X_PTR = &(x[ix]) 67 | MOVQ iy+128(FP), INC_Y 68 | LEAQ (Y_PTR)(INC_Y*8), Y_PTR // Y_PTR = &(dst[idst]) 69 | MOVQ idst+32(FP), INC_DST 70 | LEAQ (DST_PTR)(INC_DST*8), DST_PTR // DST_PTR = &(y[iy]) 71 | 72 | MOVQ incX+104(FP), INC_X // INC_X = incX * sizeof(float64) 73 | SHLQ $3, INC_X 74 | MOVQ incY+112(FP), INC_Y // INC_Y = incY * sizeof(float64) 75 | SHLQ $3, INC_Y 76 | MOVQ incDst+24(FP), INC_DST // INC_DST = incDst * sizeof(float64) 77 | SHLQ $3, INC_DST 78 | MOVSD alpha+40(FP), ALPHA 79 | 80 | MOVQ LEN, TAIL 81 | ANDQ $3, TAIL // TAIL = n % 4 82 | SHRQ $2, LEN // LEN = floor( n / 4 ) 83 | JZ tail_start // if LEN == 0 { goto tail_start } 84 | 85 | MOVSD ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining 86 | LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 87 | LEAQ (INC_Y)(INC_Y*2), INCx3_Y // INCx3_Y = INC_Y * 3 88 | LEAQ (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3 89 | 90 | loop: // do { // y[i] += alpha * x[i] unrolled 2x. 91 | MOVSD (X_PTR), X2 // X_i = x[i] 92 | MOVSD (X_PTR)(INC_X*1), X3 93 | MOVSD (X_PTR)(INC_X*2), X4 94 | MOVSD (X_PTR)(INCx3_X*1), X5 95 | 96 | MULSD ALPHA, X2 // X_i *= a 97 | MULSD ALPHA_2, X3 98 | MULSD ALPHA, X4 99 | MULSD ALPHA_2, X5 100 | 101 | ADDSD (Y_PTR), X2 // X_i += y[i] 102 | ADDSD (Y_PTR)(INC_Y*1), X3 103 | ADDSD (Y_PTR)(INC_Y*2), X4 104 | ADDSD (Y_PTR)(INCx3_Y*1), X5 105 | 106 | MOVSD X2, (DST_PTR) // y[i] = X_i 107 | MOVSD X3, (DST_PTR)(INC_DST*1) 108 | MOVSD X4, (DST_PTR)(INC_DST*2) 109 | MOVSD X5, (DST_PTR)(INCx3_DST*1) 110 | 111 | LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) 112 | LEAQ (Y_PTR)(INC_Y*4), Y_PTR // Y_PTR = &(Y_PTR[incY*4]) 113 | LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4] 114 | DECQ LEN 115 | JNZ loop // } while --LEN > 0 116 | CMPQ TAIL, $0 // if TAIL == 0 { return } 117 | JE end 118 | 119 | tail_start: // Reset Loop registers 120 | MOVQ TAIL, LEN // Loop counter: LEN = TAIL 121 | SHRQ $1, LEN // LEN = floor( LEN / 2 ) 122 | JZ tail_one 123 | 124 | tail_two: 125 | MOVSD (X_PTR), X2 // X_i = x[i] 126 | MOVSD (X_PTR)(INC_X*1), X3 127 | MULSD ALPHA, X2 // X_i *= a 128 | MULSD ALPHA, X3 129 | ADDSD (Y_PTR), X2 // X_i += y[i] 130 | ADDSD (Y_PTR)(INC_Y*1), X3 131 | MOVSD X2, (DST_PTR) // y[i] = X_i 132 | MOVSD X3, (DST_PTR)(INC_DST*1) 133 | 134 | LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2]) 135 | LEAQ (Y_PTR)(INC_Y*2), Y_PTR // Y_PTR = &(Y_PTR[incY*2]) 136 | LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incY*2] 137 | 138 | ANDQ $1, TAIL 139 | JZ end // if TAIL == 0 { goto end } 140 | 141 | tail_one: 142 | MOVSD (X_PTR), X2 // X2 = x[i] 143 | MULSD ALPHA, X2 // X2 *= a 144 | ADDSD (Y_PTR), X2 // X2 += y[i] 145 | MOVSD X2, (DST_PTR) // y[i] = X2 146 | 147 | end: 148 | RET 149 | -------------------------------------------------------------------------------- /asm/f64/axpyunitary_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | // 5 | // Some of the loop unrolling code is copied from: 6 | // http://golang.org/src/math/big/arith_amd64.s 7 | // which is distributed under these terms: 8 | // 9 | // Copyright (c) 2012 The Go Authors. All rights reserved. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // * Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // * Redistributions in binary form must reproduce the above 18 | // copyright notice, this list of conditions and the following disclaimer 19 | // in the documentation and/or other materials provided with the 20 | // distribution. 21 | // * Neither the name of Google Inc. nor the names of its 22 | // contributors may be used to endorse or promote products derived from 23 | // this software without specific prior written permission. 24 | // 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | 37 | //+build !noasm,!appengine 38 | 39 | #include "textflag.h" 40 | 41 | #define X_PTR SI 42 | #define Y_PTR DI 43 | #define DST_PTR DI 44 | #define IDX AX 45 | #define LEN CX 46 | #define TAIL BX 47 | #define ALPHA X0 48 | #define ALPHA_2 X1 49 | 50 | // func AxpyUnitary(alpha float64, x, y []float64) 51 | TEXT ·AxpyUnitary(SB), NOSPLIT, $0 52 | MOVQ x_base+8(FP), X_PTR // X_PTR := &x 53 | MOVQ y_base+32(FP), Y_PTR // Y_PTR := &y 54 | MOVQ x_len+16(FP), LEN // LEN = min( len(x), len(y) ) 55 | CMPQ y_len+40(FP), LEN 56 | CMOVQLE y_len+40(FP), LEN 57 | CMPQ LEN, $0 // if LEN == 0 { return } 58 | JE end 59 | XORQ IDX, IDX 60 | MOVSD alpha+0(FP), ALPHA // ALPHA := { alpha, alpha } 61 | SHUFPD $0, ALPHA, ALPHA 62 | MOVUPS ALPHA, ALPHA_2 // ALPHA_2 := ALPHA for pipelining 63 | MOVQ Y_PTR, TAIL // Check memory alignment 64 | ANDQ $15, TAIL // TAIL = &y % 16 65 | JZ no_trim // if TAIL == 0 { goto no_trim } 66 | 67 | // Align on 16-byte boundary 68 | MOVSD (X_PTR), X2 // X2 := x[0] 69 | MULSD ALPHA, X2 // X2 *= a 70 | ADDSD (Y_PTR), X2 // X2 += y[0] 71 | MOVSD X2, (DST_PTR) // y[0] = X2 72 | INCQ IDX // i++ 73 | DECQ LEN // LEN-- 74 | JZ end // if LEN == 0 { return } 75 | 76 | no_trim: 77 | MOVQ LEN, TAIL 78 | ANDQ $7, TAIL // TAIL := n % 8 79 | SHRQ $3, LEN // LEN = floor( n / 8 ) 80 | JZ tail_start // if LEN == 0 { goto tail2_start } 81 | 82 | loop: // do { 83 | // y[i] += alpha * x[i] unrolled 8x. 84 | MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] 85 | MOVUPS 16(X_PTR)(IDX*8), X3 86 | MOVUPS 32(X_PTR)(IDX*8), X4 87 | MOVUPS 48(X_PTR)(IDX*8), X5 88 | 89 | MULPD ALPHA, X2 // X_i *= a 90 | MULPD ALPHA_2, X3 91 | MULPD ALPHA, X4 92 | MULPD ALPHA_2, X5 93 | 94 | ADDPD (Y_PTR)(IDX*8), X2 // X_i += y[i] 95 | ADDPD 16(Y_PTR)(IDX*8), X3 96 | ADDPD 32(Y_PTR)(IDX*8), X4 97 | ADDPD 48(Y_PTR)(IDX*8), X5 98 | 99 | MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X_i 100 | MOVUPS X3, 16(DST_PTR)(IDX*8) 101 | MOVUPS X4, 32(DST_PTR)(IDX*8) 102 | MOVUPS X5, 48(DST_PTR)(IDX*8) 103 | 104 | ADDQ $8, IDX // i += 8 105 | DECQ LEN 106 | JNZ loop // } while --LEN > 0 107 | CMPQ TAIL, $0 // if TAIL == 0 { return } 108 | JE end 109 | 110 | tail_start: // Reset loop registers 111 | MOVQ TAIL, LEN // Loop counter: LEN = TAIL 112 | SHRQ $1, LEN // LEN = floor( TAIL / 2 ) 113 | JZ tail_one // if TAIL == 0 { goto tail } 114 | 115 | tail_two: // do { 116 | MOVUPS (X_PTR)(IDX*8), X2 // X2 = x[i] 117 | MULPD ALPHA, X2 // X2 *= a 118 | ADDPD (Y_PTR)(IDX*8), X2 // X2 += y[i] 119 | MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X2 120 | ADDQ $2, IDX // i += 2 121 | DECQ LEN 122 | JNZ tail_two // } while --LEN > 0 123 | 124 | ANDQ $1, TAIL 125 | JZ end // if TAIL == 0 { goto end } 126 | 127 | tail_one: 128 | MOVSD (X_PTR)(IDX*8), X2 // X2 = x[i] 129 | MULSD ALPHA, X2 // X2 *= a 130 | ADDSD (Y_PTR)(IDX*8), X2 // X2 += y[i] 131 | MOVSD X2, (DST_PTR)(IDX*8) // y[i] = X2 132 | 133 | end: 134 | RET 135 | -------------------------------------------------------------------------------- /asm/f64/axpyunitaryto_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | // 5 | // Some of the loop unrolling code is copied from: 6 | // http://golang.org/src/math/big/arith_amd64.s 7 | // which is distributed under these terms: 8 | // 9 | // Copyright (c) 2012 The Go Authors. All rights reserved. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // * Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // * Redistributions in binary form must reproduce the above 18 | // copyright notice, this list of conditions and the following disclaimer 19 | // in the documentation and/or other materials provided with the 20 | // distribution. 21 | // * Neither the name of Google Inc. nor the names of its 22 | // contributors may be used to endorse or promote products derived from 23 | // this software without specific prior written permission. 24 | // 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | 37 | //+build !noasm,!appengine 38 | 39 | #include "textflag.h" 40 | 41 | #define X_PTR SI 42 | #define Y_PTR DX 43 | #define DST_PTR DI 44 | #define IDX AX 45 | #define LEN CX 46 | #define TAIL BX 47 | #define ALPHA X0 48 | #define ALPHA_2 X1 49 | 50 | // func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) 51 | TEXT ·AxpyUnitaryTo(SB), NOSPLIT, $0 52 | MOVQ dst_base+0(FP), DST_PTR // DST_PTR := &dst 53 | MOVQ x_base+32(FP), X_PTR // X_PTR := &x 54 | MOVQ y_base+56(FP), Y_PTR // Y_PTR := &y 55 | MOVQ x_len+40(FP), LEN // LEN = min( len(x), len(y), len(dst) ) 56 | CMPQ y_len+64(FP), LEN 57 | CMOVQLE y_len+64(FP), LEN 58 | CMPQ dst_len+8(FP), LEN 59 | CMOVQLE dst_len+8(FP), LEN 60 | 61 | CMPQ LEN, $0 62 | JE end // if LEN == 0 { return } 63 | 64 | XORQ IDX, IDX // IDX = 0 65 | MOVSD alpha+24(FP), ALPHA 66 | SHUFPD $0, ALPHA, ALPHA // ALPHA := { alpha, alpha } 67 | MOVQ Y_PTR, TAIL // Check memory alignment 68 | ANDQ $15, TAIL // TAIL = &y % 16 69 | JZ no_trim // if TAIL == 0 { goto no_trim } 70 | 71 | // Align on 16-byte boundary 72 | MOVSD (X_PTR), X2 // X2 := x[0] 73 | MULSD ALPHA, X2 // X2 *= a 74 | ADDSD (Y_PTR), X2 // X2 += y[0] 75 | MOVSD X2, (DST_PTR) // y[0] = X2 76 | INCQ IDX // i++ 77 | DECQ LEN // LEN-- 78 | JZ end // if LEN == 0 { return } 79 | 80 | no_trim: 81 | MOVQ LEN, TAIL 82 | ANDQ $7, TAIL // TAIL := n % 8 83 | SHRQ $3, LEN // LEN = floor( n / 8 ) 84 | JZ tail_start // if LEN == 0 { goto tail_start } 85 | 86 | MOVUPS ALPHA, ALPHA_2 // ALPHA_2 := ALPHA for pipelining 87 | 88 | loop: // do { 89 | // y[i] += alpha * x[i] unrolled 8x. 90 | MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] 91 | MOVUPS 16(X_PTR)(IDX*8), X3 92 | MOVUPS 32(X_PTR)(IDX*8), X4 93 | MOVUPS 48(X_PTR)(IDX*8), X5 94 | 95 | MULPD ALPHA, X2 // X_i *= alpha 96 | MULPD ALPHA_2, X3 97 | MULPD ALPHA, X4 98 | MULPD ALPHA_2, X5 99 | 100 | ADDPD (Y_PTR)(IDX*8), X2 // X_i += y[i] 101 | ADDPD 16(Y_PTR)(IDX*8), X3 102 | ADDPD 32(Y_PTR)(IDX*8), X4 103 | ADDPD 48(Y_PTR)(IDX*8), X5 104 | 105 | MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X_i 106 | MOVUPS X3, 16(DST_PTR)(IDX*8) 107 | MOVUPS X4, 32(DST_PTR)(IDX*8) 108 | MOVUPS X5, 48(DST_PTR)(IDX*8) 109 | 110 | ADDQ $8, IDX // i += 8 111 | DECQ LEN 112 | JNZ loop // } while --LEN > 0 113 | CMPQ TAIL, $0 // if TAIL == 0 { return } 114 | JE end 115 | 116 | tail_start: // Reset loop registers 117 | MOVQ TAIL, LEN // Loop counter: LEN = TAIL 118 | SHRQ $1, LEN // LEN = floor( TAIL / 2 ) 119 | JZ tail_one // if LEN == 0 { goto tail } 120 | 121 | tail_two: // do { 122 | MOVUPS (X_PTR)(IDX*8), X2 // X2 = x[i] 123 | MULPD ALPHA, X2 // X2 *= alpha 124 | ADDPD (Y_PTR)(IDX*8), X2 // X2 += y[i] 125 | MOVUPS X2, (DST_PTR)(IDX*8) // y[i] = X2 126 | ADDQ $2, IDX // i += 2 127 | DECQ LEN 128 | JNZ tail_two // } while --LEN > 0 129 | 130 | ANDQ $1, TAIL 131 | JZ end // if TAIL == 0 { goto end } 132 | 133 | tail_one: 134 | MOVSD (X_PTR)(IDX*8), X2 // X2 = x[i] 135 | MULSD ALPHA, X2 // X2 *= a 136 | ADDSD (Y_PTR)(IDX*8), X2 // X2 += y[i] 137 | MOVSD X2, (DST_PTR)(IDX*8) // y[i] = X2 138 | 139 | end: 140 | RET 141 | -------------------------------------------------------------------------------- /asm/f64/benchAxpy_test.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2017 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build go1.7 6 | 7 | package f64 8 | 9 | import ( 10 | "fmt" 11 | "testing" 12 | ) 13 | 14 | const ( 15 | testLen = 1e5 16 | ) 17 | 18 | var ( 19 | a = 2.0 20 | x = make([]float64, testLen) 21 | y = make([]float64, testLen) 22 | z = make([]float64, testLen) 23 | ) 24 | 25 | func init() { 26 | for n := range x { 27 | x[n] = float64(n) 28 | y[n] = float64(n) 29 | } 30 | } 31 | 32 | func BenchmarkAxpyUnitary(t *testing.B) { 33 | naiveaxpyu := func(a float64, x, y []float64) { 34 | for i, v := range x { 35 | y[i] += a * v 36 | } 37 | } 38 | tests := []struct { 39 | name string 40 | f func(a float64, x, y []float64) 41 | }{ 42 | {"AxpyUnitary", AxpyUnitary}, 43 | {"NaiveAxpyUnitary", naiveaxpyu}, 44 | } 45 | for _, test := range tests { 46 | for _, ln := range []uintptr{1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4, 1e5} { 47 | t.Run(fmt.Sprintf("%s-%d", test.name, ln), func(b *testing.B) { 48 | b.SetBytes(int64(64 * ln)) 49 | x, y := x[:ln], y[:ln] 50 | b.ResetTimer() 51 | for i := 0; i < b.N; i++ { 52 | test.f(a, x, y) 53 | } 54 | }) 55 | } 56 | } 57 | } 58 | 59 | func BenchmarkAxpyUnitaryTo(t *testing.B) { 60 | naiveaxpyut := func(d []float64, a float64, x, y []float64) { 61 | for i, v := range x { 62 | d[i] = y[i] + a*v 63 | } 64 | } 65 | tests := []struct { 66 | name string 67 | f func(z []float64, a float64, x, y []float64) 68 | }{ 69 | {"AxpyUnitaryTo", AxpyUnitaryTo}, 70 | {"NaiveAxpyUnitaryTo", naiveaxpyut}, 71 | } 72 | for _, test := range tests { 73 | for _, ln := range []uintptr{1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4, 1e5} { 74 | t.Run(fmt.Sprintf("%s-%d", test.name, ln), func(b *testing.B) { 75 | b.SetBytes(int64(64 * ln)) 76 | x, y, z := x[:ln], y[:ln], z[:ln] 77 | b.ResetTimer() 78 | for i := 0; i < b.N; i++ { 79 | test.f(z, a, x, y) 80 | } 81 | }) 82 | } 83 | } 84 | } 85 | 86 | var incsAxpy = []struct { 87 | len uintptr 88 | inc []int 89 | }{ 90 | {1, []int{1}}, 91 | {2, []int{1, 2, 4, 10}}, 92 | {3, []int{1, 2, 4, 10}}, 93 | {4, []int{1, 2, 4, 10}}, 94 | {5, []int{1, 2, 4, 10}}, 95 | {10, []int{1, 2, 4, 10}}, 96 | {500, []int{1, 2, 4, 10}}, 97 | {1e3, []int{1, 2, 4, 10}}, 98 | {1e4, []int{1, 2, 4, 10, -1, -2, -4, -10}}, 99 | } 100 | 101 | func BenchmarkAxpyInc(t *testing.B) { 102 | naiveaxpyinc := func(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) { 103 | for i := 0; i < int(n); i++ { 104 | y[iy] += alpha * x[ix] 105 | ix += incX 106 | iy += incY 107 | } 108 | } 109 | tests := []struct { 110 | name string 111 | f func(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) 112 | }{ 113 | {"AxpyInc", AxpyInc}, 114 | {"NaiveAxpyInc", naiveaxpyinc}, 115 | } 116 | for _, test := range tests { 117 | for _, tt := range incsAxpy { 118 | for _, inc := range tt.inc { 119 | t.Run(fmt.Sprintf("%s-%d-inc(%d)", test.name, tt.len, inc), func(b *testing.B) { 120 | b.SetBytes(int64(64 * tt.len)) 121 | var idx, tstInc uintptr = 0, uintptr(inc) 122 | if inc < 0 { 123 | idx = uintptr((-int(tt.len) + 1) * inc) 124 | } 125 | for i := 0; i < b.N; i++ { 126 | test.f(a, x, y, uintptr(tt.len), tstInc, tstInc, idx, idx) 127 | } 128 | }) 129 | } 130 | } 131 | } 132 | } 133 | 134 | func BenchmarkAxpyIncTo(t *testing.B) { 135 | naiveaxpyincto := func(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) { 136 | for i := 0; i < int(n); i++ { 137 | dst[idst] = alpha*x[ix] + y[iy] 138 | ix += incX 139 | iy += incY 140 | idst += incDst 141 | } 142 | } 143 | tests := []struct { 144 | name string 145 | f func(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) 146 | }{ 147 | {"AxpyIncTo", AxpyIncTo}, 148 | {"NaiveAxpyIncTo", naiveaxpyincto}, 149 | } 150 | for _, test := range tests { 151 | for _, tt := range incsAxpy { 152 | for _, inc := range tt.inc { 153 | t.Run(fmt.Sprintf("%s-%d-inc(%d)", test.name, tt.len, inc), func(b *testing.B) { 154 | b.SetBytes(int64(64 * tt.len)) 155 | var idx, tstInc uintptr = 0, uintptr(inc) 156 | if inc < 0 { 157 | idx = uintptr((-int(tt.len) + 1) * inc) 158 | } 159 | for i := 0; i < b.N; i++ { 160 | test.f(z, tstInc, idx, a, x, y, uintptr(tt.len), 161 | tstInc, tstInc, idx, idx) 162 | } 163 | }) 164 | } 165 | } 166 | } 167 | } 168 | -------------------------------------------------------------------------------- /asm/f64/benchScal_test.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2017 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build go1.7 6 | 7 | package f64 8 | 9 | import ( 10 | "fmt" 11 | "testing" 12 | ) 13 | 14 | var uniScal = []int64{1, 3, 10, 30, 1e2, 3e2, 1e3, 3e3, 1e4, 3e4} 15 | 16 | func BenchmarkScalUnitary(t *testing.B) { 17 | tstName := "ScalUnitary" 18 | for _, ln := range uniScal { 19 | t.Run(fmt.Sprintf("%s-%d", tstName, ln), func(b *testing.B) { 20 | b.SetBytes(64 * ln) 21 | x := x[:ln] 22 | b.ResetTimer() 23 | for i := 0; i < b.N; i++ { 24 | ScalUnitary(a, x) 25 | } 26 | }) 27 | } 28 | } 29 | 30 | func BenchmarkScalUnitaryTo(t *testing.B) { 31 | tstName := "ScalUnitaryTo" 32 | for _, ln := range uniScal { 33 | t.Run(fmt.Sprintf("%s-%d", tstName, ln), func(b *testing.B) { 34 | b.SetBytes(int64(64 * ln)) 35 | x, y := x[:ln], y[:ln] 36 | b.ResetTimer() 37 | for i := 0; i < b.N; i++ { 38 | ScalUnitaryTo(y, a, x) 39 | } 40 | }) 41 | } 42 | } 43 | 44 | var incScal = []struct { 45 | len uintptr 46 | inc []int 47 | }{ 48 | {1, []int{1}}, 49 | {3, []int{1, 2, 4, 10}}, 50 | {10, []int{1, 2, 4, 10}}, 51 | {30, []int{1, 2, 4, 10}}, 52 | {1e2, []int{1, 2, 4, 10}}, 53 | {3e2, []int{1, 2, 4, 10}}, 54 | {1e3, []int{1, 2, 4, 10}}, 55 | {3e3, []int{1, 2, 4, 10}}, 56 | {1e4, []int{1, 2, 4, 10}}, 57 | } 58 | 59 | func BenchmarkScalInc(t *testing.B) { 60 | tstName := "ScalInc" 61 | for _, tt := range incScal { 62 | for _, inc := range tt.inc { 63 | t.Run(fmt.Sprintf("%s-%d-inc(%d)", tstName, tt.len, inc), func(b *testing.B) { 64 | b.SetBytes(int64(64 * tt.len)) 65 | tstInc := uintptr(inc) 66 | for i := 0; i < b.N; i++ { 67 | ScalInc(a, x, uintptr(tt.len), tstInc) 68 | } 69 | }) 70 | } 71 | } 72 | } 73 | 74 | func BenchmarkScalIncTo(t *testing.B) { 75 | tstName := "ScalIncTo" 76 | for _, tt := range incScal { 77 | for _, inc := range tt.inc { 78 | t.Run(fmt.Sprintf("%s-%d-inc(%d)", tstName, tt.len, inc), func(b *testing.B) { 79 | b.SetBytes(int64(64 * tt.len)) 80 | tstInc := uintptr(inc) 81 | for i := 0; i < b.N; i++ { 82 | ScalIncTo(z, tstInc, a, x, uintptr(tt.len), tstInc) 83 | } 84 | }) 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /asm/f64/cumprod_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | TEXT ·CumProd(SB), NOSPLIT, $0 10 | MOVQ dst_base+0(FP), DI // DI = &dst 11 | MOVQ dst_len+8(FP), CX // CX = len(dst) 12 | MOVQ s_base+24(FP), SI // SI = &s 13 | CMPQ s_len+32(FP), CX // CX = max( CX, len(s) ) 14 | CMOVQLE s_len+32(FP), CX 15 | MOVQ CX, ret_len+56(FP) // len(ret) = CX 16 | CMPQ CX, $0 // if CX == 0 { return } 17 | JE cp_end 18 | XORQ AX, AX // i = 0 19 | 20 | MOVSD (SI), X5 // p_prod = { s[0], s[0] } 21 | SHUFPD $0, X5, X5 22 | MOVSD X5, (DI) // dst[0] = s[0] 23 | INCQ AX // ++i 24 | DECQ CX // -- CX 25 | JZ cp_end // if CX == 0 { return } 26 | 27 | MOVQ CX, BX 28 | ANDQ $3, BX // BX = CX % 4 29 | SHRQ $2, CX // CX = floor( CX / 4 ) 30 | JZ cp_tail_start // if CX == 0 { goto cp_tail_start } 31 | 32 | cp_loop: // Loop unrolled 4x do { 33 | MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1] 34 | MOVUPS 16(SI)(AX*8), X2 35 | MOVAPS X0, X1 // X1 = X0 36 | MOVAPS X2, X3 37 | SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] } 38 | SHUFPD $1, X3, X3 39 | MULPD X0, X1 // X1 *= X0 40 | MULPD X2, X3 41 | SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] } 42 | SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] } 43 | SHUFPD $2, X3, X2 44 | SHUFPD $3, X3, X3 45 | MULPD X5, X0 // X0 *= p_prod 46 | MULPD X1, X5 // p_prod *= X1 47 | MULPD X5, X2 48 | MOVUPS X0, (DI)(AX*8) // dst[i] = X0 49 | MOVUPS X2, 16(DI)(AX*8) 50 | MULPD X3, X5 51 | ADDQ $4, AX // i += 4 52 | LOOP cp_loop // } while --CX > 0 53 | 54 | // if BX == 0 { return } 55 | CMPQ BX, $0 56 | JE cp_end 57 | 58 | cp_tail_start: // Reset loop registers 59 | MOVQ BX, CX // Loop counter: CX = BX 60 | 61 | cp_tail: // do { 62 | MULSD (SI)(AX*8), X5 // p_prod *= s[i] 63 | MOVSD X5, (DI)(AX*8) // dst[i] = p_prod 64 | INCQ AX // ++i 65 | LOOP cp_tail // } while --CX > 0 66 | 67 | cp_end: 68 | MOVQ DI, ret_base+48(FP) // &ret = &dst 69 | MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst) 70 | MOVQ SI, ret_cap+64(FP) 71 | RET 72 | -------------------------------------------------------------------------------- /asm/f64/cumsum_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | TEXT ·CumSum(SB), NOSPLIT, $0 10 | MOVQ dst_base+0(FP), DI // DI = &dst 11 | MOVQ dst_len+8(FP), CX // CX = len(dst) 12 | MOVQ s_base+24(FP), SI // SI = &s 13 | CMPQ s_len+32(FP), CX // CX = max( CX, len(s) ) 14 | CMOVQLE s_len+32(FP), CX 15 | MOVQ CX, ret_len+56(FP) // len(ret) = CX 16 | CMPQ CX, $0 // if CX == 0 { return } 17 | JE cs_end 18 | XORQ AX, AX // i = 0 19 | PXOR X5, X5 // p_sum = 0 20 | MOVQ CX, BX 21 | ANDQ $3, BX // BX = CX % 4 22 | SHRQ $2, CX // CX = floor( CX / 4 ) 23 | JZ cs_tail_start // if CX == 0 { goto cs_tail_start } 24 | 25 | cs_loop: // Loop unrolled 4x do { 26 | MOVUPS (SI)(AX*8), X0 // X0 = s[i:i+1] 27 | MOVUPS 16(SI)(AX*8), X2 28 | MOVAPS X0, X1 // X1 = X0 29 | MOVAPS X2, X3 30 | SHUFPD $1, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[0] } 31 | SHUFPD $1, X3, X3 32 | ADDPD X0, X1 // X1 += X0 33 | ADDPD X2, X3 34 | SHUFPD $2, X1, X0 // { X0[0], X0[1] } = { X0[0], X1[1] } 35 | SHUFPD $3, X1, X1 // { X1[0], X1[1] } = { X1[1], X1[1] } 36 | SHUFPD $2, X3, X2 37 | SHUFPD $3, X3, X3 38 | ADDPD X5, X0 // X0 += p_sum 39 | ADDPD X1, X5 // p_sum += X1 40 | ADDPD X5, X2 41 | MOVUPS X0, (DI)(AX*8) // dst[i] = X0 42 | MOVUPS X2, 16(DI)(AX*8) 43 | ADDPD X3, X5 44 | ADDQ $4, AX // i += 4 45 | LOOP cs_loop // } while --CX > 0 46 | 47 | // if BX == 0 { return } 48 | CMPQ BX, $0 49 | JE cs_end 50 | 51 | cs_tail_start: // Reset loop registers 52 | MOVQ BX, CX // Loop counter: CX = BX 53 | 54 | cs_tail: // do { 55 | ADDSD (SI)(AX*8), X5 // p_sum *= s[i] 56 | MOVSD X5, (DI)(AX*8) // dst[i] = p_sum 57 | INCQ AX // ++i 58 | LOOP cs_tail // } while --CX > 0 59 | 60 | cs_end: 61 | MOVQ DI, ret_base+48(FP) // &ret = &dst 62 | MOVQ dst_cap+16(FP), SI // cap(ret) = cap(dst) 63 | MOVQ SI, ret_cap+64(FP) 64 | RET 65 | -------------------------------------------------------------------------------- /asm/f64/div_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func Div(dst, s []float64) 10 | TEXT ·Div(SB), NOSPLIT, $0 11 | MOVQ dst_base+0(FP), DI // DI = &dst 12 | MOVQ dst_len+8(FP), CX // CX = len(dst) 13 | MOVQ s_base+24(FP), SI // SI = &s 14 | CMPQ s_len+32(FP), CX // CX = max( CX, len(s) ) 15 | CMOVQLE s_len+32(FP), CX 16 | CMPQ CX, $0 // if CX == 0 { return } 17 | JE div_end 18 | XORQ AX, AX // i = 0 19 | MOVQ SI, BX 20 | ANDQ $15, BX // BX = &s & 15 21 | JZ div_no_trim // if BX == 0 { goto div_no_trim } 22 | 23 | // Align on 16-bit boundary 24 | MOVSD (DI)(AX*8), X0 // X0 = dst[i] 25 | DIVSD (SI)(AX*8), X0 // X0 /= s[i] 26 | MOVSD X0, (DI)(AX*8) // dst[i] = X0 27 | INCQ AX // ++i 28 | DECQ CX // --CX 29 | JZ div_end // if CX == 0 { return } 30 | 31 | div_no_trim: 32 | MOVQ CX, BX 33 | ANDQ $7, BX // BX = len(dst) % 8 34 | SHRQ $3, CX // CX = floor( len(dst) / 8 ) 35 | JZ div_tail_start // if CX == 0 { goto div_tail_start } 36 | 37 | div_loop: // Loop unrolled 8x do { 38 | MOVUPS (DI)(AX*8), X0 // X0 = dst[i:i+1] 39 | MOVUPS 16(DI)(AX*8), X1 40 | MOVUPS 32(DI)(AX*8), X2 41 | MOVUPS 48(DI)(AX*8), X3 42 | DIVPD (SI)(AX*8), X0 // X0 /= s[i:i+1] 43 | DIVPD 16(SI)(AX*8), X1 44 | DIVPD 32(SI)(AX*8), X2 45 | DIVPD 48(SI)(AX*8), X3 46 | MOVUPS X0, (DI)(AX*8) // dst[i] = X0 47 | MOVUPS X1, 16(DI)(AX*8) 48 | MOVUPS X2, 32(DI)(AX*8) 49 | MOVUPS X3, 48(DI)(AX*8) 50 | ADDQ $8, AX // i += 8 51 | LOOP div_loop // } while --CX > 0 52 | CMPQ BX, $0 // if BX == 0 { return } 53 | JE div_end 54 | 55 | div_tail_start: // Reset loop registers 56 | MOVQ BX, CX // Loop counter: CX = BX 57 | 58 | div_tail: // do { 59 | MOVSD (DI)(AX*8), X0 // X0 = dst[i] 60 | DIVSD (SI)(AX*8), X0 // X0 /= s[i] 61 | MOVSD X0, (DI)(AX*8) // dst[i] = X0 62 | INCQ AX // ++i 63 | LOOP div_tail // } while --CX > 0 64 | 65 | div_end: 66 | RET 67 | 68 | -------------------------------------------------------------------------------- /asm/f64/divto_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func DivTo(dst, x, y []float64) 10 | TEXT ·DivTo(SB), NOSPLIT, $0 11 | MOVQ dst_base+0(FP), DI // DI = &dst 12 | MOVQ dst_len+8(FP), CX // CX = len(dst) 13 | MOVQ x_base+24(FP), SI // SI = &x 14 | MOVQ y_base+48(FP), DX // DX = &y 15 | CMPQ x_len+32(FP), CX // CX = max( len(dst), len(x), len(y) ) 16 | CMOVQLE x_len+32(FP), CX 17 | CMPQ y_len+56(FP), CX 18 | CMOVQLE y_len+56(FP), CX 19 | MOVQ CX, ret_len+80(FP) // len(ret) = CX 20 | CMPQ CX, $0 // if CX == 0 { return } 21 | JE div_end 22 | XORQ AX, AX // i = 0 23 | MOVQ DX, BX 24 | ANDQ $15, BX // BX = &y & OxF 25 | JZ div_no_trim // if BX == 0 { goto div_no_trim } 26 | 27 | // Align on 16-bit boundary 28 | MOVSD (SI)(AX*8), X0 // X0 = s[i] 29 | DIVSD (DX)(AX*8), X0 // X0 /= t[i] 30 | MOVSD X0, (DI)(AX*8) // dst[i] = X0 31 | INCQ AX // ++i 32 | DECQ CX // --CX 33 | JZ div_end // if CX == 0 { return } 34 | 35 | div_no_trim: 36 | MOVQ CX, BX 37 | ANDQ $7, BX // BX = len(dst) % 8 38 | SHRQ $3, CX // CX = floor( len(dst) / 8 ) 39 | JZ div_tail_start // if CX == 0 { goto div_tail_start } 40 | 41 | div_loop: // Loop unrolled 8x do { 42 | MOVUPS (SI)(AX*8), X0 // X0 = x[i:i+1] 43 | MOVUPS 16(SI)(AX*8), X1 44 | MOVUPS 32(SI)(AX*8), X2 45 | MOVUPS 48(SI)(AX*8), X3 46 | DIVPD (DX)(AX*8), X0 // X0 /= y[i:i+1] 47 | DIVPD 16(DX)(AX*8), X1 48 | DIVPD 32(DX)(AX*8), X2 49 | DIVPD 48(DX)(AX*8), X3 50 | MOVUPS X0, (DI)(AX*8) // dst[i:i+1] = X0 51 | MOVUPS X1, 16(DI)(AX*8) 52 | MOVUPS X2, 32(DI)(AX*8) 53 | MOVUPS X3, 48(DI)(AX*8) 54 | ADDQ $8, AX // i += 8 55 | LOOP div_loop // } while --CX > 0 56 | CMPQ BX, $0 // if BX == 0 { return } 57 | JE div_end 58 | 59 | div_tail_start: // Reset loop registers 60 | MOVQ BX, CX // Loop counter: CX = BX 61 | 62 | div_tail: // do { 63 | MOVSD (SI)(AX*8), X0 // X0 = x[i] 64 | DIVSD (DX)(AX*8), X0 // X0 /= y[i] 65 | MOVSD X0, (DI)(AX*8) 66 | INCQ AX // ++i 67 | LOOP div_tail // } while --CX > 0 68 | 69 | div_end: 70 | MOVQ DI, ret_base+72(FP) // &ret = &dst 71 | MOVQ dst_cap+16(FP), DI // cap(ret) = cap(dst) 72 | MOVQ DI, ret_cap+88(FP) 73 | RET 74 | -------------------------------------------------------------------------------- /asm/f64/doc.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2017 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // This repository is no longer maintained. 6 | // Development has moved to https://github.com/gonum/gonum. 7 | // 8 | // Package f64 provides float64 vector primitives. 9 | package f64 10 | -------------------------------------------------------------------------------- /asm/f64/dot.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !amd64 noasm appengine 6 | 7 | package f64 8 | 9 | // DotUnitary is 10 | // for i, v := range x { 11 | // sum += y[i] * v 12 | // } 13 | // return sum 14 | func DotUnitary(x, y []float64) (sum float64) { 15 | for i, v := range x { 16 | sum += y[i] * v 17 | } 18 | return sum 19 | } 20 | 21 | // DotInc is 22 | // for i := 0; i < int(n); i++ { 23 | // sum += y[iy] * x[ix] 24 | // ix += incX 25 | // iy += incY 26 | // } 27 | // return sum 28 | func DotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64) { 29 | for i := 0; i < int(n); i++ { 30 | sum += y[iy] * x[ix] 31 | ix += incX 32 | iy += incY 33 | } 34 | return sum 35 | } 36 | -------------------------------------------------------------------------------- /asm/f64/dot_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | // 5 | // Some of the loop unrolling code is copied from: 6 | // http://golang.org/src/math/big/arith_amd64.s 7 | // which is distributed under these terms: 8 | // 9 | // Copyright (c) 2012 The Go Authors. All rights reserved. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // * Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // * Redistributions in binary form must reproduce the above 18 | // copyright notice, this list of conditions and the following disclaimer 19 | // in the documentation and/or other materials provided with the 20 | // distribution. 21 | // * Neither the name of Google Inc. nor the names of its 22 | // contributors may be used to endorse or promote products derived from 23 | // this software without specific prior written permission. 24 | // 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | 37 | //+build !noasm,!appengine 38 | 39 | #include "textflag.h" 40 | 41 | // func DdotUnitary(x, y []float64) (sum float64) 42 | // This function assumes len(y) >= len(x). 43 | TEXT ·DotUnitary(SB), NOSPLIT, $0 44 | MOVQ x+0(FP), R8 45 | MOVQ x_len+8(FP), DI // n = len(x) 46 | MOVQ y+24(FP), R9 47 | 48 | MOVSD $(0.0), X7 // sum = 0 49 | MOVSD $(0.0), X8 // sum = 0 50 | 51 | MOVQ $0, SI // i = 0 52 | SUBQ $4, DI // n -= 4 53 | JL tail_uni // if n < 0 goto tail_uni 54 | 55 | loop_uni: 56 | // sum += x[i] * y[i] unrolled 4x. 57 | MOVUPD 0(R8)(SI*8), X0 58 | MOVUPD 0(R9)(SI*8), X1 59 | MOVUPD 16(R8)(SI*8), X2 60 | MOVUPD 16(R9)(SI*8), X3 61 | MULPD X1, X0 62 | MULPD X3, X2 63 | ADDPD X0, X7 64 | ADDPD X2, X8 65 | 66 | ADDQ $4, SI // i += 4 67 | SUBQ $4, DI // n -= 4 68 | JGE loop_uni // if n >= 0 goto loop_uni 69 | 70 | tail_uni: 71 | ADDQ $4, DI // n += 4 72 | JLE end_uni // if n <= 0 goto end_uni 73 | 74 | onemore_uni: 75 | // sum += x[i] * y[i] for the remaining 1-3 elements. 76 | MOVSD 0(R8)(SI*8), X0 77 | MOVSD 0(R9)(SI*8), X1 78 | MULSD X1, X0 79 | ADDSD X0, X7 80 | 81 | ADDQ $1, SI // i++ 82 | SUBQ $1, DI // n-- 83 | JNZ onemore_uni // if n != 0 goto onemore_uni 84 | 85 | end_uni: 86 | // Add the four sums together. 87 | ADDPD X8, X7 88 | MOVSD X7, X0 89 | UNPCKHPD X7, X7 90 | ADDSD X0, X7 91 | MOVSD X7, sum+48(FP) // Return final sum. 92 | RET 93 | 94 | // func DdotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64) 95 | TEXT ·DotInc(SB), NOSPLIT, $0 96 | MOVQ x+0(FP), R8 97 | MOVQ y+24(FP), R9 98 | MOVQ n+48(FP), CX 99 | MOVQ incX+56(FP), R11 100 | MOVQ incY+64(FP), R12 101 | MOVQ ix+72(FP), R13 102 | MOVQ iy+80(FP), R14 103 | 104 | MOVSD $(0.0), X7 // sum = 0 105 | LEAQ (R8)(R13*8), SI // p = &x[ix] 106 | LEAQ (R9)(R14*8), DI // q = &y[ix] 107 | SHLQ $3, R11 // incX *= sizeof(float64) 108 | SHLQ $3, R12 // indY *= sizeof(float64) 109 | 110 | SUBQ $2, CX // n -= 2 111 | JL tail_inc // if n < 0 goto tail_inc 112 | 113 | loop_inc: 114 | // sum += *p * *q unrolled 2x. 115 | MOVHPD (SI), X0 116 | MOVHPD (DI), X1 117 | ADDQ R11, SI // p += incX 118 | ADDQ R12, DI // q += incY 119 | MOVLPD (SI), X0 120 | MOVLPD (DI), X1 121 | ADDQ R11, SI // p += incX 122 | ADDQ R12, DI // q += incY 123 | 124 | MULPD X1, X0 125 | ADDPD X0, X7 126 | 127 | SUBQ $2, CX // n -= 2 128 | JGE loop_inc // if n >= 0 goto loop_inc 129 | 130 | tail_inc: 131 | ADDQ $2, CX // n += 2 132 | JLE end_inc // if n <= 0 goto end_inc 133 | 134 | // sum += *p * *q for the last iteration if n is odd. 135 | MOVSD (SI), X0 136 | MULSD (DI), X0 137 | ADDSD X0, X7 138 | 139 | end_inc: 140 | // Add the two sums together. 141 | MOVSD X7, X0 142 | UNPCKHPD X7, X7 143 | ADDSD X0, X7 144 | MOVSD X7, sum+88(FP) // Return final sum. 145 | RET 146 | -------------------------------------------------------------------------------- /asm/f64/dot_test.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package f64 6 | 7 | import ( 8 | "fmt" 9 | "math" 10 | "math/rand" 11 | "testing" 12 | ) 13 | 14 | func TestDotUnitary(t *testing.T) { 15 | for i, test := range []struct { 16 | xData []float64 17 | yData []float64 18 | 19 | want float64 20 | }{ 21 | { 22 | xData: []float64{2}, 23 | yData: []float64{-3}, 24 | want: -6, 25 | }, 26 | { 27 | xData: []float64{2, 3}, 28 | yData: []float64{-3, 4}, 29 | want: 6, 30 | }, 31 | { 32 | xData: []float64{2, 3, -4}, 33 | yData: []float64{-3, 4, 5}, 34 | want: -14, 35 | }, 36 | { 37 | xData: []float64{2, 3, -4, -5}, 38 | yData: []float64{-3, 4, 5, -6}, 39 | want: 16, 40 | }, 41 | { 42 | xData: []float64{0, 2, 3, -4, -5}, 43 | yData: []float64{0, -3, 4, 5, -6}, 44 | want: 16, 45 | }, 46 | { 47 | xData: []float64{0, 0, 2, 3, -4, -5}, 48 | yData: []float64{0, 1, -3, 4, 5, -6}, 49 | want: 16, 50 | }, 51 | { 52 | xData: []float64{0, 0, 1, 1, 2, -3, -4}, 53 | yData: []float64{0, 1, 0, 3, -4, 5, -6}, 54 | want: 4, 55 | }, 56 | { 57 | xData: []float64{0, 0, 1, 1, 2, -3, -4, 5}, 58 | yData: []float64{0, 1, 0, 3, -4, 5, -6, 7}, 59 | want: 39, 60 | }, 61 | } { 62 | const msgGuard = "test %v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v" 63 | 64 | x, xFront, xBack := newGuardedVector(test.xData, 1) 65 | y, yFront, yBack := newGuardedVector(test.yData, 1) 66 | got := DotUnitary(x, y) 67 | 68 | if !allNaN(xFront) || !allNaN(xBack) { 69 | t.Errorf(msgGuard, i, "x", xFront, xBack) 70 | } 71 | if !allNaN(yFront) || !allNaN(yBack) { 72 | t.Errorf(msgGuard, i, "y", yFront, yBack) 73 | } 74 | if !equalStrided(test.xData, x, 1) { 75 | t.Errorf("test %v: modified read-only x argument", i) 76 | } 77 | if !equalStrided(test.yData, y, 1) { 78 | t.Errorf("test %v: modified read-only y argument", i) 79 | } 80 | if math.IsNaN(got) { 81 | t.Errorf("test %v: invalid memory read", i) 82 | continue 83 | } 84 | 85 | if got != test.want { 86 | t.Errorf("test %v: unexpected result. want %v, got %v", i, test.want, got) 87 | } 88 | } 89 | } 90 | 91 | func TestDotInc(t *testing.T) { 92 | for i, test := range []struct { 93 | xData []float64 94 | yData []float64 95 | 96 | want float64 97 | wantRev float64 // Result when one of the vectors is reversed. 98 | }{ 99 | { 100 | xData: []float64{2}, 101 | yData: []float64{-3}, 102 | want: -6, 103 | wantRev: -6, 104 | }, 105 | { 106 | xData: []float64{2, 3}, 107 | yData: []float64{-3, 4}, 108 | want: 6, 109 | wantRev: -1, 110 | }, 111 | { 112 | xData: []float64{2, 3, -4}, 113 | yData: []float64{-3, 4, 5}, 114 | want: -14, 115 | wantRev: 34, 116 | }, 117 | { 118 | xData: []float64{2, 3, -4, -5}, 119 | yData: []float64{-3, 4, 5, -6}, 120 | want: 16, 121 | wantRev: 2, 122 | }, 123 | { 124 | xData: []float64{0, 2, 3, -4, -5}, 125 | yData: []float64{0, -3, 4, 5, -6}, 126 | want: 16, 127 | wantRev: 34, 128 | }, 129 | { 130 | xData: []float64{0, 0, 2, 3, -4, -5}, 131 | yData: []float64{0, 1, -3, 4, 5, -6}, 132 | want: 16, 133 | wantRev: -5, 134 | }, 135 | { 136 | xData: []float64{0, 0, 1, 1, 2, -3, -4}, 137 | yData: []float64{0, 1, 0, 3, -4, 5, -6}, 138 | want: 4, 139 | wantRev: -4, 140 | }, 141 | { 142 | xData: []float64{0, 0, 1, 1, 2, -3, -4, 5}, 143 | yData: []float64{0, 1, 0, 3, -4, 5, -6, 7}, 144 | want: 39, 145 | wantRev: 3, 146 | }, 147 | } { 148 | const msgGuard = "%v: out-of-bounds write to %v argument\nfront guard: %v\nback guard: %v" 149 | 150 | for _, incX := range []int{-7, -3, -2, -1, 1, 2, 3, 7} { 151 | for _, incY := range []int{-7, -3, -2, -1, 1, 2, 3, 7} { 152 | n := len(test.xData) 153 | x, xFront, xBack := newGuardedVector(test.xData, incX) 154 | y, yFront, yBack := newGuardedVector(test.yData, incY) 155 | 156 | var ix, iy int 157 | if incX < 0 { 158 | ix = (-n + 1) * incX 159 | } 160 | if incY < 0 { 161 | iy = (-n + 1) * incY 162 | } 163 | got := DotInc(x, y, uintptr(n), uintptr(incX), uintptr(incY), uintptr(ix), uintptr(iy)) 164 | 165 | prefix := fmt.Sprintf("test %v, incX = %v, incY = %v", i, incX, incY) 166 | if !allNaN(xFront) || !allNaN(xBack) { 167 | t.Errorf(msgGuard, prefix, "x", xFront, xBack) 168 | } 169 | if !allNaN(yFront) || !allNaN(yBack) { 170 | t.Errorf(msgGuard, prefix, "y", yFront, yBack) 171 | } 172 | if nonStridedWrite(x, incX) || !equalStrided(test.xData, x, incX) { 173 | t.Errorf("%v: modified read-only x argument", prefix) 174 | } 175 | if nonStridedWrite(y, incY) || !equalStrided(test.yData, y, incY) { 176 | t.Errorf("%v: modified read-only y argument", prefix) 177 | } 178 | if math.IsNaN(got) { 179 | t.Errorf("%v: invalid memory read", prefix) 180 | continue 181 | } 182 | 183 | want := test.want 184 | if incX*incY < 0 { 185 | want = test.wantRev 186 | } 187 | if got != want { 188 | t.Errorf("%v: unexpected result. want %v, got %v", prefix, want, got) 189 | } 190 | } 191 | } 192 | } 193 | } 194 | 195 | func BenchmarkDotUnitaryN1(b *testing.B) { dotUnitaryBenchmark(b, 1) } 196 | func BenchmarkDotUnitaryN2(b *testing.B) { dotUnitaryBenchmark(b, 2) } 197 | func BenchmarkDotUnitaryN3(b *testing.B) { dotUnitaryBenchmark(b, 3) } 198 | func BenchmarkDotUnitaryN4(b *testing.B) { dotUnitaryBenchmark(b, 4) } 199 | func BenchmarkDotUnitaryN10(b *testing.B) { dotUnitaryBenchmark(b, 10) } 200 | func BenchmarkDotUnitaryN100(b *testing.B) { dotUnitaryBenchmark(b, 100) } 201 | func BenchmarkDotUnitaryN1000(b *testing.B) { dotUnitaryBenchmark(b, 1000) } 202 | func BenchmarkDotUnitaryN10000(b *testing.B) { dotUnitaryBenchmark(b, 10000) } 203 | func BenchmarkDotUnitaryN100000(b *testing.B) { dotUnitaryBenchmark(b, 100000) } 204 | 205 | var r float64 206 | 207 | func dotUnitaryBenchmark(b *testing.B, n int) { 208 | x := make([]float64, n) 209 | for i := range x { 210 | x[i] = rand.Float64() 211 | } 212 | y := make([]float64, n) 213 | for i := range y { 214 | y[i] = rand.Float64() 215 | } 216 | b.ResetTimer() 217 | for i := 0; i < b.N; i++ { 218 | r = DotUnitary(x, y) 219 | } 220 | } 221 | 222 | func BenchmarkDotIncN1Inc1(b *testing.B) { dotIncBenchmark(b, 1, 1) } 223 | 224 | func BenchmarkDotIncN2Inc1(b *testing.B) { dotIncBenchmark(b, 2, 1) } 225 | func BenchmarkDotIncN2Inc2(b *testing.B) { dotIncBenchmark(b, 2, 2) } 226 | func BenchmarkDotIncN2Inc4(b *testing.B) { dotIncBenchmark(b, 2, 4) } 227 | func BenchmarkDotIncN2Inc10(b *testing.B) { dotIncBenchmark(b, 2, 10) } 228 | 229 | func BenchmarkDotIncN3Inc1(b *testing.B) { dotIncBenchmark(b, 3, 1) } 230 | func BenchmarkDotIncN3Inc2(b *testing.B) { dotIncBenchmark(b, 3, 2) } 231 | func BenchmarkDotIncN3Inc4(b *testing.B) { dotIncBenchmark(b, 3, 4) } 232 | func BenchmarkDotIncN3Inc10(b *testing.B) { dotIncBenchmark(b, 3, 10) } 233 | 234 | func BenchmarkDotIncN4Inc1(b *testing.B) { dotIncBenchmark(b, 4, 1) } 235 | func BenchmarkDotIncN4Inc2(b *testing.B) { dotIncBenchmark(b, 4, 2) } 236 | func BenchmarkDotIncN4Inc4(b *testing.B) { dotIncBenchmark(b, 4, 4) } 237 | func BenchmarkDotIncN4Inc10(b *testing.B) { dotIncBenchmark(b, 4, 10) } 238 | 239 | func BenchmarkDotIncN10Inc1(b *testing.B) { dotIncBenchmark(b, 10, 1) } 240 | func BenchmarkDotIncN10Inc2(b *testing.B) { dotIncBenchmark(b, 10, 2) } 241 | func BenchmarkDotIncN10Inc4(b *testing.B) { dotIncBenchmark(b, 10, 4) } 242 | func BenchmarkDotIncN10Inc10(b *testing.B) { dotIncBenchmark(b, 10, 10) } 243 | 244 | func BenchmarkDotIncN1000Inc1(b *testing.B) { dotIncBenchmark(b, 1000, 1) } 245 | func BenchmarkDotIncN1000Inc2(b *testing.B) { dotIncBenchmark(b, 1000, 2) } 246 | func BenchmarkDotIncN1000Inc4(b *testing.B) { dotIncBenchmark(b, 1000, 4) } 247 | func BenchmarkDotIncN1000Inc10(b *testing.B) { dotIncBenchmark(b, 1000, 10) } 248 | 249 | func BenchmarkDotIncN100000Inc1(b *testing.B) { dotIncBenchmark(b, 100000, 1) } 250 | func BenchmarkDotIncN100000Inc2(b *testing.B) { dotIncBenchmark(b, 100000, 2) } 251 | func BenchmarkDotIncN100000Inc4(b *testing.B) { dotIncBenchmark(b, 100000, 4) } 252 | func BenchmarkDotIncN100000Inc10(b *testing.B) { dotIncBenchmark(b, 100000, 10) } 253 | 254 | func BenchmarkDotIncN100000IncM1(b *testing.B) { dotIncBenchmark(b, 100000, -1) } 255 | func BenchmarkDotIncN100000IncM2(b *testing.B) { dotIncBenchmark(b, 100000, -2) } 256 | func BenchmarkDotIncN100000IncM4(b *testing.B) { dotIncBenchmark(b, 100000, -4) } 257 | func BenchmarkDotIncN100000IncM10(b *testing.B) { dotIncBenchmark(b, 100000, -10) } 258 | 259 | func dotIncBenchmark(b *testing.B, n, inc int) { 260 | absInc := inc 261 | if inc < 0 { 262 | absInc = -inc 263 | } 264 | x := make([]float64, (n-1)*absInc+1) 265 | for i := range x { 266 | x[i] = rand.Float64() 267 | } 268 | y := make([]float64, (n-1)*absInc+1) 269 | for i := range y { 270 | y[i] = rand.Float64() 271 | } 272 | var ini int 273 | if inc < 0 { 274 | ini = (-n + 1) * inc 275 | } 276 | b.ResetTimer() 277 | for i := 0; i < b.N; i++ { 278 | r = DotInc(x, y, uintptr(n), uintptr(inc), uintptr(inc), uintptr(ini), uintptr(ini)) 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /asm/f64/l1norm_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func L1Dist(s, t []float64) float64 10 | TEXT ·L1Dist(SB), NOSPLIT, $0 11 | MOVQ s_base+0(FP), DI // DI = &s 12 | MOVQ t_base+24(FP), SI // SI = &t 13 | MOVQ s_len+8(FP), CX // CX = len(s) 14 | CMPQ t_len+32(FP), CX // CX = max( CX, len(t) ) 15 | CMOVQLE t_len+32(FP), CX 16 | PXOR X3, X3 // norm = 0 17 | CMPQ CX, $0 // if CX == 0 { return 0 } 18 | JE l1_end 19 | XORQ AX, AX // i = 0 20 | MOVQ CX, BX 21 | ANDQ $1, BX // BX = CX % 2 22 | SHRQ $1, CX // CX = floor( CX / 2 ) 23 | JZ l1_tail_start // if CX == 0 { return 0 } 24 | 25 | l1_loop: // Loop unrolled 2x do { 26 | MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1] 27 | MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1] 28 | MOVAPS X0, X2 29 | SUBPD X1, X0 30 | SUBPD X2, X1 31 | MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) 32 | ADDPD X0, X3 // norm += X0 33 | ADDQ $2, AX // i += 2 34 | LOOP l1_loop // } while --CX > 0 35 | CMPQ BX, $0 // if BX == 0 { return } 36 | JE l1_end 37 | 38 | l1_tail_start: // Reset loop registers 39 | MOVQ BX, CX // Loop counter: CX = BX 40 | PXOR X0, X0 // reset X0, X1 to break dependencies 41 | PXOR X1, X1 42 | 43 | l1_tail: 44 | MOVSD (SI)(AX*8), X0 // X0 = t[i] 45 | MOVSD (DI)(AX*8), X1 // x1 = s[i] 46 | MOVAPD X0, X2 47 | SUBSD X1, X0 48 | SUBSD X2, X1 49 | MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) 50 | ADDSD X0, X3 // norm += X0 51 | 52 | l1_end: 53 | MOVAPS X3, X2 54 | SHUFPD $1, X2, X2 55 | ADDSD X3, X2 // X2 = X3[1] + X3[0] 56 | MOVSD X2, ret+48(FP) // return X2 57 | RET 58 | 59 | -------------------------------------------------------------------------------- /asm/f64/linfnorm_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | // +build !noasm,!appengine 6 | 7 | #include "textflag.h" 8 | 9 | // func LinfDist(s, t []float64) float64 10 | TEXT ·LinfDist(SB), NOSPLIT, $0 11 | MOVQ s_base+0(FP), DI // DI = &s 12 | MOVQ t_base+24(FP), SI // SI = &t 13 | MOVQ s_len+8(FP), CX // CX = len(s) 14 | CMPQ t_len+32(FP), CX // CX = max( CX, len(t) ) 15 | CMOVQLE t_len+32(FP), CX 16 | PXOR X3, X3 // norm = 0 17 | CMPQ CX, $0 // if CX == 0 { return 0 } 18 | JE l1_end 19 | XORQ AX, AX // i = 0 20 | MOVQ CX, BX 21 | ANDQ $1, BX // BX = CX % 2 22 | SHRQ $1, CX // CX = floor( CX / 2 ) 23 | JZ l1_tail_start // if CX == 0 { return 0 } 24 | 25 | l1_loop: // Loop unrolled 2x do { 26 | MOVUPS (SI)(AX*8), X0 // X0 = t[i:i+1] 27 | MOVUPS (DI)(AX*8), X1 // X1 = s[i:i+1] 28 | MOVAPS X0, X2 29 | SUBPD X1, X0 30 | SUBPD X2, X1 31 | MAXPD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) 32 | MAXPD X0, X3 // norm = max( norm, X0 ) 33 | ADDQ $2, AX // i += 2 34 | LOOP l1_loop // } while --CX > 0 35 | CMPQ BX, $0 // if BX == 0 { return } 36 | JE l1_end 37 | 38 | l1_tail_start: // Reset loop registers 39 | MOVQ BX, CX // Loop counter: CX = BX 40 | PXOR X0, X0 // reset X0, X1 to break dependencies 41 | PXOR X1, X1 42 | 43 | l1_tail: 44 | MOVSD (SI)(AX*8), X0 // X0 = t[i] 45 | MOVSD (DI)(AX*8), X1 // X1 = s[i] 46 | MOVAPD X0, X2 47 | SUBSD X1, X0 48 | SUBSD X2, X1 49 | MAXSD X1, X0 // X0 = max( X0 - X1, X1 - X0 ) 50 | MAXSD X0, X3 // norm = max( norm, X0 ) 51 | 52 | l1_end: 53 | MOVAPS X3, X2 54 | SHUFPD $1, X2, X2 55 | MAXSD X3, X2 // X2 = max( X3[1], X3[0] ) 56 | MOVSD X2, ret+48(FP) // return X2 57 | RET 58 | -------------------------------------------------------------------------------- /asm/f64/scal.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !amd64 noasm appengine 6 | 7 | package f64 8 | 9 | // ScalUnitary is 10 | // for i := range x { 11 | // x[i] *= alpha 12 | // } 13 | func ScalUnitary(alpha float64, x []float64) { 14 | for i := range x { 15 | x[i] *= alpha 16 | } 17 | } 18 | 19 | // ScalUnitaryTo is 20 | // for i, v := range x { 21 | // dst[i] = alpha * v 22 | // } 23 | func ScalUnitaryTo(dst []float64, alpha float64, x []float64) { 24 | for i, v := range x { 25 | dst[i] = alpha * v 26 | } 27 | } 28 | 29 | // ScalInc is 30 | // var ix uintptr 31 | // for i := 0; i < int(n); i++ { 32 | // x[ix] *= alpha 33 | // ix += incX 34 | // } 35 | func ScalInc(alpha float64, x []float64, n, incX uintptr) { 36 | var ix uintptr 37 | for i := 0; i < int(n); i++ { 38 | x[ix] *= alpha 39 | ix += incX 40 | } 41 | } 42 | 43 | // ScalIncTo is 44 | // var idst, ix uintptr 45 | // for i := 0; i < int(n); i++ { 46 | // dst[idst] = alpha * x[ix] 47 | // ix += incX 48 | // idst += incDst 49 | // } 50 | func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr) { 51 | var idst, ix uintptr 52 | for i := 0; i < int(n); i++ { 53 | dst[idst] = alpha * x[ix] 54 | ix += incX 55 | idst += incDst 56 | } 57 | } 58 | -------------------------------------------------------------------------------- /asm/f64/scal_test.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package f64 6 | 7 | import ( 8 | "fmt" 9 | "math/rand" 10 | "testing" 11 | ) 12 | 13 | var scalTests = []struct { 14 | alpha float64 15 | x []float64 16 | want []float64 17 | }{ 18 | { 19 | alpha: 0, 20 | x: []float64{}, 21 | want: []float64{}, 22 | }, 23 | { 24 | alpha: 0, 25 | x: []float64{1}, 26 | want: []float64{0}, 27 | }, 28 | { 29 | alpha: 1, 30 | x: []float64{1}, 31 | want: []float64{1}, 32 | }, 33 | { 34 | alpha: 2, 35 | x: []float64{1, -2}, 36 | want: []float64{2, -4}, 37 | }, 38 | { 39 | alpha: 2, 40 | x: []float64{1, -2, 3}, 41 | want: []float64{2, -4, 6}, 42 | }, 43 | { 44 | alpha: 2, 45 | x: []float64{1, -2, 3, 4}, 46 | want: []float64{2, -4, 6, 8}, 47 | }, 48 | { 49 | alpha: 2, 50 | x: []float64{1, -2, 3, 4, -5}, 51 | want: []float64{2, -4, 6, 8, -10}, 52 | }, 53 | { 54 | alpha: 2, 55 | x: []float64{0, 1, -2, 3, 4, -5, 6, -7}, 56 | want: []float64{0, 2, -4, 6, 8, -10, 12, -14}, 57 | }, 58 | { 59 | alpha: 2, 60 | x: []float64{0, 1, -2, 3, 4, -5, 6, -7, 8}, 61 | want: []float64{0, 2, -4, 6, 8, -10, 12, -14, 16}, 62 | }, 63 | { 64 | alpha: 2, 65 | x: []float64{0, 1, -2, 3, 4, -5, 6, -7, 8, 9}, 66 | want: []float64{0, 2, -4, 6, 8, -10, 12, -14, 16, 18}, 67 | }, 68 | { 69 | alpha: 3, 70 | x: []float64{0, 1, -2, 3, 4, -5, 6, -7, 8, 9, 12}, 71 | want: []float64{0, 3, -6, 9, 12, -15, 18, -21, 24, 27, 36}, 72 | }, 73 | } 74 | 75 | func TestScalUnitary(t *testing.T) { 76 | const xGdVal = -0.5 77 | for i, test := range scalTests { 78 | for _, align := range align1 { 79 | prefix := fmt.Sprintf("Test %v (x:%v)", i, align) 80 | xgLn := 4 + align 81 | xg := guardVector(test.x, xGdVal, xgLn) 82 | x := xg[xgLn : len(xg)-xgLn] 83 | 84 | ScalUnitary(test.alpha, x) 85 | 86 | for i := range test.want { 87 | if !same(x[i], test.want[i]) { 88 | t.Errorf(msgVal, prefix, i, x[i], test.want[i]) 89 | } 90 | } 91 | if !isValidGuard(xg, xGdVal, xgLn) { 92 | t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:]) 93 | } 94 | } 95 | } 96 | } 97 | 98 | func TestScalUnitaryTo(t *testing.T) { 99 | const xGdVal, dstGdVal = -1, 0.5 100 | rng := rand.New(rand.NewSource(42)) 101 | for i, test := range scalTests { 102 | n := len(test.x) 103 | for _, align := range align2 { 104 | prefix := fmt.Sprintf("Test %v (x:%v dst:%v)", i, align.x, align.y) 105 | xgLn, dgLn := 4+align.x, 4+align.y 106 | xg := guardVector(test.x, xGdVal, xgLn) 107 | dg := guardVector(randSlice(n, 1, rng), dstGdVal, dgLn) 108 | x, dst := xg[xgLn:len(xg)-xgLn], dg[dgLn:len(dg)-dgLn] 109 | 110 | ScalUnitaryTo(dst, test.alpha, x) 111 | 112 | for i := range test.want { 113 | if !same(dst[i], test.want[i]) { 114 | t.Errorf(msgVal, prefix, i, dst[i], test.want[i]) 115 | } 116 | } 117 | if !isValidGuard(xg, xGdVal, xgLn) { 118 | t.Errorf(msgGuard, prefix, "x", xg[:xgLn], xg[len(xg)-xgLn:]) 119 | } 120 | if !isValidGuard(dg, dstGdVal, dgLn) { 121 | t.Errorf(msgGuard, prefix, "y", dg[:dgLn], dg[len(dg)-dgLn:]) 122 | } 123 | if !equalStrided(test.x, x, 1) { 124 | t.Errorf("%v: modified read-only x argument", prefix) 125 | } 126 | } 127 | } 128 | } 129 | 130 | func TestScalInc(t *testing.T) { 131 | const xGdVal = -0.5 132 | gdLn := 4 133 | for i, test := range scalTests { 134 | n := len(test.x) 135 | for _, incX := range []int{1, 2, 3, 4, 7, 10} { 136 | prefix := fmt.Sprintf("Test %v (x:%v)", i, incX) 137 | xg := guardIncVector(test.x, xGdVal, incX, gdLn) 138 | x := xg[gdLn : len(xg)-gdLn] 139 | 140 | ScalInc(test.alpha, x, uintptr(n), uintptr(incX)) 141 | 142 | for i := range test.want { 143 | if !same(x[i*incX], test.want[i]) { 144 | t.Errorf(msgVal, prefix, i, x[i*incX], test.want[i]) 145 | } 146 | } 147 | checkValidIncGuard(t, xg, xGdVal, incX, gdLn) 148 | } 149 | } 150 | } 151 | 152 | func TestScalIncTo(t *testing.T) { 153 | const xGdVal, dstGdVal = -1, 0.5 154 | gdLn := 4 155 | rng := rand.New(rand.NewSource(42)) 156 | for i, test := range scalTests { 157 | n := len(test.x) 158 | for _, inc := range newIncSet(1, 2, 3, 4, 7, 10) { 159 | prefix := fmt.Sprintf("test %v (x:%v dst:%v)", i, inc.x, inc.y) 160 | xg := guardIncVector(test.x, xGdVal, inc.x, gdLn) 161 | dg := guardIncVector(randSlice(n, 1, rng), dstGdVal, inc.y, gdLn) 162 | x, dst := xg[gdLn:len(xg)-gdLn], dg[gdLn:len(dg)-gdLn] 163 | 164 | ScalIncTo(dst, uintptr(inc.y), test.alpha, x, uintptr(n), uintptr(inc.x)) 165 | 166 | for i := range test.want { 167 | if !same(dst[i*inc.y], test.want[i]) { 168 | t.Errorf(msgVal, prefix, i, dst[i*inc.y], test.want[i]) 169 | } 170 | } 171 | checkValidIncGuard(t, xg, xGdVal, inc.x, gdLn) 172 | checkValidIncGuard(t, dg, dstGdVal, inc.y, gdLn) 173 | if !equalStrided(test.x, x, inc.x) { 174 | t.Errorf("%v: modified read-only x argument", prefix) 175 | } 176 | 177 | } 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /asm/f64/scalinc_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | // 5 | // Some of the loop unrolling code is copied from: 6 | // http://golang.org/src/math/big/arith_amd64.s 7 | // which is distributed under these terms: 8 | // 9 | // Copyright (c) 2012 The Go Authors. All rights reserved. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // * Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // * Redistributions in binary form must reproduce the above 18 | // copyright notice, this list of conditions and the following disclaimer 19 | // in the documentation and/or other materials provided with the 20 | // distribution. 21 | // * Neither the name of Google Inc. nor the names of its 22 | // contributors may be used to endorse or promote products derived from 23 | // this software without specific prior written permission. 24 | // 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | 37 | //+build !noasm,!appengine 38 | 39 | #include "textflag.h" 40 | 41 | #define X_PTR SI 42 | #define LEN CX 43 | #define TAIL BX 44 | #define INC_X R8 45 | #define INCx3_X R9 46 | #define ALPHA X0 47 | #define ALPHA_2 X1 48 | 49 | // func ScalInc(alpha float64, x []float64, n, incX uintptr) 50 | TEXT ·ScalInc(SB), NOSPLIT, $0 51 | MOVSD alpha+0(FP), ALPHA // ALPHA = alpha 52 | MOVQ x_base+8(FP), X_PTR // X_PTR = &x 53 | MOVQ incX+40(FP), INC_X // INC_X = incX 54 | SHLQ $3, INC_X // INC_X *= sizeof(float64) 55 | MOVQ n+32(FP), LEN // LEN = n 56 | CMPQ LEN, $0 57 | JE end // if LEN == 0 { return } 58 | 59 | MOVQ LEN, TAIL 60 | ANDQ $3, TAIL // TAIL = LEN % 4 61 | SHRQ $2, LEN // LEN = floor( LEN / 4 ) 62 | JZ tail_start // if LEN == 0 { goto tail_start } 63 | 64 | MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining 65 | LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 66 | 67 | loop: // do { // x[i] *= alpha unrolled 4x. 68 | MOVSD (X_PTR), X2 // X_i = x[i] 69 | MOVSD (X_PTR)(INC_X*1), X3 70 | MOVSD (X_PTR)(INC_X*2), X4 71 | MOVSD (X_PTR)(INCx3_X*1), X5 72 | 73 | MULSD ALPHA, X2 // X_i *= a 74 | MULSD ALPHA_2, X3 75 | MULSD ALPHA, X4 76 | MULSD ALPHA_2, X5 77 | 78 | MOVSD X2, (X_PTR) // x[i] = X_i 79 | MOVSD X3, (X_PTR)(INC_X*1) 80 | MOVSD X4, (X_PTR)(INC_X*2) 81 | MOVSD X5, (X_PTR)(INCx3_X*1) 82 | 83 | LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) 84 | DECQ LEN 85 | JNZ loop // } while --LEN > 0 86 | CMPQ TAIL, $0 87 | JE end // if TAIL == 0 { return } 88 | 89 | tail_start: // Reset loop registers 90 | MOVQ TAIL, LEN // Loop counter: LEN = TAIL 91 | SHRQ $1, LEN // LEN = floor( LEN / 2 ) 92 | JZ tail_one 93 | 94 | tail_two: // do { 95 | MOVSD (X_PTR), X2 // X_i = x[i] 96 | MOVSD (X_PTR)(INC_X*1), X3 97 | MULSD ALPHA, X2 // X_i *= a 98 | MULSD ALPHA, X3 99 | MOVSD X2, (X_PTR) // x[i] = X_i 100 | MOVSD X3, (X_PTR)(INC_X*1) 101 | 102 | LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2]) 103 | 104 | ANDQ $1, TAIL 105 | JZ end 106 | 107 | tail_one: 108 | MOVSD (X_PTR), X2 // X_i = x[i] 109 | MULSD ALPHA, X2 // X_i *= ALPHA 110 | MOVSD X2, (X_PTR) // x[i] = X_i 111 | 112 | end: 113 | RET 114 | -------------------------------------------------------------------------------- /asm/f64/scalincto_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | // 5 | // Some of the loop unrolling code is copied from: 6 | // http://golang.org/src/math/big/arith_amd64.s 7 | // which is distributed under these terms: 8 | // 9 | // Copyright (c) 2012 The Go Authors. All rights reserved. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // * Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // * Redistributions in binary form must reproduce the above 18 | // copyright notice, this list of conditions and the following disclaimer 19 | // in the documentation and/or other materials provided with the 20 | // distribution. 21 | // * Neither the name of Google Inc. nor the names of its 22 | // contributors may be used to endorse or promote products derived from 23 | // this software without specific prior written permission. 24 | // 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | 37 | //+build !noasm,!appengine 38 | 39 | #include "textflag.h" 40 | 41 | #define X_PTR SI 42 | #define DST_PTR DI 43 | #define LEN CX 44 | #define TAIL BX 45 | #define INC_X R8 46 | #define INCx3_X R9 47 | #define INC_DST R10 48 | #define INCx3_DST R11 49 | #define ALPHA X0 50 | #define ALPHA_2 X1 51 | 52 | // func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr) 53 | TEXT ·ScalIncTo(SB), NOSPLIT, $0 54 | MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst 55 | MOVQ incDst+24(FP), INC_DST // INC_DST = incDst 56 | SHLQ $3, INC_DST // INC_DST *= sizeof(float64) 57 | MOVSD alpha+32(FP), ALPHA // ALPHA = alpha 58 | MOVQ x_base+40(FP), X_PTR // X_PTR = &x 59 | MOVQ n+64(FP), LEN // LEN = n 60 | MOVQ incX+72(FP), INC_X // INC_X = incX 61 | SHLQ $3, INC_X // INC_X *= sizeof(float64) 62 | CMPQ LEN, $0 63 | JE end // if LEN == 0 { return } 64 | 65 | MOVQ LEN, TAIL 66 | ANDQ $3, TAIL // TAIL = LEN % 4 67 | SHRQ $2, LEN // LEN = floor( LEN / 4 ) 68 | JZ tail_start // if LEN == 0 { goto tail_start } 69 | 70 | MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining 71 | LEAQ (INC_X)(INC_X*2), INCx3_X // INCx3_X = INC_X * 3 72 | LEAQ (INC_DST)(INC_DST*2), INCx3_DST // INCx3_DST = INC_DST * 3 73 | 74 | loop: // do { // x[i] *= alpha unrolled 4x. 75 | MOVSD (X_PTR), X2 // X_i = x[i] 76 | MOVSD (X_PTR)(INC_X*1), X3 77 | MOVSD (X_PTR)(INC_X*2), X4 78 | MOVSD (X_PTR)(INCx3_X*1), X5 79 | 80 | MULSD ALPHA, X2 // X_i *= a 81 | MULSD ALPHA_2, X3 82 | MULSD ALPHA, X4 83 | MULSD ALPHA_2, X5 84 | 85 | MOVSD X2, (DST_PTR) // dst[i] = X_i 86 | MOVSD X3, (DST_PTR)(INC_DST*1) 87 | MOVSD X4, (DST_PTR)(INC_DST*2) 88 | MOVSD X5, (DST_PTR)(INCx3_DST*1) 89 | 90 | LEAQ (X_PTR)(INC_X*4), X_PTR // X_PTR = &(X_PTR[incX*4]) 91 | LEAQ (DST_PTR)(INC_DST*4), DST_PTR // DST_PTR = &(DST_PTR[incDst*4]) 92 | DECQ LEN 93 | JNZ loop // } while --LEN > 0 94 | CMPQ TAIL, $0 95 | JE end // if TAIL == 0 { return } 96 | 97 | tail_start: // Reset loop registers 98 | MOVQ TAIL, LEN // Loop counter: LEN = TAIL 99 | SHRQ $1, LEN // LEN = floor( LEN / 2 ) 100 | JZ tail_one 101 | 102 | tail_two: 103 | MOVSD (X_PTR), X2 // X_i = x[i] 104 | MOVSD (X_PTR)(INC_X*1), X3 105 | MULSD ALPHA, X2 // X_i *= a 106 | MULSD ALPHA, X3 107 | MOVSD X2, (DST_PTR) // dst[i] = X_i 108 | MOVSD X3, (DST_PTR)(INC_DST*1) 109 | 110 | LEAQ (X_PTR)(INC_X*2), X_PTR // X_PTR = &(X_PTR[incX*2]) 111 | LEAQ (DST_PTR)(INC_DST*2), DST_PTR // DST_PTR = &(DST_PTR[incDst*2]) 112 | 113 | ANDQ $1, TAIL 114 | JZ end 115 | 116 | tail_one: 117 | MOVSD (X_PTR), X2 // X_i = x[i] 118 | MULSD ALPHA, X2 // X_i *= ALPHA 119 | MOVSD X2, (DST_PTR) // x[i] = X_i 120 | 121 | end: 122 | RET 123 | -------------------------------------------------------------------------------- /asm/f64/scalunitary_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | // 5 | // Some of the loop unrolling code is copied from: 6 | // http://golang.org/src/math/big/arith_amd64.s 7 | // which is distributed under these terms: 8 | // 9 | // Copyright (c) 2012 The Go Authors. All rights reserved. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // * Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // * Redistributions in binary form must reproduce the above 18 | // copyright notice, this list of conditions and the following disclaimer 19 | // in the documentation and/or other materials provided with the 20 | // distribution. 21 | // * Neither the name of Google Inc. nor the names of its 22 | // contributors may be used to endorse or promote products derived from 23 | // this software without specific prior written permission. 24 | // 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | 37 | //+build !noasm,!appengine 38 | 39 | #include "textflag.h" 40 | 41 | #define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x0824 // @ MOVDDUP XMM0, 8[RSP] 42 | 43 | #define X_PTR SI 44 | #define DST_PTR DI 45 | #define IDX AX 46 | #define LEN CX 47 | #define TAIL BX 48 | #define ALPHA X0 49 | #define ALPHA_2 X1 50 | 51 | // func ScalUnitary(alpha float64, x []float64) 52 | TEXT ·ScalUnitary(SB), NOSPLIT, $0 53 | MOVDDUP_ALPHA // ALPHA = { alpha, alpha } 54 | MOVQ x_base+8(FP), X_PTR // X_PTR = &x 55 | MOVQ x_len+16(FP), LEN // LEN = len(x) 56 | CMPQ LEN, $0 57 | JE end // if LEN == 0 { return } 58 | XORQ IDX, IDX // IDX = 0 59 | 60 | MOVQ LEN, TAIL 61 | ANDQ $7, TAIL // TAIL = LEN % 8 62 | SHRQ $3, LEN // LEN = floor( LEN / 8 ) 63 | JZ tail_start // if LEN == 0 { goto tail_start } 64 | 65 | MOVUPS ALPHA, ALPHA_2 66 | 67 | loop: // do { // x[i] *= alpha unrolled 8x. 68 | MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] 69 | MOVUPS 16(X_PTR)(IDX*8), X3 70 | MOVUPS 32(X_PTR)(IDX*8), X4 71 | MOVUPS 48(X_PTR)(IDX*8), X5 72 | 73 | MULPD ALPHA, X2 // X_i *= ALPHA 74 | MULPD ALPHA_2, X3 75 | MULPD ALPHA, X4 76 | MULPD ALPHA_2, X5 77 | 78 | MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i 79 | MOVUPS X3, 16(X_PTR)(IDX*8) 80 | MOVUPS X4, 32(X_PTR)(IDX*8) 81 | MOVUPS X5, 48(X_PTR)(IDX*8) 82 | 83 | ADDQ $8, IDX // i += 8 84 | DECQ LEN 85 | JNZ loop // while --LEN > 0 86 | CMPQ TAIL, $0 87 | JE end // if TAIL == 0 { return } 88 | 89 | tail_start: // Reset loop registers 90 | MOVQ TAIL, LEN // Loop counter: LEN = TAIL 91 | SHRQ $1, LEN // LEN = floor( TAIL / 2 ) 92 | JZ tail_one // if n == 0 goto end 93 | 94 | tail_two: // do { 95 | MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] 96 | MULPD ALPHA, X2 // X_i *= ALPHA 97 | MOVUPS X2, (X_PTR)(IDX*8) // x[i] = X_i 98 | ADDQ $2, IDX // i += 2 99 | DECQ LEN 100 | JNZ tail_two // while --LEN > 0 101 | 102 | ANDQ $1, TAIL 103 | JZ end // if TAIL == 0 { return } 104 | 105 | tail_one: 106 | // x[i] *= alpha for the remaining element. 107 | MOVSD (X_PTR)(IDX*8), X2 108 | MULSD ALPHA, X2 109 | MOVSD X2, (X_PTR)(IDX*8) 110 | 111 | end: 112 | RET 113 | -------------------------------------------------------------------------------- /asm/f64/scalunitaryto_amd64.s: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | // 5 | // Some of the loop unrolling code is copied from: 6 | // http://golang.org/src/math/big/arith_amd64.s 7 | // which is distributed under these terms: 8 | // 9 | // Copyright (c) 2012 The Go Authors. All rights reserved. 10 | // 11 | // Redistribution and use in source and binary forms, with or without 12 | // modification, are permitted provided that the following conditions are 13 | // met: 14 | // 15 | // * Redistributions of source code must retain the above copyright 16 | // notice, this list of conditions and the following disclaimer. 17 | // * Redistributions in binary form must reproduce the above 18 | // copyright notice, this list of conditions and the following disclaimer 19 | // in the documentation and/or other materials provided with the 20 | // distribution. 21 | // * Neither the name of Google Inc. nor the names of its 22 | // contributors may be used to endorse or promote products derived from 23 | // this software without specific prior written permission. 24 | // 25 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 26 | // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 27 | // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 28 | // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 29 | // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 30 | // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 31 | // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 32 | // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 33 | // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 34 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 35 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 36 | 37 | //+build !noasm,!appengine 38 | 39 | #include "textflag.h" 40 | 41 | #define MOVDDUP_ALPHA LONG $0x44120FF2; WORD $0x2024 // @ MOVDDUP 32(SP), X0 /*XMM0, 32[RSP]*/ 42 | 43 | #define X_PTR SI 44 | #define DST_PTR DI 45 | #define IDX AX 46 | #define LEN CX 47 | #define TAIL BX 48 | #define ALPHA X0 49 | #define ALPHA_2 X1 50 | 51 | // func ScalUnitaryTo(dst []float64, alpha float64, x []float64) 52 | // This function assumes len(dst) >= len(x). 53 | TEXT ·ScalUnitaryTo(SB), NOSPLIT, $0 54 | MOVQ x_base+32(FP), X_PTR // X_PTR = &x 55 | MOVQ dst_base+0(FP), DST_PTR // DST_PTR = &dst 56 | MOVDDUP_ALPHA // ALPHA = { alpha, alpha } 57 | MOVQ x_len+40(FP), LEN // LEN = len(x) 58 | CMPQ LEN, $0 59 | JE end // if LEN == 0 { return } 60 | 61 | XORQ IDX, IDX // IDX = 0 62 | MOVQ LEN, TAIL 63 | ANDQ $7, TAIL // TAIL = LEN % 8 64 | SHRQ $3, LEN // LEN = floor( LEN / 8 ) 65 | JZ tail_start // if LEN == 0 { goto tail_start } 66 | 67 | MOVUPS ALPHA, ALPHA_2 // ALPHA_2 = ALPHA for pipelining 68 | 69 | loop: // do { // dst[i] = alpha * x[i] unrolled 8x. 70 | MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] 71 | MOVUPS 16(X_PTR)(IDX*8), X3 72 | MOVUPS 32(X_PTR)(IDX*8), X4 73 | MOVUPS 48(X_PTR)(IDX*8), X5 74 | 75 | MULPD ALPHA, X2 // X_i *= ALPHA 76 | MULPD ALPHA_2, X3 77 | MULPD ALPHA, X4 78 | MULPD ALPHA_2, X5 79 | 80 | MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i 81 | MOVUPS X3, 16(DST_PTR)(IDX*8) 82 | MOVUPS X4, 32(DST_PTR)(IDX*8) 83 | MOVUPS X5, 48(DST_PTR)(IDX*8) 84 | 85 | ADDQ $8, IDX // i += 8 86 | DECQ LEN 87 | JNZ loop // while --LEN > 0 88 | CMPQ TAIL, $0 89 | JE end // if TAIL == 0 { return } 90 | 91 | tail_start: // Reset loop counters 92 | MOVQ TAIL, LEN // Loop counter: LEN = TAIL 93 | SHRQ $1, LEN // LEN = floor( TAIL / 2 ) 94 | JZ tail_one // if LEN == 0 { goto tail_one } 95 | 96 | tail_two: // do { 97 | MOVUPS (X_PTR)(IDX*8), X2 // X_i = x[i] 98 | MULPD ALPHA, X2 // X_i *= ALPHA 99 | MOVUPS X2, (DST_PTR)(IDX*8) // dst[i] = X_i 100 | ADDQ $2, IDX // i += 2 101 | DECQ LEN 102 | JNZ tail_two // while --LEN > 0 103 | 104 | ANDQ $1, TAIL 105 | JZ end // if TAIL == 0 { return } 106 | 107 | tail_one: 108 | MOVSD (X_PTR)(IDX*8), X2 // X_i = x[i] 109 | MULSD ALPHA, X2 // X_i *= ALPHA 110 | MOVSD X2, (DST_PTR)(IDX*8) // dst[i] = X_i 111 | 112 | end: 113 | RET 114 | -------------------------------------------------------------------------------- /asm/f64/stubs_amd64.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2015 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !noasm,!appengine 6 | 7 | package f64 8 | 9 | // L1Norm is 10 | // for _, v := range x { 11 | // sum += math.Abs(v) 12 | // } 13 | // return sum 14 | func L1Norm(x []float64) (sum float64) 15 | 16 | // L1NormInc is 17 | // for i := 0; i < n*incX; i += incX { 18 | // sum += math.Abs(x[i]) 19 | // } 20 | // return sum 21 | func L1NormInc(x []float64, n, incX int) (sum float64) 22 | 23 | // AddConst is 24 | // for i := range x { 25 | // x[i] += alpha 26 | // } 27 | func AddConst(alpha float64, x []float64) 28 | 29 | // Add is 30 | // for i, v := range s { 31 | // dst[i] += v 32 | // } 33 | func Add(dst, s []float64) 34 | 35 | // AxpyUnitary is 36 | // for i, v := range x { 37 | // y[i] += alpha * v 38 | // } 39 | func AxpyUnitary(alpha float64, x, y []float64) 40 | 41 | // AxpyUnitaryTo is 42 | // for i, v := range x { 43 | // dst[i] = alpha*v + y[i] 44 | // } 45 | func AxpyUnitaryTo(dst []float64, alpha float64, x, y []float64) 46 | 47 | // AxpyInc is 48 | // for i := 0; i < int(n); i++ { 49 | // y[iy] += alpha * x[ix] 50 | // ix += incX 51 | // iy += incY 52 | // } 53 | func AxpyInc(alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) 54 | 55 | // AxpyIncTo is 56 | // for i := 0; i < int(n); i++ { 57 | // dst[idst] = alpha*x[ix] + y[iy] 58 | // ix += incX 59 | // iy += incY 60 | // idst += incDst 61 | // } 62 | func AxpyIncTo(dst []float64, incDst, idst uintptr, alpha float64, x, y []float64, n, incX, incY, ix, iy uintptr) 63 | 64 | // CumSum is 65 | // if len(s) == 0 { 66 | // return dst 67 | // } 68 | // dst[0] = s[0] 69 | // for i, v := range s[1:] { 70 | // dst[i+1] = dst[i] + v 71 | // } 72 | // return dst 73 | func CumSum(dst, s []float64) []float64 74 | 75 | // CumProd is 76 | // if len(s) == 0 { 77 | // return dst 78 | // } 79 | // dst[0] = s[0] 80 | // for i, v := range s[1:] { 81 | // dst[i+1] = dst[i] * v 82 | // } 83 | // return dst 84 | func CumProd(dst, s []float64) []float64 85 | 86 | // Div is 87 | // for i, v := range s { 88 | // dst[i] /= v 89 | // } 90 | func Div(dst, s []float64) 91 | 92 | // DivTo is 93 | // for i, v := range s { 94 | // dst[i] = v / t[i] 95 | // } 96 | // return dst 97 | func DivTo(dst, x, y []float64) []float64 98 | 99 | // DotUnitary is 100 | // for i, v := range x { 101 | // sum += y[i] * v 102 | // } 103 | // return sum 104 | func DotUnitary(x, y []float64) (sum float64) 105 | 106 | // DotInc is 107 | // for i := 0; i < int(n); i++ { 108 | // sum += y[iy] * x[ix] 109 | // ix += incX 110 | // iy += incY 111 | // } 112 | // return sum 113 | func DotInc(x, y []float64, n, incX, incY, ix, iy uintptr) (sum float64) 114 | 115 | // L1Dist is 116 | // var norm float64 117 | // for i, v := range s { 118 | // norm += math.Abs(t[i] - v) 119 | // } 120 | // return norm 121 | func L1Dist(s, t []float64) float64 122 | 123 | // LinfDist is 124 | // var norm float64 125 | // if len(s) == 0 { 126 | // return 0 127 | // } 128 | // norm = math.Abs(t[0] - s[0]) 129 | // for i, v := range s[1:] { 130 | // absDiff := math.Abs(t[i+1] - v) 131 | // if absDiff > norm || math.IsNaN(norm) { 132 | // norm = absDiff 133 | // } 134 | // } 135 | // return norm 136 | func LinfDist(s, t []float64) float64 137 | 138 | // ScalUnitary is 139 | // for i := range x { 140 | // x[i] *= alpha 141 | // } 142 | func ScalUnitary(alpha float64, x []float64) 143 | 144 | // ScalUnitaryTo is 145 | // for i, v := range x { 146 | // dst[i] = alpha * v 147 | // } 148 | func ScalUnitaryTo(dst []float64, alpha float64, x []float64) 149 | 150 | // ScalInc is 151 | // var ix uintptr 152 | // for i := 0; i < int(n); i++ { 153 | // x[ix] *= alpha 154 | // ix += incX 155 | // } 156 | func ScalInc(alpha float64, x []float64, n, incX uintptr) 157 | 158 | // ScalIncTo is 159 | // var idst, ix uintptr 160 | // for i := 0; i < int(n); i++ { 161 | // dst[idst] = alpha * x[ix] 162 | // ix += incX 163 | // idst += incDst 164 | // } 165 | func ScalIncTo(dst []float64, incDst uintptr, alpha float64, x []float64, n, incX uintptr) 166 | -------------------------------------------------------------------------------- /asm/f64/stubs_noasm.go: -------------------------------------------------------------------------------- 1 | // Copyright ©2016 The gonum Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | //+build !amd64 noasm appengine 6 | 7 | package f64 8 | 9 | import "math" 10 | 11 | // L1Norm is 12 | // for _, v := range x { 13 | // sum += math.Abs(v) 14 | // } 15 | // return sum 16 | func L1Norm(x []float64) (sum float64) { 17 | for _, v := range x { 18 | sum += math.Abs(v) 19 | } 20 | return sum 21 | } 22 | 23 | // L1NormInc is 24 | // for i := 0; i < n*incX; i += incX { 25 | // sum += math.Abs(x[i]) 26 | // } 27 | // return sum 28 | func L1NormInc(x []float64, n, incX int) (sum float64) { 29 | for i := 0; i < n*incX; i += incX { 30 | sum += math.Abs(x[i]) 31 | } 32 | return sum 33 | } 34 | 35 | // Add is 36 | // for i, v := range s { 37 | // dst[i] += v 38 | // } 39 | func Add(dst, s []float64) { 40 | for i, v := range s { 41 | dst[i] += v 42 | } 43 | } 44 | 45 | // AddConst is 46 | // for i := range x { 47 | // x[i] += alpha 48 | // } 49 | func AddConst(alpha float64, x []float64) { 50 | for i := range x { 51 | x[i] += alpha 52 | } 53 | } 54 | 55 | // CumSum is 56 | // if len(s) == 0 { 57 | // return dst 58 | // } 59 | // dst[0] = s[0] 60 | // for i, v := range s[1:] { 61 | // dst[i+1] = dst[i] + v 62 | // } 63 | // return dst 64 | func CumSum(dst, s []float64) []float64 { 65 | if len(s) == 0 { 66 | return dst 67 | } 68 | dst[0] = s[0] 69 | for i, v := range s[1:] { 70 | dst[i+1] = dst[i] + v 71 | } 72 | return dst 73 | } 74 | 75 | // CumProd is 76 | // if len(s) == 0 { 77 | // return dst 78 | // } 79 | // dst[0] = s[0] 80 | // for i, v := range s[1:] { 81 | // dst[i+1] = dst[i] * v 82 | // } 83 | // return dst 84 | func CumProd(dst, s []float64) []float64 { 85 | if len(s) == 0 { 86 | return dst 87 | } 88 | dst[0] = s[0] 89 | for i, v := range s[1:] { 90 | dst[i+1] = dst[i] * v 91 | } 92 | return dst 93 | } 94 | 95 | // Div is 96 | // for i, v := range s { 97 | // dst[i] /= v 98 | // } 99 | func Div(dst, s []float64) { 100 | for i, v := range s { 101 | dst[i] /= v 102 | } 103 | } 104 | 105 | // DivTo is 106 | // for i, v := range s { 107 | // dst[i] = v / t[i] 108 | // } 109 | // return dst 110 | func DivTo(dst, s, t []float64) []float64 { 111 | for i, v := range s { 112 | dst[i] = v / t[i] 113 | } 114 | return dst 115 | } 116 | 117 | // L1Dist is 118 | // var norm float64 119 | // for i, v := range s { 120 | // norm += math.Abs(t[i] - v) 121 | // } 122 | // return norm 123 | func L1Dist(s, t []float64) float64 { 124 | var norm float64 125 | for i, v := range s { 126 | norm += math.Abs(t[i] - v) 127 | } 128 | return norm 129 | } 130 | 131 | // LinfDist is 132 | // var norm float64 133 | // if len(s) == 0 { 134 | // return 0 135 | // } 136 | // norm = math.Abs(t[0] - s[0]) 137 | // for i, v := range s[1:] { 138 | // absDiff := math.Abs(t[i+1] - v) 139 | // if absDiff > norm || math.IsNaN(norm) { 140 | // norm = absDiff 141 | // } 142 | // } 143 | // return norm 144 | func LinfDist(s, t []float64) float64 { 145 | var norm float64 146 | if len(s) == 0 { 147 | return 0 148 | } 149 | norm = math.Abs(t[0] - s[0]) 150 | for i, v := range s[1:] { 151 | absDiff := math.Abs(t[i+1] - v) 152 | if absDiff > norm || math.IsNaN(norm) { 153 | norm = absDiff 154 | } 155 | } 156 | return norm 157 | } 158 | --------------------------------------------------------------------------------