├── LICENSE ├── Makefile ├── README.md ├── go.mod ├── go.sum ├── init_amd64.go ├── init_arm64.go ├── internal ├── avx │ ├── AddFloat32_amd64.s │ ├── AddFloat64_amd64.s │ ├── DivFloat32_amd64.s │ ├── DivFloat64_amd64.s │ ├── MaxFloat32_amd64.s │ ├── MaxFloat64_amd64.s │ ├── MinFloat32_amd64.s │ ├── MinFloat64_amd64.s │ ├── MulFloat32_amd64.s │ ├── MulFloat64_amd64.s │ ├── SubFloat32_amd64.s │ ├── SubFloat64_amd64.s │ ├── avx_amd64.go │ └── avx_amd64_test.go ├── avx2 │ ├── AddInt32_amd64.s │ ├── AddInt64_amd64.s │ ├── AndInt32_amd64.s │ ├── AndInt64_amd64.s │ ├── MaxInt32_amd64.s │ ├── MinInt32_amd64.s │ ├── MulInt32_amd64.s │ ├── OrInt32_amd64.s │ ├── OrInt64_amd64.s │ ├── SubInt32_amd64.s │ ├── SubInt64_amd64.s │ ├── XorInt32_amd64.s │ ├── XorInt64_amd64.s │ ├── avx2_amd64.go │ └── avx2_amd64_test.go ├── avx512vl │ ├── AddFloat32_amd64.s │ ├── AddFloat64_amd64.s │ ├── AddInt32_amd64.s │ ├── AddInt64_amd64.s │ ├── AndInt32_amd64.s │ ├── AndInt64_amd64.s │ ├── DivFloat32_amd64.s │ ├── DivFloat64_amd64.s │ ├── MaxFloat32_amd64.s │ ├── MaxFloat64_amd64.s │ ├── MaxInt32_amd64.s │ ├── MaxInt64_amd64.s │ ├── MinFloat32_amd64.s │ ├── MinFloat64_amd64.s │ ├── MinInt32_amd64.s │ ├── MinInt64_amd64.s │ ├── MulFloat32_amd64.s │ ├── MulFloat64_amd64.s │ ├── MulInt32_amd64.s │ ├── MulInt64_amd64.s │ ├── OrInt32_amd64.s │ ├── OrInt64_amd64.s │ ├── SubFloat32_amd64.s │ ├── SubFloat64_amd64.s │ ├── SubInt32_amd64.s │ ├── SubInt64_amd64.s │ ├── XorInt32_amd64.s │ ├── XorInt64_amd64.s │ ├── avx512vl_amd64.go │ └── avx512vl_amd64_test.go ├── data │ └── types.go ├── fallback │ └── fallback.go ├── neon │ ├── AddFloat32_arm64.s │ ├── AddFloat64_arm64.s │ ├── AddInt32_arm64.s │ ├── AddInt64_arm64.s │ ├── AndInt32_arm64.s │ ├── AndInt64_arm64.s │ ├── MulFloat32_arm64.s │ ├── MulFloat64_arm64.s │ ├── MulInt32_arm64.s │ ├── OrInt32_arm64.s │ ├── OrInt64_arm64.s │ ├── SubFloat32_arm64.s │ ├── SubFloat64_arm64.s │ ├── SubInt32_arm64.s │ ├── SubInt64_arm64.s │ ├── neon_arm64.go │ └── neon_arm64_test.go ├── sse │ ├── AddFloat32_amd64.s │ ├── DivFloat32_amd64.s │ ├── MaxFloat32_amd64.s │ ├── MinFloat32_amd64.s │ ├── MulFloat32_amd64.s │ ├── SubFloat32_amd64.s │ ├── sse_amd64.go │ └── sse_amd64_test.go ├── sse2 │ ├── AddFloat64_amd64.s │ ├── AddInt32_amd64.s │ ├── AddInt64_amd64.s │ ├── AndInt32_amd64.s │ ├── AndInt64_amd64.s │ ├── DivFloat64_amd64.s │ ├── MaxFloat64_amd64.s │ ├── MinFloat64_amd64.s │ ├── MulFloat64_amd64.s │ ├── OrInt32_amd64.s │ ├── OrInt64_amd64.s │ ├── SubFloat64_amd64.s │ ├── SubInt32_amd64.s │ ├── SubInt64_amd64.s │ ├── XorInt32_amd64.s │ ├── XorInt64_amd64.s │ ├── sse2_amd64.go │ └── sse2_amd64_test.go ├── sse41 │ ├── MaxInt32_amd64.s │ ├── MinInt32_amd64.s │ ├── MulInt32_amd64.s │ ├── sse41_amd64.go │ └── sse41_amd64_test.go └── test │ └── utils.go ├── logo ├── 150x150.png ├── 15x15.png ├── 300x300.png └── 600x600.png ├── simd.go └── simd_test.go /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 pehringer 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | test: 2 | go test -v ./... 3 | 4 | test_amd64: 5 | GOOS=linux GOARCH=amd64 go test -exec="qemu-x86_64" ./... -test.v 6 | 7 | test_arm64: 8 | GOOS=linux GOARCH=arm64 go test -exec="qemu-aarch64" ./... -test.v 9 | 10 | 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![SIMD](logo/300x300.png) 2 | # SIMD (Single Instruction, Multiple Data) 3 | SIMD support via Go assembly for arithmetic, bitwise, maximum, and minimum operations. 4 | Allowing for parallel element-wise computations. 5 | Resulting in a ***100% to 400%*** speedup. 6 | Currently AMD64 (x86_64) and ARM64 processors are supported. 7 | ## Function Documentation 8 | - [pkg.go.dev/github.com/pehringer/simd](https://pkg.go.dev/github.com/pehringer/simd). 9 | - ```simd.go``` (GoDoc comments) 10 | - ```simd_test.go``` (GoDoc examples) 11 | ## SIMD Support 12 | | |AMD64 (x86_64) |ARM64| 13 | |----------|------------------------|-----| 14 | |AddFloat32|SSE / AVX / AVX512VL |NEON | 15 | |AddFloat64|SSE2 / AVX / AVX512VL |NEON | 16 | |AddInt32 |SSE2 / AVX2 / AVX512VL |NEON | 17 | |AddInt64 |SSE2 / AVX2 / AVX512VL |NEON | 18 | |AndInt32 |SSE2 / AVX2 / AVX512VL |NEON | 19 | |AndInt64 |SSE2 / AVX2 / AVX512VL |NEON | 20 | |DivFloat32|SSE / AVX / AVX512VL | | 21 | |DivFloat64|SSE2 / AVX / AVX512VL | | 22 | |DivInt32 | | | 23 | |DivInt64 | | | 24 | |MaxFloat32|SSE / AVX / AVX512VL | | 25 | |MaxFloat64|SSE2 / AVX / AVX512VL | | 26 | |MaxInt32 |SSE4.1 / AVX2 / AVX512VL| | 27 | |MaxInt64 |AVX512VL | | 28 | |MinFloat32|SSE / AVX / AVX512VL | | 29 | |MinFloat64|SSE2 / AVX / AVX512VL | | 30 | |MinInt32 |SSE4.1 / AVX2 / AVX512VL| | 31 | |MinInt64 |AVX512VL | | 32 | |MulFloat32|SSE / AVX / AVX512VL |NEON | 33 | |MulFloat64|SSE2 / AVX / AVX512VL |NEON | 34 | |MulInt32 |SSE4.1 / AVX2 / AVX512VL|NEON | 35 | |MulInt64 |AVX512VL | | 36 | |OrInt32 |SSE2 / AVX2 / AVX512VL |NEON | 37 | |OrInt64 |SSE2 / AVX2 / AVX512VL |NEON | 38 | |SubFloat32|SSE / AVX / AVX512VL |NEON | 39 | |SubFloat64|SSE2 / AVX / AVX512VL |NEON | 40 | |SubInt32 |SSE2 / AVX2 / AVX512VL |NEON | 41 | |SubInt64 |SSE2 / AVX2 / AVX512VL |NEON | 42 | |XorInt32 |SSE2 / AVX2 / AVX512VL | | 43 | |XorInt64 |SSE2 / AVX2 / AVX512VL | | 44 | ## Make Targets 45 | #### Tests 46 | |Command |Description | 47 | |---------------------|----------------------------------------------------------------------| 48 | |```make test``` |Compiles and runs tests natively on hardware. | 49 | |```make test_amd64```|Cross compiles for amd64 and runs tests via QEMU (```qemu-x86_64```). | 50 | |```make test_arm64```|Cross compiles for arm64 and runs tests via QEMU (```qemu-aarch64```).| 51 | ## AMD64 Performance (AMD Ryzen 7 7840U / DDR5 SO-DIMM) 52 | |Elements |Go ns/op|SIMD ns/op|Performance x| 53 | |--------------|--------|----------|-------------| 54 | |Small Vectors | | | | 55 | |100 |38.33 |7.580 |5.056 | 56 | |200 |79.59 |12.80 |6.217 | 57 | |300 |117.0 |18.45 |9.593 | 58 | |400 |154.5 |16.20 |9.537 | 59 | |500 |191.5 |20.38 |9.396 | 60 | |600 |228.6 |26.37 |8.668 | 61 | |700 |265.6 |33.70 |7.881 | 62 | |800 |303.1 |29.38 |10.31 | 63 | |900 |340.3 |33.54 |10.14 | 64 | |Medium Vectors| | | | 65 | |1000 |377.4 |39.60 |9.530 | 66 | |2000 |751.2 |69.45 |10.81 | 67 | |3000 |1153 |148.3 |7.774 | 68 | |4000 |1499 |325.1 |4.610 | 69 | |5000 |1871 |431.6 |4.335 | 70 | |6000 |2243 |523.6 |4.283 | 71 | |7000 |2614 |614.1 |4.256 | 72 | |8000 |2987 |701.6 |4.257 | 73 | |9000 |3360 |792.5 |4.239 | 74 | |Large Vectors | | | | 75 | |10000 |3725 |878.5 |4.240 | 76 | |20000 |7458 |1754 |4.251 | 77 | |30000 |11187 |2631 |4.251 | 78 | |40000 |14908 |3509 |4.248 | 79 | |50000 |18677 |4373 |4.270 | 80 | |60000 |22363 |5276 |4.238 | 81 | |70000 |26107 |6319 |4.131 | 82 | |80000 |29854 |7820 |3.817 | 83 | |90000 |33613 |9222 |3.644 | 84 | ## ARM64 Performance (Apple M1 Pro / LPDDR5 SDRAM) 85 | |Elements |Go ns/op|SIMD ns/op|Performance x| 86 | |--------------|--------|----------|-------------| 87 | |Small Vectors | | | | 88 | |100 |51.81 |13.68 |3.787 | 89 | |200 |102.2 |24.24 |4.216 | 90 | |300 |152.8 |35.93 |4.252 | 91 | |400 |209.0 |47.71 |4.380 | 92 | |500 |258.7 |64.88 |3.987 | 93 | |600 |309.8 |73.42 |4.219 | 94 | |700 |359.6 |89.01 |4.039 | 95 | |800 |410.6 |101.9 |4.029 | 96 | |900 |460.3 |112.5 |4.091 | 97 | |Medium Vectors| | | | 98 | |1000 |511.5 |124.3 |4.115 | 99 | |2000 |1015 |241.0 |4.211 | 100 | |3000 |1520 |356.9 |4.258 | 101 | |4000 |2024 |473.1 |4.278 | 102 | |5000 |2527 |589.9 |4.283 | 103 | |6000 |3032 |706.1 |4.294 | 104 | |7000 |3535 |822.5 |4.297 | 105 | |8000 |4039 |939.2 |4.300 | 106 | |9000 |4543 |1056 |4.302 | 107 | |Large Vectors | | | | 108 | |10000 |5046 |1172 |4.305 | 109 | |20000 |10107 |2394 |4.221 | 110 | |30000 |15139 |3599 |4.206 | 111 | |40000 |20178 |4957 |4.070 | 112 | |50000 |25218 |6190 |4.073 | 113 | |60000 |30253 |7277 |4.157 | 114 | |70000 |35285 |8707 |4.052 | 115 | |80000 |40346 |9924 |4.065 | 116 | |90000 |45378 |11189 |4.055 | 117 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/pehringer/simd 2 | 3 | go 1.23.1 4 | 5 | require golang.org/x/sys v0.27.0 // indirect 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= 2 | golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 3 | -------------------------------------------------------------------------------- /init_amd64.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package simd 5 | 6 | import ( 7 | "golang.org/x/sys/cpu" 8 | "github.com/pehringer/simd/internal/avx" 9 | "github.com/pehringer/simd/internal/avx2" 10 | "github.com/pehringer/simd/internal/avx512vl" 11 | "github.com/pehringer/simd/internal/sse" 12 | "github.com/pehringer/simd/internal/sse2" 13 | "github.com/pehringer/simd/internal/sse41" 14 | ) 15 | 16 | func init() { 17 | if cpu.X86.HasSSE2 { 18 | addFloat32 = sse.AddFloat32 19 | divFloat32 = sse.DivFloat32 20 | maxFloat32 = sse.MaxFloat32 21 | minFloat32 = sse.MinFloat32 22 | mulFloat32 = sse.MulFloat32 23 | subFloat32 = sse.SubFloat32 24 | addFloat64 = sse2.AddFloat64 25 | addInt32 = sse2.AddInt32 26 | addInt64 = sse2.AddInt64 27 | andInt32 = sse2.AndInt32 28 | andInt64 = sse2.AndInt64 29 | divFloat64 = sse2.DivFloat64 30 | maxFloat64 = sse2.MaxFloat64 31 | minFloat64 = sse2.MinFloat64 32 | mulFloat64 = sse2.MulFloat64 33 | orInt32 = sse2.OrInt32 34 | orInt64 = sse2.OrInt64 35 | subFloat64 = sse2.SubFloat64 36 | subInt32 = sse2.SubInt32 37 | subInt64 = sse2.SubInt64 38 | xorInt32 = sse2.XorInt32 39 | xorInt64 = sse2.XorInt64 40 | } 41 | if cpu.X86.HasSSE41 { 42 | maxInt32 = sse41.MaxInt32 43 | minInt32 = sse41.MinInt32 44 | mulInt32 = sse41.MulInt32 45 | } 46 | if cpu.X86.HasAVX { 47 | addFloat32 = avx.AddFloat32 48 | addFloat64 = avx.AddFloat64 49 | divFloat32 = avx.DivFloat32 50 | divFloat64 = avx.DivFloat64 51 | maxFloat32 = avx.MaxFloat32 52 | maxFloat64 = avx.MaxFloat64 53 | minFloat32 = avx.MinFloat32 54 | minFloat64 = avx.MinFloat64 55 | mulFloat32 = avx.MulFloat32 56 | mulFloat64 = avx.MulFloat64 57 | subFloat32 = avx.SubFloat32 58 | subFloat64 = avx.SubFloat64 59 | } 60 | if cpu.X86.HasAVX2 { 61 | addInt32 = avx2.AddInt32 62 | addInt64 = avx2.AddInt64 63 | andInt32 = avx2.AndInt32 64 | andInt64 = avx2.AndInt64 65 | maxInt32 = avx2.MaxInt32 66 | minInt32 = avx2.MinInt32 67 | mulInt32 = avx2.MulInt32 68 | orInt32 = avx2.OrInt32 69 | orInt64 = avx2.OrInt64 70 | subInt32 = avx2.SubInt32 71 | subInt64 = sse2.SubInt64 72 | xorInt32 = avx2.XorInt32 73 | xorInt64 = avx2.XorInt64 74 | } 75 | if cpu.X86.HasAVX512VL { 76 | addFloat32 = avx512vl.AddFloat32 77 | addFloat64 = avx512vl.AddFloat64 78 | addInt32 = avx512vl.AddInt32 79 | addInt64 = avx512vl.AddInt64 80 | andInt32 = avx512vl.AndInt32 81 | andInt64 = avx512vl.AndInt64 82 | divFloat32 = avx512vl.DivFloat32 83 | divFloat64 = avx512vl.DivFloat64 84 | maxFloat32 = avx512vl.MaxFloat32 85 | maxFloat64 = avx512vl.MaxFloat64 86 | maxInt32 = avx512vl.MaxInt32 87 | maxInt64 = avx512vl.MaxInt64 88 | minFloat32 = avx512vl.MinFloat32 89 | minFloat64 = avx512vl.MinFloat64 90 | minInt32 = avx512vl.MinInt32 91 | minInt64 = avx512vl.MinInt64 92 | mulFloat32 = avx512vl.MulFloat32 93 | mulFloat64 = avx512vl.MulFloat64 94 | mulInt32 = avx512vl.MulInt32 95 | mulInt64 = avx512vl.MulInt64 96 | orInt32 = avx512vl.OrInt32 97 | orInt64 = avx512vl.OrInt64 98 | subFloat32 = avx512vl.SubFloat32 99 | subFloat64 = avx512vl.SubFloat64 100 | subInt32 = avx512vl.SubInt32 101 | subInt64 = avx512vl.SubInt64 102 | xorInt32 = avx512vl.XorInt32 103 | xorInt64 = avx512vl.XorInt64 104 | } 105 | } 106 | -------------------------------------------------------------------------------- /init_arm64.go: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | package simd 5 | 6 | import ( 7 | "golang.org/x/sys/cpu" 8 | "github.com/pehringer/simd/internal/neon" 9 | ) 10 | 11 | func init() { 12 | if cpu.ARM64.HasASIMD { 13 | addFloat32 = neon.AddFloat32 14 | addFloat64 = neon.AddFloat64 15 | addInt32 = neon.AddInt32 16 | addInt64 = neon.AddInt64 17 | andInt32 = neon.AndInt32 18 | andInt64 = neon.AndInt64 19 | mulFloat32 = neon.MulFloat32 20 | mulFloat64 = neon.MulFloat64 21 | mulInt32 = neon.MulInt32 22 | orInt32 = neon.OrInt32 23 | orInt64 = neon.OrInt64 24 | subFloat32 = neon.SubFloat32 25 | subFloat64 = neon.SubFloat64 26 | subInt32 = neon.SubInt32 27 | subInt64 = neon.SubInt64 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /internal/avx/AddFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddFloat32(left, right, result []float32) int 5 | TEXT ·AddFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Add eight float32 values. 27 | VMOVUPS (SI)(AX*4), Y0 28 | VMOVUPS (DX)(AX*4), Y1 29 | VADDPS Y1, Y0, Y2 30 | VMOVUPS Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | ADDSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/AddFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddFloat64(left, right, result []float64) int 5 | TEXT ·AddFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Add four float64 values. 27 | VMOVUPD (SI)(AX*8), Y0 28 | VMOVUPD (DX)(AX*8), Y1 29 | VADDPD Y1, Y0, Y2 30 | VMOVUPD Y2, (DI)(AX*8) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | ADDSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/DivFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func DivFloat32(left, right, result []float32) int 5 | TEXT ·DivFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Div eight float32 values. 27 | VMOVUPS (SI)(AX*4), Y0 28 | VMOVUPS (DX)(AX*4), Y1 29 | VDIVPS Y1, Y0, Y2 30 | VMOVUPS Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Div one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | DIVSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/DivFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func DivFloat64(left, right, result []float64) int 5 | TEXT ·DivFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Div four float64 values. 27 | VMOVUPD (SI)(AX*8), Y0 28 | VMOVUPD (DX)(AX*8), Y1 29 | VDIVPD Y1, Y0, Y2 30 | VMOVUPD Y2, (DI)(AX*8) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Div one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | DIVSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/MaxFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MaxFloat32(left, right, result []float32) int 5 | TEXT ·MaxFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Max eight float32 values. 27 | VMOVUPS (SI)(AX*4), Y0 28 | VMOVUPS (DX)(AX*4), Y1 29 | VMAXPS Y1, Y0, Y2 30 | VMOVUPS Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Max one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | MAXSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/MaxFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MaxFloat64(left, right, result []float64) int 5 | TEXT ·MaxFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Max four float64 values. 27 | VMOVUPD (SI)(AX*8), Y0 28 | VMOVUPD (DX)(AX*8), Y1 29 | VMAXPD Y1, Y0, Y2 30 | VMOVUPD Y2, (DI)(AX*8) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Max one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | MAXSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/MinFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MinFloat32(left, right, result []float32) int 5 | TEXT ·MinFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Min eight float32 values. 27 | VMOVUPS (SI)(AX*4), Y0 28 | VMOVUPS (DX)(AX*4), Y1 29 | VMINPS Y1, Y0, Y2 30 | VMOVUPS Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Min one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | MINSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/MinFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MinFloat64(left, right, result []float64) int 5 | TEXT ·MinFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Min four float64 values. 27 | VMOVUPD (SI)(AX*8), Y0 28 | VMOVUPD (DX)(AX*8), Y1 29 | VMINPD Y1, Y0, Y2 30 | VMOVUPD Y2, (DI)(AX*8) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Min one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | MINSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/MulFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MulFloat32(left, right, result []float32) int 5 | TEXT ·MulFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Mul eight float32 values. 27 | VMOVUPS (SI)(AX*4), Y0 28 | VMOVUPS (DX)(AX*4), Y1 29 | VMULPS Y1, Y0, Y2 30 | VMOVUPS Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Mul one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | MULSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/MulFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MulFloat64(left, right, result []float64) int 5 | TEXT ·MulFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Mul four float64 values. 27 | VMOVUPD (SI)(AX*8), Y0 28 | VMOVUPD (DX)(AX*8), Y1 29 | VMULPD Y1, Y0, Y2 30 | VMOVUPD Y2, (DI)(AX*8) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Mul one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | MULSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/SubFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubFloat32(left, right, result []float32) int 5 | TEXT ·SubFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Sub eight float32 values. 27 | VMOVUPS (SI)(AX*4), Y0 28 | VMOVUPS (DX)(AX*4), Y1 29 | VSUBPS Y1, Y0, Y2 30 | VMOVUPS Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | SUBSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/SubFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubFloat64(left, right, result []float64) int 5 | TEXT ·SubFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Sub four float64 values. 27 | VMOVUPD (SI)(AX*8), Y0 28 | VMOVUPD (DX)(AX*8), Y1 29 | VSUBPD Y1, Y0, Y2 30 | VMOVUPD Y2, (DI)(AX*8) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | SUBSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx/avx_amd64.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package avx 5 | 6 | func AddFloat32(left, right, result []float32) int 7 | 8 | func AddFloat64(left, right, result []float64) int 9 | 10 | func DivFloat32(left, right, result []float32) int 11 | 12 | func DivFloat64(left, right, result []float64) int 13 | 14 | func MaxFloat32(left, right, result []float32) int 15 | 16 | func MaxFloat64(left, right, result []float64) int 17 | 18 | func MinFloat32(left, right, result []float32) int 19 | 20 | func MinFloat64(left, right, result []float64) int 21 | 22 | func MulFloat32(left, right, result []float32) int 23 | 24 | func MulFloat64(left, right, result []float64) int 25 | 26 | func SubFloat32(left, right, result []float32) int 27 | 28 | func SubFloat64(left, right, result []float64) int 29 | -------------------------------------------------------------------------------- /internal/avx/avx_amd64_test.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package avx 5 | 6 | import ( 7 | "testing" 8 | 9 | "github.com/pehringer/simd/internal/fallback" 10 | "github.com/pehringer/simd/internal/test" 11 | "golang.org/x/sys/cpu" 12 | ) 13 | 14 | func TestAvx(t *testing.T) { 15 | if !cpu.X86.HasAVX { 16 | t.Skip("avx not supported") 17 | return 18 | } 19 | t.Run("AddFloat32", func(t *testing.T) { test.Universal(t, AddFloat32, fallback.Add) }) 20 | t.Run("AddFloat64", func(t *testing.T) { test.Universal(t, AddFloat64, fallback.Add) }) 21 | t.Run("DivFloat32", func(t *testing.T) { test.Universal(t, DivFloat32, fallback.Div) }) 22 | t.Run("DivFloat64", func(t *testing.T) { test.Universal(t, DivFloat64, fallback.Div) }) 23 | t.Run("MaxFloat32", func(t *testing.T) { test.Universal(t, MaxFloat32, fallback.Max) }) 24 | t.Run("MaxFloat64", func(t *testing.T) { test.Universal(t, MaxFloat64, fallback.Max) }) 25 | t.Run("MinFloat32", func(t *testing.T) { test.Universal(t, MinFloat32, fallback.Min) }) 26 | t.Run("MinFloat64", func(t *testing.T) { test.Universal(t, MinFloat64, fallback.Min) }) 27 | t.Run("MulFloat32", func(t *testing.T) { test.Universal(t, MulFloat32, fallback.Mul) }) 28 | t.Run("MulFloat64", func(t *testing.T) { test.Universal(t, MulFloat64, fallback.Mul) }) 29 | t.Run("SubFloat32", func(t *testing.T) { test.Universal(t, SubFloat32, fallback.Sub) }) 30 | t.Run("SubFloat64", func(t *testing.T) { test.Universal(t, SubFloat64, fallback.Sub) }) 31 | } 32 | -------------------------------------------------------------------------------- /internal/avx2/AddInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddInt32(left, right, result []int32) int 5 | TEXT ·AddInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Add eight int32 values. 27 | VMOVDQU (SI)(AX*4), Y0 28 | VMOVDQU (DX)(AX*4), Y1 29 | VPADDD Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | ADDL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx2/AddInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddInt64(left, right, result []int64) int 5 | TEXT ·AddInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Add four int64 values. 27 | VMOVDQU (SI)(AX*8), Y0 28 | VMOVDQU (DX)(AX*8), Y1 29 | VPADDQ Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*8) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | ADDQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx2/AndInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AndInt32(left, right, result []int32) int 5 | TEXT ·AndInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //And eight int32 values. 27 | VMOVDQU (SI)(AX*4), Y0 28 | VMOVDQU (DX)(AX*4), Y1 29 | VPAND Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //And one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | ANDL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx2/AndInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AndInt64(left, right, result []int64) int 5 | TEXT ·AndInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //And four int64 values. 27 | VMOVDQU (SI)(AX*8), Y0 28 | VMOVDQU (DX)(AX*8), Y1 29 | VPAND Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*8) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //And one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | ANDQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx2/MaxInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MaxInt32(left, right, result []int32) int 5 | TEXT ·MaxInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Max eight int32 values. 27 | VMOVDQU (SI)(AX*4), Y0 28 | VMOVDQU (DX)(AX*4), Y1 29 | VPMAXSD Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Max one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | CMPL R9, R8 40 | CMOVLGT R9, R8 41 | MOVL R8, (DI)(AX*4) 42 | INCQ AX 43 | JMP singleDataLoop 44 | returnLength: 45 | MOVQ CX, int+72(FP) 46 | RET 47 | -------------------------------------------------------------------------------- /internal/avx2/MinInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MinInt32(left, right, result []int32) int 5 | TEXT ·MinInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Min eight int32 values. 27 | VMOVDQU (SI)(AX*4), Y0 28 | VMOVDQU (DX)(AX*4), Y1 29 | VPMINSD Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Min one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | CMPL R9, R8 40 | CMOVLLT R9, R8 41 | MOVL R8, (DI)(AX*4) 42 | INCQ AX 43 | JMP singleDataLoop 44 | returnLength: 45 | MOVQ CX, int+72(FP) 46 | RET 47 | -------------------------------------------------------------------------------- /internal/avx2/MulInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MulInt32(left, right, result []int32) int 5 | TEXT ·MulInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Mul eight int32 values. 27 | VMOVDQU (SI)(AX*4), Y0 28 | VMOVDQU (DX)(AX*4), Y1 29 | VPMULLD Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Mul one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | IMULL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx2/OrInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func OrInt32(left, right, result []int32) int 5 | TEXT ·OrInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Or eight int32 values. 27 | VMOVDQU (SI)(AX*4), Y0 28 | VMOVDQU (DX)(AX*4), Y1 29 | VPOR Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Or one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | ORL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx2/OrInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func OrInt64(left, right, result []int64) int 5 | TEXT ·OrInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Or four int64 values. 27 | VMOVDQU (SI)(AX*8), Y0 28 | VMOVDQU (DX)(AX*8), Y1 29 | VPOR Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*8) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Or one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | ORQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx2/SubInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubInt32(left, right, result []int32) int 5 | TEXT ·SubInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Sub eight int32 values. 27 | VMOVDQU (SI)(AX*4), Y0 28 | VMOVDQU (DX)(AX*4), Y1 29 | VPSUBD Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | SUBL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx2/SubInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubInt64(left, right, result []int64) int 5 | TEXT ·SubInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Sub four int64 values. 27 | VMOVDQU (SI)(AX*8), Y0 28 | VMOVDQU (DX)(AX*8), Y1 29 | VPSUBQ Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*8) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | SUBQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx2/XorInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func XorInt32(left, right, result []int32) int 5 | TEXT ·XorInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Xor eight int32 values. 27 | VMOVDQU (SI)(AX*4), Y0 28 | VMOVDQU (DX)(AX*4), Y1 29 | VPXOR Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*4) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Xor one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | XORL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx2/XorInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func XorInt64(left, right, result []int64) int 5 | TEXT ·XorInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Xor four int64 values. 27 | VMOVDQU (SI)(AX*8), Y0 28 | VMOVDQU (DX)(AX*8), Y1 29 | VPXOR Y1, Y0, Y2 30 | VMOVDQU Y2, (DI)(AX*8) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Xor one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | XORQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx2/avx2_amd64.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package avx2 5 | 6 | func AddInt32(left, right, result []int32) int 7 | 8 | func AddInt64(left, right, result []int64) int 9 | 10 | func AndInt32(left, right, result []int32) int 11 | 12 | func AndInt64(left, right, result []int64) int 13 | 14 | func MaxInt32(left, right, result []int32) int 15 | 16 | func MinInt32(left, right, result []int32) int 17 | 18 | func MulInt32(left, right, result []int32) int 19 | 20 | func OrInt32(left, right, result []int32) int 21 | 22 | func OrInt64(left, right, result []int64) int 23 | 24 | func SubInt32(left, right, result []int32) int 25 | 26 | func SubInt64(left, right, result []int64) int 27 | 28 | func XorInt32(left, right, result []int32) int 29 | 30 | func XorInt64(left, right, result []int64) int 31 | -------------------------------------------------------------------------------- /internal/avx2/avx2_amd64_test.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package avx2 5 | 6 | import ( 7 | "testing" 8 | 9 | "github.com/pehringer/simd/internal/fallback" 10 | "github.com/pehringer/simd/internal/test" 11 | "golang.org/x/sys/cpu" 12 | ) 13 | 14 | func TestAvx2(t *testing.T) { 15 | if !cpu.X86.HasAVX2 { 16 | t.Skip("avx2 not supported") 17 | return 18 | } 19 | t.Run("AddInt32", func(t *testing.T) { test.Universal(t, AddInt32, fallback.Add) }) 20 | t.Run("AddInt64", func(t *testing.T) { test.Universal(t, AddInt64, fallback.Add) }) 21 | t.Run("AndInt32", func(t *testing.T) { test.Universal(t, AndInt32, fallback.And) }) 22 | t.Run("AndInt64", func(t *testing.T) { test.Universal(t, AndInt64, fallback.And) }) 23 | t.Run("MaxInt32", func(t *testing.T) { test.Universal(t, MaxInt32, fallback.Max) }) 24 | t.Run("MinInt32", func(t *testing.T) { test.Universal(t, MinInt32, fallback.Min) }) 25 | t.Run("MulInt32", func(t *testing.T) { test.Universal(t, MulInt32, fallback.Mul) }) 26 | t.Run("OrInt32", func(t *testing.T) { test.Universal(t, OrInt32, fallback.Or) }) 27 | t.Run("OrInt64", func(t *testing.T) { test.Universal(t, OrInt64, fallback.Or) }) 28 | t.Run("SubInt32", func(t *testing.T) { test.Universal(t, SubInt32, fallback.Sub) }) 29 | t.Run("SubInt64", func(t *testing.T) { test.Universal(t, SubInt64, fallback.Sub) }) 30 | t.Run("XorInt32", func(t *testing.T) { test.Universal(t, XorInt32, fallback.Xor) }) 31 | t.Run("XorInt64", func(t *testing.T) { test.Universal(t, XorInt64, fallback.Xor) }) 32 | } 33 | -------------------------------------------------------------------------------- /internal/avx512vl/AddFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddFloat32(left, right, result []float32) int 5 | TEXT ·AddFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Add sixteen float32 values. 27 | VMOVUPS (SI)(AX*4), Z0 28 | VMOVUPS (DX)(AX*4), Z1 29 | VADDPS Z1, Z0, Z2 30 | VMOVUPS Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | ADDSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/AddFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddFloat64(left, right, result []float64) int 5 | TEXT ·AddFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Add eight float64 values. 27 | VMOVUPD (SI)(AX*8), Z0 28 | VMOVUPD (DX)(AX*8), Z1 29 | VADDPD Z1, Z0, Z2 30 | VMOVUPD Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | ADDSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/AddInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddInt32(left, right, result []int32) int 5 | TEXT ·AddInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Add sixteen int32 values. 27 | VMOVDQU32 (SI)(AX*4), Z0 28 | VMOVDQU32 (DX)(AX*4), Z1 29 | VPADDD Z1, Z0, Z2 30 | VMOVDQU32 Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | ADDL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/AddInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddInt64(left, right, result []int64) int 5 | TEXT ·AddInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Add eight int64 values. 27 | VMOVDQU64 (SI)(AX*8), Z0 28 | VMOVDQU64 (DX)(AX*8), Z1 29 | VPADDQ Z1, Z0, Z2 30 | VMOVDQU64 Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | ADDQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/AndInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AndInt32(left, right, result []int32) int 5 | TEXT ·AndInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //And sixteen int32 values. 27 | VMOVDQU32 (SI)(AX*4), Z0 28 | VMOVDQU32 (DX)(AX*4), Z1 29 | VPANDD Z1, Z0, Z2 30 | VMOVDQU32 Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //And one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | ANDL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/AndInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AndInt64(left, right, result []int64) int 5 | TEXT ·AndInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //And eight int64 values. 27 | VMOVDQU64 (SI)(AX*8), Z0 28 | VMOVDQU64 (DX)(AX*8), Z1 29 | VPANDQ Z1, Z0, Z2 30 | VMOVDQU64 Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //And one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | ANDQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/DivFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func DivFloat32(left, right, result []float32) int 5 | TEXT ·DivFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Div sixteen float32 values. 27 | VMOVUPS (SI)(AX*4), Z0 28 | VMOVUPS (DX)(AX*4), Z1 29 | VDIVPS Z1, Z0, Z2 30 | VMOVUPS Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Div one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | DIVSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/DivFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func DivFloat64(left, right, result []float64) int 5 | TEXT ·DivFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Div eight float64 values. 27 | VMOVUPD (SI)(AX*8), Z0 28 | VMOVUPD (DX)(AX*8), Z1 29 | VDIVPD Z1, Z0, Z2 30 | VMOVUPD Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Div one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | DIVSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/MaxFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MaxFloat32(left, right, result []float32) int 5 | TEXT ·MaxFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Max sixteen float32 values. 27 | VMOVUPS (SI)(AX*4), Z0 28 | VMOVUPS (DX)(AX*4), Z1 29 | VMAXPS Z1, Z0, Z2 30 | VMOVUPS Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Max one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | MAXSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/MaxFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MaxFloat64(left, right, result []float64) int 5 | TEXT ·MaxFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Max eight float64 values. 27 | VMOVUPD (SI)(AX*8), Z0 28 | VMOVUPD (DX)(AX*8), Z1 29 | VMAXPD Z1, Z0, Z2 30 | VMOVUPD Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Max one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | MAXSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/MaxInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MaxInt32(left, right, result []int32) int 5 | TEXT ·MaxInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Max sixteen int32 values. 27 | VMOVDQU32 (SI)(AX*4), Z0 28 | VMOVDQU32 (DX)(AX*4), Z1 29 | VPMAXSD Z1, Z0, Z2 30 | VMOVDQU32 Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Max one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | CMPL R9, R8 40 | CMOVLGT R9, R8 41 | MOVL R8, (DI)(AX*4) 42 | INCQ AX 43 | JMP singleDataLoop 44 | returnLength: 45 | MOVQ CX, int+72(FP) 46 | RET 47 | -------------------------------------------------------------------------------- /internal/avx512vl/MaxInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MaxInt64(left, right, result []int64) int 5 | TEXT ·MaxInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Max eight int64 values. 27 | VMOVDQU64 (SI)(AX*8), Z0 28 | VMOVDQU64 (DX)(AX*8), Z1 29 | VPMAXSQ Z1, Z0, Z2 30 | VMOVDQU64 Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Max one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | CMPQ R9, R8 40 | CMOVQGT R9, R8 41 | MOVQ R8, (DI)(AX*8) 42 | INCQ AX 43 | JMP singleDataLoop 44 | returnLength: 45 | MOVQ CX, int+72(FP) 46 | RET 47 | -------------------------------------------------------------------------------- /internal/avx512vl/MinFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MinFloat32(left, right, result []float32) int 5 | TEXT ·MinFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Min sixteen float32 values. 27 | VMOVUPS (SI)(AX*4), Z0 28 | VMOVUPS (DX)(AX*4), Z1 29 | VMINPS Z1, Z0, Z2 30 | VMOVUPS Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Min one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | MINSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/MinFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MinFloat64(left, right, result []float64) int 5 | TEXT ·MinFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Min eight float64 values. 27 | VMOVUPD (SI)(AX*8), Z0 28 | VMOVUPD (DX)(AX*8), Z1 29 | VMINPD Z1, Z0, Z2 30 | VMOVUPD Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Min one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | MINSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/MinInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MinInt32(left, right, result []int32) int 5 | TEXT ·MinInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Min sixteen int32 values. 27 | VMOVDQU32 (SI)(AX*4), Z0 28 | VMOVDQU32 (DX)(AX*4), Z1 29 | VPMINSD Z1, Z0, Z2 30 | VMOVDQU32 Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Min one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | CMPL R9, R8 40 | CMOVLLT R9, R8 41 | MOVL R8, (DI)(AX*4) 42 | INCQ AX 43 | JMP singleDataLoop 44 | returnLength: 45 | MOVQ CX, int+72(FP) 46 | RET 47 | -------------------------------------------------------------------------------- /internal/avx512vl/MinInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MinInt64(left, right, result []int64) int 5 | TEXT ·MinInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Min eight int64 values. 27 | VMOVDQU64 (SI)(AX*8), Z0 28 | VMOVDQU64 (DX)(AX*8), Z1 29 | VPMINSQ Z1, Z0, Z2 30 | VMOVDQU64 Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Min one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | CMPQ R9, R8 40 | CMOVQLT R9, R8 41 | MOVQ R8, (DI)(AX*8) 42 | INCQ AX 43 | JMP singleDataLoop 44 | returnLength: 45 | MOVQ CX, int+72(FP) 46 | RET 47 | -------------------------------------------------------------------------------- /internal/avx512vl/MulFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MulFloat32(left, right, result []float32) int 5 | TEXT ·MulFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Mul sixteen float32 values. 27 | VMOVUPS (SI)(AX*4), Z0 28 | VMOVUPS (DX)(AX*4), Z1 29 | VMULPS Z1, Z0, Z2 30 | VMOVUPS Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Mul one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | MULSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/MulFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MulFloat64(left, right, result []float64) int 5 | TEXT ·MulFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Mul eight float64 values. 27 | VMOVUPD (SI)(AX*8), Z0 28 | VMOVUPD (DX)(AX*8), Z1 29 | VMULPD Z1, Z0, Z2 30 | VMOVUPD Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Mul one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | MULSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/MulInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MulInt32(left, right, result []int32) int 5 | TEXT ·MulInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Mul sixteen int32 values. 27 | VMOVDQU32 (SI)(AX*4), Z0 28 | VMOVDQU32 (DX)(AX*4), Z1 29 | VPMULLD Z1, Z0, Z2 30 | VMOVDQU32 Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Mul one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | IMULL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/MulInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MulInt64(left, right, result []int64) int 5 | TEXT ·MulInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Mul eight int64 values. 27 | VMOVDQU64 (SI)(AX*8), Z0 28 | VMOVDQU64 (DX)(AX*8), Z1 29 | VPMULLQ Z1, Z0, Z2 30 | VMOVDQU64 Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Mul one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | IMULQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/OrInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func OrInt32(left, right, result []int32) int 5 | TEXT ·OrInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Or sixteen int32 values. 27 | VMOVDQU32 (SI)(AX*4), Z0 28 | VMOVDQU32 (DX)(AX*4), Z1 29 | VPORD Z1, Z0, Z2 30 | VMOVDQU32 Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Or one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | ORL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/OrInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func OrInt64(left, right, result []int64) int 5 | TEXT ·OrInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Or eight int64 values. 27 | VMOVDQU64 (SI)(AX*8), Z0 28 | VMOVDQU64 (DX)(AX*8), Z1 29 | VPORQ Z1, Z0, Z2 30 | VMOVDQU64 Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Or one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | ORQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/SubFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubFloat32(left, right, result []float32) int 5 | TEXT ·SubFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Sub sixteen float32 values. 27 | VMOVUPS (SI)(AX*4), Z0 28 | VMOVUPS (DX)(AX*4), Z1 29 | VSUBPS Z1, Z0, Z2 30 | VMOVUPS Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | SUBSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/SubFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubFloat64(left, right, result []float64) int 5 | TEXT ·SubFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Sub eight float64 values. 27 | VMOVUPD (SI)(AX*8), Z0 28 | VMOVUPD (DX)(AX*8), Z1 29 | VSUBPD Z1, Z0, Z2 30 | VMOVUPD Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | SUBSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/SubInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubInt32(left, right, result []int32) int 5 | TEXT ·SubInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Sub sixteen int32 values. 27 | VMOVDQU32 (SI)(AX*4), Z0 28 | VMOVDQU32 (DX)(AX*4), Z1 29 | VPSUBD Z1, Z0, Z2 30 | VMOVDQU32 Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | SUBL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/SubInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubInt64(left, right, result []int64) int 5 | TEXT ·SubInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Sub eight int64 values. 27 | VMOVDQU64 (SI)(AX*8), Z0 28 | VMOVDQU64 (DX)(AX*8), Z1 29 | VPSUBQ Z1, Z0, Z2 30 | VMOVDQU64 Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | SUBQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/XorInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func XorInt32(left, right, result []int32) int 5 | TEXT ·XorInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $16 25 | JL singleDataLoop 26 | //Xor sixteen int32 values. 27 | VMOVDQU32 (SI)(AX*4), Z0 28 | VMOVDQU32 (DX)(AX*4), Z1 29 | VPXORD Z1, Z0, Z2 30 | VMOVDQU32 Z2, (DI)(AX*4) 31 | ADDQ $16, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Xor one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | XORL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/XorInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func XorInt64(left, right, result []int64) int 5 | TEXT ·XorInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $8 25 | JL singleDataLoop 26 | //Xor eight int64 values. 27 | VMOVDQU64 (SI)(AX*8), Z0 28 | VMOVDQU64 (DX)(AX*8), Z1 29 | VPXORQ Z1, Z0, Z2 30 | VMOVDQU64 Z2, (DI)(AX*8) 31 | ADDQ $8, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Xor one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | XORQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/avx512vl/avx512vl_amd64.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package avx512vl 5 | 6 | func AddFloat32(left, right, result []float32) int 7 | 8 | func AddFloat64(left, right, result []float64) int 9 | 10 | func AddInt32(left, right, result []int32) int 11 | 12 | func AddInt64(left, right, result []int64) int 13 | 14 | func AndInt32(left, right, result []int32) int 15 | 16 | func AndInt64(left, right, result []int64) int 17 | 18 | func DivFloat32(left, right, result []float32) int 19 | 20 | func DivFloat64(left, right, result []float64) int 21 | 22 | func MaxFloat32(left, right, result []float32) int 23 | 24 | func MaxFloat64(left, right, result []float64) int 25 | 26 | func MaxInt32(left, right, result []int32) int 27 | 28 | func MaxInt64(left, right, result []int64) int 29 | 30 | func MinFloat32(left, right, result []float32) int 31 | 32 | func MinFloat64(left, right, result []float64) int 33 | 34 | func MinInt32(left, right, result []int32) int 35 | 36 | func MinInt64(left, right, result []int64) int 37 | 38 | func MulFloat32(left, right, result []float32) int 39 | 40 | func MulFloat64(left, right, result []float64) int 41 | 42 | func MulInt32(left, right, result []int32) int 43 | 44 | func MulInt64(left, right, result []int64) int 45 | 46 | func OrInt32(left, right, result []int32) int 47 | 48 | func OrInt64(left, right, result []int64) int 49 | 50 | func SubFloat32(left, right, result []float32) int 51 | 52 | func SubFloat64(left, right, result []float64) int 53 | 54 | func SubInt32(left, right, result []int32) int 55 | 56 | func SubInt64(left, right, result []int64) int 57 | 58 | func XorInt32(left, right, result []int32) int 59 | 60 | func XorInt64(left, right, result []int64) int 61 | -------------------------------------------------------------------------------- /internal/avx512vl/avx512vl_amd64_test.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package avx512vl 5 | 6 | import ( 7 | "testing" 8 | 9 | "github.com/pehringer/simd/internal/fallback" 10 | "github.com/pehringer/simd/internal/test" 11 | "golang.org/x/sys/cpu" 12 | ) 13 | 14 | func TestAvx512vl(t *testing.T) { 15 | if !cpu.X86.HasAVX512VL { 16 | t.Skip("avx512vl not supported") 17 | return 18 | } 19 | t.Run("AddFloat32", func(t *testing.T) { test.Universal(t, AddFloat32, fallback.Add) }) 20 | t.Run("AddFloat64", func(t *testing.T) { test.Universal(t, AddFloat64, fallback.Add) }) 21 | t.Run("AddInt32", func(t *testing.T) { test.Universal(t, AddInt32, fallback.Add) }) 22 | t.Run("AddInt64", func(t *testing.T) { test.Universal(t, AddInt64, fallback.Add) }) 23 | t.Run("AndInt32", func(t *testing.T) { test.Universal(t, AndInt32, fallback.And) }) 24 | t.Run("AndInt64", func(t *testing.T) { test.Universal(t, AndInt64, fallback.And) }) 25 | t.Run("DivFloat32", func(t *testing.T) { test.Universal(t, DivFloat32, fallback.Div) }) 26 | t.Run("DivFloat64", func(t *testing.T) { test.Universal(t, DivFloat64, fallback.Div) }) 27 | t.Run("MaxFloat32", func(t *testing.T) { test.Universal(t, MaxFloat32, fallback.Max) }) 28 | t.Run("MaxFloat64", func(t *testing.T) { test.Universal(t, MaxFloat64, fallback.Max) }) 29 | t.Run("MaxInt32", func(t *testing.T) { test.Universal(t, MaxInt32, fallback.Max) }) 30 | t.Run("MaxInt64", func(t *testing.T) { test.Universal(t, MaxInt64, fallback.Max) }) 31 | t.Run("MinFloat32", func(t *testing.T) { test.Universal(t, MinFloat32, fallback.Min) }) 32 | t.Run("MinFloat64", func(t *testing.T) { test.Universal(t, MinFloat64, fallback.Min) }) 33 | t.Run("MinInt32", func(t *testing.T) { test.Universal(t, MinInt32, fallback.Min) }) 34 | t.Run("MinInt64", func(t *testing.T) { test.Universal(t, MinInt64, fallback.Min) }) 35 | t.Run("MulFloat32", func(t *testing.T) { test.Universal(t, MulFloat32, fallback.Mul) }) 36 | t.Run("MulFloat64", func(t *testing.T) { test.Universal(t, MulFloat64, fallback.Mul) }) 37 | t.Run("MulInt32", func(t *testing.T) { test.Universal(t, MulInt32, fallback.Mul) }) 38 | t.Run("MulInt64", func(t *testing.T) { test.Universal(t, MulInt64, fallback.Mul) }) 39 | t.Run("OrInt32", func(t *testing.T) { test.Universal(t, OrInt32, fallback.Or) }) 40 | t.Run("OrInt64", func(t *testing.T) { test.Universal(t, OrInt64, fallback.Or) }) 41 | t.Run("SubFloat32", func(t *testing.T) { test.Universal(t, SubFloat32, fallback.Sub) }) 42 | t.Run("SubFloat64", func(t *testing.T) { test.Universal(t, SubFloat64, fallback.Sub) }) 43 | t.Run("SubInt32", func(t *testing.T) { test.Universal(t, SubInt32, fallback.Sub) }) 44 | t.Run("SubInt64", func(t *testing.T) { test.Universal(t, SubInt64, fallback.Sub) }) 45 | t.Run("XorInt32", func(t *testing.T) { test.Universal(t, XorInt32, fallback.Xor) }) 46 | t.Run("XorInt64", func(t *testing.T) { test.Universal(t, XorInt64, fallback.Xor) }) 47 | } 48 | -------------------------------------------------------------------------------- /internal/data/types.go: -------------------------------------------------------------------------------- 1 | package data 2 | 3 | type ( 4 | Floating interface{ float32 | float64 } 5 | Integer interface{ int32 | int64 } 6 | Operation[T Floating | Integer] func(left, right, result []T) int 7 | ) 8 | -------------------------------------------------------------------------------- /internal/fallback/fallback.go: -------------------------------------------------------------------------------- 1 | package fallback 2 | 3 | import ( 4 | "github.com/pehringer/simd/internal/data" 5 | ) 6 | 7 | func Add[T data.Floating | data.Integer](left, right, result []T) int { 8 | length := min(len(left), len(right), len(result)) 9 | i := 0 10 | for ; length-i >= 4; i += 4 { 11 | result[i] = left[i] + right[i] 12 | result[i+1] = left[i+1] + right[i+1] 13 | result[i+2] = left[i+2] + right[i+2] 14 | result[i+3] = left[i+3] + right[i+3] 15 | } 16 | for ; i < length; i++ { 17 | result[i] = left[i] + right[i] 18 | } 19 | return length 20 | } 21 | 22 | func And[T data.Integer](left, right, result []T) int { 23 | length := min(len(left), len(right), len(result)) 24 | i := 0 25 | for ; length-i >= 4; i += 4 { 26 | result[i] = left[i] & right[i] 27 | result[i+1] = left[i+1] & right[i+1] 28 | result[i+2] = left[i+2] & right[i+2] 29 | result[i+3] = left[i+3] & right[i+3] 30 | } 31 | for ; i < length; i++ { 32 | result[i] = left[i] & right[i] 33 | } 34 | return length 35 | } 36 | 37 | func Div[T data.Floating | data.Integer](left, right, result []T) int { 38 | length := min(len(left), len(right), len(result)) 39 | i := 0 40 | for ; length-i >= 4; i += 4 { 41 | result[i] = left[i] / right[i] 42 | result[i+1] = left[i+1] / right[i+1] 43 | result[i+2] = left[i+2] / right[i+2] 44 | result[i+3] = left[i+3] / right[i+3] 45 | } 46 | for ; i < length; i++ { 47 | result[i] = left[i] / right[i] 48 | } 49 | return length 50 | } 51 | 52 | func Max[T data.Floating | data.Integer](left, right, result []T) int { 53 | length := min(len(left), len(right), len(result)) 54 | i := 0 55 | for ; length-i >= 4; i += 4 { 56 | result[i] = max(left[i], right[i]) 57 | result[i+1] = max(left[i+1], right[i+1]) 58 | result[i+2] = max(left[i+2], right[i+2]) 59 | result[i+3] = max(left[i+3], right[i+3]) 60 | } 61 | for ; i < length; i++ { 62 | result[i] = max(left[i], right[i]) 63 | } 64 | return length 65 | } 66 | 67 | func Min[T data.Floating | data.Integer](left, right, result []T) int { 68 | length := min(len(left), len(right), len(result)) 69 | i := 0 70 | for ; length-i >= 4; i += 4 { 71 | result[i] = min(left[i], right[i]) 72 | result[i+1] = min(left[i+1], right[i+1]) 73 | result[i+2] = min(left[i+2], right[i+2]) 74 | result[i+3] = min(left[i+3], right[i+3]) 75 | } 76 | for ; i < length; i++ { 77 | result[i] = min(left[i], right[i]) 78 | } 79 | return length 80 | } 81 | 82 | func Mul[T data.Floating | data.Integer](left, right, result []T) int { 83 | length := min(len(left), len(right), len(result)) 84 | i := 0 85 | for ; length-i >= 4; i += 4 { 86 | result[i] = left[i] * right[i] 87 | result[i+1] = left[i+1] * right[i+1] 88 | result[i+2] = left[i+2] * right[i+2] 89 | result[i+3] = left[i+3] * right[i+3] 90 | } 91 | for ; i < length; i++ { 92 | result[i] = left[i] * right[i] 93 | } 94 | return length 95 | } 96 | 97 | func Or[T data.Integer](left, right, result []T) int { 98 | length := min(len(left), len(right), len(result)) 99 | i := 0 100 | for ; length-i >= 4; i += 4 { 101 | result[i] = left[i] | right[i] 102 | result[i+1] = left[i+1] | right[i+1] 103 | result[i+2] = left[i+2] | right[i+2] 104 | result[i+3] = left[i+3] | right[i+3] 105 | } 106 | for ; i < length; i++ { 107 | result[i] = left[i] | right[i] 108 | } 109 | return length 110 | } 111 | 112 | func Sub[T data.Floating | data.Integer](left, right, result []T) int { 113 | length := min(len(left), len(right), len(result)) 114 | i := 0 115 | for ; length-i >= 4; i += 4 { 116 | result[i] = left[i] - right[i] 117 | result[i+1] = left[i+1] - right[i+1] 118 | result[i+2] = left[i+2] - right[i+2] 119 | result[i+3] = left[i+3] - right[i+3] 120 | } 121 | for ; i < length; i++ { 122 | result[i] = left[i] - right[i] 123 | } 124 | return length 125 | } 126 | 127 | func Xor[T data.Integer](left, right, result []T) int { 128 | length := min(len(left), len(right), len(result)) 129 | i := 0 130 | for ; length-i >= 4; i += 4 { 131 | result[i] = left[i] ^ right[i] 132 | result[i+1] = left[i+1] ^ right[i+1] 133 | result[i+2] = left[i+2] ^ right[i+2] 134 | result[i+3] = left[i+3] ^ right[i+3] 135 | } 136 | for ; i < length; i++ { 137 | result[i] = left[i] ^ right[i] 138 | } 139 | return length 140 | } 141 | -------------------------------------------------------------------------------- /internal/neon/AddFloat32_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func AddFloat32(left, right, result []float32) int 5 | TEXT ·AddFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $4, R1 24 | BLT singleDataLoop 25 | //Add four float32 values. 26 | VLD1.P 16(R3), [V0.S4] 27 | VLD1.P 16(R4), [V1.S4] 28 | //WORD $0x4E21D441 //VFADD V1.S4, V2.S4, V1.S4 29 | WORD $0x4E21D402 //VFADD V1.S4, V0.S4, V2.S4 30 | VST1.P [V2.S4], 16(R5) 31 | ADD $4, R0, R0 32 | B multipleDataLoop 33 | singleDataLoop: 34 | CMP R2, R0 35 | BGE returnLength 36 | //Add one float32 value. 37 | FMOVS.P 4(R3), F0 38 | FMOVS.P 4(R4), F1 39 | FADDS F1, F0, F2 40 | FMOVS.P F2, 4(R5) 41 | ADD $1, R0, R0 42 | B singleDataLoop 43 | returnLength: 44 | MOVD R2, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/neon/AddFloat64_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func AddFloat64(left, right, result []float64) int 5 | TEXT ·AddFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $2, R1 24 | BLT singleDataLoop 25 | //Add two float64 values. 26 | VLD1.P 16(R3), [V0.D2] 27 | VLD1.P 16(R4), [V1.D2] 28 | WORD $0x4E61D402 //VFADD V1.D2, V0.D2, V2.D2 29 | VST1.P [V2.D2], 16(R5) 30 | ADD $2, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Add one float64 value. 36 | FMOVD.P 8(R3), F0 37 | FMOVD.P 8(R4), F1 38 | FADDD F1, F0, F2 39 | FMOVD.P F2, 8(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/AddInt32_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func AddInt32(left, right, result []int32) int 5 | TEXT ·AddInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $4, R1 24 | BLT singleDataLoop 25 | //Add four int32 values. 26 | VLD1.P 16(R3), [V0.S4] 27 | VLD1.P 16(R4), [V1.S4] 28 | VADD V1.S4, V0.S4, V2.S4 29 | VST1.P [V2.S4], 16(R5) 30 | ADD $4, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Add one int32 value. 36 | MOVW.P 4(R3), R6 37 | MOVW.P 4(R4), R7 38 | ADD R7, R6, R8 39 | MOVW.P R8, 4(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/AddInt64_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func AddInt64(left, right, result []int64) int 5 | TEXT ·AddInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $2, R1 24 | BLT singleDataLoop 25 | //Add two int64 values. 26 | VLD1.P 16(R3), [V0.D2] 27 | VLD1.P 16(R4), [V1.D2] 28 | VADD V1.D2, V0.D2, V2.D2 29 | VST1.P [V2.D2], 16(R5) 30 | ADD $2, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Add one int64 value. 36 | MOVD.P 8(R3), R6 37 | MOVD.P 8(R4), R7 38 | ADD R7, R6, R8 39 | MOVD.P R8, 8(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/AndInt32_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func AndInt32(left, right, result []int32) int 5 | TEXT ·AndInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $4, R1 24 | BLT singleDataLoop 25 | //And four int32 values. 26 | VLD1.P 16(R3), [V0.S4] 27 | VLD1.P 16(R4), [V1.S4] 28 | WORD $0x4E211C02 //VAND V1.S4, V0.S4, V2.S4 29 | VST1.P [V2.S4], 16(R5) 30 | ADD $4, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //And one int32 value. 36 | MOVW.P 4(R3), R6 37 | MOVW.P 4(R4), R7 38 | AND R7, R6, R8 39 | MOVW.P R8, 4(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/AndInt64_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func AndInt64(left, right, result []int64) int 5 | TEXT ·AndInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $2, R1 24 | BLT singleDataLoop 25 | //And two int64 values. 26 | VLD1.P 16(R3), [V0.D2] 27 | VLD1.P 16(R4), [V1.D2] 28 | WORD $0x4E211C02 //VAND V1.D2, V0.D2, V2.D2 29 | VST1.P [V2.D2], 16(R5) 30 | ADD $2, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //And one int64 value. 36 | MOVD.P 8(R3), R6 37 | MOVD.P 8(R4), R7 38 | AND R7, R6, R8 39 | MOVD.P R8, 8(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/MulFloat32_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func MulFloat32(left, right, result []float32) int 5 | TEXT ·MulFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $4, R1 24 | BLT singleDataLoop 25 | //Mul four float32 values. 26 | VLD1.P 16(R3), [V0.S4] 27 | VLD1.P 16(R4), [V1.S4] 28 | WORD $0x4E21DC02 //VFMUL V1.S4, V0.S4, V2.S4 29 | VST1.P [V2.S4], 16(R5) 30 | ADD $4, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Mul one float32 value. 36 | FMOVS.P 4(R3), F0 37 | FMOVS.P 4(R4), F1 38 | FMULS F1, F0, F2 39 | FMOVS.P F2, 4(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/MulFloat64_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func MulFloat64(left, right, result []float64) int 5 | TEXT ·MulFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $2, R1 24 | BLT singleDataLoop 25 | //Mul two float64 values. 26 | VLD1.P 16(R3), [V0.D2] 27 | VLD1.P 16(R4), [V1.D2] 28 | WORD $0x4E61DC02 //VFMUL V1.D2, V0.D2, V2.D2 29 | VST1.P [V2.D2], 16(R5) 30 | ADD $2, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Mul one float64 value. 36 | FMOVD.P 8(R3), F0 37 | FMOVD.P 8(R4), F1 38 | FMULD F1, F0, F2 39 | FMOVD.P F2, 8(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/MulInt32_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func MulInt32(left, right, result []int32) int 5 | TEXT ·MulInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $4, R1 24 | BLT singleDataLoop 25 | //Mul four int32 values. 26 | VLD1.P 16(R3), [V0.S4] 27 | VLD1.P 16(R4), [V1.S4] 28 | WORD $0x4EA19C02 //VMUL V1.S4, V0.S4, V2.S4 29 | VST1.P [V2.S4], 16(R5) 30 | ADD $4, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Mul one int32 value. 36 | MOVW.P 4(R3), R6 37 | MOVW.P 4(R4), R7 38 | MUL R7, R6, R8 39 | MOVW.P R8, 4(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/OrInt32_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func OrInt32(left, right, result []int32) int 5 | TEXT ·OrInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $4, R1 24 | BLT singleDataLoop 25 | //Or four int32 values. 26 | VLD1.P 16(R3), [V0.S4] 27 | VLD1.P 16(R4), [V1.S4] 28 | WORD $0x4EA11C02 //VORR V1.S4, V0.S4, V2.S4 29 | VST1.P [V2.S4], 16(R5) 30 | ADD $4, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Or one int32 value. 36 | MOVW.P 4(R3), R6 37 | MOVW.P 4(R4), R7 38 | ORR R7, R6, R8 39 | MOVW.P R8, 4(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/OrInt64_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func OrInt64(left, right, result []int64) int 5 | TEXT ·OrInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $2, R1 24 | BLT singleDataLoop 25 | //Or two int64 values. 26 | VLD1.P 16(R3), [V0.D2] 27 | VLD1.P 16(R4), [V1.D2] 28 | WORD $0x4EA11C02 //VORR V1.D2, V0.D2, V2.D2 29 | VST1.P [V2.D2], 16(R5) 30 | ADD $2, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Or one int64 value. 36 | MOVD.P 8(R3), R6 37 | MOVD.P 8(R4), R7 38 | ORR R7, R6, R8 39 | MOVD.P R8, 8(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/SubFloat32_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func SubFloat32(left, right, result []float32) int 5 | TEXT ·SubFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $4, R1 24 | BLT singleDataLoop 25 | //Sub four float32 values. 26 | VLD1.P 16(R3), [V0.S4] 27 | VLD1.P 16(R4), [V1.S4] 28 | WORD $0x4EA1D402 //VFSUB V1.S4, V0.S4, V2.S4 29 | VST1.P [V2.S4], 16(R5) 30 | ADD $4, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Sub one float32 value. 36 | FMOVS.P 4(R3), F0 37 | FMOVS.P 4(R4), F1 38 | FSUBS F1, F0, F2 39 | FMOVS.P F2, 4(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/SubFloat64_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func SubFloat64(left, right, result []float64) int 5 | TEXT ·SubFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $2, R1 24 | BLT singleDataLoop 25 | //Sub two float64 values. 26 | VLD1.P 16(R3), [V0.D2] 27 | VLD1.P 16(R4), [V1.D2] 28 | WORD $0x4EE1D402 //VFSUB V1.D2, V0.D2, V2.D2 29 | VST1.P [V2.D2], 16(R5) 30 | ADD $2, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Sub one float64 value. 36 | FMOVD.P 8(R3), F0 37 | FMOVD.P 8(R4), F1 38 | FSUBD F1, F0, F2 39 | FMOVD.P F2, 8(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/SubInt32_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func SubInt32(left, right, result []int32) int 5 | TEXT ·SubInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $4, R1 24 | BLT singleDataLoop 25 | //Sub four int32 values. 26 | VLD1.P 16(R3), [V0.S4] 27 | VLD1.P 16(R4), [V1.S4] 28 | VSUB V1.S4, V0.S4, V2.S4 29 | VST1.P [V2.S4], 16(R5) 30 | ADD $4, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Sub one int32 value. 36 | MOVW.P 4(R3), R6 37 | MOVW.P 4(R4), R7 38 | SUB R7, R6, R8 39 | MOVW.P R8, 4(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/SubInt64_arm64.s: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | // func SubInt64(left, right, result []int64) int 5 | TEXT ·SubInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVD leftLen+8(FP), R0 8 | MOVD rightLen+32(FP), R1 9 | MOVD resultLen+56(FP), R2 10 | //Get minimum length. 11 | CMP R0, R2 12 | CSEL LT, R2, R0, R2 13 | CMP R1, R2 14 | CSEL LT, R2, R1, R2 15 | //Load slices data pointers. 16 | MOVD leftData+0(FP), R3 17 | MOVD rightData+24(FP), R4 18 | MOVD resultData+48(FP), R5 19 | //Initialize loop index. 20 | MOVD $0, R0 21 | multipleDataLoop: 22 | SUB R0, R2, R1 23 | CMP $2, R1 24 | BLT singleDataLoop 25 | //Sub two int64 values. 26 | VLD1.P 16(R3), [V0.D2] 27 | VLD1.P 16(R4), [V1.D2] 28 | VSUB V1.D2, V0.D2, V2.D2 29 | VST1.P [V2.D2], 16(R5) 30 | ADD $2, R0, R0 31 | B multipleDataLoop 32 | singleDataLoop: 33 | CMP R2, R0 34 | BGE returnLength 35 | //Sub one int64 value. 36 | MOVD.P 8(R3), R6 37 | MOVD.P 8(R4), R7 38 | SUB R7, R6, R8 39 | MOVD.P R8, 8(R5) 40 | ADD $1, R0, R0 41 | B singleDataLoop 42 | returnLength: 43 | MOVD R2, int+72(FP) 44 | RET 45 | -------------------------------------------------------------------------------- /internal/neon/neon_arm64.go: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | package neon 5 | 6 | func AddFloat32(left, right, result []float32) int 7 | 8 | func AddFloat64(left, right, result []float64) int 9 | 10 | func AddInt32(left, right, result []int32) int 11 | 12 | func AddInt64(left, right, result []int64) int 13 | 14 | func AndInt32(left, right, result []int32) int 15 | 16 | func AndInt64(left, right, result []int64) int 17 | 18 | func MulFloat32(left, right, result []float32) int 19 | 20 | func MulFloat64(left, right, result []float64) int 21 | 22 | func MulInt32(left, right, result []int32) int 23 | 24 | func OrInt32(left, right, result []int32) int 25 | 26 | func OrInt64(left, right, result []int64) int 27 | 28 | func SubFloat32(left, right, result []float32) int 29 | 30 | func SubFloat64(left, right, result []float64) int 31 | 32 | func SubInt32(left, right, result []int32) int 33 | 34 | func SubInt64(left, right, result []int64) int 35 | -------------------------------------------------------------------------------- /internal/neon/neon_arm64_test.go: -------------------------------------------------------------------------------- 1 | //go:build arm64 2 | // +build arm64 3 | 4 | package neon 5 | 6 | import ( 7 | "testing" 8 | 9 | "github.com/pehringer/simd/internal/fallback" 10 | "github.com/pehringer/simd/internal/test" 11 | "golang.org/x/sys/cpu" 12 | ) 13 | 14 | func TestNeon(t *testing.T) { 15 | if !cpu.ARM64.HasASIMD { 16 | t.Skip("neon not supported") 17 | return 18 | } 19 | t.Run("AddFloat32", func(t *testing.T) { test.Universal(t, AddFloat32, fallback.Add) }) 20 | t.Run("AddFloat64", func(t *testing.T) { test.Universal(t, AddFloat64, fallback.Add) }) 21 | t.Run("AddInt32", func(t *testing.T) { test.Universal(t, AddInt32, fallback.Add) }) 22 | t.Run("AddInt64", func(t *testing.T) { test.Universal(t, AddInt64, fallback.Add) }) 23 | t.Run("AndInt32", func(t *testing.T) { test.Universal(t, AndInt32, fallback.And) }) 24 | t.Run("AndInt64", func(t *testing.T) { test.Universal(t, AndInt64, fallback.And) }) 25 | t.Run("MulFloat32", func(t *testing.T) { test.Universal(t, MulFloat32, fallback.Mul) }) 26 | t.Run("MulFloat64", func(t *testing.T) { test.Universal(t, MulFloat64, fallback.Mul) }) 27 | t.Run("MulInt32", func(t *testing.T) { test.Universal(t, MulInt32, fallback.Mul) }) 28 | t.Run("OrInt32", func(t *testing.T) { test.Universal(t, OrInt32, fallback.Or) }) 29 | t.Run("OrInt64", func(t *testing.T) { test.Universal(t, OrInt64, fallback.Or) }) 30 | t.Run("SubFloat32", func(t *testing.T) { test.Universal(t, SubFloat32, fallback.Sub) }) 31 | t.Run("SubFloat64", func(t *testing.T) { test.Universal(t, SubFloat64, fallback.Sub) }) 32 | t.Run("SubInt32", func(t *testing.T) { test.Universal(t, SubInt32, fallback.Sub) }) 33 | t.Run("SubInt64", func(t *testing.T) { test.Universal(t, SubInt64, fallback.Sub) }) 34 | } 35 | -------------------------------------------------------------------------------- /internal/sse/AddFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddFloat32(left, right, result []float32) int 5 | TEXT ·AddFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Add four float32 values. 27 | MOVUPS (SI)(AX*4), X0 28 | MOVUPS (DX)(AX*4), X1 29 | ADDPS X1, X0 30 | MOVUPS X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | ADDSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse/DivFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func DivFloat32(left, right, result []float32) int 5 | TEXT ·DivFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Div four float32 values. 27 | MOVUPS (SI)(AX*4), X0 28 | MOVUPS (DX)(AX*4), X1 29 | DIVPS X1, X0 30 | MOVUPS X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Div one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | DIVSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse/MaxFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MaxFloat32(left, right, result []float32) int 5 | TEXT ·MaxFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Max four float32 values. 27 | MOVUPS (SI)(AX*4), X0 28 | MOVUPS (DX)(AX*4), X1 29 | MAXPS X1, X0 30 | MOVUPS X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Max one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | MAXSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse/MinFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MinFloat32(left, right, result []float32) int 5 | TEXT ·MinFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Min four float32 values. 27 | MOVUPS (SI)(AX*4), X0 28 | MOVUPS (DX)(AX*4), X1 29 | MINPS X1, X0 30 | MOVUPS X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Min one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | MINSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse/MulFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MulFloat32(left, right, result []float32) int 5 | TEXT ·MulFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Mul four float32 values. 27 | MOVUPS (SI)(AX*4), X0 28 | MOVUPS (DX)(AX*4), X1 29 | MULPS X1, X0 30 | MOVUPS X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Mul one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | MULSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse/SubFloat32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubFloat32(left, right, result []float32) int 5 | TEXT ·SubFloat32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Sub four float32 values. 27 | MOVUPS (SI)(AX*4), X0 28 | MOVUPS (DX)(AX*4), X1 29 | SUBPS X1, X0 30 | MOVUPS X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one float32 value. 37 | MOVSS (SI)(AX*4), X0 38 | MOVSS (DX)(AX*4), X1 39 | SUBSS X1, X0 40 | MOVSS X0, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse/sse_amd64.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package sse 5 | 6 | func AddFloat32(left, right, result []float32) int 7 | 8 | func DivFloat32(left, right, result []float32) int 9 | 10 | func MaxFloat32(left, right, result []float32) int 11 | 12 | func MinFloat32(left, right, result []float32) int 13 | 14 | func MulFloat32(left, right, result []float32) int 15 | 16 | func SubFloat32(left, right, result []float32) int 17 | -------------------------------------------------------------------------------- /internal/sse/sse_amd64_test.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package sse 5 | 6 | import ( 7 | "testing" 8 | 9 | "github.com/pehringer/simd/internal/fallback" 10 | "github.com/pehringer/simd/internal/test" 11 | "golang.org/x/sys/cpu" 12 | ) 13 | 14 | func TestSse(t *testing.T) { 15 | if !cpu.X86.HasSSE2 { 16 | t.Skip("sse not supported") 17 | return 18 | } 19 | t.Run("AddFloat32", func(t *testing.T) { test.Universal(t, AddFloat32, fallback.Add) }) 20 | t.Run("DivFloat32", func(t *testing.T) { test.Universal(t, DivFloat32, fallback.Div) }) 21 | t.Run("MaxFloat32", func(t *testing.T) { test.Universal(t, MaxFloat32, fallback.Max) }) 22 | t.Run("MinFloat32", func(t *testing.T) { test.Universal(t, MinFloat32, fallback.Min) }) 23 | t.Run("MulFloat32", func(t *testing.T) { test.Universal(t, MulFloat32, fallback.Mul) }) 24 | t.Run("SubFloat32", func(t *testing.T) { test.Universal(t, SubFloat32, fallback.Sub) }) 25 | } 26 | -------------------------------------------------------------------------------- /internal/sse2/AddFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddFloat64(left, right, result []float64) int 5 | TEXT ·AddFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $2 25 | JL singleDataLoop 26 | //Add two float64 values. 27 | MOVUPD (SI)(AX*8), X0 28 | MOVUPD (DX)(AX*8), X1 29 | ADDPD X1, X0 30 | MOVUPD X0, (DI)(AX*8) 31 | ADDQ $2, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | ADDSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/AddInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddInt32(left, right, result []int32) int 5 | TEXT ·AddInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Add four int32 values. 27 | MOVOU (SI)(AX*4), X0 28 | MOVOU (DX)(AX*4), X1 29 | PADDL X1, X0 30 | MOVOU X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | ADDL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/AddInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AddInt64(left, right, result []int64) int 5 | TEXT ·AddInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $2 25 | JL singleDataLoop 26 | //Add two int64 values. 27 | MOVOU (SI)(AX*8), X0 28 | MOVOU (DX)(AX*8), X1 29 | PADDQ X1, X0 30 | MOVOU X0, (DI)(AX*8) 31 | ADDQ $2, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Add one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | ADDQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/AndInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AndInt32(left, right, result []int32) int 5 | TEXT ·AndInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //And four int32 values. 27 | MOVOU (SI)(AX*4), X0 28 | MOVOU (DX)(AX*4), X1 29 | PAND X1, X0 30 | MOVOU X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //And one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | ANDL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/AndInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func AndInt64(left, right, result []int64) int 5 | TEXT ·AndInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $2 25 | JL singleDataLoop 26 | //And two int64 values. 27 | MOVOU (SI)(AX*8), X0 28 | MOVOU (DX)(AX*8), X1 29 | PAND X1, X0 30 | MOVOU X0, (DI)(AX*8) 31 | ADDQ $2, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //And one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | ANDQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/DivFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func DivFloat64(left, right, result []float64) int 5 | TEXT ·DivFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $2 25 | JL singleDataLoop 26 | //Div two float64 values. 27 | MOVUPD (SI)(AX*8), X0 28 | MOVUPD (DX)(AX*8), X1 29 | DIVPD X1, X0 30 | MOVUPD X0, (DI)(AX*8) 31 | ADDQ $2, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Div one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | DIVSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/MaxFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MaxFloat64(left, right, result []float64) int 5 | TEXT ·MaxFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $2 25 | JL singleDataLoop 26 | //Max two float64 values. 27 | MOVUPD (SI)(AX*8), X0 28 | MOVUPD (DX)(AX*8), X1 29 | MAXPD X1, X0 30 | MOVUPD X0, (DI)(AX*8) 31 | ADDQ $2, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Max one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | MAXSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/MinFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MinFloat64(left, right, result []float64) int 5 | TEXT ·MinFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $2 25 | JL singleDataLoop 26 | //Min two float64 values. 27 | MOVUPD (SI)(AX*8), X0 28 | MOVUPD (DX)(AX*8), X1 29 | MINPD X1, X0 30 | MOVUPD X0, (DI)(AX*8) 31 | ADDQ $2, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Min one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | MINSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/MulFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MulFloat64(left, right, result []float64) int 5 | TEXT ·MulFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $2 25 | JL singleDataLoop 26 | //Mul two float64 values. 27 | MOVUPD (SI)(AX*8), X0 28 | MOVUPD (DX)(AX*8), X1 29 | MULPD X1, X0 30 | MOVUPD X0, (DI)(AX*8) 31 | ADDQ $2, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Mul one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | MULSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/OrInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func OrInt32(left, right, result []int32) int 5 | TEXT ·OrInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Or four int32 values. 27 | MOVOU (SI)(AX*4), X0 28 | MOVOU (DX)(AX*4), X1 29 | POR X1, X0 30 | MOVOU X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Or one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | ORL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/OrInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func OrInt64(left, right, result []int64) int 5 | TEXT ·OrInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $2 25 | JL singleDataLoop 26 | //Or two int64 values. 27 | MOVOU (SI)(AX*8), X0 28 | MOVOU (DX)(AX*8), X1 29 | POR X1, X0 30 | MOVOU X0, (DI)(AX*8) 31 | ADDQ $2, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Or one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | ORQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/SubFloat64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubFloat64(left, right, result []float64) int 5 | TEXT ·SubFloat64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $2 25 | JL singleDataLoop 26 | //Sub two float64 values. 27 | MOVUPD (SI)(AX*8), X0 28 | MOVUPD (DX)(AX*8), X1 29 | SUBPD X1, X0 30 | MOVUPD X0, (DI)(AX*8) 31 | ADDQ $2, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one float64 value. 37 | MOVSD (SI)(AX*8), X0 38 | MOVSD (DX)(AX*8), X1 39 | SUBSD X1, X0 40 | MOVSD X0, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/SubInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubInt32(left, right, result []int32) int 5 | TEXT ·SubInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Sub four int32 values. 27 | MOVOU (SI)(AX*4), X0 28 | MOVOU (DX)(AX*4), X1 29 | PSUBL X1, X0 30 | MOVOU X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | SUBL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/SubInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func SubInt64(left, right, result []int64) int 5 | TEXT ·SubInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $2 25 | JL singleDataLoop 26 | //Sub two int64 values. 27 | MOVOU (SI)(AX*8), X0 28 | MOVOU (DX)(AX*8), X1 29 | PSUBQ X1, X0 30 | MOVOU X0, (DI)(AX*8) 31 | ADDQ $2, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Sub one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | SUBQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/XorInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func XorInt32(left, right, result []int32) int 5 | TEXT ·XorInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Xor four int32 values. 27 | MOVOU (SI)(AX*4), X0 28 | MOVOU (DX)(AX*4), X1 29 | PXOR X1, X0 30 | MOVOU X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Xor one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | XORL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/XorInt64_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func XorInt64(left, right, result []int64) int 5 | TEXT ·XorInt64(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $2 25 | JL singleDataLoop 26 | //Xor two int64 values. 27 | MOVOU (SI)(AX*8), X0 28 | MOVOU (DX)(AX*8), X1 29 | PXOR X1, X0 30 | MOVOU X0, (DI)(AX*8) 31 | ADDQ $2, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Xor one int64 value. 37 | MOVQ (SI)(AX*8), R8 38 | MOVQ (DX)(AX*8), R9 39 | XORQ R9, R8 40 | MOVQ R8, (DI)(AX*8) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse2/sse2_amd64.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package sse2 5 | 6 | func AddFloat64(left, right, result []float64) int 7 | 8 | func AddInt32(left, right, result []int32) int 9 | 10 | func AddInt64(left, right, result []int64) int 11 | 12 | func AndInt32(left, right, result []int32) int 13 | 14 | func AndInt64(left, right, result []int64) int 15 | 16 | func DivFloat64(left, right, result []float64) int 17 | 18 | func MaxFloat64(left, right, result []float64) int 19 | 20 | func MinFloat64(left, right, result []float64) int 21 | 22 | func MulFloat64(left, right, result []float64) int 23 | 24 | func OrInt32(left, right, result []int32) int 25 | 26 | func OrInt64(left, right, result []int64) int 27 | 28 | func SubFloat64(left, right, result []float64) int 29 | 30 | func SubInt32(left, right, result []int32) int 31 | 32 | func SubInt64(left, right, result []int64) int 33 | 34 | func XorInt32(left, right, result []int32) int 35 | 36 | func XorInt64(left, right, result []int64) int 37 | -------------------------------------------------------------------------------- /internal/sse2/sse2_amd64_test.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package sse2 5 | 6 | import ( 7 | "testing" 8 | 9 | "github.com/pehringer/simd/internal/fallback" 10 | "github.com/pehringer/simd/internal/test" 11 | "golang.org/x/sys/cpu" 12 | ) 13 | 14 | func TestSse2(t *testing.T) { 15 | if !cpu.X86.HasSSE2 { 16 | t.Skip("sse2 not supported") 17 | return 18 | } 19 | t.Run("AddFloat64", func(t *testing.T) { test.Universal(t, AddFloat64, fallback.Add) }) 20 | t.Run("AddInt32", func(t *testing.T) { test.Universal(t, AddInt32, fallback.Add) }) 21 | t.Run("AddInt64", func(t *testing.T) { test.Universal(t, AddInt64, fallback.Add) }) 22 | t.Run("AndInt32", func(t *testing.T) { test.Universal(t, AndInt32, fallback.And) }) 23 | t.Run("AndInt64", func(t *testing.T) { test.Universal(t, AndInt64, fallback.And) }) 24 | t.Run("DivFloat64", func(t *testing.T) { test.Universal(t, DivFloat64, fallback.Div) }) 25 | t.Run("MaxFloat64", func(t *testing.T) { test.Universal(t, MaxFloat64, fallback.Max) }) 26 | t.Run("MinFloat64", func(t *testing.T) { test.Universal(t, MinFloat64, fallback.Min) }) 27 | t.Run("MulFloat64", func(t *testing.T) { test.Universal(t, MulFloat64, fallback.Mul) }) 28 | t.Run("OrInt32", func(t *testing.T) { test.Universal(t, OrInt32, fallback.Or) }) 29 | t.Run("OrInt64", func(t *testing.T) { test.Universal(t, OrInt64, fallback.Or) }) 30 | t.Run("SubFloat64", func(t *testing.T) { test.Universal(t, SubFloat64, fallback.Sub) }) 31 | t.Run("SubInt32", func(t *testing.T) { test.Universal(t, SubInt32, fallback.Sub) }) 32 | t.Run("SubInt64", func(t *testing.T) { test.Universal(t, SubInt64, fallback.Sub) }) 33 | t.Run("XorInt32", func(t *testing.T) { test.Universal(t, XorInt32, fallback.Xor) }) 34 | t.Run("XorInt64", func(t *testing.T) { test.Universal(t, XorInt64, fallback.Xor) }) 35 | } 36 | -------------------------------------------------------------------------------- /internal/sse41/MaxInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MaxInt32(left, right, result []int32) int 5 | TEXT ·MaxInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Max four int32 values. 27 | MOVOU (SI)(AX*4), X0 28 | MOVOU (DX)(AX*4), X1 29 | PMAXSD X1, X0 30 | MOVOU X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Max one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | CMPL R9, R8 40 | CMOVLGT R9, R8 41 | MOVL R8, (DI)(AX*4) 42 | INCQ AX 43 | JMP singleDataLoop 44 | returnLength: 45 | MOVQ CX, int+72(FP) 46 | RET 47 | -------------------------------------------------------------------------------- /internal/sse41/MinInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MinInt32(left, right, result []int32) int 5 | TEXT ·MinInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Min four int32 values. 27 | MOVOU (SI)(AX*4), X0 28 | MOVOU (DX)(AX*4), X1 29 | PMINSD X1, X0 30 | MOVOU X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Min one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | CMPL R9, R8 40 | CMOVLLT R9, R8 41 | MOVL R8, (DI)(AX*4) 42 | INCQ AX 43 | JMP singleDataLoop 44 | returnLength: 45 | MOVQ CX, int+72(FP) 46 | RET 47 | -------------------------------------------------------------------------------- /internal/sse41/MulInt32_amd64.s: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | // func MulInt32(left, right, result []int32) int 5 | TEXT ·MulInt32(SB), 4, $0 6 | //Load slices lengths. 7 | MOVQ leftLen+8(FP), AX 8 | MOVQ rightLen+32(FP), BX 9 | MOVQ resultLen+56(FP), CX 10 | //Get minimum length. 11 | CMPQ AX, CX 12 | CMOVQLT AX, CX 13 | CMPQ BX, CX 14 | CMOVQLT BX, CX 15 | //Load slices data pointers. 16 | MOVQ leftData+0(FP), SI 17 | MOVQ rightData+24(FP), DX 18 | MOVQ resultData+48(FP), DI 19 | //Initialize loop index. 20 | MOVQ $0, AX 21 | multipleDataLoop: 22 | MOVQ CX, BX 23 | SUBQ AX, BX 24 | CMPQ BX, $4 25 | JL singleDataLoop 26 | //Mul four int32 values. 27 | MOVOU (SI)(AX*4), X0 28 | MOVOU (DX)(AX*4), X1 29 | PMULLD X1, X0 30 | MOVOU X0, (DI)(AX*4) 31 | ADDQ $4, AX 32 | JMP multipleDataLoop 33 | singleDataLoop: 34 | CMPQ AX, CX 35 | JGE returnLength 36 | //Mul one int32 value. 37 | MOVL (SI)(AX*4), R8 38 | MOVL (DX)(AX*4), R9 39 | IMULL R9, R8 40 | MOVL R8, (DI)(AX*4) 41 | INCQ AX 42 | JMP singleDataLoop 43 | returnLength: 44 | MOVQ CX, int+72(FP) 45 | RET 46 | -------------------------------------------------------------------------------- /internal/sse41/sse41_amd64.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package sse41 5 | 6 | func MaxInt32(left, right, result []int32) int 7 | 8 | func MinInt32(left, right, result []int32) int 9 | 10 | func MulInt32(left, right, result []int32) int 11 | -------------------------------------------------------------------------------- /internal/sse41/sse41_amd64_test.go: -------------------------------------------------------------------------------- 1 | //go:build amd64 2 | // +build amd64 3 | 4 | package sse41 5 | 6 | import ( 7 | "testing" 8 | 9 | "github.com/pehringer/simd/internal/fallback" 10 | "github.com/pehringer/simd/internal/test" 11 | "golang.org/x/sys/cpu" 12 | ) 13 | 14 | func TestSse41(t *testing.T) { 15 | if !cpu.X86.HasSSE41 { 16 | t.Skip("sse4.1 not supported") 17 | return 18 | } 19 | t.Run("MaxInt32", func(t *testing.T) { test.Universal(t, MaxInt32, fallback.Max) }) 20 | t.Run("MinInt32", func(t *testing.T) { test.Universal(t, MinInt32, fallback.Min) }) 21 | t.Run("MulInt32", func(t *testing.T) { test.Universal(t, MulInt32, fallback.Mul) }) 22 | } 23 | -------------------------------------------------------------------------------- /internal/test/utils.go: -------------------------------------------------------------------------------- 1 | package test 2 | 3 | import ( 4 | "math" 5 | "testing" 6 | 7 | "github.com/pehringer/simd/internal/data" 8 | ) 9 | 10 | func checkSlice[T data.Floating | data.Integer](t *testing.T, test, control []T) bool { 11 | if len(test) != len(control) { 12 | t.Errorf("lengths not equal") 13 | return false 14 | } 15 | if cap(test) != cap(control) { 16 | t.Errorf("capacities not equal") 17 | return false 18 | } 19 | for i := range len(control) { 20 | if test[i] != control[i] { 21 | t.Errorf("elements not equal") 22 | return false 23 | } 24 | } 25 | return true 26 | } 27 | 28 | func checkOperation[T data.Floating | data.Integer](t *testing.T, test, control data.Operation[T], left, right, result []T) bool { 29 | testLeft := make([]T, len(left), cap(left)) 30 | copy(testLeft, left) 31 | testRight := make([]T, len(right), cap(right)) 32 | copy(testRight, right) 33 | testResult := make([]T, len(result), cap(result)) 34 | copy(testResult, result) 35 | if test(testLeft, testRight, testResult) != control(left, right, result) { 36 | t.Errorf("operation returned incorrect length") 37 | return false 38 | } 39 | if !checkSlice(t, testLeft, left) { 40 | return false 41 | } 42 | if !checkSlice(t, testRight, right) { 43 | return false 44 | } 45 | if !checkSlice(t, testResult, result) { 46 | return false 47 | } 48 | return true 49 | } 50 | 51 | var ( 52 | increment int64 = 1 53 | decrement int64 = math.MaxInt64 54 | ) 55 | 56 | func large[T data.Floating | data.Integer](length int) []T { 57 | elements := make([]T, length) 58 | for i := range length { 59 | elements[i] = T(decrement) 60 | decrement-- 61 | } 62 | return elements 63 | } 64 | 65 | func small[T data.Floating | data.Integer](length int) []T { 66 | elements := make([]T, length) 67 | for i := range length { 68 | elements[i] = T(increment) 69 | increment++ 70 | } 71 | return elements 72 | } 73 | 74 | func Universal[T data.Floating | data.Integer](t *testing.T, test, control data.Operation[T]) { 75 | checkOperation(t, test, control, []T{}, []T{}, []T{}) 76 | checkOperation(t, test, control, small[T](11), small[T](13), small[T](17)) 77 | checkOperation(t, test, control, small[T](29), small[T](19), large[T](23)) 78 | checkOperation(t, test, control, small[T](37), large[T](41), small[T](31)) 79 | checkOperation(t, test, control, small[T](43), large[T](47), large[T](53)) 80 | checkOperation(t, test, control, large[T](67), small[T](59), small[T](61)) 81 | checkOperation(t, test, control, large[T](73), small[T](79), large[T](71)) 82 | } 83 | -------------------------------------------------------------------------------- /logo/150x150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pehringer/simd/80b2f767e42d8e93cfa7b4f6d0ad62dd28d665b5/logo/150x150.png -------------------------------------------------------------------------------- /logo/15x15.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pehringer/simd/80b2f767e42d8e93cfa7b4f6d0ad62dd28d665b5/logo/15x15.png -------------------------------------------------------------------------------- /logo/300x300.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pehringer/simd/80b2f767e42d8e93cfa7b4f6d0ad62dd28d665b5/logo/300x300.png -------------------------------------------------------------------------------- /logo/600x600.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/pehringer/simd/80b2f767e42d8e93cfa7b4f6d0ad62dd28d665b5/logo/600x600.png -------------------------------------------------------------------------------- /simd.go: -------------------------------------------------------------------------------- 1 | // SIMD support via Go assembly for arithmetic and bitwise operations. 2 | // Allowing for parallel element-wise computations. 3 | package simd 4 | 5 | import ( 6 | "github.com/pehringer/simd/internal/data" 7 | "github.com/pehringer/simd/internal/fallback" 8 | ) 9 | 10 | var ( 11 | addFloat32 data.Operation[float32] = fallback.Add[float32] 12 | addFloat64 data.Operation[float64] = fallback.Add[float64] 13 | addInt32 data.Operation[int32] = fallback.Add[int32] 14 | addInt64 data.Operation[int64] = fallback.Add[int64] 15 | andInt32 data.Operation[int32] = fallback.And[int32] 16 | andInt64 data.Operation[int64] = fallback.And[int64] 17 | divFloat32 data.Operation[float32] = fallback.Div[float32] 18 | divFloat64 data.Operation[float64] = fallback.Div[float64] 19 | divInt32 data.Operation[int32] = fallback.Div[int32] 20 | divInt64 data.Operation[int64] = fallback.Div[int64] 21 | maxFloat32 data.Operation[float32] = fallback.Max[float32] 22 | maxFloat64 data.Operation[float64] = fallback.Max[float64] 23 | maxInt32 data.Operation[int32] = fallback.Max[int32] 24 | maxInt64 data.Operation[int64] = fallback.Max[int64] 25 | minFloat32 data.Operation[float32] = fallback.Min[float32] 26 | minFloat64 data.Operation[float64] = fallback.Min[float64] 27 | minInt32 data.Operation[int32] = fallback.Min[int32] 28 | minInt64 data.Operation[int64] = fallback.Min[int64] 29 | mulFloat32 data.Operation[float32] = fallback.Mul[float32] 30 | mulFloat64 data.Operation[float64] = fallback.Mul[float64] 31 | mulInt32 data.Operation[int32] = fallback.Mul[int32] 32 | mulInt64 data.Operation[int64] = fallback.Mul[int64] 33 | orInt32 data.Operation[int32] = fallback.Or[int32] 34 | orInt64 data.Operation[int64] = fallback.Or[int64] 35 | subFloat32 data.Operation[float32] = fallback.Sub[float32] 36 | subFloat64 data.Operation[float64] = fallback.Sub[float64] 37 | subInt32 data.Operation[int32] = fallback.Sub[int32] 38 | subInt64 data.Operation[int64] = fallback.Sub[int64] 39 | xorInt32 data.Operation[int32] = fallback.Xor[int32] 40 | xorInt64 data.Operation[int64] = fallback.Xor[int64] 41 | ) 42 | 43 | // AddFloat32 performs element-wise addition on left and right, storing the sums in result. 44 | // The operation is performed up to the shortest length of left, right, and result. 45 | // Returns the number of operations performed. 46 | func AddFloat32(left, right, result []float32) int { 47 | return addFloat32(left, right, result) 48 | } 49 | 50 | // AddFloat64 performs element-wise addition on left and right, storing the sums in result. 51 | // The operation is performed up to the shortest length of left, right, and result. 52 | // Returns the number of operations performed. 53 | func AddFloat64(left, right, result []float64) int { 54 | return addFloat64(left, right, result) 55 | } 56 | 57 | // AddInt32 performs element-wise addition on left and right, storing the sums in result. 58 | // The operation is performed up to the shortest length of left, right, and result. 59 | // Returns the number of operations performed. 60 | func AddInt32(left, right, result []int32) int { 61 | return addInt32(left, right, result) 62 | } 63 | 64 | // AddInt64 performs element-wise addition on left and right, storing the sums in result. 65 | // The operation is performed up to the shortest length of left, right, and result. 66 | // Returns the number of operations performed. 67 | func AddInt64(left, right, result []int64) int { 68 | return addInt64(left, right, result) 69 | } 70 | 71 | // AndInt32 performs element-wise AND on left and right, storing the results in result. 72 | // The operation is performed up to the shortest length of left, right, and result. 73 | // Returns the number of operations performed. 74 | func AndInt32(left, right, result []int32) int { 75 | return andInt32(left, right, result) 76 | } 77 | 78 | // AndInt64 performs element-wise AND on left and right, storing the results in result. 79 | // The operation is performed up to the shortest length of left, right, and result. 80 | // Returns the number of operations performed. 81 | func AndInt64(left, right, result []int64) int { 82 | return andInt64(left, right, result) 83 | } 84 | 85 | // DivFloat32 performs element-wise division on left and right, storing the quotients in result. 86 | // The operation is performed up to the shortest length of left, right, and result. 87 | // Returns the number of operations performed. 88 | func DivFloat32(left, right, result []float32) int { 89 | return divFloat32(left, right, result) 90 | } 91 | 92 | // DivFloat64 performs element-wise division on left and right, storing the quotients in result. 93 | // The operation is performed up to the shortest length of left, right, and result. 94 | // Returns the number of operations performed. 95 | func DivFloat64(left, right, result []float64) int { 96 | return divFloat64(left, right, result) 97 | } 98 | 99 | // DivInt32 performs element-wise division on left and right, storing the quotients in result. 100 | // The operation is performed up to the shortest length of left, right, and result. 101 | // Returns the number of operations performed. 102 | func DivInt32(left, right, result []int32) int { 103 | return divInt32(left, right, result) 104 | } 105 | 106 | // DivInt64 performs element-wise division on left and right, storing the quotients in result. 107 | // The operation is performed up to the shortest length of left, right, and result. 108 | // Returns the number of operations performed. 109 | func DivInt64(left, right, result []int64) int { 110 | return divInt64(left, right, result) 111 | } 112 | 113 | // MaxFloat32 performs element-wise maximum on left and right, storing the maxes in result. 114 | // The operation is performed up to the shortest length of left, right, and result. 115 | // Returns the number of operations performed. 116 | func MaxFloat32(left, right, result []float32) int { 117 | return maxFloat32(left, right, result) 118 | } 119 | 120 | // MaxFloat64 performs element-wise maximum on left and right, storing the maxes in result. 121 | // The operation is performed up to the shortest length of left, right, and result. 122 | // Returns the number of operations performed. 123 | func MaxFloat64(left, right, result []float64) int { 124 | return maxFloat64(left, right, result) 125 | } 126 | 127 | // MaxInt32 performs element-wise maximum on left and right, storing the maxes in result. 128 | // The operation is performed up to the shortest length of left, right, and result. 129 | // Returns the number of operations performed. 130 | func MaxInt32(left, right, result []int32) int { 131 | return maxInt32(left, right, result) 132 | } 133 | 134 | // MaxInt64 performs element-wise maximum on left and right, storing the maxes in result. 135 | // The operation is performed up to the shortest length of left, right, and result. 136 | // Returns the number of operations performed. 137 | func MaxInt64(left, right, result []int64) int { 138 | return maxInt64(left, right, result) 139 | } 140 | 141 | // MinFloat32 performs element-wise minimum on left and right, storing the mins in result. 142 | // The operation is performed up to the shortest length of left, right, and result. 143 | // Returns the number of operations performed. 144 | func MinFloat32(left, right, result []float32) int { 145 | return minFloat32(left, right, result) 146 | } 147 | 148 | // MinFloat64 performs element-wise minimum on left and right, storing the mins in result. 149 | // The operation is performed up to the shortest length of left, right, and result. 150 | // Returns the number of operations performed. 151 | func MinFloat64(left, right, result []float64) int { 152 | return minFloat64(left, right, result) 153 | } 154 | 155 | // MinInt32 performs element-wise minimum on left and right, storing the mins in result. 156 | // The operation is performed up to the shortest length of left, right, and result. 157 | // Returns the number of operations performed. 158 | func MinInt32(left, right, result []int32) int { 159 | return minInt32(left, right, result) 160 | } 161 | 162 | // MinInt64 performs element-wise minimum on left and right, storing the mins in result. 163 | // The operation is performed up to the shortest length of left, right, and result. 164 | // Returns the number of operations performed. 165 | func MinInt64(left, right, result []int64) int { 166 | return minInt64(left, right, result) 167 | } 168 | 169 | // MulFloat32 performs element-wise multiplication on left and right, storing the products in result. 170 | // The operation is performed up to the shortest length of left, right, and result. 171 | // Returns the number of operations performed. 172 | func MulFloat32(left, right, result []float32) int { 173 | return mulFloat32(left, right, result) 174 | } 175 | 176 | // MulFloat64 performs element-wise multiplication on left and right, storing the products in result. 177 | // The operation is performed up to the shortest length of left, right, and result. 178 | // Returns the number of operations performed. 179 | func MulFloat64(left, right, result []float64) int { 180 | return mulFloat64(left, right, result) 181 | } 182 | 183 | // MulInt32 performs element-wise multiplication on left and right, storing the products in result. 184 | // The operation is performed up to the shortest length of left, right, and result. 185 | // Returns the number of operations performed. 186 | func MulInt32(left, right, result []int32) int { 187 | return mulInt32(left, right, result) 188 | } 189 | 190 | // MulInt64 performs element-wise multiplication on left and right, storing the products in result. 191 | // The operation is performed up to the shortest length of left, right, and result. 192 | // Returns the number of operations performed. 193 | func MulInt64(left, right, result []int64) int { 194 | return mulInt64(left, right, result) 195 | } 196 | 197 | // OrInt32 performs element-wise OR on left and right, storing the results in result. 198 | // The operation is performed up to the shortest length of left, right, and result. 199 | // Returns the number of operations performed. 200 | func OrInt32(left, right, result []int32) int { 201 | return orInt32(left, right, result) 202 | } 203 | 204 | // OrInt64 performs element-wise OR on left and right, storing the results in result. 205 | // The operation is performed up to the shortest length of left, right, and result. 206 | // Returns the number of operations performed. 207 | func OrInt64(left, right, result []int64) int { 208 | return orInt64(left, right, result) 209 | } 210 | 211 | // SubFloat32 performs element-wise subtraction on left and right, storing the differences in result. 212 | // The operation is performed up to the shortest length of left, right, and result. 213 | // Returns the number of operations performed. 214 | func SubFloat32(left, right, result []float32) int { 215 | return subFloat32(left, right, result) 216 | } 217 | 218 | // SubFloat64 performs element-wise subtraction on left and right, storing the differences in result. 219 | // The operation is performed up to the shortest length of left, right, and result. 220 | // Returns the number of operations performed. 221 | func SubFloat64(left, right, result []float64) int { 222 | return subFloat64(left, right, result) 223 | } 224 | 225 | // SubInt32 performs element-wise subtraction on left and right, storing the differences in result. 226 | // The operation is performed up to the shortest length of left, right, and result. 227 | // Returns the number of operations performed. 228 | func SubInt32(left, right, result []int32) int { 229 | return subInt32(left, right, result) 230 | } 231 | 232 | // SubInt64 performs element-wise subtraction on left and right, storing the differences in result. 233 | // The operation is performed up to the shortest length of left, right, and result. 234 | // Returns the number of operations performed. 235 | func SubInt64(left, right, result []int64) int { 236 | return subInt64(left, right, result) 237 | } 238 | 239 | // XorInt32 performs element-wise XOR on left and right, storing the results in result. 240 | // The operation is performed up to the shortest length of left, right, and result. 241 | // Returns the number of operations performed. 242 | func XorInt32(left, right, result []int32) int { 243 | return xorInt32(left, right, result) 244 | } 245 | 246 | // XorInt64 performs element-wise XOR on left and right, storing the results in result. 247 | // The operation is performed up to the shortest length of left, right, and result. 248 | // Returns the number of operations performed. 249 | func XorInt64(left, right, result []int64) int { 250 | return xorInt64(left, right, result) 251 | } 252 | -------------------------------------------------------------------------------- /simd_test.go: -------------------------------------------------------------------------------- 1 | package simd 2 | 3 | import ( 4 | "fmt" 5 | ) 6 | 7 | func ExampleAddFloat32() { 8 | left := []float32{1, 9, 2, 8} 9 | right := []float32{3, 7, 4, 6, 5} 10 | result := []float32{0, 0, 0, 0, 0, 0} 11 | length := AddFloat32(left, right, result) 12 | fmt.Print(length, result) 13 | // Output: 4 [4 16 6 14 0 0] 14 | } 15 | 16 | func ExampleAddFloat64() { 17 | left := []float64{1, 9, 2, 8} 18 | right := []float64{3, 7, 4, 6, 5} 19 | result := []float64{0, 0, 0, 0, 0, 0} 20 | length := AddFloat64(left, right, result) 21 | fmt.Print(length, result) 22 | // Output: 4 [4 16 6 14 0 0] 23 | } 24 | 25 | func ExampleAddInt32() { 26 | left := []int32{1, 9, 2, 8} 27 | right := []int32{3, 7, 4, 6, 5} 28 | result := []int32{0, 0, 0, 0, 0, 0} 29 | length := AddInt32(left, right, result) 30 | fmt.Print(length, result) 31 | // Output: 4 [4 16 6 14 0 0] 32 | } 33 | 34 | func ExampleAddInt64() { 35 | left := []int64{1, 9, 2, 8} 36 | right := []int64{3, 7, 4, 6, 5} 37 | result := []int64{0, 0, 0, 0, 0, 0} 38 | length := AddInt64(left, right, result) 39 | fmt.Print(length, result) 40 | // Output: 4 [4 16 6 14 0 0] 41 | } 42 | 43 | func ExampleAndInt32() { 44 | left := []int32{1, 9, 2, 8} 45 | right := []int32{3, 7, 4, 6, 5} 46 | result := []int32{0, 0, 0, 0, 0, 0} 47 | length := AndInt32(left, right, result) 48 | fmt.Print(length, result) 49 | // Output: 4 [1 1 0 0 0 0] 50 | } 51 | 52 | func ExampleAndInt64() { 53 | left := []int64{1, 9, 2, 8} 54 | right := []int64{3, 7, 4, 6, 5} 55 | result := []int64{0, 0, 0, 0, 0, 0} 56 | length := AndInt64(left, right, result) 57 | fmt.Print(length, result) 58 | // Output: 4 [1 1 0 0 0 0] 59 | } 60 | 61 | func ExampleDivFloat32() { 62 | left := []float32{1, 9, 2, 8} 63 | right := []float32{3, 7, 4, 6, 5} 64 | result := []float32{0, 0, 0, 0, 0, 0} 65 | length := DivFloat32(left, right, result) 66 | fmt.Print(length, result) 67 | // Output: 4 [0.33333334 1.2857143 0.5 1.3333334 0 0] 68 | } 69 | 70 | func ExampleDivFloat64() { 71 | left := []float64{1, 9, 2, 8} 72 | right := []float64{3, 7, 4, 6, 5} 73 | result := []float64{0, 0, 0, 0, 0, 0} 74 | length := DivFloat64(left, right, result) 75 | fmt.Print(length, result) 76 | // Output: 4 [0.3333333333333333 1.2857142857142858 0.5 1.3333333333333333 0 0] 77 | } 78 | 79 | func ExampleDivInt32() { 80 | left := []int32{1, 9, 2, 8} 81 | right := []int32{3, 7, 4, 6, 5} 82 | result := []int32{0, 0, 0, 0, 0, 0} 83 | length := DivInt32(left, right, result) 84 | fmt.Print(length, result) 85 | // Output: 4 [0 1 0 1 0 0] 86 | } 87 | 88 | func ExampleDivInt64() { 89 | left := []int64{1, 9, 2, 8} 90 | right := []int64{3, 7, 4, 6, 5} 91 | result := []int64{0, 0, 0, 0, 0, 0} 92 | length := DivInt64(left, right, result) 93 | fmt.Print(length, result) 94 | // Output: 4 [0 1 0 1 0 0] 95 | } 96 | 97 | func ExampleMaxFloat32() { 98 | left := []float32{1, 9, 2, 8} 99 | right := []float32{3, 7, 4, 6, 5} 100 | result := []float32{0, 0, 0, 0, 0, 0} 101 | length := MaxFloat32(left, right, result) 102 | fmt.Print(length, result) 103 | // Output: 4 [3 9 4 8 0 0] 104 | } 105 | 106 | func ExampleMaxFloat64() { 107 | left := []float64{1, 9, 2, 8} 108 | right := []float64{3, 7, 4, 6, 5} 109 | result := []float64{0, 0, 0, 0, 0, 0} 110 | length := MaxFloat64(left, right, result) 111 | fmt.Print(length, result) 112 | // Output: 4 [3 9 4 8 0 0] 113 | } 114 | 115 | func ExampleMaxInt32() { 116 | left := []int32{1, 9, 2, 8} 117 | right := []int32{3, 7, 4, 6, 5} 118 | result := []int32{0, 0, 0, 0, 0, 0} 119 | length := MaxInt32(left, right, result) 120 | fmt.Print(length, result) 121 | // Output: 4 [3 9 4 8 0 0] 122 | } 123 | 124 | func ExampleMaxInt64() { 125 | left := []int64{1, 9, 2, 8} 126 | right := []int64{3, 7, 4, 6, 5} 127 | result := []int64{0, 0, 0, 0, 0, 0} 128 | length := MaxInt64(left, right, result) 129 | fmt.Print(length, result) 130 | // Output: 4 [3 9 4 8 0 0] 131 | } 132 | 133 | func ExampleMinFloat32() { 134 | left := []float32{1, 9, 2, 8} 135 | right := []float32{3, 7, 4, 6, 5} 136 | result := []float32{0, 0, 0, 0, 0, 0} 137 | length := MinFloat32(left, right, result) 138 | fmt.Print(length, result) 139 | // Output: 4 [1 7 2 6 0 0] 140 | } 141 | 142 | func ExampleMinFloat64() { 143 | left := []float64{1, 9, 2, 8} 144 | right := []float64{3, 7, 4, 6, 5} 145 | result := []float64{0, 0, 0, 0, 0, 0} 146 | length := MinFloat64(left, right, result) 147 | fmt.Print(length, result) 148 | // Output: 4 [1 7 2 6 0 0] 149 | } 150 | 151 | func ExampleMinInt32() { 152 | left := []int32{1, 9, 2, 8} 153 | right := []int32{3, 7, 4, 6, 5} 154 | result := []int32{0, 0, 0, 0, 0, 0} 155 | length := MinInt32(left, right, result) 156 | fmt.Print(length, result) 157 | // Output: 4 [1 7 2 6 0 0] 158 | } 159 | 160 | func ExampleMinInt64() { 161 | left := []int64{1, 9, 2, 8} 162 | right := []int64{3, 7, 4, 6, 5} 163 | result := []int64{0, 0, 0, 0, 0, 0} 164 | length := MinInt64(left, right, result) 165 | fmt.Print(length, result) 166 | // Output: 4 [1 7 2 6 0 0] 167 | } 168 | 169 | func ExampleMulFloat32() { 170 | left := []float32{1, 9, 2, 8} 171 | right := []float32{3, 7, 4, 6, 5} 172 | result := []float32{0, 0, 0, 0, 0, 0} 173 | length := MulFloat32(left, right, result) 174 | fmt.Print(length, result) 175 | // Output: 4 [3 63 8 48 0 0] 176 | } 177 | 178 | func ExampleMulFloat64() { 179 | left := []float64{1, 9, 2, 8} 180 | right := []float64{3, 7, 4, 6, 5} 181 | result := []float64{0, 0, 0, 0, 0, 0} 182 | length := MulFloat64(left, right, result) 183 | fmt.Print(length, result) 184 | // Output: 4 [3 63 8 48 0 0] 185 | } 186 | 187 | func ExampleMulInt32() { 188 | left := []int32{1, 9, 2, 8} 189 | right := []int32{3, 7, 4, 6, 5} 190 | result := []int32{0, 0, 0, 0, 0, 0} 191 | length := MulInt32(left, right, result) 192 | fmt.Print(length, result) 193 | // Output: 4 [3 63 8 48 0 0] 194 | } 195 | 196 | func ExampleMulInt64() { 197 | left := []int64{1, 9, 2, 8} 198 | right := []int64{3, 7, 4, 6, 5} 199 | result := []int64{0, 0, 0, 0, 0, 0} 200 | length := MulInt64(left, right, result) 201 | fmt.Print(length, result) 202 | // Output: 4 [3 63 8 48 0 0] 203 | } 204 | 205 | func ExampleOrInt32() { 206 | left := []int32{1, 9, 2, 8} 207 | right := []int32{3, 7, 4, 6, 5} 208 | result := []int32{0, 0, 0, 0, 0, 0} 209 | length := OrInt32(left, right, result) 210 | fmt.Print(length, result) 211 | // Output: 4 [3 15 6 14 0 0] 212 | } 213 | 214 | func ExampleOrInt64() { 215 | left := []int64{1, 9, 2, 8} 216 | right := []int64{3, 7, 4, 6, 5} 217 | result := []int64{0, 0, 0, 0, 0, 0} 218 | length := OrInt64(left, right, result) 219 | fmt.Print(length, result) 220 | // Output: 4 [3 15 6 14 0 0] 221 | } 222 | 223 | func ExampleSubFloat32() { 224 | left := []float32{1, 9, 2, 8} 225 | right := []float32{3, 7, 4, 6, 5} 226 | result := []float32{0, 0, 0, 0, 0, 0} 227 | length := SubFloat32(left, right, result) 228 | fmt.Print(length, result) 229 | // Output: 4 [-2 2 -2 2 0 0] 230 | } 231 | 232 | func ExampleSubFloat64() { 233 | left := []float64{1, 9, 2, 8} 234 | right := []float64{3, 7, 4, 6, 5} 235 | result := []float64{0, 0, 0, 0, 0, 0} 236 | length := SubFloat64(left, right, result) 237 | fmt.Print(length, result) 238 | // Output: 4 [-2 2 -2 2 0 0] 239 | } 240 | 241 | func ExampleSubInt32() { 242 | left := []int32{1, 9, 2, 8} 243 | right := []int32{3, 7, 4, 6, 5} 244 | result := []int32{0, 0, 0, 0, 0, 0} 245 | length := SubInt32(left, right, result) 246 | fmt.Print(length, result) 247 | // Output: 4 [-2 2 -2 2 0 0] 248 | } 249 | 250 | func ExampleSubInt64() { 251 | left := []int64{1, 9, 2, 8} 252 | right := []int64{3, 7, 4, 6, 5} 253 | result := []int64{0, 0, 0, 0, 0, 0} 254 | length := SubInt64(left, right, result) 255 | fmt.Print(length, result) 256 | // Output: 4 [-2 2 -2 2 0 0] 257 | } 258 | 259 | func ExampleXorInt32() { 260 | left := []int32{1, 9, 2, 8} 261 | right := []int32{3, 7, 4, 6, 5} 262 | result := []int32{0, 0, 0, 0, 0, 0} 263 | length := XorInt32(left, right, result) 264 | fmt.Print(length, result) 265 | // Output: 4 [2 14 6 14 0 0] 266 | } 267 | 268 | func ExampleXorInt64() { 269 | left := []int64{1, 9, 2, 8} 270 | right := []int64{3, 7, 4, 6, 5} 271 | result := []int64{0, 0, 0, 0, 0, 0} 272 | length := XorInt64(left, right, result) 273 | fmt.Print(length, result) 274 | // Output: 4 [2 14 6 14 0 0] 275 | } 276 | --------------------------------------------------------------------------------