├── COPYING ├── README ├── bench_test.go ├── count_test.go ├── countavx2_386.s ├── countavx2_amd64.s ├── countavx512_amd64.s ├── countneon_arm64.s ├── countsse2_386.s ├── countsse2_amd64.s ├── dispatch.go ├── example_test.go ├── generic.go ├── go.mod ├── go.sum ├── minimize_test.go ├── overflow_test.go ├── overread_test.go ├── safe.go ├── select_386.go ├── select_amd64.go ├── select_arm64.go └── select_generic.go /COPYING: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020--2024 Robert Clausecker 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | 7 | 1. Redistributions of source code must retain the above copyright 8 | notice, this list of conditions and the following disclaimer. 9 | 10 | 2. Redistributions in binary form must reproduce the above copyright 11 | notice, this list of conditions and the following disclaimer in the 12 | documentation and/or other materials provided with the distribution. 13 | 14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 15 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 16 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 17 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 18 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 20 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 21 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 22 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 23 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | High-performance vectorised positional popcount routines for Go 2 | =============================================================== 3 | 4 | This repository contains implementations of the positional population 5 | count functions for Go. Details on the algorithms used will be 6 | published in a future research paper. 7 | 8 | To use this library, import it as follows: 9 | 10 | import "github.com/clausecker/pospop" 11 | 12 | You can then count populations using the Count8, Count16, Count32, 13 | Count64, and CountString functions: 14 | 15 | var counts [8]int 16 | pospop.Count8(&counts, buf) 17 | 18 | The positional population count for buf is added to the contents of 19 | counts. 20 | 21 | Supported Platforms 22 | ------------------- 23 | 24 | The kernels works on a block size of 240 or 480 bytes (depending on 25 | whether AVX2 is available or not). A buffer size that is a multiple 26 | of 480 bytes and at least 10 kB in size is recommended. 27 | 28 | Implementations are provided for the following SIMD extensions: 29 | 30 | * AVX-512 F/BW (amd64) 31 | * AVX2 (amd64, 386) 32 | * SSE2 (amd64, 386) 33 | * NEON (arm64) 34 | * generic kernel (all architectures) 35 | 36 | The three kernels for amd64 correspond to the v4, v3, and v1 values 37 | of the GOAMD64 environment variable. However, all kernels are 38 | compiled in regardless of what value GOAMD64 is set to. 39 | 40 | The library automatically chooses the fastest available kernel for 41 | the system it is running on. 42 | 43 | Performance 44 | ----------- 45 | 46 | As all functions (Count8, Count16, Count32, Count64, CountString) of 47 | one set are based on the same kernel with a different accumulation 48 | function, they all perform equally well. This does not apply to the 49 | generic implementations whose performance is therefore given for every 50 | function individually. 51 | 52 | The following performance table is grouped by the instruction set used 53 | and the architecture it runs on. A buffer size of 100 kB was used to 54 | find these results. 55 | 56 | 57 | amd64 386 arm64 arm 58 | avx512 82.1 GB/s --- --- --- 59 | avx2 34.8 GB/s 31.6 GB/s --- --- 60 | sse2 16.0 GB/s 15.6 GB/s --- --- 61 | neon --- --- 36.9 GB/s --- 62 | generic8 1.02 GB/s 297 MB/s 1.68 GB/s 49.0 MB/s 63 | generic16 1.71 GB/s 1.36 GB/s 3.03 GB/s 67.1 MB/s 64 | generic32 2.66 GB/s 2.21 GB/s 3.83 GB/s 105 MB/s 65 | generic64 3.43 GB/s 1.89 GB/s 6.56 GB/s 82.9 MB/s 66 | 67 | The following systems were used for benchmarks, all using Go 1.16: 68 | 69 | * amd64, 386: Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz 70 | * arm64: Apple M1 71 | * arm: ARM Cortex-A72 r0p3 (Raspberry Pi 4B) 72 | 73 | Remaining Work 74 | -------------- 75 | 76 | * provide assembly kernels for arm, ppcle, and others 77 | (hardware donations appreciated for further targets) 78 | * provide variants of Count16, Count32, and Count64 working on byte 79 | arrays 80 | 81 | (c) 2020--2024 Robert Clausecker . All Rights Reserved. 82 | 83 | This code is published under a 2-clause BSD license. See the file 84 | COPYING for details. 85 | -------------------------------------------------------------------------------- /bench_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020, 2021 Robert Clausecker 2 | 3 | package pospop 4 | 5 | import "math/rand" 6 | import "testing" 7 | import "strconv" 8 | 9 | // sizes to benchmark 10 | var benchmarkLengths = []int{ 11 | 1, 10, 100, 1000, 10 * 1000, 100 * 1000, 1000 * 1000, 10 * 1000 * 1000, 100 * 1000 * 1000, 12 | } 13 | 14 | // sizes to benchmark in a short benchmark 15 | var benchmarkLengthsShort = []int{100 * 1000} 16 | 17 | // benchmark a count8 implementation 18 | func benchmarkCount8(b *testing.B, buf []uint8, lengths []int, count8 func(*[8]int, []uint8)) { 19 | for _, l := range lengths { 20 | b.Run(strconv.Itoa(l)+"B", func(b *testing.B) { 21 | var counts [8]int 22 | testbuf := buf[:l] 23 | b.SetBytes(int64(l) * 1) 24 | for i := 0; i < b.N; i++ { 25 | count8(&counts, testbuf) 26 | } 27 | }) 28 | } 29 | } 30 | 31 | // benchmark all Count8 implementations 32 | func BenchmarkCount8(b *testing.B) { 33 | funcs := count8funcs 34 | lengths := benchmarkLengths 35 | 36 | // short benchmark: only test the implementation 37 | // actually used and keep it to one size 38 | if testing.Short() { 39 | funcs = []count8impl{{Count8, "dispatch", true}} 40 | lengths = benchmarkLengthsShort 41 | } 42 | 43 | maxlen := lengths[len(lengths)-1] 44 | buf := make([]uint8, maxlen) 45 | rand.Read(buf) 46 | 47 | for _, impl := range funcs { 48 | b.Run(impl.name, func(bb *testing.B) { 49 | if !impl.available { 50 | bb.SkipNow() 51 | } 52 | 53 | benchmarkCount8(bb, buf, lengths, impl.count8) 54 | }) 55 | } 56 | } 57 | 58 | // benchmark a count16 implementation 59 | func benchmarkCount16(b *testing.B, buf []uint16, lengths []int, count16 func(*[16]int, []uint16)) { 60 | for _, l := range lengths { 61 | b.Run(strconv.Itoa(l), func(b *testing.B) { 62 | var counts [16]int 63 | testbuf := buf[:l/2] 64 | b.SetBytes(int64(l)) 65 | for i := 0; i < b.N; i++ { 66 | count16(&counts, testbuf) 67 | } 68 | }) 69 | } 70 | } 71 | 72 | // benchmark all Count16 implementations 73 | func BenchmarkCount16(b *testing.B) { 74 | funcs := count16funcs 75 | lengths := benchmarkLengths 76 | 77 | // short benchmark: only test the implementation 78 | // actually used and keep it to one size 79 | if testing.Short() { 80 | funcs = []count16impl{{Count16, "dispatch", true}} 81 | lengths = benchmarkLengthsShort 82 | } 83 | 84 | maxlen := lengths[len(lengths)-1] / 2 85 | buf := make([]uint16, maxlen) 86 | for i := range buf { 87 | buf[i] = uint16(rand.Int63()) 88 | } 89 | 90 | for _, impl := range funcs { 91 | b.Run(impl.name, func(bb *testing.B) { 92 | if !impl.available { 93 | bb.SkipNow() 94 | } 95 | 96 | benchmarkCount16(bb, buf, lengths, impl.count16) 97 | }) 98 | } 99 | } 100 | 101 | // benchmark a count32 implementation 102 | func benchmarkCount32(b *testing.B, buf []uint32, lengths []int, count32 func(*[32]int, []uint32)) { 103 | for _, l := range lengths { 104 | b.Run(strconv.Itoa(l), func(b *testing.B) { 105 | var counts [32]int 106 | testbuf := buf[:l/4] 107 | b.SetBytes(int64(l)) 108 | for i := 0; i < b.N; i++ { 109 | count32(&counts, testbuf) 110 | } 111 | }) 112 | } 113 | } 114 | 115 | // benchmark all Count32 implementations 116 | func BenchmarkCount32(b *testing.B) { 117 | funcs := count32funcs 118 | lengths := benchmarkLengths 119 | 120 | // short benchmark: only test the implementation 121 | // actually used and keep it to one size 122 | if testing.Short() { 123 | funcs = []count32impl{{Count32, "dispatch", true}} 124 | lengths = benchmarkLengthsShort 125 | } 126 | 127 | maxlen := lengths[len(lengths)-1] / 4 128 | buf := make([]uint32, maxlen) 129 | for i := range buf { 130 | buf[i] = uint32(rand.Int63()) 131 | } 132 | 133 | for _, impl := range funcs { 134 | b.Run(impl.name, func(bb *testing.B) { 135 | if !impl.available { 136 | bb.SkipNow() 137 | } 138 | 139 | benchmarkCount32(bb, buf, lengths, impl.count32) 140 | }) 141 | } 142 | } 143 | 144 | // benchmark a count64 implementation 145 | func benchmarkCount64(b *testing.B, buf []uint64, lengths []int, count64 func(*[64]int, []uint64)) { 146 | for _, l := range lengths { 147 | b.Run(strconv.Itoa(l), func(b *testing.B) { 148 | var counts [64]int 149 | testbuf := buf[:l/8] 150 | b.SetBytes(int64(l)) 151 | for i := 0; i < b.N; i++ { 152 | count64(&counts, testbuf) 153 | } 154 | }) 155 | } 156 | } 157 | 158 | // benchmark all Count64 implementations 159 | func BenchmarkCount64(b *testing.B) { 160 | funcs := count64funcs 161 | lengths := benchmarkLengths 162 | 163 | // short benchmark: only test the implementation 164 | // actually used and keep it to one size 165 | if testing.Short() { 166 | funcs = []count64impl{{Count64, "dispatch", true}} 167 | lengths = benchmarkLengthsShort 168 | } 169 | 170 | maxlen := lengths[len(lengths)-1] / 8 171 | buf := make([]uint64, maxlen) 172 | for i := range buf { 173 | buf[i] = rand.Uint64() 174 | } 175 | 176 | for _, impl := range funcs { 177 | b.Run(impl.name, func(bb *testing.B) { 178 | if !impl.available { 179 | bb.SkipNow() 180 | } 181 | 182 | benchmarkCount64(bb, buf, lengths, impl.count64) 183 | }) 184 | } 185 | } 186 | -------------------------------------------------------------------------------- /count_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020--2022, 2024 Robert Clausecker 2 | 3 | package pospop 4 | 5 | import ( 6 | "math/rand" 7 | "testing" 8 | ) 9 | 10 | // standard test lengths to try 11 | var testLengths = []int{ 12 | 0, 1, 2, 3, 13 | 4, 5, 6, 7, 14 | 8, 9, 10, 11, 15 | 12, 13, 14, 15, 16 | 16, 17, 18, 19, 17 | 31, 32, 33, 18 | 63, 64, 65, 19 | 95, 97, 98, 20 | 119, 120, 121, 21 | 239, 240, 241, 22 | 2*240 - 1, 2 * 240, 2*240 + 1, 23 | 4*240 - 1, 4 * 240, 4*240 + 1, 24 | 1023, 1024, 1025, 25 | (15 + 16) * 8, (15 + 16) * 16, (15 + 16) * 32, (15 + 16) * 64, 26 | 27 | // long length to trigger counter overflow 28 | (255*16 + 15) * 64, 29 | } 30 | 31 | // minimizing the failure causes timeout for long test cases 32 | const minimizationThreshold = (15 + 16) * 64 33 | 34 | // fill counts with random integers 35 | func randomCounts(counts []int) { 36 | for i := range counts { 37 | counts[i] = rand.Int() 38 | } 39 | } 40 | 41 | // compute the difference in length between two equally long integers slices. 42 | func countDiff(a []int, b []int) []int { 43 | res := make([]int, len(a)) 44 | 45 | for i := range a { 46 | res[i] = b[i] - a[i] 47 | } 48 | 49 | return res 50 | } 51 | 52 | // test the correctness of a count8 implementation 53 | func testCount8(t *testing.T, count8 func(*[8]int, []uint8)) { 54 | for _, len := range testLengths { 55 | buf := make([]uint8, len+1) 56 | buf = buf[1 : len+1] // ensure misalignment 57 | for i := range buf { 58 | buf[i] = uint8(rand.Int63()) 59 | } 60 | 61 | var counts [8]int 62 | randomCounts(counts[:]) 63 | refCounts := counts 64 | 65 | count8(&counts, buf) 66 | count8safe(&refCounts, buf) 67 | 68 | if counts != refCounts { 69 | t.Errorf("length %d: counts don't match: %v\n", len, countDiff(counts[:], refCounts[:])) 70 | } 71 | } 72 | } 73 | 74 | // test the correctness of a count16 implementation 75 | func testCount16(t *testing.T, count16 func(*[16]int, []uint16)) { 76 | for _, len := range testLengths { 77 | buf := make([]uint16, len+1) 78 | buf = buf[1 : len+1] // ensure misalignment 79 | for i := range buf { 80 | buf[i] = uint16(rand.Int63()) 81 | } 82 | 83 | var counts [16]int 84 | randomCounts(counts[:]) 85 | refCounts := counts 86 | 87 | count16(&counts, buf) 88 | count16safe(&refCounts, buf) 89 | 90 | if counts != refCounts { 91 | t.Errorf("length %d: counts don't match: %v\n", len, countDiff(counts[:], refCounts[:])) 92 | } 93 | } 94 | } 95 | 96 | // test the correctness of a count32 implementation 97 | func testCount32(t *testing.T, count32 func(*[32]int, []uint32)) { 98 | for _, len := range testLengths { 99 | buf := make([]uint32, len+1) 100 | buf = buf[1 : len+1] // ensure misalignment 101 | for i := range buf { 102 | buf[i] = rand.Uint32() 103 | } 104 | 105 | var counts [32]int 106 | randomCounts(counts[:]) 107 | refCounts := counts 108 | 109 | count32(&counts, buf) 110 | count32safe(&refCounts, buf) 111 | 112 | if counts != refCounts { 113 | t.Errorf("length %d: counts don't match: %v\n", len, countDiff(counts[:], refCounts[:])) 114 | } 115 | } 116 | } 117 | 118 | // test the correctness of a count64 implementation 119 | func testCount64(t *testing.T, count64 func(*[64]int, []uint64)) { 120 | for _, len := range testLengths { 121 | buf := make([]uint64, len+1) 122 | buf = buf[1 : len+1] // ensure misalignment 123 | for i := range buf { 124 | buf[i] = rand.Uint64() 125 | } 126 | 127 | var counts [64]int 128 | randomCounts(counts[:]) 129 | refCounts := counts 130 | 131 | count64(&counts, buf) 132 | count64safe(&refCounts, buf) 133 | 134 | if counts != refCounts { 135 | t.Errorf("length %d: counts don't match: %v\n", len, countDiff(counts[:], refCounts[:])) 136 | 137 | if len > minimizationThreshold { 138 | continue 139 | } 140 | 141 | min := minimizeTestcase64(count64, buf) 142 | tcstr := testcaseString64(min) 143 | if tcstr != "" { 144 | t.Log("minimized test case:\n", tcstr) 145 | } 146 | } 147 | } 148 | } 149 | 150 | // test the correctness of CountString 151 | func TestCountString(t *testing.T) { 152 | testCount8(t, func(counts *[8]int, buf []uint8) { CountString(counts, string(buf)) }) 153 | } 154 | 155 | // test the correctness of all Count8 implementations 156 | func TestCount8(t *testing.T) { 157 | t.Run("dispatch", func(tt *testing.T) { testCount8(tt, Count8) }) 158 | 159 | for i := range count8funcs { 160 | t.Run(count8funcs[i].name, func(tt *testing.T) { 161 | if !count8funcs[i].available { 162 | tt.SkipNow() 163 | } 164 | 165 | testCount8(tt, count8funcs[i].count8) 166 | }) 167 | } 168 | } 169 | 170 | // test the correctness of Count16 171 | func TestCount16(t *testing.T) { 172 | t.Run("dispatch", func(tt *testing.T) { testCount16(tt, Count16) }) 173 | 174 | for i := range count16funcs { 175 | t.Run(count16funcs[i].name, func(tt *testing.T) { 176 | if !count16funcs[i].available { 177 | tt.SkipNow() 178 | } 179 | 180 | testCount16(tt, count16funcs[i].count16) 181 | }) 182 | } 183 | } 184 | 185 | // test the correctness of Count32 186 | func TestCount32(t *testing.T) { 187 | t.Run("dispatch", func(tt *testing.T) { testCount32(tt, Count32) }) 188 | 189 | for i := range count32funcs { 190 | t.Run(count32funcs[i].name, func(tt *testing.T) { 191 | if !count32funcs[i].available { 192 | tt.SkipNow() 193 | } 194 | 195 | testCount32(tt, count32funcs[i].count32) 196 | }) 197 | } 198 | } 199 | 200 | // test the correctness of Count64 201 | func TestCount64(t *testing.T) { 202 | t.Run("dispatch", func(tt *testing.T) { testCount64(tt, Count64) }) 203 | 204 | for i := range count64funcs { 205 | t.Run(count64funcs[i].name, func(tt *testing.T) { 206 | if !count64funcs[i].available { 207 | tt.SkipNow() 208 | } 209 | 210 | testCount64(tt, count64funcs[i].count64) 211 | }) 212 | } 213 | } 214 | -------------------------------------------------------------------------------- /countavx2_386.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // AVX2 based kernels for the positional population count operation. 4 | // All these kernels have the same backbone based on a 15-fold CSA 5 | // reduction to first reduce 480 byte into 4x32 byte, followed by a 6 | // bunch of shuffles to group the positional registers into nibbles. 7 | // These are then summed up using a width-specific summation function. 8 | // Required CPU extension: AVX2. 9 | 10 | // magic transposition constants, comparison constants 11 | DATA magic<>+ 0(SB)/8, $0x0000000000000000 12 | DATA magic<>+ 8(SB)/8, $0x0101010101010101 13 | DATA magic<>+16(SB)/8, $0x0202020202020202 14 | DATA magic<>+24(SB)/8, $0x0303030303030303 15 | DATA magic<>+32(SB)/8, $0x8040201008040201 16 | DATA magic<>+40(SB)/4, $0x55555555 17 | DATA magic<>+44(SB)/4, $0x33333333 18 | DATA magic<>+48(SB)/4, $0x0f0f0f0f 19 | GLOBL magic<>(SB), RODATA|NOPTR, $52 20 | 21 | // sliding window for head/tail loads. Unfortunately, there doesn't seem to be 22 | // a good way to do this with less memory wasted. 23 | DATA window<>+ 0(SB)/8, $0x0000000000000000 24 | DATA window<>+ 8(SB)/8, $0x0000000000000000 25 | DATA window<>+16(SB)/8, $0x0000000000000000 26 | DATA window<>+24(SB)/8, $0x0000000000000000 27 | DATA window<>+32(SB)/8, $0xffffffffffffffff 28 | DATA window<>+40(SB)/8, $0xffffffffffffffff 29 | DATA window<>+48(SB)/8, $0xffffffffffffffff 30 | DATA window<>+56(SB)/8, $0xffffffffffffffff 31 | GLOBL window<>(SB), RODATA|NOPTR, $64 32 | 33 | // B:A = A+B+C, D used for scratch space 34 | #define CSA(A, B, C, D) \ 35 | VPAND A, B, D \ 36 | VPXOR A, B, A \ 37 | VPAND A, C, B \ 38 | VPXOR A, C, A \ 39 | VPOR B, D, B 40 | 41 | // Generic kernel. This function expects a pointer to a width-specific 42 | // accumulation function in BX, a possibly unaligned input buffer in SI, 43 | // counters in DI and a remaining length in BP. 44 | TEXT countavx<>(SB), NOSPLIT, $160-0 45 | TESTL BP, BP // any data to process at all? 46 | CMOVLEQ BP, SI // if not, avoid loading head 47 | 48 | // constants for processing the head 49 | VPBROADCASTQ magic<>+32(SB), Y6 // bit position mask 50 | VMOVDQU magic<>+0(SB), Y3 // permutation mask 51 | VPXOR Y0, Y0, Y0 // lower counter register 52 | VPXOR Y1, Y1, Y1 // upper counter register 53 | 54 | // load head into scratch space (until alignment/end is reached) 55 | MOVL SI, DX 56 | ANDL $31, DX // offset of the buffer start from 32 byte alignment 57 | JEQ nohead // if source buffer is aligned, skip head processing 58 | MOVL $32, AX 59 | SUBL DX, AX // number of bytes til alignment is reached (head length) 60 | VMOVDQA -32(SI)(AX*1), Y7 // load head 61 | MOVL $window<>(SB), DX // load window mask base pointer 62 | VMOVDQU (DX)(AX*1), Y5 // load mask of the bytes that are part of the head 63 | VPAND Y5, Y7, Y7 // and mask out those bytes that are not 64 | CMPL AX, BP // is the head shorter than the buffer? 65 | JLT norunt // if yes, perform special processing 66 | 67 | // buffer is short and does not cross a 32 byte boundary 68 | SUBL BP, AX // number of bytes by which we overshoot the buffer 69 | VMOVDQU (DX)(AX*1), Y5 // load mask of bytes that overshoot the buffer 70 | VPANDN Y7, Y5, Y7 // and clear them in Y4 71 | MOVL BP, AX // set up the true prefix length 72 | 73 | norunt: VMOVDQU Y7, scratch-160(SP) // copy to scratch space 74 | SUBL AX, BP // mark head as accounted for 75 | MOVL SI, DX // keep a copy of the head pointer 76 | ADDL AX, SI // and advance past head 77 | 78 | ANDL $31, DX // compute misalignment again 79 | SHRL $3, DX // misalignment in qwords (rounded down) 80 | ANDL $3, DX // and reduced to range 0--3 81 | 82 | // process head, 8 bytes at a time (up to 4 times) 83 | head: VPBROADCASTD scratch-160+0(SP)(DX*8), Y4 84 | // Y4 = 3210:3210:3210:3210:3210:3210:3210:3210 85 | VPBROADCASTD scratch-160+4(SP)(DX*8), Y5 86 | VPSHUFB Y3, Y4, Y4 // Y4 = 3333:3333:2222:2222:1111:1111:0000:0000 87 | VPSHUFB Y3, Y5, Y5 88 | VPAND Y6, Y4, Y4 // mask out one bit in each copy of the bytes 89 | VPAND Y6, Y5, Y5 90 | VPCMPEQB Y6, Y4, Y4 // set bytes to -1 if the bits were set 91 | VPCMPEQB Y6, Y5, Y5 // or to 0 otherwise 92 | VPSUBB Y4, Y0, Y0 // add 1/0 (subtract -1/0) to counters 93 | VPSUBB Y5, Y1, Y1 94 | ADDL $1, DX 95 | CMPL DX, $4 // have we processed the full head? 96 | JLT head 97 | 98 | // produce 16 byte aligned point to counter vector in DX 99 | nohead: MOVL $counts-160+31(SP), DX 100 | ANDL $~31, DX // align to 32 bytes 101 | 102 | // initialise counters to what we have 103 | VPXOR Y7, Y7, Y7 // zero register 104 | VPUNPCKLBW Y7, Y0, Y4 // 0-7, 16-23 105 | VMOVDQA Y4, 0*32(DX) 106 | VPUNPCKHBW Y7, Y0, Y5 // 8-15, 24-31 107 | VMOVDQA Y5, 1*32(DX) 108 | VPUNPCKLBW Y7, Y1, Y6 // 32-39, 48-55 109 | VMOVDQA Y6, 2*32(DX) 110 | VPUNPCKHBW Y7, Y1, Y7 // 40-47, 56-63 111 | VMOVDQA Y7, 3*32(DX) 112 | 113 | SUBL $15*32, BP // enough data left to process? 114 | JLT endvec // also, pre-subtract 115 | 116 | MOVL $65535-4, AX // space left til overflow could occur in Y8--Y11 117 | 118 | vec: VMOVDQU 0*32(SI), Y0 // load 480 bytes from buf 119 | VMOVDQU 1*32(SI), Y1 // and sum them into Y3:Y2:Y1:Y0 120 | VMOVDQU 2*32(SI), Y4 121 | VMOVDQU 3*32(SI), Y2 122 | VMOVDQU 4*32(SI), Y3 123 | VMOVDQU 5*32(SI), Y5 124 | VMOVDQU 6*32(SI), Y6 125 | CSA(Y0, Y1, Y4, Y7) 126 | VMOVDQU 7*32(SI), Y4 127 | CSA(Y3, Y2, Y5, Y7) 128 | VMOVDQU 8*32(SI), Y5 129 | CSA(Y0, Y3, Y6, Y7) 130 | VMOVDQU 9*32(SI), Y6 131 | CSA(Y1, Y2, Y3, Y7) 132 | VMOVDQU 10*32(SI), Y3 133 | CSA(Y0, Y4, Y5, Y7) 134 | VMOVDQU 11*32(SI), Y5 135 | CSA(Y0, Y3, Y6, Y7) 136 | VMOVDQU 12*32(SI), Y6 137 | CSA(Y1, Y3, Y4, Y7) 138 | VMOVDQU 13*32(SI), Y4 139 | CSA(Y0, Y5, Y6, Y7) 140 | VMOVDQU 14*32(SI), Y6 141 | CSA(Y0, Y4, Y6, Y7) 142 | CSA(Y1, Y4, Y5, Y7) 143 | CSA(Y2, Y3, Y4, Y7) 144 | 145 | // load magic constants 146 | VPBROADCASTD magic<>+40(SB), Y7 // 0x55555555 147 | VPADDD Y7, Y7, Y6 // 0xaaaaaaaa 148 | 149 | ADDL $15*32, SI 150 | 151 | // group Y0--Y3 into nibbles in the same registers 152 | VPAND Y0, Y6, Y5 153 | VPSRLD $1, Y5, Y5 154 | VPAND Y1, Y7, Y4 155 | VPADDD Y4, Y4, Y4 156 | VPAND Y0, Y7, Y0 157 | VPAND Y1, Y6, Y1 158 | VPOR Y0, Y4, Y0 // Y0 = eca86420 (low crumbs) 159 | VPOR Y1, Y5, Y1 // Y1 = fdb97531 (low crumbs) 160 | 161 | VPAND Y2, Y6, Y5 162 | VPSRLD $1, Y5, Y5 163 | VPAND Y3, Y7, Y4 164 | VPADDD Y4, Y4, Y4 165 | VPAND Y2, Y7, Y2 166 | VPBROADCASTD magic<>+44(SB), Y7 // 0x33333333 167 | VPAND Y3, Y6, Y3 168 | VPSLLD $2, Y7, Y6 // 0xcccccccc 169 | VPOR Y2, Y4, Y2 // Y2 = eca86420 (high crumbs) 170 | VPOR Y3, Y5, Y3 // Y3 = fdb97531 (high crumbs) 171 | 172 | VPAND Y0, Y6, Y5 173 | VPSRLD $2, Y5, Y5 174 | VPAND Y2, Y7, Y4 175 | VPSLLD $2, Y4, Y4 176 | VPAND Y0, Y7, Y0 177 | VPAND Y2, Y6, Y2 178 | VPOR Y0, Y4, Y0 // Y0 = c840 179 | VPOR Y2, Y5, Y2 // Y2 = ea62 180 | 181 | VPAND Y1, Y6, Y5 182 | VPSRLD $2, Y5, Y5 183 | VPAND Y3, Y7, Y4 184 | VPSLLD $2, Y4, Y4 185 | VPAND Y1, Y7, Y1 186 | VPAND Y3, Y6, Y3 187 | VPOR Y1, Y4, Y1 // Y1 = d951 188 | VPOR Y3, Y5, Y3 // Y3 = fb73 189 | 190 | VPBROADCASTD magic<>+48(SB), Y7 // 0x0f0f0f0f for deinterleaving nibbles 191 | 192 | // pre-shuffle nibbles 193 | VPUNPCKLBW Y1, Y0, Y4 // Y4 = d9c85140 (3:2:1:0) 194 | VPUNPCKHBW Y1, Y0, Y5 // Y5 = d9c85140 (7:6:5:4) 195 | VPUNPCKLBW Y3, Y2, Y6 // Y6 = fbea7362 (3:2:1:0) 196 | VPUNPCKHBW Y3, Y2, Y3 // Y3 = fbea7362 (7:6:5:4) 197 | VPUNPCKLWD Y6, Y4, Y0 // Y0 = fbead9c873625140 (1:0) 198 | VPUNPCKHWD Y6, Y4, Y1 // Y1 = fbead9c873625140 (3:2) 199 | VPUNPCKLWD Y3, Y5, Y2 // Y2 = fbead9c873625140 (5:4) 200 | VPUNPCKHWD Y3, Y5, Y3 // Y3 = fbead9c873625140 (7:6) 201 | 202 | // pull out high and low nibbles and reduce once 203 | VPAND Y0, Y7, Y4 204 | VPSRLD $4, Y0, Y0 205 | VPAND Y0, Y7, Y5 206 | VPAND Y2, Y7, Y6 207 | VPSRLD $4, Y2, Y2 208 | VPADDB Y4, Y6, Y0 // Y0 = ba98:3210:ba98:3210 (1:0) 209 | VPAND Y2, Y7, Y2 210 | VPADDB Y2, Y5, Y2 // Y2 = fedc:7654:fedc:7654 (1:0) 211 | 212 | VPAND Y1, Y7, Y4 213 | VPSRLD $4, Y1, Y1 214 | VPAND Y1, Y7, Y5 215 | VPAND Y3, Y7, Y6 216 | VPSRLD $4, Y3, Y3 217 | VPADDB Y4, Y6, Y1 // Y1 = ba98:3210:ba98:3210 (3:2) 218 | VPAND Y3, Y7, Y3 219 | VPADDB Y3, Y5, Y3 // Y3 = fedc:7654:fedc:7654 (3:2) 220 | 221 | // shuffle dwords and group them 222 | VPUNPCKLDQ Y2, Y0, Y4 223 | VPUNPCKHDQ Y2, Y0, Y5 224 | VPUNPCKLDQ Y3, Y1, Y6 225 | VPUNPCKHDQ Y3, Y1, Y7 226 | // VPERM2I128 $0x20, Y5, Y4, Y0 227 | BYTE $0xc4 228 | BYTE $0xe3 229 | BYTE $0x5d 230 | BYTE $0x46 231 | BYTE $0xc5 232 | BYTE $0x20 233 | // VPERM2I128 $0x31, Y5, Y4, Y2 234 | BYTE $0xc4 235 | BYTE $0xe3 236 | BYTE $0x5d 237 | BYTE $0x46 238 | BYTE $0xd5 239 | BYTE $0x31 240 | // VPERM2I128 $0x20, Y7, Y6, Y1 241 | BYTE $0xc4 242 | BYTE $0xe3 243 | BYTE $0x4d 244 | BYTE $0x46 245 | BYTE $0xcf 246 | BYTE $0x20 247 | // VPERM2I128 $0x31, Y7, Y6, Y3 248 | BYTE $0xc4 249 | BYTE $0xe3 250 | BYTE $0x4d 251 | BYTE $0x46 252 | BYTE $0xdf 253 | BYTE $0x31 254 | VPADDB Y2, Y0, Y0 // Y0 = fedc:ba98:7654:3210 (1:0) 255 | VPADDB Y3, Y1, Y1 // Y1 = fedc:ba98:7654:3210 (3:2) 256 | 257 | 258 | // zero-extend and add to Y8--Y11 259 | VPXOR Y7, Y7, Y7 260 | VPUNPCKLBW Y7, Y0, Y4 261 | VPUNPCKHBW Y7, Y0, Y5 262 | VPUNPCKLBW Y7, Y1, Y6 263 | VPUNPCKHBW Y7, Y1, Y1 264 | 265 | VPADDW 0*32(DX), Y4, Y4 266 | VPADDW 1*32(DX), Y5, Y5 267 | VPADDW 2*32(DX), Y6, Y6 268 | VPADDW 3*32(DX), Y1, Y1 269 | 270 | // write back to counters 271 | VMOVDQA Y4, 0*32(DX) 272 | VMOVDQA Y5, 1*32(DX) 273 | VMOVDQA Y6, 2*32(DX) 274 | VMOVDQA Y1, 3*32(DX) 275 | 276 | SUBL $15*4, AX // account for possible overflow 277 | CMPL AX, $15*4 // enough space left in the counters? 278 | JGE have_space 279 | 280 | // flush accumulators into counters 281 | CALL *BX // call accumulation function 282 | VPXOR Y7, Y7, Y7 283 | VMOVDQA Y7, 0*32(DX) 284 | VMOVDQA Y7, 1*32(DX) 285 | VMOVDQA Y7, 2*32(DX) 286 | VMOVDQA Y7, 3*32(DX) 287 | 288 | MOVL $65535, AX // space left til overflow could occur 289 | 290 | have_space: 291 | SUBL $15*32, BP // account for bytes consumed 292 | JGE vec 293 | 294 | endvec: VPBROADCASTQ magic<>+32(SB), Y2 // byte mask 295 | VMOVDQU magic<>+0(SB), Y3 // permutation mask 296 | VPXOR Y0, Y0, Y0 // lower counter register 297 | VPXOR Y1, Y1, Y1 // upper counter register 298 | 299 | // process tail, 8 bytes at a time 300 | SUBL $8-15*32, BP // 8 bytes left to process? 301 | JLT tail1 302 | 303 | tail8: VPBROADCASTD 0(SI), Y4 304 | VPBROADCASTD 4(SI), Y5 305 | ADDL $8, SI 306 | VPSHUFB Y3, Y4, Y4 307 | VPSHUFB Y3, Y5, Y5 308 | VPAND Y2, Y4, Y4 309 | VPAND Y2, Y5, Y5 310 | VPCMPEQB Y2, Y4, Y4 311 | VPCMPEQB Y2, Y5, Y5 312 | VPSUBB Y4, Y0, Y0 313 | VPSUBB Y5, Y1, Y1 314 | SUBL $8, BP 315 | JGE tail8 316 | 317 | // process remaining 0--7 byte 318 | tail1: SUBL $-8, BP // anything left to process? 319 | JLE end 320 | 321 | // VMOVQ (SI), X5 // load 8 byte from buffer. This is ok 322 | // as buffer is aligned to 8 byte here 323 | BYTE $0xc5 324 | BYTE $0xfa 325 | BYTE $0x7e 326 | BYTE $0x2e 327 | MOVL $window<>+32(SB), AX // load window address 328 | SUBL BP, AX // adjust mask pointer 329 | VMOVQ (AX), X6 // load window mask 330 | VPANDN X5, X6, X5 // and mask out the desired bytes 331 | 332 | VPBROADCASTD X5, Y4 333 | VPSRLDQ $4, X5, X5 334 | VPBROADCASTD X5, Y5 335 | VPSHUFB Y3, Y4, Y4 336 | VPSHUFB Y3, Y5, Y5 337 | VPAND Y2, Y4, Y4 338 | VPAND Y2, Y5, Y5 339 | VPCMPEQB Y2, Y4, Y4 340 | VPCMPEQB Y2, Y5, Y5 341 | VPSUBB Y4, Y0, Y0 342 | VPSUBB Y5, Y1, Y1 343 | 344 | // add tail to counters 345 | end: VPXOR Y7, Y7, Y7 346 | VPUNPCKLBW Y7, Y0, Y4 347 | VPUNPCKHBW Y7, Y0, Y5 348 | VPUNPCKLBW Y7, Y1, Y6 349 | VPUNPCKHBW Y7, Y1, Y1 350 | 351 | VPADDW 0*32(DX), Y4, Y4 352 | VPADDW 1*32(DX), Y5, Y5 353 | VPADDW 2*32(DX), Y6, Y6 354 | VPADDW 3*32(DX), Y1, Y1 355 | 356 | // write back to counters 357 | VMOVDQA Y4, 0*32(DX) 358 | VMOVDQA Y5, 1*32(DX) 359 | VMOVDQA Y6, 2*32(DX) 360 | VMOVDQA Y1, 3*32(DX) 361 | 362 | // and perform a final accumulation 363 | CALL *BX 364 | VZEROUPPER 365 | RET 366 | 367 | // Count8 accumulation function. Accumulates words 368 | // into 8 dword counters at (DI). Trashes Y0--Y7. 369 | TEXT accum8<>(SB), NOSPLIT, $0-0 370 | VPMOVZXWD 0*16(DX), Y0 371 | VPMOVZXWD 1*16(DX), Y2 372 | VPMOVZXWD 2*16(DX), Y1 373 | VPMOVZXWD 3*16(DX), Y3 374 | VPMOVZXWD 4*16(DX), Y4 375 | VPMOVZXWD 5*16(DX), Y6 376 | VPMOVZXWD 6*16(DX), Y5 377 | VPMOVZXWD 7*16(DX), Y7 378 | VPADDD Y0, Y4, Y0 379 | VPADDD Y1, Y5, Y1 380 | VPADDD Y2, Y6, Y2 381 | VPADDD Y3, Y7, Y3 382 | VPADDD Y0, Y2, Y0 383 | VPADDD Y1, Y3, Y1 384 | VPADDD Y1, Y0, Y0 385 | VPADDD 0*32(DI), Y0, Y0 386 | VMOVDQU Y0, 0*32(DI) 387 | RET 388 | 389 | // Count16 accumulation function. Accumulates words 390 | // into 16 dword counters at (DI). Trashes Y0--Y7. 391 | TEXT accum16<>(SB), NOSPLIT, $0-0 392 | VPMOVZXWD 0*16(DX), Y0 393 | VPMOVZXWD 1*16(DX), Y2 394 | VPMOVZXWD 2*16(DX), Y1 395 | VPMOVZXWD 3*16(DX), Y3 396 | VPMOVZXWD 4*16(DX), Y4 397 | VPMOVZXWD 5*16(DX), Y6 398 | VPMOVZXWD 6*16(DX), Y5 399 | VPMOVZXWD 7*16(DX), Y7 400 | VPADDD Y0, Y4, Y0 401 | VPADDD Y1, Y5, Y1 402 | VPADDD Y2, Y6, Y2 403 | VPADDD Y3, Y7, Y3 404 | VPADDD Y0, Y2, Y0 405 | VPADDD Y1, Y3, Y1 406 | VPADDD 0*32(DI), Y0, Y0 407 | VPADDD 1*32(DI), Y1, Y1 408 | VMOVDQU Y0, 0*32(DI) 409 | VMOVDQU Y1, 1*32(DI) 410 | RET 411 | 412 | // Count32 accumulation function. Accumulates words 413 | // into 32 dword counters at (DI). Trashes Y0--Y7. 414 | TEXT accum32<>(SB), NOSPLIT, $0-0 415 | VPMOVZXWD 0*16(DX), Y0 416 | VPMOVZXWD 1*16(DX), Y2 417 | VPMOVZXWD 2*16(DX), Y1 418 | VPMOVZXWD 3*16(DX), Y3 419 | VPMOVZXWD 4*16(DX), Y4 420 | VPMOVZXWD 5*16(DX), Y6 421 | VPMOVZXWD 6*16(DX), Y5 422 | VPMOVZXWD 7*16(DX), Y7 423 | VPADDD Y0, Y4, Y0 424 | VPADDD Y1, Y5, Y1 425 | VPADDD Y2, Y6, Y2 426 | VPADDD Y3, Y7, Y3 427 | VPADDD 0*32(DI), Y0, Y0 428 | VPADDD 1*32(DI), Y1, Y1 429 | VPADDD 2*32(DI), Y2, Y2 430 | VPADDD 3*32(DI), Y3, Y3 431 | VMOVDQU Y0, 0*32(DI) 432 | VMOVDQU Y1, 1*32(DI) 433 | VMOVDQU Y2, 2*32(DI) 434 | VMOVDQU Y3, 3*32(DI) 435 | RET 436 | 437 | // Count64 accumulation function. Accumulates words 438 | // into 64 dword counters at (DI). Trashes Y0--Y3. 439 | TEXT accum64<>(SB), NOSPLIT, $0-0 440 | VPMOVZXWD 0*16(DX), Y0 441 | VPMOVZXWD 1*16(DX), Y2 442 | VPMOVZXWD 2*16(DX), Y1 443 | VPMOVZXWD 3*16(DX), Y3 444 | VPADDD 0*32(DI), Y0, Y0 445 | VPADDD 1*32(DI), Y1, Y1 446 | VPADDD 2*32(DI), Y2, Y2 447 | VPADDD 3*32(DI), Y3, Y3 448 | VMOVDQU Y0, 0*32(DI) 449 | VMOVDQU Y1, 1*32(DI) 450 | VMOVDQU Y2, 2*32(DI) 451 | VMOVDQU Y3, 3*32(DI) 452 | VPMOVZXWD 4*16(DX), Y0 453 | VPMOVZXWD 5*16(DX), Y2 454 | VPMOVZXWD 6*16(DX), Y1 455 | VPMOVZXWD 7*16(DX), Y3 456 | VPADDD 4*32(DI), Y0, Y0 457 | VPADDD 5*32(DI), Y1, Y1 458 | VPADDD 6*32(DI), Y2, Y2 459 | VPADDD 7*32(DI), Y3, Y3 460 | VMOVDQU Y0, 4*32(DI) 461 | VMOVDQU Y1, 5*32(DI) 462 | VMOVDQU Y2, 6*32(DI) 463 | VMOVDQU Y3, 7*32(DI) 464 | RET 465 | 466 | // func count8avx2(counts *[8]int, buf []uint8) 467 | TEXT ·count8avx2(SB), 0, $0-16 468 | MOVL counts+0(FP), DI 469 | MOVL buf_base+4(FP), SI // SI = &buf[0] 470 | MOVL buf_len+8(FP), BP // BP = len(buf) 471 | MOVL $accum8<>(SB), BX 472 | CALL countavx<>(SB) 473 | RET 474 | 475 | // func count16avx2(counts *[16]int, buf []uint16) 476 | TEXT ·count16avx2(SB), 0, $0-16 477 | MOVL counts+0(FP), DI 478 | MOVL buf_base+4(FP), SI // SI = &buf[0] 479 | MOVL buf_len+8(FP), BP // BP = len(buf) 480 | MOVL $accum16<>(SB), BX 481 | SHLL $1, BP // count in bytes 482 | CALL countavx<>(SB) 483 | RET 484 | 485 | // func count32avx2(counts *[32]int, buf []uint32) 486 | TEXT ·count32avx2(SB), 0, $0-16 487 | MOVL counts+0(FP), DI 488 | MOVL buf_base+4(FP), SI // SI = &buf[0] 489 | MOVL buf_len+8(FP), BP // BP = len(buf) 490 | MOVL $accum32<>(SB), BX 491 | SHLL $2, BP // count in bytes 492 | CALL countavx<>(SB) 493 | RET 494 | 495 | // func count64avx2(counts *[64]int, buf []uint64) 496 | TEXT ·count64avx2(SB), 0, $0-16 497 | MOVL counts+0(FP), DI 498 | MOVL buf_base+4(FP), SI // SI = &buf[0] 499 | MOVL buf_len+8(FP), BP // BP = len(buf) 500 | MOVL $accum64<>(SB), BX 501 | SHLL $3, BP // count in bytes 502 | CALL countavx<>(SB) 503 | RET 504 | -------------------------------------------------------------------------------- /countavx2_amd64.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // An AVX2 based kernel first doing a 15-fold CSA reduction and then 4 | // a 16-fold CSA reduction, carrying over place-value vectors between 5 | // iterations. 6 | // Required CPU extension: AVX2, BMI2. 7 | 8 | // magic transposition constants, comparison constants 9 | DATA magic<>+ 0(SB)/8, $0x0000000000000000 10 | DATA magic<>+ 8(SB)/8, $0x0101010101010101 11 | DATA magic<>+16(SB)/8, $0x0202020202020202 12 | DATA magic<>+24(SB)/8, $0x0303030303030303 13 | DATA magic<>+32(SB)/8, $0x0404040404040404 14 | DATA magic<>+40(SB)/8, $0x0505050505050505 15 | DATA magic<>+48(SB)/8, $0x0606060606060606 16 | DATA magic<>+56(SB)/8, $0x0707070707070707 17 | DATA magic<>+64(SB)/8, $0x8040201008040201 18 | DATA magic<>+72(SB)/4, $0x55555555 19 | DATA magic<>+76(SB)/4, $0x33333333 20 | DATA magic<>+80(SB)/4, $0x0f0f0f0f 21 | DATA magic<>+84(SB)/4, $0x00ff00ff 22 | GLOBL magic<>(SB), RODATA|NOPTR, $88 23 | 24 | // sliding window for head/tail loads. Unfortunately, there doesn't seem to be 25 | // a good way to do this with less memory wasted. 26 | DATA window<>+ 0(SB)/8, $0x0000000000000000 27 | DATA window<>+ 8(SB)/8, $0x0000000000000000 28 | DATA window<>+16(SB)/8, $0x0000000000000000 29 | DATA window<>+24(SB)/8, $0x0000000000000000 30 | DATA window<>+32(SB)/8, $0xffffffffffffffff 31 | DATA window<>+40(SB)/8, $0xffffffffffffffff 32 | DATA window<>+48(SB)/8, $0xffffffffffffffff 33 | DATA window<>+56(SB)/8, $0xffffffffffffffff 34 | GLOBL window<>(SB), RODATA|NOPTR, $64 35 | 36 | // B:A = A+B+C, D used for scratch space 37 | #define CSA(A, B, C, D) \ 38 | VPAND A, B, D \ 39 | VPXOR A, B, A \ 40 | VPAND A, C, B \ 41 | VPXOR A, C, A \ 42 | VPOR B, D, B 43 | 44 | // count 8 bytes from L into Y0 and Y1, 45 | // using Y4, and Y5 for scratch space 46 | #define COUNT8(L) \ 47 | VPBROADCASTQ L, Y4 \ // Y4 = 7654:3210:7654:3210:7654:3210:7654:3210 48 | VPSHUFB Y7, Y4, Y5 \ // Y5 = 7777:7777:6666:6666:5555:5555:4444:4444 49 | VPSHUFB Y3, Y4, Y4 \ // Y4 = 3333:3333:2222:2222:1111:1111:0000:0000 50 | VPAND Y2, Y5, Y5 \ 51 | VPAND Y2, Y4, Y4 \ // mask out one bit in each copy of the bytes 52 | VPCMPEQB Y2, Y5, Y5 \ // set bytes to -1 if the bits were set 53 | VPCMPEQB Y2, Y4, Y4 \ // or to 0 otherwise 54 | VPSUBB Y5, Y1, Y1 \ 55 | VPSUBB Y4, Y0, Y0 // add 1/0 (subtract -1/0) to counters 56 | 57 | 58 | // Generic kernel. This function expects a pointer to a width-specific 59 | // accumulation function in BX, a possibly unaligned input buffer in SI, 60 | // counters in DI and a remaining length in CX. 61 | TEXT countavx2<>(SB), NOSPLIT, $0-0 62 | CMPQ CX, $15*32 // is the CSA kernel worth using? 63 | JLT runt 64 | 65 | // load head until alignment/end is reached 66 | MOVL SI, DX 67 | ANDL $31, DX // offset of the buffer start from 32 byte alignment 68 | MOVL $32, AX 69 | SUBL DX, AX // number of bytes til alignment is reached (head length) 70 | SUBQ DX, SI // align source to 32 bytes 71 | VMOVDQA (SI), Y0 // load head 72 | ADDQ DX, CX // and account for head length 73 | LEAQ window<>(SB), DX // load window mask base pointer 74 | VPAND (DX)(AX*1), Y0, Y0 // mask out bytes not in head 75 | 76 | VMOVDQA 1*32(SI), Y1 // load 480 (-32) bytes from buf 77 | VMOVDQA 2*32(SI), Y4 // and sum them into Y3:Y2:Y1:Y0 78 | VMOVDQA 3*32(SI), Y2 79 | VMOVDQA 4*32(SI), Y3 80 | VMOVDQA 5*32(SI), Y5 81 | VMOVDQA 6*32(SI), Y6 82 | CSA(Y0, Y1, Y4, Y7) 83 | VMOVDQA 7*32(SI), Y4 84 | CSA(Y3, Y2, Y5, Y7) 85 | VMOVDQA 8*32(SI), Y5 86 | CSA(Y0, Y3, Y6, Y7) 87 | VMOVDQA 9*32(SI), Y6 88 | CSA(Y1, Y2, Y3, Y7) 89 | VMOVDQA 10*32(SI), Y3 90 | CSA(Y0, Y4, Y5, Y7) 91 | VMOVDQA 11*32(SI), Y5 92 | CSA(Y0, Y3, Y6, Y7) 93 | VMOVDQA 12*32(SI), Y6 94 | CSA(Y1, Y3, Y4, Y7) 95 | VMOVDQA 13*32(SI), Y4 96 | CSA(Y0, Y5, Y6, Y7) 97 | VMOVDQA 14*32(SI), Y6 98 | VPBROADCASTD magic<>+72(SB), Y15 // 0x55555555 99 | VPBROADCASTD magic<>+76(SB), Y13 // 0x33333333 100 | CSA(Y0, Y4, Y6, Y7) 101 | VPXOR Y8, Y8, Y8 // initialise counters 102 | VPXOR Y9, Y9, Y9 103 | CSA(Y1, Y4, Y5, Y7) 104 | VPXOR Y10, Y10, Y10 105 | VPXOR Y11, Y11, Y11 106 | CSA(Y2, Y3, Y4, Y7) 107 | 108 | ADDQ $15*32, SI 109 | SUBQ $(15+16)*32, CX // enough data left to process? 110 | JLT post 111 | 112 | MOVL $65535, AX // space left til overflow could occur in Y8--Y11 113 | 114 | // load 512 bytes from buf, add them to Y0..Y3 into Y0..Y4 115 | vec: VMOVDQA 0*32(SI), Y4 116 | VMOVDQA 1*32(SI), Y5 117 | VMOVDQA 2*32(SI), Y6 118 | VMOVDQA 3*32(SI), Y12 119 | VMOVDQA 4*32(SI), Y14 120 | CSA(Y0, Y4, Y5, Y7) 121 | VMOVDQA 5*32(SI), Y5 122 | CSA(Y6, Y12, Y14, Y7) 123 | VMOVDQA 6*32(SI), Y14 124 | CSA(Y1, Y4, Y12, Y7) 125 | VMOVDQA 7*32(SI), Y12 126 | CSA(Y0, Y5, Y6, Y7) 127 | VMOVDQA 8*32(SI), Y6 128 | CSA(Y6, Y12, Y14, Y7) 129 | VMOVDQA 9*32(SI), Y14 130 | CSA(Y1, Y5, Y12, Y7) 131 | VMOVDQA 10*32(SI), Y12 132 | CSA(Y0, Y12, Y14, Y7) 133 | VMOVDQA 11*32(SI), Y14 134 | CSA(Y2, Y4, Y5, Y7) 135 | VMOVDQA 12*32(SI), Y5 136 | CSA(Y0, Y6, Y14, Y7) 137 | VMOVDQA 13*32(SI), Y14 138 | CSA(Y1, Y6, Y12, Y7) 139 | VMOVDQA 14*32(SI), Y12 140 | CSA(Y5, Y12, Y14, Y7) 141 | VMOVDQA 15*32(SI), Y14 142 | CSA(Y0, Y5, Y14, Y7) 143 | ADDQ $16*32, SI 144 | PREFETCHT0 0(SI) 145 | PREFETCHT0 32(SI) 146 | CSA(Y1, Y5, Y12, Y7) 147 | CSA(Y2, Y5, Y6, Y7) 148 | CSA(Y3, Y4, Y5, Y7) 149 | 150 | 151 | VPBROADCASTD magic<>+84(SB), Y12 // 0x00ff00ff 152 | VPBROADCASTD magic<>+80(SB), Y14 // 0x0f0f0f0f 153 | 154 | // now Y0..Y4 hold counters; preserve Y0..Y4 for the next round 155 | // and add Y4 to the counters. 156 | 157 | // split into even/odd and reduce into crumbs 158 | VPAND Y4, Y15, Y5 // Y5 = 02468ace x16 159 | VPANDN Y4, Y15, Y6 // Y6 = 13579bdf x16 160 | VPSRLD $1, Y6, Y6 161 | VPERM2I128 $0x20, Y6, Y5, Y4 162 | VPERM2I128 $0x31, Y6, Y5, Y5 163 | VPADDD Y5, Y4, Y4 // Y4 = 02468ace x8 13579bdf x8 164 | 165 | // split again and reduce into nibbles 166 | VPAND Y4, Y13, Y5 // Y5 = 048c x8 159d x8 167 | VPANDN Y4, Y13, Y6 // Y6 = 26ae x8 37bf x8 168 | VPSRLD $2, Y6, Y6 169 | VPUNPCKLQDQ Y6, Y5, Y4 170 | VPUNPCKHQDQ Y6, Y5, Y5 171 | VPADDD Y5, Y4, Y4 // Y4 = 048c x4 26ae x4 159d x4 37bf x4 172 | 173 | // split again into bytes and shuffle into order 174 | VPAND Y4, Y14, Y5 // Y5 = 08 x4 2a x4 19 x4 3b x4 175 | VPANDN Y4, Y14, Y6 // Y4 = 4c x4 6e x4 5d x4 7f x4 176 | VPSLLD $4, Y5, Y5 177 | VPERM2I128 $0x20, Y6, Y5, Y4 // Y4 = 08 x4 2a x4 4c x4 6e x4 178 | VPERM2I128 $0x31, Y6, Y5, Y5 // Y5 = 19 x4 3b x4 5d x4 7f x4 179 | VPUNPCKLWD Y5, Y4, Y6 // Y6 = 0819 x4 4c5d x4 180 | VPUNPCKHWD Y5, Y4, Y7 // Y7 = 2a3b x4 6e7f x4 181 | VPUNPCKLDQ Y7, Y6, Y4 // Y4 = 08192a3b[0:1] 4c5d6e7f[0:1] 182 | VPUNPCKHDQ Y7, Y6, Y5 // Y5 = 08192a3b[2:3] 4c5d6e7f[2:3] 183 | VPERMQ $0xd8, Y4, Y4 // Y4 = 08192a3b4c5d6e7f[0:1] 184 | VPERMQ $0xd8, Y5, Y5 // Y5 = 08192a3b4c5d6e7f[2:3] 185 | 186 | // split again into words and add to counters 187 | VPAND Y4, Y12, Y6 // Y6 = 01234567[0:1] 188 | VPAND Y5, Y12, Y7 // Y7 = 01234567[2:3] 189 | VPADDW Y6, Y8, Y8 190 | VPADDW Y7, Y10, Y10 191 | VPSRLW $8, Y4, Y4 // Y4 = 89abcdef[0:1] 192 | VPSRLW $8, Y5, Y5 // Y5 = 89abcdef[2:3] 193 | VPADDW Y4, Y9, Y9 194 | VPADDW Y5, Y11, Y11 195 | 196 | SUBL $16*4, AX // account for possible overflow 197 | CMPL AX, $(15+15)*4 // enough space left in the counters? 198 | JGE have_space 199 | 200 | // flush accumulators into counters 201 | VPXOR Y7, Y7, Y7 202 | CALL *BX // call accumulation function 203 | VPXOR Y8, Y8, Y8 // clear accumulators for next round 204 | VPXOR Y9, Y9, Y9 205 | VPXOR Y10, Y10, Y10 206 | VPXOR Y11, Y11, Y11 207 | 208 | MOVL $65535, AX // space left til overflow could occur 209 | 210 | have_space: 211 | SUBQ $16*32, CX // account for bytes consumed 212 | JGE vec 213 | 214 | // group nibbles in Y0, Y1, Y2, and Y3 into Y4, Y5, Y6, and Y7 215 | post: VPBROADCASTD magic<>+80(SB), Y14 // 0x0f0f0f0f 216 | 217 | VPAND Y1, Y15, Y5 218 | VPADDD Y5, Y5, Y5 219 | VPAND Y3, Y15, Y7 220 | VPADDD Y7, Y7, Y7 221 | VPAND Y0, Y15, Y4 222 | VPAND Y2, Y15, Y6 223 | VPOR Y4, Y5, Y4 // Y4 = eca86420 (low crumbs) 224 | VPOR Y6, Y7, Y5 // Y5 = eca86420 (high crumbs) 225 | 226 | VPANDN Y0, Y15, Y0 227 | VPSRLD $1, Y0, Y0 228 | VPANDN Y2, Y15, Y2 229 | VPSRLD $1, Y2, Y2 230 | VPANDN Y1, Y15, Y1 231 | VPANDN Y3, Y15, Y3 232 | VPOR Y0, Y1, Y6 // Y6 = fdb97531 (low crumbs) 233 | VPOR Y2, Y3, Y7 // Y7 = fdb97531 (high crumbs) 234 | 235 | VPAND Y5, Y13, Y1 236 | VPSLLD $2, Y1, Y1 237 | VPAND Y7, Y13, Y3 238 | VPSLLD $2, Y3, Y3 239 | VPAND Y4, Y13, Y0 240 | VPAND Y6, Y13, Y2 241 | VPOR Y0, Y1, Y0 // Y0 = c840 242 | VPOR Y2, Y3, Y1 // Y1 = d951 243 | 244 | VPANDN Y4, Y13, Y4 245 | VPSRLD $2, Y4, Y4 246 | VPANDN Y6, Y13, Y6 247 | VPSRLD $2, Y6, Y6 248 | VPANDN Y5, Y13, Y5 249 | VPANDN Y7, Y13, Y7 250 | VPOR Y4, Y5, Y2 // Y2 = ea62 251 | VPOR Y6, Y7, Y3 // Y3 = fb73 252 | 253 | // pre-shuffle nibbles 254 | VPUNPCKLBW Y1, Y0, Y5 // Y5 = d9c85140 (3:2:1:0) 255 | VPUNPCKHBW Y1, Y0, Y0 // Y0 = d9c85140 (7:6:5:4) 256 | VPUNPCKLBW Y3, Y2, Y6 // Y6 = fbea7362 (3:2:1:0) 257 | VPUNPCKHBW Y3, Y2, Y1 // Y1 = fbea7362 (3:2:1:0) 258 | VPUNPCKLWD Y6, Y5, Y4 // Y4 = fbead9c873625140 (1:0) 259 | VPUNPCKHWD Y6, Y5, Y5 // Y5 = fbead9c873625140 (3:2) 260 | VPUNPCKLWD Y1, Y0, Y6 // Y6 = fbead9c873624150 (5:4) 261 | VPUNPCKHWD Y1, Y0, Y7 // Y7 = fbead9c873624150 (7:6) 262 | 263 | // pull out high and low nibbles 264 | VPAND Y4, Y14, Y0 265 | VPSRLD $4, Y4, Y4 266 | VPAND Y4, Y14, Y4 267 | VPAND Y5, Y14, Y1 268 | VPSRLD $4, Y5, Y5 269 | VPAND Y5, Y14, Y5 270 | VPAND Y6, Y14, Y2 271 | VPSRLD $4, Y6, Y6 272 | VPAND Y6, Y14, Y6 273 | VPAND Y7, Y14, Y3 274 | VPSRLD $4, Y7, Y7 275 | VPAND Y7, Y14, Y7 276 | 277 | // reduce common values 278 | VPADDB Y2, Y0, Y0 // Y0 = ba98:3210:ba98:3210 (1:0) 279 | VPADDB Y3, Y1, Y1 // Y1 = ba98:3210:ba98:3210 (3:2) 280 | VPADDB Y6, Y4, Y2 // Y2 = fedc:7654:fedc:7654 (1:0) 281 | VPADDB Y7, Y5, Y3 // Y3 = fedc:7654:fedc:7654 (3:2) 282 | 283 | // shuffle dwords and group them 284 | VPUNPCKLDQ Y2, Y0, Y4 285 | VPUNPCKHDQ Y2, Y0, Y5 286 | VPUNPCKLDQ Y3, Y1, Y6 287 | VPUNPCKHDQ Y3, Y1, Y7 288 | VPERM2I128 $0x20, Y5, Y4, Y0 289 | VPERM2I128 $0x31, Y5, Y4, Y2 290 | VPERM2I128 $0x20, Y7, Y6, Y1 291 | VPERM2I128 $0x31, Y7, Y6, Y3 292 | VPADDB Y2, Y0, Y0 // Y0 = fedc:ba98:7654:3210 (1:0) 293 | VPADDB Y3, Y1, Y1 // Y1 = fedc:ba98:7654:3210 (3:2) 294 | 295 | // zero-extend and add to Y8--Y11 296 | VPXOR Y7, Y7, Y7 297 | VPUNPCKLBW Y7, Y0, Y4 298 | VPUNPCKHBW Y7, Y0, Y5 299 | VPUNPCKLBW Y7, Y1, Y6 300 | VPUNPCKHBW Y7, Y1, Y1 301 | 302 | VPADDW Y4, Y8, Y8 303 | VPADDW Y5, Y9, Y9 304 | VPADDW Y6, Y10, Y10 305 | VPADDW Y1, Y11, Y11 306 | 307 | endvec: CMPL CX, $-16*32 // no bytes left to process? 308 | JE end 309 | 310 | VPBROADCASTQ magic<>+64(SB), Y2 // byte mask 311 | VMOVDQU magic<>+0(SB), Y3 // permutation mask 312 | VMOVDQU magic<>+32(SB), Y7 313 | VPXOR Y0, Y0, Y0 // lower counter register 314 | VPXOR Y1, Y1, Y1 // upper counter register 315 | 316 | // process tail, 8 bytes at a time 317 | SUBL $8-16*32, CX // 8 bytes left to process? 318 | JLE tail1 319 | 320 | tail8: COUNT8((SI)) 321 | ADDQ $8, SI 322 | SUBL $8, CX 323 | JGT tail8 324 | 325 | // process remaining 1--8 bytes 326 | tail1: MOVL $8*8(CX*8), CX 327 | BZHIQ CX, (SI), AX // load tail into AX (will never fault) 328 | VMOVQ AX, X6 329 | COUNT8(X6) 330 | 331 | // add tail to counters 332 | VPXOR Y7, Y7, Y7 333 | VPUNPCKLBW Y7, Y0, Y4 334 | VPUNPCKHBW Y7, Y0, Y5 335 | VPUNPCKLBW Y7, Y1, Y6 336 | VPUNPCKHBW Y7, Y1, Y7 337 | 338 | VPADDW Y4, Y8, Y8 339 | VPADDW Y5, Y9, Y9 340 | VPADDW Y6, Y10, Y10 341 | VPADDW Y7, Y11, Y11 342 | 343 | // and perform a final accumulation 344 | end: VPXOR Y7, Y7, Y7 345 | CALL *BX 346 | VZEROUPPER 347 | RET 348 | 349 | // buffer is short, do just head/tail processing 350 | runt: VPBROADCASTQ magic<>+64(SB), Y2 // bit position mask 351 | VMOVDQU magic<>+0(SB), Y3 // permutation mask 352 | VMOVDQU magic<>+32(SB), Y7 353 | VPXOR Y0, Y0, Y0 // lower counter register 354 | VPXOR Y1, Y1, Y1 // upper counter register 355 | SUBL $8, CX // 8 byte left to process? 356 | JLT runt1 357 | 358 | // process runt, 8 bytes at a time 359 | runt8: COUNT8((SI)) 360 | ADDQ $8, SI 361 | SUBL $8, CX 362 | JGE runt8 363 | 364 | // process remaining 0--7 byte 365 | // while making sure we don't get a page fault 366 | runt1: CMPL CX, $-8 // anything left to process? 367 | JLE runt_accum 368 | 369 | LEAL 7(SI)(CX*1), DX // last address of buffer 370 | XORL SI, DX // which bits changed? 371 | LEAL 8*8(CX*8), CX // CX scaled to a bit length 372 | TESTL $8, DX // did we cross an alignment boundary? 373 | JNE crossrunt1 // if yes, we can safely load directly 374 | 375 | LEAL (SI*8), AX 376 | ANDQ $~7, SI // align buffer to 8 bytes 377 | MOVQ (SI), R8 // and load 8 bytes from buffer 378 | SHRXQ AX, R8, R8 // buffer starting at the beginning 379 | BZHIQ CX, R8, R8 // mask out bytes past the buffer 380 | JMP dorunt1 381 | 382 | crossrunt1: 383 | BZHIQ CX, (SI), R8 // load 8 bytes from unaligned buffer 384 | 385 | dorunt1:VMOVQ R8, X6 386 | COUNT8(X6) 387 | 388 | // move tail to counters and perform final accumulation 389 | runt_accum: 390 | VPXOR Y7, Y7, Y7 391 | VPUNPCKLBW Y7, Y0, Y8 392 | VPUNPCKHBW Y7, Y0, Y9 393 | VPUNPCKLBW Y7, Y1, Y10 394 | VPUNPCKHBW Y7, Y1, Y11 395 | CALL *BX 396 | VZEROUPPER 397 | RET 398 | 399 | // zero extend Y8--Y11 into dwords and fold the upper 32 counters 400 | // over the lower 32 counters, leaving the registers with 401 | // Y12 contains 0- 3, 16-19 402 | // Y8 contains 4- 7, 20-23 403 | // Y14 contains 8-11, 24-27 404 | // Y9 contains 12-15, 28-31 405 | // Assumes Y7 == 0. 406 | #define FOLD32 \ 407 | VPUNPCKLWD Y7, Y8, Y12 \ 408 | VPUNPCKHWD Y7, Y8, Y8 \ 409 | VPUNPCKLWD Y7, Y9, Y14 \ 410 | VPUNPCKHWD Y7, Y9, Y9 \ 411 | VPUNPCKLWD Y7, Y10, Y4 \ 412 | VPUNPCKHWD Y7, Y10, Y10 \ 413 | VPUNPCKLWD Y7, Y11, Y5 \ 414 | VPUNPCKHWD Y7, Y11, Y11 \ 415 | VPADDD Y12, Y4, Y12 \ 416 | VPADDD Y8, Y10, Y8 \ 417 | VPADDD Y14, Y5, Y14 \ 418 | VPADDD Y9, Y11, Y9 419 | 420 | // zero-extend dwords in Y trashing Y and Z. Add the low 421 | // half dwords to a*8(DI) and the high half to b*8(DI). 422 | // Assumes Y7 == 0 423 | #define ACCUM(a, b, Y, Z) \ 424 | VPERMQ $0xd8, Y, Y \ 425 | VPUNPCKHDQ Y7, Y, Z \ 426 | VPUNPCKLDQ Y7, Y, Y \ 427 | VPADDQ (a)*8(DI), Y, Y \ 428 | VPADDQ (b)*8(DI), Z, Z \ 429 | VMOVDQU Y, (a)*8(DI) \ 430 | VMOVDQU Z, (b)*8(DI) 431 | 432 | // Count8 accumulation function. Accumulates words Y8--Y11 433 | // into 8 qword counters at (DI). Trashes Y0--Y12. 434 | TEXT accum8<>(SB), NOSPLIT, $0-0 435 | FOLD32 436 | 437 | VPADDD Y14, Y12, Y12 // 0- 3, 0- 3 438 | VPADDD Y9, Y8, Y8 // 4- 7, 4- 7 439 | VPERM2I128 $0x20, Y8, Y12, Y14 440 | VPERM2I128 $0x31, Y8, Y12, Y4 441 | VPADDD Y4, Y14, Y12 // 0- 3, 4- 7 442 | ACCUM(0, 4, Y12, Y14) 443 | RET 444 | 445 | // Count16 accumulation function. Accumulates words Y8--Y11 446 | // into 16 qword counters at (DI). Trashes Y0--Y12. 447 | TEXT accum16<>(SB), NOSPLIT, $0-0 448 | FOLD32 449 | 450 | // fold over upper 16 bit over lower 32 counters 451 | VPERM2I128 $0x20, Y8, Y12, Y4 // 0- 3, 4- 7 452 | VPERM2I128 $0x31, Y8, Y12, Y10 // 16-19, 20-23 453 | VPADDD Y4, Y10, Y12 // 0- 7 454 | VPERM2I128 $0x20, Y9, Y14, Y5 // 8-11, 12-15 455 | VPERM2I128 $0x31, Y9, Y14, Y11 // 24-27, 29-31 456 | VPADDD Y5, Y11, Y4 // 8-15 457 | 458 | // zero extend into qwords and add to counters 459 | ACCUM(0, 4, Y12, Y14) 460 | ACCUM(8, 12, Y4, Y5) 461 | 462 | RET 463 | 464 | // Count32 accumulation function. Accumulates words Y8--Y11 465 | // int 32 qword counters at (DI). Trashes Y0--Y12 466 | TEXT accum32<>(SB), NOSPLIT, $0-0 467 | FOLD32 468 | 469 | ACCUM( 0, 16, Y12, Y4) 470 | ACCUM( 4, 20, Y8, Y4) 471 | ACCUM( 8, 24, Y14, Y4) 472 | ACCUM(12, 28, Y9, Y4) 473 | 474 | RET 475 | 476 | // accumulate the 16 counters in Y into k*8(DI) to (k+15)*8(DI) 477 | // trashes Y0--Y3. Assumes Y12 == 0 478 | #define ACCUM64(k, Y) \ 479 | VPUNPCKLWD Y7, Y, Y12 \ 480 | VPUNPCKHWD Y7, Y, Y14 \ 481 | ACCUM(k, k+16, Y12, Y4) \ 482 | ACCUM(k+4, k+20, Y14, Y4) 483 | 484 | // Count64 accumulation function. Accumulates words Y8--Y11 485 | // into 64 qword counters at (DI). Trashes Y0--Y12. 486 | TEXT accum64<>(SB), NOSPLIT, $0-0 487 | ACCUM64(0, Y8) 488 | ACCUM64(8, Y9) 489 | ACCUM64(32, Y10) 490 | ACCUM64(40, Y11) 491 | RET 492 | 493 | // func count8avx2(counts *[8]int, buf []uint8) 494 | TEXT ·count8avx2(SB), 0, $0-32 495 | MOVQ counts+0(FP), DI 496 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 497 | MOVQ buf_len+16(FP), CX // CX = len(buf) 498 | MOVQ $accum8<>(SB), BX 499 | CALL countavx2<>(SB) 500 | RET 501 | 502 | // func count16avx2(counts *[16]int, buf []uint16) 503 | TEXT ·count16avx2(SB), 0, $0-32 504 | MOVQ counts+0(FP), DI 505 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 506 | MOVQ buf_len+16(FP), CX // CX = len(buf) 507 | MOVQ $accum16<>(SB), BX 508 | SHLQ $1, CX // count in bytes 509 | CALL countavx2<>(SB) 510 | RET 511 | 512 | // func count32avx2(counts *[32]int, buf []uint32) 513 | TEXT ·count32avx2(SB), 0, $0-32 514 | MOVQ counts+0(FP), DI 515 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 516 | MOVQ buf_len+16(FP), CX // CX = len(buf) 517 | MOVQ $accum32<>(SB), BX 518 | SHLQ $2, CX // count in bytes 519 | CALL countavx2<>(SB) 520 | RET 521 | 522 | // func count64avx2(counts *[64]int, buf []uint64) 523 | TEXT ·count64avx2(SB), 0, $0-32 524 | MOVQ counts+0(FP), DI 525 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 526 | MOVQ buf_len+16(FP), CX // CX = len(buf) 527 | MOVQ $accum64<>(SB), BX 528 | SHLQ $3, CX // count in bytes 529 | CALL countavx2<>(SB) 530 | RET 531 | -------------------------------------------------------------------------------- /countavx512_amd64.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // An AVX512 based kernel first doing a 15-fold CSA reduction 4 | // and then a 16-fold CSA reduction, carrying over place-value 5 | // vectors between iterations. 6 | // Required CPU extensions: BMI2, AVX-512 -F, -BW. 7 | 8 | // magic constants 9 | DATA magic<>+ 0(SB)/4, $0x55555555 10 | DATA magic<>+ 4(SB)/4, $0x33333333 11 | DATA magic<>+ 8(SB)/4, $0x0f0f0f0f 12 | DATA magic<>+12(SB)/4, $0x00ff00ff 13 | 14 | // permutation vectors for the last permutation step of the vec loop 15 | // permutes words 16 | // A = 0000 1111 2222 3333 4444 5555 6666 7777 17 | // B = 8888 9999 AAAA BBBB CCCC DDDD EEEE FFFF 18 | // into the order used by the counters: 19 | // Q1 = 0123 4567 0123 4567 0123 4567 0123 4567 20 | // Q2 = 89AB CDEF 89AB CDEF 89AB CDEF 89AB CDEF 21 | DATA magic<>+16(SB)/8, $0x1c1814100c080400 22 | DATA magic<>+24(SB)/8, $0x1d1915110d090501 23 | DATA magic<>+32(SB)/8, $0x1e1a16120e0a0602 24 | DATA magic<>+40(SB)/8, $0x1f1b17130f0b0703 25 | GLOBL magic<>(SB), RODATA|NOPTR, $48 26 | 27 | // B:A = A+B+C, D used as scratch space 28 | #define CSA(A, B, C, D) \ 29 | VMOVDQA64 A, D \ 30 | VPTERNLOGD $0x96, C, B, A \ 31 | VPTERNLOGD $0xe8, C, D, B 32 | 33 | // Generic kernel. This function expects a pointer to a width-specific 34 | // accumulation function in BX, a possibly unaligned input buffer in SI, 35 | // counters in DI and an array length in CX. 36 | TEXT countavx512<>(SB), NOSPLIT, $0-0 37 | // head and tail constants, counter registers 38 | VPTERNLOGD $0xff, Z30, Z30, Z30 // ffffffff 39 | VPXORD Y25, Y25, Y25 // zero register 40 | 41 | CMPQ CX, $15*64 // is the CSA kernel worth using? 42 | JLT runt 43 | 44 | // compute misalignment mask 45 | MOVQ $-1, AX 46 | SHLXQ SI, AX, AX // mask out the head of the load 47 | KMOVQ AX, K1 // prepare mask register 48 | ADDQ SI, CX 49 | ANDQ $~63, SI // align source to 64 byte 50 | SUBQ SI, CX // account for head length in CX 51 | 52 | VMOVDQU8.Z 0*64(SI), K1, Z0 // load 960 bytes from buf 53 | VMOVDQA64 1*64(SI), Z1 // and sum them into Z3:Z2:Z1:Z0 54 | VMOVDQA64 2*64(SI), Z4 55 | VPXOR Y8, Y8, Y8 // initialise counters 56 | VPXOR Y9, Y9, Y9 57 | VMOVDQA64 3*64(SI), Z2 58 | VMOVDQA64 4*64(SI), Z3 59 | VMOVDQA64 5*64(SI), Z5 60 | CSA(Z0, Z1, Z4, Z22) 61 | VMOVDQA64 6*64(SI), Z6 62 | VMOVDQA64 7*64(SI), Z7 63 | VMOVDQA64 8*64(SI), Z10 64 | CSA(Z2, Z3, Z5, Z22) 65 | VMOVDQA64 9*64(SI), Z11 66 | VMOVDQA64 10*64(SI), Z12 67 | VMOVDQA64 11*64(SI), Z13 68 | CSA(Z6, Z7, Z10, Z22) 69 | VMOVDQA64 12*64(SI), Z4 70 | VMOVDQA64 13*64(SI), Z5 71 | VMOVDQA64 14*64(SI), Z10 72 | CSA(Z11, Z12, Z13, Z22) 73 | VPBROADCASTD magic<>+0(SB), Z28 // 0x55555555 for transposition 74 | VPBROADCASTD magic<>+4(SB), Z27 // 0x33333333 for transposition 75 | VPBROADCASTD magic<>+8(SB), Z26 // 0x0f0f0f0f for transposition 76 | CSA(Z4, Z5, Z10, Z22) 77 | CSA(Z0, Z2, Z6, Z22) 78 | CSA(Z1, Z3, Z7, Z22) 79 | CSA(Z0, Z11, Z4, Z22) 80 | CSA(Z2, Z12, Z5, Z22) 81 | CSA(Z1, Z2, Z11, Z22) 82 | CSA(Z2, Z3, Z12, Z22) 83 | 84 | ADDQ $15*64, SI 85 | SUBQ $(15+16)*64, CX // enough data left to process? 86 | JLT post 87 | 88 | VPBROADCASTD magic<>+12(SB), Z24 // 0x00ff00ff 89 | VPMOVZXBW magic<>+16(SB), Z23 // transposition vector 90 | MOVL $65535, AX // space left til overflow could occur in Z8, Z9 91 | 92 | // load 1024 bytes from buf, add them to Z0..Z3 into Z0..Z4 93 | vec: VMOVDQA64 0*64(SI), Z4 94 | VMOVDQA64 1*64(SI), Z5 95 | VMOVDQA64 2*64(SI), Z6 96 | VMOVDQA64 3*64(SI), Z7 97 | VMOVDQA64 4*64(SI), Z10 98 | CSA(Z0, Z4, Z5, Z22) 99 | VMOVDQA64 5*64(SI), Z5 100 | VMOVDQA64 6*64(SI), Z11 101 | VMOVDQA64 7*64(SI), Z12 102 | CSA(Z6, Z7, Z10, Z22) 103 | VMOVDQA64 8*64(SI), Z10 104 | VMOVDQA64 9*64(SI), Z13 105 | VMOVDQA64 10*64(SI), Z14 106 | CSA(Z5, Z11, Z12, Z22) 107 | VMOVDQA64 11*64(SI), Z12 108 | VMOVDQA64 12*64(SI), Z15 109 | VMOVDQA64 13*64(SI), Z16 110 | CSA(Z10, Z13, Z14, Z22) 111 | VMOVDQA64 14*64(SI), Z14 112 | VMOVDQA64 15*64(SI), Z17 113 | CSA(Z12, Z15, Z16, Z22) 114 | ADDQ $16*64, SI 115 | PREFETCHT0 (SI) 116 | CSA(Z0, Z5, Z6, Z22) 117 | PREFETCHT0 64(SI) 118 | CSA(Z1, Z4, Z7, Z22) 119 | CSA(Z10, Z12, Z14, Z22) 120 | CSA(Z11, Z13, Z15, Z22) 121 | CSA(Z0, Z10, Z17, Z22) 122 | CSA(Z1, Z5, Z11, Z22) 123 | CSA(Z2, Z4, Z13, Z22) 124 | CSA(Z1, Z10, Z12, Z22) 125 | CSA(Z2, Z5, Z10, Z22) 126 | CSA(Z3, Z4, Z5, Z22) 127 | 128 | // now Z0..Z4 hold counters; preserve Z0..Z3 for next round and 129 | // add Z4 to counters. 130 | 131 | // split into even/odd and reduce into crumbs 132 | VPANDD Z4, Z28, Z5 // Z5 = bits 02468ace x32 133 | VPANDND Z4, Z28, Z6 // Z6 = bits 13579bdf x32 134 | VPSRLD $1, Z6, Z6 135 | VSHUFI64X2 $0x44, Z6, Z5, Z10 136 | VSHUFI64X2 $0xee, Z6, Z5, Z11 137 | VPADDD Z10, Z11, Z4 // Z4 = 02468ace x16 ... 13579bdf x16 138 | 139 | // split again and reduce into nibbles 140 | VPANDD Z4, Z27, Z5 // Z5 = 048c x16 ... 159d x16 141 | VPANDND Z4, Z27, Z6 // Z6 = 26ae x16 ... 37bf x16 142 | VPSRLD $2, Z6, Z6 143 | VSHUFI64X2 $0x88, Z6, Z5, Z10 144 | VSHUFI64X2 $0xdd, Z6, Z5, Z11 145 | VPADDD Z10, Z11, Z4 // Z4 = 048c x8 159d x8 26ae x8 37bf x8 146 | 147 | // split again and reduce into bytes (shifted left by 4) 148 | VPANDD Z4, Z26, Z5 // Z5 = 08 x8 19 x8 2a x8 3b x8 149 | VPANDND Z4, Z26, Z6 // Z6 = 4c x8 5d x8 6e x8 7f x8 150 | VPSLLD $4, Z5, Z5 151 | VPERMQ $0xd8, Z5, Z5 // Z5 = 08x4 19x4 08x4 19x4 2ax4 3bx4 2ax4 3bx4 152 | VPERMQ $0xd8, Z6, Z6 // Z6 = 4cx4 5dx4 4cx4 5dx4 6ex4 7fx4 6ex4 7fx4 153 | VSHUFI64X2 $0x88, Z6, Z5, Z10 154 | VSHUFI64X2 $0xdd, Z6, Z5, Z11 155 | VPADDD Z10, Z11, Z4 // Z4 = 08x4 19x4 2ax4 3bx4 4cx4 5dx4 6ex4 7fx4 156 | 157 | // split again into 16 bit counters 158 | VPSRLW $8, Z4, Z6 // Z6 = 8888 9999 aaaa bbbb cccc dddd eeee ffff 159 | VPANDD Z4, Z24, Z5 // Z5 = 0000 1111 2222 3333 4444 5555 6666 7777 160 | 161 | // accumulate in permuted order 162 | VPADDW Z5, Z8, Z8 163 | VPADDW Z6, Z9, Z9 164 | 165 | SUBL $16*8, AX // account for possible overflow 166 | CMPL AX, $(15+15)*8 // enough space left in the counters? 167 | JGE have_space 168 | 169 | // fix permutation and flush into counters 170 | VPERMW Z8, Z23, Z8 // Z5 = 0123 4567 0123 4567 0123 4567 0123 4567 171 | VPERMW Z9, Z23, Z9 // Z6 = 89ab cdef 89ab cdef 89ab cdef 89ab cdef 172 | CALL *BX // call accumulation function 173 | VPXOR Y8, Y8, Y8 // clear accumulators for next round 174 | VPXOR Y9, Y9, Y9 175 | MOVL $65535, AX // space left til overflow could occur 176 | 177 | have_space: 178 | SUBQ $16*64, CX // account for bytes consumed 179 | JGE vec 180 | 181 | // fix permutation for final step 182 | VPERMW Z8, Z23, Z8 // Z5 = 0123 4567 0123 4567 0123 4567 0123 4567 183 | VPERMW Z9, Z23, Z9 // Z6 = 89ab cdef 89ab cdef 89ab cdef 89ab cdef 184 | 185 | // sum up Z0..Z3 into the counter registers 186 | post: VPSRLD $1, Z0, Z4 // group nibbles in Z0--Z3 into Z4--Z7 187 | VPADDD Z1, Z1, Z5 188 | VPSRLD $1, Z2, Z6 189 | VPADDD Z3, Z3, Z7 190 | VPTERNLOGD $0xe4, Z28, Z5, Z0 // Z0 = eca86420 (low crumbs) 191 | VPTERNLOGD $0xd8, Z28, Z4, Z1 // Z1 = fdb97531 (high crumbs) 192 | VPTERNLOGD $0xe4, Z28, Z7, Z2 // Z2 = eca86420 (low crumbs) 193 | VPTERNLOGD $0xd8, Z28, Z6, Z3 // Z3 = fdb97531 (high crumbs) 194 | 195 | VPSRLD $2, Z0, Z4 196 | VPSRLD $2, Z1, Z6 197 | VPSLLD $2, Z2, Z5 198 | VPSLLD $2, Z3, Z7 199 | VPTERNLOGD $0xd8, Z27, Z4, Z2 // Z2 = ea63 200 | VPTERNLOGD $0xd8, Z27, Z6, Z3 // Z3 = fb73 201 | VPTERNLOGD $0xe4, Z27, Z5, Z0 // Z0 = c840 202 | VPTERNLOGD $0xe4, Z27, Z7, Z1 // Z1 = d951 203 | 204 | // pre-shuffle nibbles (within 128 bit lanes)! 205 | VPUNPCKLBW Z3, Z2, Z6 // Z6 = fbea7362 (3:2:1:0) 206 | VPUNPCKHBW Z3, Z2, Z3 // Z3 = fbea7362 (7:6:5:4) 207 | VPUNPCKLBW Z1, Z0, Z5 // Z5 = d9c85140 (3:2:1:0) 208 | VPUNPCKHBW Z1, Z0, Z2 // Z2 = d9c85140 (7:6:5:4) 209 | VPUNPCKLWD Z6, Z5, Z4 // Z4 = fbead9c873625140 (1:0) 210 | VPUNPCKHWD Z6, Z5, Z5 // Z5 = fbead9c873625140 (3:2) 211 | VPUNPCKLWD Z3, Z2, Z6 // Z6 = fbead9c873625140 (5:4) 212 | VPUNPCKHWD Z3, Z2, Z7 // Z7 = fbead9c873625140 (7:6) 213 | 214 | // pull out high and low nibbles 215 | VPANDD Z26, Z4, Z0 216 | VPSRLD $4, Z4, Z4 217 | VPANDD Z26, Z4, Z4 218 | 219 | VPANDD Z26, Z5, Z1 220 | VPSRLD $4, Z5, Z5 221 | VPANDD Z26, Z5, Z5 222 | 223 | VPANDD Z26, Z6, Z2 224 | VPSRLD $4, Z6, Z6 225 | VPANDD Z26, Z6, Z6 226 | 227 | VPANDD Z26, Z7, Z3 228 | VPSRLD $4, Z7, Z7 229 | VPANDD Z26, Z7, Z7 230 | 231 | // reduce once 232 | VPADDB Z2, Z0, Z0 // Z0 = ba983210 (1:0) 233 | VPADDB Z3, Z1, Z1 // Z1 = ba983210 (3:2) 234 | VPADDB Z6, Z4, Z2 // Z2 = fedc7654 (1:0) 235 | VPADDB Z7, Z5, Z3 // Z3 = fedc7654 (3:2) 236 | 237 | // shuffle again to form ordered groups of 16 counters in each lane 238 | VPUNPCKLDQ Z2, Z0, Z4 // Z4 = fedcba9876543210 (0) 239 | VPUNPCKHDQ Z2, Z0, Z5 // Z5 = fedcba9876543210 (1) 240 | VPUNPCKLDQ Z3, Z1, Z6 // Z6 = fedcba9876543210 (2) 241 | VPUNPCKHDQ Z3, Z1, Z7 // Z7 = fedcba9876543210 (3) 242 | 243 | // reduce lanes once (4x1 lane -> 2x2 lanes) 244 | VSHUFI64X2 $0x44, Z5, Z4, Z0 // Z0 = fedcba9876543210 (1:1:0:0) 245 | VSHUFI64X2 $0xee, Z5, Z4, Z1 // Z1 = fedcba9876543210 (1:1:0:0) 246 | VSHUFI64X2 $0x44, Z7, Z6, Z2 // Z2 = fedcba9876543210 (3:3:2:2) 247 | VSHUFI64X2 $0xee, Z7, Z6, Z3 // Z2 = fedcba9876543210 (3:3:2:2) 248 | VPADDB Z1, Z0, Z0 249 | VPADDB Z3, Z2, Z2 250 | 251 | // reduce lanes again (2x2 lanes -> 1x4 lane) 252 | VSHUFI64X2 $0x88, Z2, Z0, Z1 // Z1 = fedcba9876543210 (3:2:1:0) 253 | VSHUFI64X2 $0xdd, Z2, Z0, Z0 // Z0 = fedcba9876543210 (3:2:1:0) 254 | VPADDB Z1, Z0, Z0 255 | 256 | // Zero extend and add to Z8, Z9 257 | VPUNPCKLBW Z25, Z0, Z1 // Z1 = 76543210 (3:2:1:0) 258 | VPUNPCKHBW Z25, Z0, Z2 // Z2 = fedcba98 (3:2:1:0) 259 | VPADDW Z1, Z8, Z8 260 | VPADDW Z2, Z9, Z9 261 | 262 | endvec: VPXOR Y0, Y0, Y0 // counter register 263 | 264 | // process tail, 8 bytes at a time 265 | CMPL CX, $-16*64 // no bytes left to process? 266 | JE end 267 | SUBL $8-16*64, CX // 8 bytes left to process? 268 | JLE tail1 269 | 270 | tail8: KMOVQ (SI), K1 271 | ADDQ $8, SI 272 | VPSUBB Z30, Z0, K1, Z0 273 | SUBL $8, CX 274 | JGT tail8 275 | 276 | // process remaining 1--8 bytes 277 | tail1: MOVL $8*8(CX*8), CX 278 | BZHIQ CX, (SI), AX // load tail into AX (will never fault) 279 | KMOVQ AX, K1 280 | VPSUBB Z30, Z0, K1, Z0 281 | 282 | // add tail to counters 283 | VPUNPCKLBW Z25, Z0, Z1 284 | VPUNPCKHBW Z25, Z0, Z2 285 | VPADDW Z1, Z8, Z8 286 | VPADDW Z2, Z9, Z9 287 | 288 | // and perform a final accumulation 289 | end: CALL *BX 290 | VZEROUPPER 291 | RET 292 | 293 | // special processing for when the data is less than 294 | // one iteration of the kernel 295 | runt: VPXOR Y0, Y0, Y0 // counter register 296 | SUBL $8, CX // 8 bytes left to process? 297 | JLE runtrunt // input of 0--8 bytes? 298 | 299 | runt8: KMOVQ (SI), K1 300 | ADDQ $8, SI 301 | VPSUBB Z30, Z0, K1, Z0 302 | SUBQ $8, CX 303 | JGT runt8 304 | 305 | // process last 1--7 bytes 306 | // as SI has no particular alignment, we cannot savely overread 307 | // instead overlap previous chunk and shift out junk 308 | MOVL $(CX*8), DX 309 | NEGL DX // number of bits to be masked out 310 | SHRXQ DX, (SI)(CX*1), AX 311 | KMOVQ AX, K1 312 | VPSUBB Z30, Z0, K1, Z0 313 | 314 | // populate counters and accumulate 315 | VPUNPCKLBW Z25, Z0, Z8 316 | VPUNPCKHBW Z25, Z0, Z9 317 | CALL *BX 318 | VZEROUPPER 319 | RET 320 | 321 | // process runt of 0--8 bytes 322 | runtrunt: 323 | ADDL $8, CX 324 | XORL AX, AX 325 | BTSL CX, AX // 1 << CX 326 | DECL AX // mask of CX ones 327 | KMOVD AX, K1 328 | VMOVDQU8.Z (SI), K1, X4 // just the runt bytes 329 | VMOVQ X4, AX 330 | KMOVQ AX, K1 331 | VPSUBB Z30, Z0, K1, Z0 332 | 333 | // populate counters and accumulate 334 | VPUNPCKLBW Z25, Z0, Z8 335 | VPUNPCKHBW Z25, Z0, Z9 336 | CALL *BX 337 | VZEROUPPER 338 | RET 339 | 340 | TEXT accum8<>(SB), NOSPLIT, $0-0 341 | // unpack and zero-extend 342 | VPMOVZXWQ X8, Z10 343 | VEXTRACTI128 $1, Y8, X11 344 | VPMOVZXWQ X11, Z11 345 | VEXTRACTI64X2 $2, Z8, X12 346 | VPMOVZXWQ X12, Z12 347 | VEXTRACTI64X2 $3, Z8, X13 348 | VPMOVZXWQ X13, Z13 349 | VPMOVZXWQ X9, Z14 350 | VEXTRACTI128 $1, Y9, X15 351 | VPMOVZXWQ X15, Z15 352 | VEXTRACTI64X2 $2, Z9, X16 353 | VPMOVZXWQ X16, Z16 354 | VEXTRACTI64X2 $3, Z9, X17 355 | VPMOVZXWQ X17, Z17 356 | 357 | // fold over thrice 358 | VPADDQ Z12, Z10, Z10 359 | VPADDQ Z13, Z11, Z11 360 | VPADDQ Z16, Z14, Z14 361 | VPADDQ Z17, Z15, Z15 362 | VPADDQ Z11, Z10, Z10 363 | VPADDQ Z15, Z14, Z14 364 | VPADDQ Z14, Z10, Z10 365 | 366 | // add to counters 367 | VPADDQ 0*64(DI), Z10, Z10 368 | VMOVDQU64 Z10, 0*64(DI) 369 | 370 | RET 371 | 372 | TEXT accum16<>(SB), NOSPLIT, $0-0 373 | // unpack and zero-extend 374 | VPMOVZXWQ X8, Z10 375 | VEXTRACTI128 $1, Y8, X11 376 | VPMOVZXWQ X11, Z11 377 | VEXTRACTI64X2 $2, Z8, X12 378 | VPMOVZXWQ X12, Z12 379 | VEXTRACTI64X2 $3, Z8, X13 380 | VPMOVZXWQ X13, Z13 381 | VPMOVZXWQ X9, Z14 382 | VEXTRACTI128 $1, Y9, X15 383 | VPMOVZXWQ X15, Z15 384 | VEXTRACTI64X2 $2, Z9, X16 385 | VPMOVZXWQ X16, Z16 386 | VEXTRACTI64X2 $3, Z9, X17 387 | VPMOVZXWQ X17, Z17 388 | 389 | // fold over twice 390 | VPADDQ Z12, Z10, Z10 391 | VPADDQ Z13, Z11, Z11 392 | VPADDQ Z16, Z14, Z14 393 | VPADDQ Z17, Z15, Z15 394 | VPADDQ Z11, Z10, Z10 395 | VPADDQ Z15, Z14, Z14 396 | 397 | // add to counters 398 | VPADDQ 0*64(DI), Z10, Z10 399 | VPADDQ 1*64(DI), Z14, Z14 400 | VMOVDQU64 Z10, 0*64(DI) 401 | VMOVDQU64 Z14, 1*64(DI) 402 | 403 | RET 404 | 405 | TEXT accum32<>(SB), NOSPLIT, $0-0 406 | // fold high half over low half and reduce 407 | VEXTRACTI64X2 $2, Z8, X12 408 | VEXTRACTI64X2 $2, Z9, X13 409 | VPMOVZXWQ X8, Z10 410 | VPMOVZXWQ X9, Z11 411 | VPMOVZXWQ X12, Z12 412 | VPMOVZXWQ X13, Z13 413 | VPADDQ Z12, Z10, Z10 414 | VPADDQ Z13, Z11, Z11 415 | VPADDQ 0*64(DI), Z10, Z10 416 | VPADDQ 1*64(DI), Z11, Z11 417 | VMOVDQU64 Z10, 0*64(DI) 418 | VMOVDQU64 Z11, 1*64(DI) 419 | 420 | VEXTRACTI128 $1, Y8, X10 421 | VEXTRACTI128 $1, Y9, X11 422 | VEXTRACTI64X2 $3, Z8, X12 423 | VEXTRACTI64X2 $3, Z9, X13 424 | VPMOVZXWQ X10, Z10 425 | VPMOVZXWQ X11, Z11 426 | VPMOVZXWQ X12, Z12 427 | VPMOVZXWQ X13, Z13 428 | VPADDQ Z12, Z10, Z10 429 | VPADDQ Z13, Z11, Z11 430 | VPADDQ 2*64(DI), Z10, Z10 431 | VPADDQ 3*64(DI), Z11, Z11 432 | VMOVDQU64 Z10, 2*64(DI) 433 | VMOVDQU64 Z11, 3*64(DI) 434 | 435 | RET 436 | 437 | TEXT accum64<>(SB), NOSPLIT, $0-0 438 | VPMOVZXWQ X8, Z13 439 | VPMOVZXWQ X9, Z14 440 | VPADDQ 0*64(DI), Z13, Z13 441 | VPADDQ 1*64(DI), Z14, Z14 442 | VMOVDQU64 Z13, 0*64(DI) 443 | VMOVDQU64 Z14, 1*64(DI) 444 | 445 | VEXTRACTI128 $1, Y8, X13 446 | VEXTRACTI128 $1, Y9, X14 447 | VPMOVZXWQ X13, Z13 448 | VPMOVZXWQ X14, Z14 449 | VPADDQ 2*64(DI), Z13, Z13 450 | VPADDQ 3*64(DI), Z14, Z14 451 | VMOVDQU64 Z13, 2*64(DI) 452 | VMOVDQU64 Z14, 3*64(DI) 453 | 454 | VEXTRACTI64X2 $2, Z8, X13 455 | VEXTRACTI64X2 $2, Z9, X14 456 | VPMOVZXWQ X13, Z13 457 | VPMOVZXWQ X14, Z14 458 | VPADDQ 4*64(DI), Z13, Z13 459 | VPADDQ 5*64(DI), Z14, Z14 460 | VMOVDQU64 Z13, 4*64(DI) 461 | VMOVDQU64 Z14, 5*64(DI) 462 | 463 | VEXTRACTI64X2 $3, Z8, X13 464 | VEXTRACTI64X2 $3, Z9, X14 465 | VPMOVZXWQ X13, Z13 466 | VPMOVZXWQ X14, Z14 467 | VPADDQ 6*64(DI), Z13, Z13 468 | VPADDQ 7*64(DI), Z14, Z14 469 | VMOVDQU64 Z13, 6*64(DI) 470 | VMOVDQU64 Z14, 7*64(DI) 471 | 472 | RET 473 | 474 | // func count8avx512(counts *[8]int, buf []uint8) 475 | TEXT ·count8avx512(SB), 0, $0-32 476 | MOVQ counts+0(FP), DI 477 | MOVQ buf_base+8(FP), SI 478 | MOVQ buf_len+16(FP), CX 479 | MOVQ $accum8<>(SB), BX 480 | CALL countavx512<>(SB) 481 | RET 482 | 483 | // func count16avx512(counts *[16]int, buf []uint16) 484 | TEXT ·count16avx512(SB), 0, $0-32 485 | MOVQ counts+0(FP), DI 486 | MOVQ buf_base+8(FP), SI 487 | MOVQ buf_len+16(FP), CX 488 | MOVQ $accum16<>(SB), BX 489 | SHLQ $1, CX 490 | CALL countavx512<>(SB) 491 | RET 492 | 493 | // func count32avx512(counts *[32]int, buf []uint32) 494 | TEXT ·count32avx512(SB), 0, $0-32 495 | MOVQ counts+0(FP), DI 496 | MOVQ buf_base+8(FP), SI 497 | MOVQ buf_len+16(FP), CX 498 | MOVQ $accum32<>(SB), BX 499 | SHLQ $2, CX 500 | CALL countavx512<>(SB) 501 | RET 502 | 503 | // func count64avx512(counts *[64]int, buf []uint64) 504 | TEXT ·count64avx512(SB), 0, $0-32 505 | MOVQ counts+0(FP), DI 506 | MOVQ buf_base+8(FP), SI 507 | MOVQ buf_len+16(FP), CX 508 | MOVQ $accum64<>(SB), BX 509 | SHLQ $3, CX 510 | CALL countavx512<>(SB) 511 | RET 512 | -------------------------------------------------------------------------------- /countneon_arm64.s: -------------------------------------------------------------------------------- 1 | //+build arm64,go1.16 2 | 3 | #include "textflag.h" 4 | 5 | // A NEON based kernel first doing a 15-fold CSA reduction and then a 6 | // 16-fold CSA reduction, carrying over place-value vectors between 7 | // iterations. 8 | 9 | // magic transposition constants, sliding window 10 | DATA magic<>+ 0(SB)/8, $0x8040201008040201 11 | DATA magic<>+ 8(SB)/8, $0x0000000000000000 12 | DATA magic<>+16(SB)/8, $0x0000000000000000 13 | DATA magic<>+24(SB)/8, $0xffffffffffffffff 14 | DATA magic<>+32(SB)/8, $0xffffffffffffffff 15 | GLOBL magic<>(SB), RODATA|NOPTR, $40 16 | 17 | // B:A = A+B+C, V31 used for scratch space 18 | #define CSA(A, B, C) \ 19 | VEOR B.B16, A.B16, V31.B16 \ 20 | VEOR C.B16, V31.B16, A.B16 \ 21 | VBIT V31.B16, C.B16, B.B16 22 | 23 | // D:A = A+B+C 24 | #define CSAC(A, B, C, D) \ 25 | VEOR A.B16, B.B16, D.B16 \ 26 | VEOR D.B16, C.B16, A.B16 \ 27 | VBSL B.B16, C.B16, D.B16 28 | 29 | // Process 4 bytes from S. Add low word counts to L, high to H. 30 | // Assumes masks loaded into V28, V29, and V30. Trashes V4, V5. 31 | #define COUNT4(L, H, S) \ 32 | VTBL V30.B16, [S.B16], V4.B16 \ // V4 = 0000:0000:1111:1111 33 | VTBL V29.B16, [S.B16], V5.B16 \ // V5 = 2222:2222:3333:3333 34 | VCMTST V28.B16, V4.B16, V4.B16 \ 35 | VCMTST V28.B16, V5.B16, V5.B16 \ 36 | VSUB V4.B16, L.B16, L.B16 \ 37 | VSUB V5.B16, H.B16, H.B16 38 | 39 | // Generic kernel. This function expects a pointer to a width-specific 40 | // accumulation function in R0, a possibly unaligned input buffer in R1, 41 | // counters in R2 and a remaining length in R3. 42 | TEXT countneon<>(SB), NOSPLIT, $0-0 43 | // constant for processing the head 44 | MOVD $magic<>(SB), R4 45 | VLD1R.P 8(R4), [V28.D2] // 80402010080402018040201008040201 46 | VMOVI $1, V30.B8 // 00000000000000000101010101010101 47 | VMOVI $2, V29.B16 // 02020202020202020202020202020202 48 | VADD V30.B16, V29.B16, V29.B16 // 02020202020202020303030303030303 49 | VMOVI $0, V8.B16 // counter registers 50 | VMOVI $0, V10.B16 51 | VMOVI $0, V12.B16 52 | VMOVI $0, V14.B16 53 | 54 | CMP $15*16, R3 // is the CSA kernel worth using? 55 | BLT runt 56 | 57 | // load head until alignment/end is reached 58 | AND $15, R1, R6 // offset of the buffer start from 16 byte alignment 59 | AND $~15, R1, R1 // align the source buffer pointer 60 | SUB $16, R6, R5 // negated number of bytes til alignment is reached 61 | ADD R6, R3, R3 // account for head length in CX 62 | NEG R5, R5 // number of bytes til alignment is reached 63 | VLD1.P 16(R1), [V3.B16] // load head, advance past it 64 | // VMOVQ (R4)(R5), V5 // load mask of bytes that are part of the head 65 | WORD $0x3ce56885 66 | VAND V5.B16, V3.B16, V0.B16 // and mask out those bytes that are not 67 | 68 | // load 15 registers worth of data and accumulate into V3--V0 69 | VLD1.P 2*16(R1), [V1.B16, V2.B16] 70 | VLD1.P 4*16(R1), [V3.B16, V4.B16, V5.B16, V6.B16] 71 | VLD1.P 4*16(R1), [V16.B16, V17.B16, V18.B16, V19.B16] 72 | CSA(V0, V1, V2) 73 | VMOVI $0x55, V27.B16 // 55555555 for transposition 74 | CSAC(V0, V3, V4, V2) 75 | VMOVI $0x33, V26.B16 // 33333333 for transposition 76 | CSAC(V0, V5, V6, V3) 77 | VLD1.P 4*16(R1), [V4.B16, V5.B16, V6.B16, V7.B16] 78 | CSA(V1, V2, V3) 79 | CSA(V0, V16, V17) 80 | VMOVI $0x0f, V25.B16 // 0f0f0f0f for extracting nibbles 81 | CSA(V0, V18, V19) 82 | MOVD $65535, R6 // space left til overflow could occur in V8--V15 83 | CSAC(V1, V16, V18, V3) 84 | VMOVI $0, V9.B16 85 | CSA(V0, V4, V5) 86 | VMOVI $0, V11.B16 87 | CSA(V0, V6, V7) 88 | VMOVI $0, V13.B16 89 | CSA(V1, V4, V6) 90 | VMOVI $0, V15.B16 91 | CSA(V2, V3, V4) 92 | 93 | SUBS $(15+16)*16, R3, R3 // enough data left to process? 94 | BLT post 95 | 96 | // load 16 registers worth of data and accumulate into V4--V0 97 | vec: VLD1.P 4*16(R1), [V4.B16, V5.B16, V6.B16, V7.B16] 98 | VLD1.P 4*16(R1), [V16.B16, V17.B16, V18.B16, V19.B16] 99 | VLD1.P 4*16(R1), [V20.B16, V21.B16, V22.B16, V23.B16] 100 | CSA(V4, V5, V6) 101 | CSA(V0, V17, V19) 102 | CSA(V7, V16, V18) 103 | CSA(V21, V22, V20) 104 | CSA(V1, V5, V17) 105 | VLD1.P 4*16(R1), [V17.B16, V18.B16, V19.B16, V20.B16] 106 | CSA(V0, V4, V7) 107 | CSA(V17, V18, V23) 108 | CSA(V19, V20, V21) 109 | CSA(V16, V18, V22) 110 | CSA(V1, V4, V20) 111 | CSA(V0, V17, V19) 112 | CSA(V2, V5, V18) 113 | CSA(V1, V16, V17) 114 | CSA(V2, V4, V16) 115 | CSA(V3, V4, V5) 116 | 117 | // now V0..V4 hold counters; preserve V0..V3 for the next round and 118 | // add V4 to counters. 119 | 120 | // split into even/odd and reduce into crumbs 121 | VAND V27.B16, V4.B16, V5.B16 // V5 = bits 02468ace x8 122 | // VBIC V27.B16, V4.B16, V6.B16 // V6 = bits 13579bdf x8 123 | WORD $0x4e7b1c86 124 | VUSHR $1, V6.B16, V6.B16 125 | VZIP1 V6.D2, V5.D2, V4.D2 126 | VZIP2 V6.D2, V5.D2, V5.D2 127 | VADD V5.B16, V4.B16, V4.B16 // V4 = 02468ace x4 13579bdf x4 128 | 129 | // split again into nibbles 130 | VAND V26.B16, V4.B16, V5.B16 // V5 = 048c x4 159d x4 131 | // VBIC V26.B16, V4.B16, V6.B16 // V6 = 26ae x4 37bf x4 132 | WORD $0x4e7a1c86 133 | VUSHR $2, V6.B16, V6.B16 134 | 135 | // split again into bytes and shuffle into order (also scale) 136 | VAND V25.B16, V5.B16, V4.B16 // V4 = 08 x4 19 x4 137 | // VBIC V25.B16, V5.B16, V5.B16 // V5 = 4c x4 5d x4 138 | WORD $0x4e791ca5 139 | // VBIC V25.B16, V6.B16, V7.B16 // V7 = 6e x4 7f x4 140 | WORD $0x4e791cc7 141 | VAND V25.B16, V6.B16, V6.B16 // V6 = 2a x4 3b x4 142 | VSHL $4, V4.B16, V4.B16 143 | VSHL $4, V6.B16, V6.B16 144 | 145 | VZIP1 V6.B16, V4.B16, V16.B16 // V16 = 028a x4 146 | VZIP2 V6.B16, V4.B16, V17.B16 // V17 = 139b x4 147 | VZIP1 V7.B16, V5.B16, V18.B16 // V18 = 46ce x4 148 | VZIP2 V7.B16, V5.B16, V19.B16 // V19 = 57df x4 149 | 150 | VZIP1 V17.B16, V16.B16, V4.B16 // V4 = 012389ab[0:1] 151 | VZIP2 V17.B16, V16.B16, V5.B16 // V5 = 012389ab[2:3] 152 | VZIP1 V19.B16, V18.B16, V6.B16 // V6 = 4567cdef[0:1] 153 | VZIP2 V19.B16, V18.B16, V7.B16 // V7 = 4567cdef[2:3] 154 | 155 | VZIP1 V6.S4, V4.S4, V16.S4 // V16 = 01234567[0:1] 156 | VZIP2 V6.S4, V4.S4, V17.S4 // V17 = 89abcdef[0:1] 157 | VZIP1 V7.S4, V5.S4, V18.S4 // V18 = 01234567[2:3] 158 | VZIP2 V7.S4, V5.S4, V19.S4 // V19 = 89abcdef[2:3] 159 | 160 | // add to counters 161 | VUADDW V16.B8, V8.H8, V8.H8 162 | VUADDW2 V16.B16, V9.H8, V9.H8 163 | VUADDW V17.B8, V10.H8, V10.H8 164 | VUADDW2 V17.B16, V11.H8, V11.H8 165 | VUADDW V18.B8, V12.H8, V12.H8 166 | VUADDW2 V18.B16, V13.H8, V13.H8 167 | VUADDW V19.B8, V14.H8, V14.H8 168 | VUADDW2 V19.B16, V15.H8, V15.H8 169 | 170 | SUB $16*2, R6, R6 // account for possible overflow 171 | CMP $(15+15)*2, R6 // enough space left in the counters? 172 | 173 | BGE have_space 174 | 175 | CALL *R0 // call accumulation function 176 | VMOVI $0, V8.B16 // clear counters for next round 177 | VMOVI $0, V9.B16 178 | VMOVI $0, V10.B16 179 | VMOVI $0, V11.B16 180 | VMOVI $0, V12.B16 181 | VMOVI $0, V13.B16 182 | VMOVI $0, V14.B16 183 | VMOVI $0, V15.B16 184 | 185 | MOVD $65535, R6 // space left til overflow could occur 186 | 187 | have_space: 188 | SUBS $16*16, R3, R3 // account for bytes consumed 189 | BGE vec 190 | 191 | // group V0--V3 into nibbles in the same register 192 | post: VUSHR $1, V0.B16, V4.B16 193 | VADD V1.B16, V1.B16, V5.B16 194 | VUSHR $1, V2.B16, V6.B16 195 | VADD V3.B16, V3.B16, V7.B16 196 | VBIF V27.B16, V5.B16, V0.B16 // V0 = eca86420 (low crumbs) 197 | VBIT V27.B16, V4.B16, V1.B16 // V1 = fdb97531 (high crumbs) 198 | VBIF V27.B16, V7.B16, V2.B16 // V2 = eca86420 (low crumbs) 199 | VBIT V27.B16, V6.B16, V3.B16 // V3 = fdb97531 (high crumbs) 200 | 201 | VUSHR $2, V0.B16, V4.B16 202 | VUSHR $2, V1.B16, V6.B16 203 | VSHL $2, V2.B16, V5.B16 204 | VSHL $2, V3.B16, V7.B16 205 | VBIT V26.B16, V4.B16, V2.B16 // V2 = ea62 206 | VBIT V26.B16, V6.B16, V3.B16 // V3 = fb73 207 | VBIF V26.B16, V5.B16, V0.B16 // V0 = c840 208 | VBIF V26.B16, V7.B16, V1.B16 // V1 = d951 209 | 210 | // pre-shuffle nibbles 211 | VZIP1 V3.B16, V2.B16, V6.B16 // V6 = fbea7362 (3:2:1:0) 212 | VZIP2 V3.B16, V2.B16, V3.B16 // V3 = fbea7362 (7:6:5:4) 213 | VZIP1 V1.B16, V0.B16, V5.B16 // V5 = d9c85140 (3:2:1:0) 214 | VZIP2 V1.B16, V0.B16, V2.B16 // V2 = d9c85140 (7:6:5:4) 215 | VZIP1 V6.H8, V5.H8, V4.H8 // V4 = fbead9c873625140 (1:0) 216 | VZIP2 V6.H8, V5.H8, V5.H8 // V5 = fbead9c873625140 (3:2) 217 | VZIP1 V3.H8, V2.H8, V6.H8 // V6 = fbead9c873625150 (5:4) 218 | VZIP2 V3.H8, V2.H8, V7.H8 // V7 = fbead9c873625150 (7:6) 219 | 220 | // pull out high and low nibbles and reduce once 221 | VAND V4.B16, V25.B16, V0.B16 222 | VUSHR $4, V4.B16, V4.B16 223 | VAND V5.B16, V25.B16, V1.B16 224 | VUSHR $4, V5.B16, V5.B16 225 | VAND V6.B16, V25.B16, V2.B16 226 | VADD V0.B16, V2.B16, V0.B16 // V0 = ba983210 (1:0) 227 | VUSRA $4, V6.B16, V4.B16 // V4 = fedc7654 (1:0) 228 | VAND V7.B16, V25.B16, V3.B16 229 | VADD V1.B16, V3.B16, V1.B16 // V1 = ba983210 (3:2) 230 | VUSRA $4, V7.B16, V5.B16 // V5 = fedc7654 (3:2) 231 | 232 | // shuffle one last time 233 | VZIP1 V4.S4, V0.S4, V2.S4 // V2 = fedcba987654 (0) 234 | VZIP2 V4.S4, V0.S4, V3.S4 // V3 = fedcba987654 (1) 235 | VZIP1 V5.S4, V1.S4, V6.S4 // V6 = fedcba987654 (2) 236 | VZIP2 V5.S4, V1.S4, V7.S4 // V7 = fedcba987654 (3) 237 | 238 | // add to counters 239 | VUADDW V2.B8, V8.H8, V8.H8 240 | VUADDW2 V2.B16, V9.H8, V9.H8 241 | VUADDW V3.B8, V10.H8, V10.H8 242 | VUADDW2 V3.B16, V11.H8, V11.H8 243 | VUADDW V6.B8, V12.H8, V12.H8 244 | VUADDW2 V6.B16, V13.H8, V13.H8 245 | VUADDW V7.B8, V14.H8, V14.H8 246 | VUADDW2 V7.B16, V15.H8, V15.H8 247 | 248 | endvec: VMOVI $0, V0.B16 // counter registers 249 | VMOVI $0, V1.B16 250 | VMOVI $0, V2.B16 251 | VMOVI $0, V3.B16 252 | 253 | // process tail, 8 bytes at a time 254 | ADDS $16*16-8, R3, R3 // 8 bytes left to process? 255 | BLT tail1 256 | 257 | tail8: SUBS $8, R3 258 | FMOVS.P 4(R1), F6 259 | FMOVS.P 4(R1), F7 260 | COUNT4(V0, V1, V6) 261 | COUNT4(V2, V3, V7) 262 | BGE tail8 263 | 264 | // process remaining 0--7 bytes 265 | tail1: ADDS $8, R3 // anything left to process? 266 | BLE end 267 | 268 | FMOVD (R1), F6 // load 8 bytes from buffer 269 | SUB R3, R4, R6 // shifted window address 270 | FMOVQ 16(R6), F5 // load window mask 271 | // VBIC V5.B16, V6.B16, V6.B16 // mask out the desired bytes 272 | WORD $0x4e651cc6 273 | 274 | // process tail 275 | VEXT $4, V6.B16, V6.B16, V7.B16 276 | COUNT4(V0, V1, V6) 277 | COUNT4(V2, V3, V7) 278 | 279 | // add tail to counters 280 | end: VUADDW V0.B8, V9.H8, V9.H8 281 | VUADDW2 V0.B16, V8.H8, V8.H8 282 | VUADDW V1.B8, V11.H8, V11.H8 283 | VUADDW2 V1.B16, V10.H8, V10.H8 284 | VUADDW V2.B8, V13.H8, V13.H8 285 | VUADDW2 V2.B16, V12.H8, V12.H8 286 | VUADDW V3.B8, V15.H8, V15.H8 287 | VUADDW2 V3.B16, V14.H8, V14.H8 288 | 289 | CALL *R0 290 | RET 291 | 292 | // very short input, use tail routine only 293 | runt: SUBS $8, R3 // 8 bytes left to process? 294 | BLT runt1 295 | 296 | // process runt, 8 bytes at a time 297 | runt8: SUBS $8, R3 298 | FMOVS.P 4(R1), F6 299 | FMOVS.P 4(R1), F7 300 | COUNT4(V8, V10, V6) 301 | COUNT4(V12, V14, V7) 302 | BGE runt8 303 | 304 | // process remaining 0--7 bytes 305 | // while making sure we don't get a page fault 306 | runt1: ADDS $8, R3 // anything left to process? 307 | BLE runt_accum 308 | 309 | AND $7, R1, R5 // offset from 8 byte alignment 310 | ADD R5, R3, R8 // length of buffer including alignment 311 | LSL $3, R3, R3 // remaining length in bits 312 | MOVD $-1, R7 313 | LSL R3, R7, R7 // mask of bits where R6 is out of range 314 | CMP $8, R8 // if this exceeds an alignment boundary 315 | BGT crossrunt1 // we can safely load directly 316 | 317 | AND $~7, R1, R1 // align buffer to 8 bytes 318 | MOVD (R1), R6 // and load 8 bytes from buffer 319 | LSL $3, R5, R5 // offset from 8 byte alignment in bits 320 | LSR R5, R6, R6 // buffer starting at the beginning 321 | B dorunt1 322 | 323 | crossrunt1: 324 | MOVD (R1), R6 // load 8 bytes from unaligned buffer 325 | 326 | dorunt1: 327 | BIC R7, R6, R6 // clear out of range bits 328 | FMOVD R6, F6 // move buffer to SIMD unit 329 | VEXT $4, V6.B16, V6.B16, V7.B16 330 | COUNT4(V8, V10, V6) 331 | COUNT4(V12, V14, V7) 332 | 333 | // initialise counters with tail 334 | runt_accum: 335 | VUXTL V8.B8, V9.H8 // 8--15 89abcdef[0] 336 | VUXTL2 V8.B16, V8.H8 // 0-- 7 01234567[0] 337 | VUXTL V10.B8, V11.H8 // 24--31 89abcdef[1] 338 | VUXTL2 V10.B16, V10.H8 // 16--23 01234567[1] 339 | VUXTL V12.B8, V13.H8 // 40--47 89abcdef[2] 340 | VUXTL2 V12.B16, V12.H8 // 32--39 01234567[2] 341 | VUXTL V14.B8, V15.H8 // 56--63 89abcdef[3] 342 | VUXTL2 V14.B16, V14.H8 // 48--55 01234567[3] 343 | 344 | CALL *R0 345 | RET 346 | 347 | TEXT accum8<>(SB), NOSPLIT, $0-0 348 | // load counts registers 349 | VLD1 (R2), [V4.D2, V5.D2, V6.D2, V7.D2] 350 | 351 | // zero extend into dwords and fold 352 | // VUADDL V8.H4, V10.H4, V16.S4 353 | // VUADDL2 V8.H8, V10.H8, V17.S4 354 | // VUADDL V9.H4, V11.H4, V18.S4 355 | // VUADDL2 V9.H8, V11.H8, V19.S4 356 | // VUADDL V12.H4, V14.H4, V20.S4 357 | // VUADDL2 V12.H8, V14.H8, V21.S4 358 | // VUADDL V13.H4, V15.H4, V22.S4 359 | // VUADDL2 V13.H8, V15.H8, V23.S4 360 | WORD $0x2e680150 361 | WORD $0x6e680151 362 | WORD $0x2e690172 363 | WORD $0x6e690173 364 | WORD $0x2e6c01d4 365 | WORD $0x6e6c01d5 366 | WORD $0x2e6d01f6 367 | WORD $0x6e6d01f7 368 | 369 | // reduce integer pairs 370 | VADD V18.S4, V16.S4, V16.S4 371 | VADD V19.S4, V17.S4, V17.S4 372 | VADD V22.S4, V20.S4, V20.S4 373 | VADD V23.S4, V21.S4, V21.S4 374 | VADD V20.S4, V16.S4, V16.S4 375 | VADD V21.S4, V17.S4, V17.S4 376 | 377 | // accumulate 378 | VUADDW V16.S2, V4.D2, V4.D2 379 | VUADDW2 V16.S4, V5.D2, V5.D2 380 | VUADDW V17.S2, V6.D2, V6.D2 381 | VUADDW2 V17.S4, V7.D2, V7.D2 382 | 383 | // write back counts registers 384 | VST1 [V4.D2, V5.D2, V6.D2, V7.D2], (R2) 385 | RET 386 | 387 | TEXT accum16<>(SB), NOSPLIT, $0-0 388 | // load first half of the counts 389 | VLD1.P 4*16(R2), [V4.D2, V5.D2, V6.D2, V7.D2] 390 | 391 | // zero extend into dwords and fold 392 | // VUADDL V8.H4, V10.H4, V16.S4 393 | // VUADDL2 V8.H8, V10.H8, V17.S4 394 | // VUADDL V9.H4, V11.H4, V18.S4 395 | // VUADDL2 V9.H8, V11.H8, V19.S4 396 | // VUADDL V12.H4, V14.H4, V20.S4 397 | // VUADDL2 V12.H8, V14.H8, V21.S4 398 | // VUADDL V13.H4, V15.H4, V22.S4 399 | // VUADDL2 V13.H8, V15.H8, V23.S4 400 | WORD $0x2e680150 401 | WORD $0x6e680151 402 | WORD $0x2e690172 403 | WORD $0x6e690173 404 | WORD $0x2e6c01d4 405 | WORD $0x6e6c01d5 406 | WORD $0x2e6d01f6 407 | WORD $0x6e6d01f7 408 | 409 | // reduce integer pairs 410 | VADD V20.S4, V16.S4, V16.S4 411 | VADD V21.S4, V17.S4, V17.S4 412 | VADD V22.S4, V18.S4, V18.S4 413 | VADD V23.S4, V19.S4, V19.S4 414 | 415 | // load second half of the counts 416 | VLD1 (R2), [V20.D2, V21.D2, V22.D2, V23.D2] 417 | SUB $4*16, R2, R2 // move R2 back to the beginning 418 | 419 | // accumulate 420 | VUADDW V16.S2, V4.D2, V4.D2 421 | VUADDW2 V16.S4, V5.D2, V5.D2 422 | VUADDW V17.S2, V6.D2, V6.D2 423 | VUADDW2 V17.S4, V7.D2, V7.D2 424 | VUADDW V18.S2, V20.D2, V20.D2 425 | VUADDW2 V18.S4, V21.D2, V21.D2 426 | VUADDW V19.S2, V22.D2, V22.D2 427 | VUADDW2 V19.S4, V23.D2, V23.D2 428 | 429 | // write back 430 | VST1.P [V4.D2, V5.D2, V6.D2, V7.D2], 4*16(R2) 431 | VST1 [V20.D2, V21.D2, V22.D2, V23.D2], (R2) 432 | SUB $4*16, R2, R2 // restore R2 433 | 434 | RET 435 | 436 | TEXT accum32<>(SB), NOSPLIT, $0-0 437 | MOVD R2, R7 // source register 438 | MOVD R2, R8 // destination register 439 | MOVD $2, R9 // counter 440 | 441 | // load counts registers 442 | loop: VLD1.P 4*16(R7), [V20.D2, V21.D2, V22.D2, V23.D2] 443 | VLD1.P 4*16(R7), [V4.D2, V5.D2, V6.D2, V7.D2] 444 | 445 | SUB $1, R9, R9 446 | 447 | // zero extend into dwords and fold 448 | // VUADDL V8.H4, V12.H4, V16.S4 449 | // VUADDL2 V8.H8, V12.H8, V17.S4 450 | // VUADDL V9.H4, V13.H4, V18.S4 451 | // VUADDL2 V9.H8, V13.H8, V19.S4 452 | WORD $0x2e680190 453 | WORD $0x6e680191 454 | WORD $0x2e6901b2 455 | WORD $0x6e6901b3 456 | 457 | // shift remaining counters forwards 458 | // can't use the VMOV alias because the assembler 459 | // doesn't support it. VORR does the trick though 460 | VORR V10.B16, V10.B16, V8.B16 461 | VORR V11.B16, V11.B16, V9.B16 462 | VORR V14.B16, V14.B16, V12.B16 463 | VORR V15.B16, V15.B16, V13.B16 464 | 465 | // accumulate 466 | VUADDW V16.S2, V20.D2, V20.D2 467 | VUADDW2 V16.S4, V21.D2, V21.D2 468 | VUADDW V17.S2, V22.D2, V22.D2 469 | VUADDW2 V17.S4, V23.D2, V23.D2 470 | VUADDW V18.S2, V4.D2, V4.D2 471 | VUADDW2 V18.S4, V5.D2, V5.D2 472 | VUADDW V19.S2, V6.D2, V6.D2 473 | VUADDW2 V19.S4, V7.D2, V7.D2 474 | 475 | // write back 476 | VST1.P [V20.D2, V21.D2, V22.D2, V23.D2], 4*16(R8) 477 | VST1.P [V4.D2, V5.D2, V6.D2, V7.D2], 4*16(R8) 478 | 479 | CBNZ R9, loop 480 | 481 | RET 482 | 483 | TEXT accum64<>(SB), NOSPLIT, $0-0 484 | MOVD R2, R7 // source register 485 | MOVD R2, R8 // destination register 486 | MOVD $4, R9 // counter 487 | 488 | // load counts registers 489 | loop: VLD1.P 4*16(R7), [V20.D2, V21.D2, V22.D2, V23.D2] 490 | VLD1.P 4*16(R7), [V4.D2, V5.D2, V6.D2, V7.D2] 491 | 492 | SUB $1, R9, R9 493 | 494 | // zero extend into dwords 495 | VUXTL V8.H4, V16.S4 496 | VUXTL2 V8.H8, V17.S4 497 | VUXTL V9.H4, V18.S4 498 | VUXTL2 V9.H8, V19.S4 499 | 500 | // shift remaining counters forwards 501 | // can't use the VMOV alias because the assembler 502 | // doesn't support it. VORR does the trick though 503 | VORR V10.B16, V10.B16, V8.B16 504 | VORR V11.B16, V11.B16, V9.B16 505 | VORR V12.B16, V12.B16, V10.B16 506 | VORR V13.B16, V13.B16, V11.B16 507 | VORR V14.B16, V14.B16, V12.B16 508 | VORR V15.B16, V15.B16, V13.B16 509 | 510 | // accumulate 511 | VUADDW V16.S2, V20.D2, V20.D2 512 | VUADDW2 V16.S4, V21.D2, V21.D2 513 | VUADDW V17.S2, V22.D2, V22.D2 514 | VUADDW2 V17.S4, V23.D2, V23.D2 515 | VUADDW V18.S2, V4.D2, V4.D2 516 | VUADDW2 V18.S4, V5.D2, V5.D2 517 | VUADDW V19.S2, V6.D2, V6.D2 518 | VUADDW2 V19.S4, V7.D2, V7.D2 519 | 520 | // write back 521 | VST1.P [V20.D2, V21.D2, V22.D2, V23.D2], 4*16(R8) 522 | VST1.P [V4.D2, V5.D2, V6.D2, V7.D2], 4*16(R8) 523 | 524 | CBNZ R9, loop 525 | 526 | RET 527 | 528 | TEXT ·count8neon(SB), 0, $0-32 529 | LDP counts+0(FP), (R2, R1) 530 | MOVD buf_len+16(FP), R3 531 | MOVD $accum8<>(SB), R0 532 | CALL countneon<>(SB) 533 | RET 534 | 535 | TEXT ·count16neon(SB), 0, $0-32 536 | LDP counts+0(FP), (R2, R1) 537 | MOVD buf_len+16(FP), R3 538 | MOVD $accum16<>(SB), R0 539 | LSL $1, R3, R3 // count in bytes 540 | CALL countneon<>(SB) 541 | RET 542 | 543 | TEXT ·count32neon(SB), 0, $0-32 544 | LDP counts+0(FP), (R2, R1) 545 | MOVD buf_len+16(FP), R3 546 | MOVD $accum32<>(SB), R0 547 | LSL $2, R3, R3 // count in bytes 548 | CALL countneon<>(SB) 549 | RET 550 | 551 | TEXT ·count64neon(SB), 0, $0-32 552 | LDP counts+0(FP), (R2, R1) 553 | MOVD buf_len+16(FP), R3 554 | MOVD $accum64<>(SB), R0 555 | LSL $3, R3, R3 // count in bytes 556 | CALL countneon<>(SB) 557 | RET 558 | -------------------------------------------------------------------------------- /countsse2_386.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // SSE2 based kernels for the positional population count operation. 4 | // All these kernels have the same backbone based on a 15-fold CSA 5 | // reduction to first reduce 240 byte into 4x16 byte, followed by a 6 | // bunch of shuffles to group the positional registers into nibbles. 7 | // These are then summed up using a width-specific summation function. 8 | // Required CPU extension: SSE2. 9 | 10 | // magic transposition constants 11 | DATA magic<> +0(SB)/8, $0x8040201008040201 12 | DATA magic<>+ 8(SB)/8, $0xaaaaaaaa55555555 13 | DATA magic<>+16(SB)/8, $0xcccccccc33333333 14 | DATA magic<>+24(SB)/4, $0x0f0f0f0f 15 | GLOBL magic<>(SB), RODATA|NOPTR, $28 16 | 17 | // sliding window for head/tail loads. Unfortunately, there doesn't 18 | // seem to be a good way to do this with less memory wasted. 19 | DATA window<> +0(SB)/8, $0x0000000000000000 20 | DATA window<> +8(SB)/8, $0x0000000000000000 21 | DATA window<>+16(SB)/8, $0xffffffffffffffff 22 | DATA window<>+24(SB)/8, $0xffffffffffffffff 23 | GLOBL window<>(SB), RODATA|NOPTR, $32 24 | 25 | // B:A = A+B+C, D used for scratch space 26 | #define CSA(A, B, C, D) \ 27 | MOVOA A, D \ 28 | PAND B, D \ 29 | PXOR B, A \ 30 | MOVOA A, B \ 31 | PAND C, B \ 32 | PXOR C, A \ 33 | POR D, B 34 | 35 | // Process 4 bytes from X4. Add low word counts to L, high to H 36 | // assumes mask loaded into X2. Trashes X4, X5. 37 | #define COUNT4(L, H) \ // X4 = ----:----:----:3210 38 | PUNPCKLBW X4, X4 \ // X4 = ----:----:3322:1100 39 | PUNPCKLWL X4, X4 \ // X4 = 3333:2222:1111:0000 40 | PSHUFD $0xfa, X4, X5 \ // X5 = 3333:3333:2222:2222 41 | PUNPCKLLQ X4, X4 \ // X5 = 1111:1111:0000:0000 42 | PAND X6, X4 \ 43 | PAND X6, X5 \ 44 | PCMPEQB X6, X4 \ 45 | PCMPEQB X6, X5 \ 46 | PSUBB X4, L \ 47 | PSUBB X5, H 48 | 49 | // zero extend X from bytes into words and add to the counter vectors 50 | // S1 and S2. X7 is expected to be a zero register, X6 and X are trashed. 51 | #define ACCUM(S1, S2, X) \ 52 | MOVOA X, X6 \ 53 | PUNPCKLBW X7, X \ 54 | PUNPCKHBW X7, X6 \ 55 | PADDW S1, X \ 56 | PADDW S2, X6 \ 57 | MOVOA X, S1 \ 58 | MOVOA X6, S2 59 | 60 | // Generic kernel. This function expects a pointer to a width-specific 61 | // accumulation funciton in BX, a possibly unaligned input buffer in SI, 62 | // counters in DI and a remaining length in BP. 63 | TEXT countsse<>(SB), NOSPLIT, $144-0 64 | TESTL BP, BP // any data to process at all? 65 | CMOVLEQ BP, SI // if not, avoid loading head 66 | 67 | // constants for processing the head 68 | MOVQ magic<>+0(SB), X6 // bit position mask 69 | PSHUFD $0x44, X6, X6 // broadcast into both qwords 70 | PXOR X0, X0 // counter registers 71 | PXOR X1, X1 72 | PXOR X2, X2 73 | PXOR X3, X3 74 | 75 | // load head into scratch space (until alignment/end is reached) 76 | MOVL SI, DX 77 | ANDL $15, DX // offset of the buffer start from 16 byte alignment 78 | JEQ nohead // if source buffer is aligned, skip head processing 79 | MOVL $16, AX 80 | SUBL DX, AX // number of bytes til alignment is reached (head length) 81 | MOVL $window<>(SB), DX 82 | MOVOA -16(SI)(AX*1), X7 // load head 83 | MOVOU (DX)(AX*1), X5 // load mask of the bytes that are part of the head 84 | PAND X5, X7 // and mask out those bytes that are not 85 | CMPL AX, BP // is the head shorter than the buffer? 86 | JLT norunt 87 | 88 | // buffer is short and does not cross a 16 byte boundary 89 | SUBL BP, AX // number of bytes by which we overshoot the buffer 90 | MOVOU (DX)(AX*1), X5 // load mask of bytes that overshoot the buffer 91 | PANDN X7, X5 // and clear them 92 | MOVOA X5, X7 // move head buffer back to X4 93 | MOVL BP, AX // set up true prefix length 94 | 95 | norunt: SUBL AX, BP // mark head as accounted for 96 | ADDL AX, SI // and advance past the head 97 | 98 | // process head in four increments of 4 bytes 99 | MOVOA X7, X4 100 | PSRLO $4, X7 101 | COUNT4(X0, X1) 102 | MOVOA X7, X4 103 | PSRLO $4, X7 104 | COUNT4(X2, X3) 105 | MOVOA X7, X4 106 | PSRLO $4, X7 107 | COUNT4(X0, X1) 108 | MOVOA X7, X4 109 | COUNT4(X2, X3) 110 | 111 | // produce 16 byte aligned pointer to counter vector in DX 112 | nohead: MOVL $counts-144+15(SP), DX 113 | ANDL $~15, DX // align to 16 bytes 114 | 115 | // initialise counters in (DX) to what we have 116 | PXOR X7, X7 // zero register 117 | MOVOA X0, X4 118 | PUNPCKLBW X7, X0 119 | PUNPCKHBW X7, X4 120 | MOVOA X0, 0*16(DX) 121 | MOVOA X4, 1*16(DX) 122 | MOVOA X1, X4 123 | PUNPCKLBW X7, X1 124 | PUNPCKHBW X7, X4 125 | MOVOA X1, 2*16(DX) 126 | MOVOA X4, 3*16(DX) 127 | MOVOA X2, X4 128 | PUNPCKLBW X7, X2 129 | PUNPCKHBW X7, X4 130 | MOVOA X2, 4*16(DX) 131 | MOVOA X4, 5*16(DX) 132 | MOVOA X3, X4 133 | PUNPCKLBW X7, X3 134 | PUNPCKHBW X7, X4 135 | MOVOA X3, 6*16(DX) 136 | MOVOA X4, 7*16(DX) 137 | 138 | SUBL $15*16, BP // enough data left to process? 139 | JLT endvec // also, pre-subtract 140 | 141 | MOVL $65535-4, AX // space left til overflow could occur in Y8--Y11 142 | 143 | vec: MOVOA 0*16(SI), X0 // load 240 bytes from buf 144 | MOVOA 1*16(SI), X1 // and sum them into Y3:Y2:Y1:Y0 145 | MOVOA 2*16(SI), X4 146 | MOVOA 3*16(SI), X2 147 | MOVOA 4*16(SI), X3 148 | MOVOA 5*16(SI), X5 149 | MOVOA 6*16(SI), X6 150 | CSA(X0, X1, X4, X7) 151 | MOVOA 7*16(SI), X4 152 | CSA(X3, X2, X5, X7) 153 | MOVOA 8*16(SI), X5 154 | CSA(X0, X3, X6, X7) 155 | MOVOA 9*16(SI), X6 156 | CSA(X1, X2, X3, X7) 157 | MOVOA 10*16(SI), X3 158 | CSA(X0, X4, X5, X7) 159 | MOVOA 11*16(SI), X5 160 | CSA(X0, X3, X6, X7) 161 | MOVOA 12*16(SI), X6 162 | CSA(X1, X3, X4, X7) 163 | MOVOA 13*16(SI), X4 164 | CSA(X0, X5, X6, X7) 165 | MOVOA 14*16(SI), X6 166 | CSA(X0, X4, X6, X7) 167 | CSA(X1, X4, X5, X7) 168 | CSA(X2, X3, X4, X7) 169 | 170 | // load magic constants 171 | MOVQ magic<>+8(SB), X7 172 | PSHUFD $0x55, X7, X6 // 0xaaaaaaaa 173 | PSHUFD $0x00, X7, X7 // 0x55555555 174 | 175 | ADDL $15*16, SI 176 | 177 | // group X0--X3 into nibbles in the same register 178 | MOVOA X0, X5 179 | PAND X6, X5 180 | PSRLL $1, X5 181 | MOVOA X1, X4 182 | PAND X7, X4 183 | PADDL X4, X4 184 | PAND X7, X0 185 | PAND X6, X1 186 | POR X4, X0 // X0 = eca86420 (low crumbs) 187 | POR X5, X1 // X1 = fdb97531 (high crumbs) 188 | 189 | MOVOA X2, X5 190 | PAND X6, X5 191 | PSRLL $1, X5 192 | MOVOA X3, X4 193 | PAND X7, X4 194 | PADDL X4, X4 195 | PAND X7, X2 196 | PAND X6, X3 197 | POR X4, X2 // X0 = eca86420 (low crumbs) 198 | POR X5, X3 // X1 = fdb97531 (high crumbs) 199 | 200 | MOVQ magic<>+16(SB), X7 201 | PSHUFD $0x55, X7, X6 // 0xcccccccc 202 | PSHUFD $0x00, X7, X7 // 0x33333333 203 | 204 | MOVOA X0, X5 205 | PAND X6, X5 206 | PSRLL $2, X5 207 | MOVOA X2, X4 208 | PAND X7, X4 209 | PSLLL $2, X4 210 | PAND X7, X0 211 | PAND X6, X2 212 | POR X4, X0 // X0 = c840 213 | POR X5, X2 // X2 = ea62 214 | 215 | MOVOA X1, X5 216 | PAND X6, X5 217 | PSRLL $2, X5 218 | MOVOA X3, X4 219 | PAND X7, X4 220 | PSLLL $2, X4 221 | PAND X7, X1 222 | PAND X6, X3 223 | POR X4, X1 // X1 = d951 224 | POR X5, X3 // X3 = fb73 225 | 226 | MOVD magic<>+24(SB), X7 227 | PSHUFD $0x00, X7, X7 // 0x0f0f0f0f 228 | 229 | // pre-shuffle nibbles 230 | MOVOA X2, X5 231 | PUNPCKLBW X3, X2 // X2 = fbea7362 (3:2:1:0) 232 | PUNPCKHBW X3, X5 // X5 = fbea7362 (7:6:5:4) 233 | MOVOA X0, X3 234 | PUNPCKLBW X1, X0 // X0 = d9c85140 (3:2:1:0) 235 | PUNPCKHBW X1, X3 // X4 = d9c85140 (7:6:5:4) 236 | MOVOA X0, X1 237 | PUNPCKLWL X2, X0 // X0 = fbead9c873625140 (1:0) 238 | PUNPCKHWL X2, X1 // X1 = fbead9c873625140 (3:2) 239 | MOVOA X3, X2 240 | PUNPCKLWL X5, X2 // X2 = fbead9c873625140 (5:4) 241 | PUNPCKHWL X5, X3 // X3 = fbead9c873625140 (7:6) 242 | 243 | // pull high and low nibbles and reduce once 244 | MOVOA X0, X4 245 | PSRLL $4, X4 246 | PAND X7, X0 // X0 = ba983210 (1:0) 247 | PAND X7, X4 // X4 = fedc7654 (1:0) 248 | 249 | MOVOA X2, X6 250 | PSRLL $4, X2 251 | PAND X7, X6 // X6 = ba983210 (5:4) 252 | PAND X7, X2 // X2 = fedc7654 (5:4) 253 | 254 | PADDB X6, X0 // X0 = ba983210 (1:0) 255 | PADDB X4, X2 // X2 = fedc7654 (1:0) 256 | 257 | MOVOA X1, X4 258 | PSRLL $4, X4 259 | PAND X7, X1 // X1 = ba983210 (3:2) 260 | PAND X7, X4 // X4 = fedc7654 (3:2) 261 | 262 | MOVOA X3, X6 263 | PSRLL $4, X3 264 | PAND X7, X6 // X6 = ba983210 (7:6) 265 | PAND X7, X3 // X3 = fedc7654 (7:6) 266 | 267 | PADDB X6, X1 // X1 = ba983210 (3:2) 268 | PADDB X4, X3 // X3 = fedc7654 (3:2) 269 | 270 | // unpack one last time 271 | MOVOA X0, X4 272 | PUNPCKLLQ X2, X0 // X0 = fedcba9876543210 (0) 273 | PUNPCKHLQ X2, X4 // X4 = fedcba9876543210 (1) 274 | MOVOA X1, X5 275 | PUNPCKLLQ X3, X1 // X1 = fedcba9876543210 (2) 276 | PUNPCKHLQ X3, X5 // X5 = fedcba9876543210 (3) 277 | 278 | // add to counters 279 | PXOR X7, X7 // zero register 280 | ACCUM(0*16(DX), 1*16(DX), X0) 281 | ACCUM(2*16(DX), 3*16(DX), X4) 282 | ACCUM(4*16(DX), 5*16(DX), X1) 283 | ACCUM(6*16(DX), 7*16(DX), X5) 284 | 285 | SUBL $15*2, AX // account for possible overflow 286 | CMPL AX, $15*2 // enough space left in the counters? 287 | JGE have_space 288 | 289 | CALL *BX // call accumulation function 290 | 291 | // clear counts for next round 292 | PXOR X7, X7 293 | MOVOA X7, 0*16(DX) 294 | MOVOA X7, 1*16(DX) 295 | MOVOA X7, 2*16(DX) 296 | MOVOA X7, 3*16(DX) 297 | MOVOA X7, 4*16(DX) 298 | MOVOA X7, 5*16(DX) 299 | MOVOA X7, 6*16(DX) 300 | MOVOA X7, 7*16(DX) 301 | 302 | MOVL $65535, AX // space left til overflow could occur 303 | 304 | have_space: 305 | SUBL $15*16, BP // account for bytes consumed 306 | JGE vec 307 | 308 | // constants for processing the tail 309 | endvec: MOVQ magic<>+0(SB), X6 // bit position mask 310 | PSHUFD $0x44, X6, X6 // broadcast into both qwords 311 | PXOR X0, X0 // counter registers 312 | PXOR X1, X1 313 | PXOR X2, X2 314 | PXOR X3, X3 315 | 316 | // process tail, 4 bytes at a time 317 | SUBL $8-15*16, BP // 8 bytes left to process? 318 | JLT tail1 319 | 320 | tail8: MOVL (SI), X4 321 | COUNT4(X0, X1) 322 | MOVL 4(SI), X4 323 | COUNT4(X2, X3) 324 | ADDL $8, SI 325 | SUBL $8, BP 326 | JGE tail8 327 | 328 | // process remaining 0--7 byte 329 | tail1: SUBL $-8, BP // anything left to process? 330 | JLE end 331 | 332 | MOVQ (SI), X5 // load 8 bytes from buffer. Note that 333 | // buffer is aligned to 8 byte here 334 | MOVL $window<>+16(SB), AX // load window address 335 | SUBL BP, AX // adjust mask pointer 336 | MOVQ (AX), X7 // load window mask 337 | PANDN X5, X7 // and mask out the desired bytes 338 | 339 | // process rest 340 | MOVOA X7, X4 341 | PSRLO $4, X7 342 | COUNT4(X0, X1) 343 | MOVOA X7, X4 344 | COUNT4(X2, X3) 345 | 346 | // add tail to counters 347 | end: PXOR X7, X7 // zero register 348 | ACCUM(0*16(DX), 1*16(DX), X0) 349 | ACCUM(2*16(DX), 3*16(DX), X1) 350 | ACCUM(4*16(DX), 5*16(DX), X2) 351 | ACCUM(6*16(DX), 7*16(DX), X3) 352 | 353 | CALL *BX 354 | RET 355 | 356 | // zero-extend words in X and Y to dwords, sum them, and move the 357 | // halves back into X and Y. Assumes X7 == 0. Trashes X2 and X3. 358 | #define FOLDW(X, Y) \ 359 | MOVOA X, X2 \ 360 | PUNPCKLWL X7, X \ 361 | PUNPCKHWL X7, X2 \ 362 | MOVOA Y, X3 \ 363 | PUNPCKLWL X7, X3 \ 364 | PUNPCKHWL X7, Y \ 365 | PADDL X3, X \ 366 | PADDL X2, Y 367 | 368 | // add dwords in X to (a)*4(DI), trashing X2. 369 | #define ACCUMQ(a, X) \ 370 | MOVOU (a)*4(DI), X2 \ 371 | PADDL X, X2 \ 372 | MOVOU X2, (a)*4(DI) 373 | 374 | // zero-extend words in s*16(DX) to dwords and add to a*4(DI) to (a+7)*4(DI). 375 | // Assumes X7 == 0 and trashes X0, X1, and X2. 376 | #define ACCUMO(a, s) \ 377 | MOVOA (s)*16(DX), X0 \ 378 | MOVOA X0, X1 \ 379 | PUNPCKLWL X7, X0 \ 380 | PUNPCKHWL X7, X1 \ 381 | ACCUMQ(a, X0) \ 382 | ACCUMQ(a+4, X1) 383 | 384 | // Count8 accumulation function. Accumulates words into 385 | // 8 dword counters at (DI). Trashes X0--X7. 386 | TEXT accum8<>(SB), NOSPLIT, $0-0 387 | MOVOA 0*16(DX), X0 388 | MOVOA 4*16(DX), X1 389 | MOVOA 2*16(DX), X4 390 | MOVOA 6*16(DX), X5 391 | FOLDW(X0, X1) 392 | FOLDW(X4, X5) 393 | PADDL X4, X0 394 | PADDL X5, X1 395 | ACCUMQ(0, X0) 396 | ACCUMQ(4, X1) 397 | MOVOA 1*16(DX), X0 398 | MOVOA 5*16(DX), X1 399 | MOVOA 3*16(DX), X4 400 | MOVOA 7*16(DX), X5 401 | FOLDW(X0, X1) 402 | FOLDW(X4, X5) 403 | PADDL X4, X0 404 | PADDL X5, X1 405 | ACCUMQ(0, X0) 406 | ACCUMQ(4, X1) 407 | RET 408 | 409 | // Count16 accumulation function. Accumulates words into 410 | // 16 dword counters at (DI). Trashes X0--X7. 411 | TEXT accum16<>(SB), NOSPLIT, $0-0 412 | MOVOA 0*16(DX), X0 413 | MOVOA 4*16(DX), X1 414 | MOVOA 2*16(DX), X4 415 | MOVOA 6*16(DX), X5 416 | FOLDW(X0, X1) 417 | FOLDW(X4, X5) 418 | PADDL X4, X0 419 | PADDL X5, X1 420 | ACCUMQ(0, X0) 421 | ACCUMQ(4, X1) 422 | MOVOA 1*16(DX), X0 423 | MOVOA 5*16(DX), X1 424 | MOVOA 3*16(DX), X4 425 | MOVOA 7*16(DX), X5 426 | FOLDW(X0, X1) 427 | FOLDW(X4, X5) 428 | PADDL X4, X0 429 | PADDL X5, X1 430 | ACCUMQ(8, X0) 431 | ACCUMQ(12, X1) 432 | RET 433 | 434 | // Count32 accumulation function. Accumulates words into 435 | // 32 dword counters at (DI). Trashes X0--X7. 436 | TEXT accum32<>(SB), NOSPLIT, $0-0 437 | MOVOA 0*16(DX), X0 438 | MOVOA 4*16(DX), X1 439 | FOLDW(X0, X1) 440 | ACCUMQ(0, X0) 441 | ACCUMQ(4, X1) 442 | MOVOA 1*16(DX), X0 443 | MOVOA 5*16(DX), X1 444 | FOLDW(X0, X1) 445 | ACCUMQ(8, X0) 446 | ACCUMQ(12, X1) 447 | MOVOA 2*16(DX), X0 448 | MOVOA 6*16(DX), X1 449 | FOLDW(X0, X1) 450 | ACCUMQ(16, X0) 451 | ACCUMQ(20, X1) 452 | MOVOA 3*16(DX), X0 453 | MOVOA 7*16(DX), X1 454 | FOLDW(X0, X1) 455 | ACCUMQ(24, X0) 456 | ACCUMQ(28, X1) 457 | RET 458 | 459 | // Count64 accumulation function. Accumulates words into 460 | // 64 dword counters at (DI). Trashes X0, X1, and X7. 461 | TEXT accum64<>(SB), NOSPLIT, $0-0 462 | ACCUMO( 0, 0) 463 | ACCUMO( 8, 1) 464 | ACCUMO(16, 2) 465 | ACCUMO(24, 3) 466 | ACCUMO(32, 4) 467 | ACCUMO(40, 5) 468 | ACCUMO(48, 6) 469 | ACCUMO(56, 7) 470 | RET 471 | 472 | // func count8sse2(counts *[8]int, buf []uint8) 473 | TEXT ·count8sse2(SB), 0, $0-16 474 | MOVL counts+0(FP), DI 475 | MOVL buf_base+4(FP), SI // SI = &buf[0] 476 | MOVL buf_len+8(FP), BP // BP = len(buf) 477 | MOVL $accum8<>(SB), BX 478 | CALL countsse<>(SB) 479 | RET 480 | 481 | // func count16sse2(counts *[16]int, buf []uint16) 482 | TEXT ·count16sse2(SB), 0, $0-16 483 | MOVL counts+0(FP), DI 484 | MOVL buf_base+4(FP), SI // SI = &buf[0] 485 | MOVL buf_len+8(FP), BP // BP = len(buf) 486 | MOVL $accum16<>(SB), BX 487 | SHLL $1, BP // count in bytes 488 | CALL countsse<>(SB) 489 | RET 490 | 491 | // func count32sse2(counts *[32]int, buf []uint32) 492 | TEXT ·count32sse2(SB), 0, $0-16 493 | MOVL counts+0(FP), DI 494 | MOVL buf_base+4(FP), SI // SI = &buf[0] 495 | MOVL buf_len+8(FP), BP // BP = len(buf) 496 | MOVL $accum32<>(SB), BX 497 | SHLL $2, BP // count in bytes 498 | CALL countsse<>(SB) 499 | RET 500 | 501 | 502 | // func count64sse2(counts *[64]int, buf []uint64) 503 | TEXT ·count64sse2(SB), 0, $0-16 504 | MOVL counts+0(FP), DI 505 | MOVL buf_base+4(FP), SI // SI = &buf[0] 506 | MOVL buf_len+8(FP), BP // BP = len(buf) 507 | MOVL $accum64<>(SB), BX 508 | SHLL $3, BP // count in bytes 509 | CALL countsse<>(SB) 510 | RET 511 | -------------------------------------------------------------------------------- /countsse2_amd64.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // An SSE2 based kernel first doing a 15-fold CSA reduction and then 4 | // a 16-fold CSA reduction, carrying over place-value vectors between 5 | // iterations. Required CPU extension: SSE2. 6 | 7 | // magic transposition constants 8 | DATA magic<> +0(SB)/8, $0x8040201008040201 9 | DATA magic<>+ 8(SB)/8, $0xaaaaaaaa55555555 10 | DATA magic<>+16(SB)/8, $0xcccccccc33333333 11 | DATA magic<>+24(SB)/8, $0x00ff00ff0f0f0f0f 12 | GLOBL magic<>(SB), RODATA|NOPTR, $32 13 | 14 | // sliding window for head/tail loads. Unfortunately, there doesn't 15 | // seem to be a good way to do this with less memory wasted. 16 | DATA window<> +0(SB)/8, $0x0000000000000000 17 | DATA window<> +8(SB)/8, $0x0000000000000000 18 | DATA window<>+16(SB)/8, $0xffffffffffffffff 19 | DATA window<>+24(SB)/8, $0xffffffffffffffff 20 | GLOBL window<>(SB), RODATA|NOPTR, $32 21 | 22 | // B:A = A+B+C 23 | #define CSA(A, B, C) \ 24 | PXOR C, B \ 25 | PXOR A, C \ 26 | PXOR B, A \ 27 | POR C, B \ 28 | PXOR A, B 29 | 30 | // Process 4 bytes from X4. Add low word counts to L, high to H 31 | // assumes mask loaded into X2. Trashes X4, X5. 32 | #define COUNT4(L, H) \ // X4 = ----:----:----:3210 33 | PUNPCKLBW X4, X4 \ // X4 = ----:----:3322:1100 34 | PUNPCKLWL X4, X4 \ // X4 = 3333:2222:1111:0000 35 | PSHUFD $0xfa, X4, X5 \ // X5 = 3333:3333:2222:2222 36 | PUNPCKLLQ X4, X4 \ // X5 = 1111:1111:0000:0000 37 | PAND X6, X4 \ 38 | PAND X6, X5 \ 39 | PCMPEQB X6, X4 \ 40 | PCMPEQB X6, X5 \ 41 | PSUBB X4, L \ 42 | PSUBB X5, H 43 | 44 | // zero extend X from bytes into words and add to the counter vectors 45 | // S1 and S2. X7 is expected to be a zero register, X6 and X are trashed. 46 | #define ACCUM(S1, S2, X) \ 47 | MOVOA X, X6 \ 48 | PUNPCKLBW X7, X \ 49 | PUNPCKHBW X7, X6 \ 50 | PADDW X, S1 \ 51 | PADDW X6, S2 52 | 53 | // Generic kernel. This function expects a pointer to a width-specific 54 | // accumulation funciton in BX, a possibly unaligned input buffer in SI, 55 | // counters in DI and a remaining length in CX. 56 | TEXT countsse2<>(SB), NOSPLIT, $32-0 57 | // constants for processing the head 58 | MOVQ magic<>+0(SB), X6 // bit position mask 59 | PSHUFD $0x44, X6, X6 // broadcast into both qwords 60 | PXOR X7, X7 // zero register 61 | PXOR X8, X8 // counter registers 62 | PXOR X10, X10 63 | PXOR X12, X12 64 | PXOR X14, X14 65 | 66 | CMPQ CX, $15*16 // is the CSA kernel worth using? 67 | JLT runt 68 | 69 | // load head until alignment/end is reached 70 | MOVL SI, DX 71 | ANDL $15, DX // offset of the buffer start from 16 byte alignment 72 | MOVL $16, AX 73 | SUBL DX, AX // number of bytes til alignment is reached (head length) 74 | SUBQ DX, SI // align source to 16 bytes 75 | ADDQ DX, CX // and account for head length 76 | MOVQ $window<>(SB), DX // load window mask base pointer 77 | MOVOU (DX)(AX*1), X2 // load mask of the bytes that are part of the head 78 | PAND (SI), X2 // load head and mask out bytes that are not in the head 79 | 80 | // load 240 - 16 bytes from buf and sum them into X3:X2:X1:X0 81 | MOVOA 1*16(SI), X1 82 | MOVOA 2*16(SI), X0 83 | MOVOA 3*16(SI), X5 84 | MOVOA 4*16(SI), X4 85 | MOVOA 5*16(SI), X3 86 | CSA(X0, X1, X2) 87 | MOVOA 6*16(SI), X7 88 | MOVOA 7*16(SI), X6 89 | MOVOA 8*16(SI), X2 90 | CSA(X3, X4, X5) 91 | MOVOA 9*16(SI), X5 92 | CSA(X2, X6, X7) 93 | MOVOA 10*16(SI), X7 94 | CSA(X0, X5, X3) 95 | MOVOA 11*16(SI), X3 96 | CSA(X1, X4, X6) 97 | MOVOA 12*16(SI), X6 98 | CSA(X0, X2, X7) 99 | MOVOA 13*16(SI), X7 100 | PXOR X9, X9 // initialise remaining counters 101 | PXOR X11, X11 102 | CSA(X3, X7, X6) 103 | MOVOA 14*16(SI), X6 104 | CSA(X1, X2, X5) 105 | ADDQ $15*16, SI 106 | CSA(X0, X3, X6) 107 | MOVL $65535, AX // space left til overflow could occur in Y8--Y11 108 | CSA(X1, X3, X7) 109 | PXOR X13, X13 110 | PXOR X15, X15 111 | CSA(X2, X3, X4) 112 | 113 | SUBQ $(15+16)*16, CX // enough data left to process? 114 | JLT post 115 | 116 | // load 256 bytes from buf, add them to X0..X3 into X0..X4 117 | vec: MOVOA 0*16(SI), X4 118 | MOVOA 1*16(SI), X5 119 | MOVOU X8, X8save-32(SP) // stash some counters to give us 120 | MOVOU X9, X9save-16(SP) // more registers to play with 121 | MOVOA 2*16(SI), X6 122 | MOVOA 3*16(SI), X7 123 | MOVOA 4*16(SI), X8 124 | MOVOA 5*16(SI), X9 125 | CSA(X0, X5, X4) 126 | MOVOA 6*16(SI), X4 127 | CSA(X6, X8, X7) 128 | MOVOA 7*16(SI), X7 129 | CSA(X1, X8, X5) 130 | MOVOA 8*16(SI), X5 131 | CSA(X0, X6, X9) 132 | MOVOA 9*16(SI), X9 133 | CSA(X4, X5, X7) 134 | MOVOA 10*16(SI), X7 135 | CSA(X1, X5, X6) 136 | MOVOA 11*16(SI), X6 137 | CSA(X0, X4, X9) 138 | MOVOA 12*16(SI), X9 139 | CSA(X2, X5, X8) 140 | MOVOA 13*16(SI), X8 141 | CSA(X0, X6, X7) 142 | MOVOA 14*16(SI), X7 143 | CSA(X1, X4, X6) 144 | MOVOA 15*16(SI), X6 145 | CSA(X7, X8, X9) 146 | MOVOU magic<>+8(SB), X9 // 55555555, aaaaaaaa, 33333333, cccccccc 147 | CSA(X0, X6, X7) 148 | ADDQ $16*16, SI 149 | #define D 90 150 | PREFETCHT0 (D+ 0)*16(SI) 151 | CSA(X1, X6, X8) 152 | PREFETCHT0 (D+ 4)*16(SI) 153 | CSA(X2, X4, X6) 154 | PREFETCHT0 (D+ 8)*16(SI) 155 | CSA(X3, X4, X5) 156 | PREFETCHT0 (D+12)*16(SI) 157 | 158 | MOVQ magic<>+24(SB), X8 // 0f0f0f0f, 00ff00ff 159 | 160 | // now X0..X4 hold counters; preserve X0..X4 for the next round 161 | // and add X4 to the the counters. 162 | 163 | // split into even/odd and reduce into crumbs 164 | PSHUFD $0x00, X9, X7 // X7 = 55..55 165 | MOVOA X4, X5 166 | PAND X7, X5 // X5 = 02468ace x8 167 | PANDN X4, X7 // X7 = 13579bdf x8 168 | PSRLL $1, X7 169 | MOVOA X5, X4 170 | PUNPCKLQDQ X7, X4 171 | PUNPCKHQDQ X7, X5 172 | PADDL X5, X4 // X4 = 02468ace x4 13579bdf x4 173 | 174 | // split again into nibbles 175 | PSHUFD $0xaa, X9, X5 // X7 = 33..33 176 | MOVOA X5, X7 177 | PANDN X4, X5 // X5 = 26ae x4 37bf x4 178 | PAND X7, X4 // X4 = 048c x4 159d x4 179 | PSRLL $2, X5 180 | 181 | // split into bytes and shuffle into order 182 | PSHUFD $0x00, X8, X6 // X6 = 0f..0f 183 | MOVOA X6, X7 184 | PANDN X4, X6 // X6 = 4c x4 5d x4 185 | PAND X7, X4 // X4 = 08 x4 19 x4 186 | MOVOA X7, X9 187 | PANDN X5, X7 // X7 = 6e x4 7f x4 188 | PAND X9, X5 // X5 = 2a x4 3b x4 189 | PSLLL $4, X4 190 | PSLLL $4, X5 191 | 192 | MOVOA X4, X9 193 | PUNPCKLWL X5, X4 // X4 = 082a x4 194 | PUNPCKHWL X5, X9 // X9 = 193b x4 195 | MOVOA X6, X5 196 | PUNPCKLWL X7, X5 // X5 = 4c6e x4 197 | PUNPCKHWL X7, X6 // X6 = 5d7f x4 198 | MOVOA X4, X7 199 | PUNPCKLWL X9, X4 // X4 = 08192a3b[0:1] 200 | PUNPCKHWL X9, X7 // X7 = 08192a3b[2:3] 201 | MOVOA X5, X9 202 | PUNPCKLWL X6, X5 // X5 = 4c5d6e7f[0:1] 203 | PUNPCKHWL X6, X9 // X9 = 4c5d6e7f[2:3] 204 | MOVOA X4, X6 205 | PUNPCKLQDQ X5, X4 // X4 = 08192a3b4c5d6e7f[0] 206 | PUNPCKHQDQ X5, X6 // X6 = 08192a3b4c5d6e7f[1] 207 | MOVOA X7, X5 208 | PUNPCKLQDQ X9, X5 // X5 = 08192a3b4c5d6e7f[2] 209 | PUNPCKHQDQ X9, X7 // X7 = 08192a3b4c5d6e7f[3] 210 | 211 | // split into words and add to counters 212 | PSHUFD $0x55, X8, X8 // X8 = 00ff..00ff 213 | MOVOA X6, X9 214 | PAND X8, X6 // X6 = 01234678[1] 215 | PSRLW $8, X9 // X9 = 89abcdef[1] 216 | PADDW X6, X10 217 | PADDW X9, X11 218 | 219 | MOVOA X8, X6 220 | MOVOU X8save-32(SP), X8 221 | MOVOA X5, X9 222 | PAND X6, X5 // X5 = 01234567[2] 223 | PSRLW $8, X9 // X9 = 89abcdef[2] 224 | PADDW X5, X12 225 | PADDW X9, X13 226 | 227 | MOVOU X9save-16(SP), X9 228 | MOVOA X7, X5 229 | PAND X6, X7 // X7 = 01234567[3] 230 | PSRLW $8, X5 // X5 = 89abcdef[3] 231 | PADDW X7, X14 232 | PADDW X5, X15 233 | 234 | MOVOA X4, X5 235 | PAND X6, X4 // X4 = 01234567[0] 236 | PSRLW $8, X5 // X5 = 89abcdef[0] 237 | PADDW X4, X8 238 | PADDW X5, X9 239 | 240 | SUBL $16*2, AX // account for possible overflow 241 | CMPL AX, $(15+15)*2 // enough space left in the counters? 242 | JGE have_space 243 | 244 | PXOR X7, X7 245 | CALL *BX // call accumulation function 246 | PXOR X8, X8 // clear counters for next round 247 | PXOR X9, X9 248 | PXOR X10, X10 249 | PXOR X11, X11 250 | PXOR X12, X12 251 | PXOR X13, X13 252 | PXOR X14, X14 253 | PXOR X15, X15 254 | 255 | MOVL $65535, AX // space left til overflow could occur 256 | 257 | have_space: 258 | SUBQ $16*16, CX // account for bytes consumed 259 | JGE vec 260 | 261 | post: MOVQ magic<>+8(SB), X5 // load magic constants 262 | PSHUFD $0x55, X5, X6 // 0xaaaaaaaa 263 | PSHUFD $0x00, X5, X7 // 0x55555555 264 | 265 | // group X0--X3 into nibbles in the same register 266 | MOVOA X0, X5 267 | PAND X6, X5 268 | PSRLL $1, X5 269 | MOVOA X1, X4 270 | PAND X7, X4 271 | PADDL X4, X4 272 | PAND X7, X0 273 | PAND X6, X1 274 | POR X4, X0 // X0 = eca86420 (low crumbs) 275 | POR X5, X1 // X1 = fdb97531 (high crumbs) 276 | 277 | MOVOA X2, X5 278 | PAND X6, X5 279 | PSRLL $1, X5 280 | MOVOA X3, X4 281 | PAND X7, X4 282 | PADDL X4, X4 283 | PAND X7, X2 284 | PAND X6, X3 285 | POR X4, X2 // X0 = eca86420 (low crumbs) 286 | POR X5, X3 // X1 = fdb97531 (high crumbs) 287 | 288 | MOVQ magic<>+16(SB), X7 289 | PSHUFD $0x55, X7, X6 // 0xcccccccc 290 | PSHUFD $0x00, X7, X7 // 0x33333333 291 | 292 | MOVOA X0, X5 293 | PAND X6, X5 294 | PSRLL $2, X5 295 | MOVOA X2, X4 296 | PAND X7, X4 297 | PSLLL $2, X4 298 | PAND X7, X0 299 | PAND X6, X2 300 | POR X4, X0 // X0 = c840 301 | POR X5, X2 // X2 = ea62 302 | 303 | MOVOA X1, X5 304 | PAND X6, X5 305 | PSRLL $2, X5 306 | MOVOA X3, X4 307 | PAND X7, X4 308 | PSLLL $2, X4 309 | PAND X7, X1 310 | PAND X6, X3 311 | POR X4, X1 // X1 = d951 312 | POR X5, X3 // X3 = fb73 313 | 314 | MOVD magic<>+24(SB), X7 315 | PSHUFD $0x00, X7, X7 // 0x0f0f0f0f 316 | 317 | // pre-shuffle nibbles 318 | MOVOA X2, X5 319 | PUNPCKLBW X3, X2 // X2 = fbea7362 (3:2:1:0) 320 | PUNPCKHBW X3, X5 // X5 = fbea7362 (7:6:5:4) 321 | MOVOA X0, X3 322 | PUNPCKLBW X1, X0 // X0 = d9c85140 (3:2:1:0) 323 | PUNPCKHBW X1, X3 // X4 = d9c85140 (7:6:5:4) 324 | MOVOA X0, X1 325 | PUNPCKLWL X2, X0 // X0 = fbead9c873625140 (1:0) 326 | PUNPCKHWL X2, X1 // X1 = fbead9c873625140 (3:2) 327 | MOVOA X3, X2 328 | PUNPCKLWL X5, X2 // X2 = fbead9c873625140 (5:4) 329 | PUNPCKHWL X5, X3 // X3 = fbead9c873625140 (7:6) 330 | 331 | // pull high and low nibbles and reduce once 332 | MOVOA X0, X4 333 | PSRLL $4, X4 334 | PAND X7, X0 // X0 = ba983210 (1:0) 335 | PAND X7, X4 // X4 = fedc7654 (1:0) 336 | 337 | MOVOA X2, X6 338 | PSRLL $4, X2 339 | PAND X7, X6 // X6 = ba983210 (5:4) 340 | PAND X7, X2 // X2 = fedc7654 (5:4) 341 | 342 | PADDB X6, X0 // X0 = ba983210 (1:0) 343 | PADDB X4, X2 // X2 = fedc7654 (1:0) 344 | 345 | MOVOA X1, X4 346 | PSRLL $4, X4 347 | PAND X7, X1 // X1 = ba983210 (3:2) 348 | PAND X7, X4 // X4 = fedc7654 (3:2) 349 | 350 | MOVOA X3, X6 351 | PSRLL $4, X3 352 | PAND X7, X6 // X6 = ba983210 (7:6) 353 | PAND X7, X3 // X3 = fedc7654 (7:6) 354 | 355 | PADDB X6, X1 // X1 = ba983210 (3:2) 356 | PADDB X4, X3 // X3 = fedc7654 (3:2) 357 | 358 | // unpack one last time 359 | MOVOA X0, X4 360 | PUNPCKLLQ X2, X0 // X0 = fedcba9876543210 (0) 361 | PUNPCKHLQ X2, X4 // X4 = fedcba9876543210 (1) 362 | MOVOA X1, X5 363 | PUNPCKLLQ X3, X1 // X1 = fedcba9876543210 (2) 364 | PUNPCKHLQ X3, X5 // X5 = fedcba9876543210 (3) 365 | 366 | // add to counters 367 | PXOR X7, X7 // zero register 368 | ACCUM( X8, X9, X0) 369 | ACCUM(X10, X11, X4) 370 | ACCUM(X12, X13, X1) 371 | ACCUM(X14, X15, X5) 372 | 373 | // constants for processing the tail 374 | endvec: MOVQ magic<>+0(SB), X6 // bit position mask 375 | PSHUFD $0x44, X6, X6 // broadcast into both qwords 376 | PXOR X0, X0 // counter registers 377 | PXOR X1, X1 378 | PXOR X2, X2 379 | PXOR X3, X3 380 | 381 | // process tail, 4 bytes at a time 382 | SUBL $8-16*16, CX // 8 bytes left to process? 383 | JLT tail1 384 | 385 | tail8: MOVL 0(SI), X4 386 | COUNT4(X0, X1) 387 | MOVL 4(SI), X4 388 | COUNT4(X2, X3) 389 | ADDQ $8, SI 390 | SUBL $8, CX 391 | JGE tail8 392 | 393 | // process remaining 0--7 byte 394 | tail1: SUBL $-8, CX // anything left to process? 395 | JLE end 396 | 397 | MOVQ (SI), X5 // load 8 bytes from buffer. Note that 398 | // buffer is aligned to 8 byte here 399 | MOVQ $window<>+16(SB), AX // load window address 400 | SUBQ CX, AX // adjust mask 401 | MOVQ (AX), X7 // load window mask 402 | PANDN X5, X7 // and mask out the desired bytes 403 | 404 | // process rest 405 | MOVOA X7, X4 406 | COUNT4(X0, X1) 407 | PSRLO $4, X7 408 | MOVOA X7, X4 409 | COUNT4(X2, X3) 410 | 411 | // add tail to counters 412 | end: PXOR X7, X7 // zero register 413 | MOVOA X0, X4 414 | PUNPCKLBW X7, X0 415 | PUNPCKHBW X7, X4 416 | PADDW X0, X8 417 | PADDW X4, X9 418 | MOVOA X1, X4 419 | PUNPCKLBW X7, X1 420 | PUNPCKHBW X7, X4 421 | PADDW X1, X10 422 | PADDW X4, X11 423 | MOVOA X2, X4 424 | PUNPCKLBW X7, X2 425 | PUNPCKHBW X7, X4 426 | PADDW X2, X12 427 | PADDW X4, X13 428 | MOVOA X3, X4 429 | PUNPCKLBW X7, X3 430 | PUNPCKHBW X7, X4 431 | PADDW X3, X14 432 | PADDW X4, X15 433 | 434 | CALL *BX 435 | RET 436 | 437 | // buffer is short, do just head/tail processing 438 | runt: SUBL $8, CX // 8 bytes left to process? 439 | JLT runt1 440 | 441 | // process runt 8 bytes at a time 442 | runt8: MOVL 0(SI), X4 443 | COUNT4(X8, X10) 444 | MOVL 4(SI), X4 445 | COUNT4(X12, X14) 446 | ADDQ $8, SI 447 | SUBL $8, CX 448 | JGE runt8 449 | 450 | // process remaining 0--7 byte 451 | // while making sure we don't get a page fault 452 | runt1: ADDL $8, CX // anything left to process? 453 | JLE runt_accum 454 | 455 | MOVL SI, AX 456 | ANDL $7, AX // offset from 8 byte alignment 457 | LEAL (AX)(CX*1), DX // length of buffer plus alignment 458 | SHLL $3, CX // remaining length in bits 459 | XORQ R9, R9 460 | BTSQ CX, R9 461 | DECQ R9 // mask of bits where R8 is in range 462 | CMPL DX, $8 // if this exceeds the alignment boundary 463 | JGT crossrunt1 // we can safely load directly 464 | 465 | ANDQ $~7, SI // align buffer to 8 bytes 466 | MOVQ (SI), R8 // and and load 8 bytes from buffer 467 | LEAL (AX*8), CX // offset from 8 byte alignment in bits 468 | SHRQ CX, R8 // buffer starting from the beginning 469 | JMP dorunt1 470 | 471 | crossrunt1: 472 | MOVQ (SI), R8 // load 8 bytes from unaligned buffer 473 | 474 | dorunt1: 475 | ANDQ R9, R8 // mask out bytes behind the buffer 476 | MOVL R8, X4 477 | SHRQ $32, R8 478 | COUNT4(X8, X10) 479 | MOVL R8, X4 480 | COUNT4(X12, X14) 481 | 482 | // move tail to counters and perform final accumulation 483 | runt_accum: 484 | MOVOA X8, X9 485 | PUNPCKLBW X7, X8 486 | PUNPCKHBW X7, X9 487 | MOVOA X10, X11 488 | PUNPCKLBW X7, X10 489 | PUNPCKHBW X7, X11 490 | MOVOA X12, X13 491 | PUNPCKLBW X7, X12 492 | PUNPCKHBW X7, X13 493 | MOVOA X14, X15 494 | PUNPCKLBW X7, X14 495 | PUNPCKHBW X7, X15 496 | 497 | CALL *BX 498 | RET 499 | 500 | // zero-extend dwords in X trashing X, X4, and X5. Add the low half 501 | // dwords to a*8(DI) and the high half to (a+2)*8(DI). 502 | // Assumes X7 == 0. 503 | #define ACCUMQ(a, X) \ 504 | MOVOA X, X4 \ 505 | PUNPCKLLQ X7, X \ 506 | PUNPCKHLQ X7, X4 \ 507 | MOVOU (a)*8(DI), X5 \ 508 | PADDQ X, X5 \ 509 | MOVOU X5, (a)*8(DI) \ 510 | MOVOU (a+2)*8(DI), X5 \ 511 | PADDQ X4, X5 \ 512 | MOVOU X5, (a+2)*8(DI) 513 | 514 | // zero-extend words in X to qwords and add to a*8(DI) to (a+7)*8(DI). 515 | // Trashes X4, X5, and X6. Assumes X7 == 0 an X8 <= X <= X15. 516 | #define ACCUMO(a, X) \ 517 | MOVOA X, X6 \ 518 | PUNPCKLWL X7, X6 \ 519 | PUNPCKHWL X7, X \ 520 | ACCUMQ(a, X6) \ 521 | ACCUMQ(a+4, X) 522 | 523 | // zero-extend words in X and Y to dwords, sum them, and move the 524 | // halves back into X and Y. Assumes X7 == 0. Trashes X4, X5. 525 | #define FOLDW(X, Y) \ 526 | MOVOA X, X4 \ 527 | PUNPCKLWL X7, X \ 528 | PUNPCKHWL X7, X4 \ 529 | MOVOA Y, X5 \ 530 | PUNPCKLWL X7, X5 \ 531 | PUNPCKHWL X7, Y \ 532 | PADDL X5, X \ 533 | PADDL X4, Y 534 | 535 | // Count8 accumulation function. Accumulates words X8--X15 into 536 | // 8 qword counters at (DI). Assumes X7 == 0. Trashes X4--X15. 537 | TEXT accum8<>(SB), NOSPLIT, $0-0 538 | FOLDW(X8, X12) 539 | FOLDW(X9, X13) 540 | FOLDW(X10, X14) 541 | FOLDW(X11, X15) 542 | PADDL X10, X8 543 | PADDL X11, X9 544 | PADDL X14, X12 545 | PADDL X15, X13 546 | PADDL X9, X8 547 | ACCUMQ(0, X8) 548 | PADDL X13, X12 549 | ACCUMQ(4, X12) 550 | RET 551 | 552 | // Count16 accumulation function. Accumulates words X8--X15 into 553 | // 16 qword counters at (DI). Assumes X7 == 0. Trashes X4--X15. 554 | TEXT accum16<>(SB), NOSPLIT, $0-0 555 | FOLDW(X8, X12) 556 | FOLDW(X9, X13) 557 | FOLDW(X10, X14) 558 | FOLDW(X11, X15) 559 | PADDL X10, X8 560 | ACCUMQ(0, X8) 561 | PADDL X14, X12 562 | ACCUMQ(4, X12) 563 | PADDL X11, X9 564 | ACCUMQ(8, X9) 565 | PADDL X15, X13 566 | ACCUMQ(12, X13) 567 | RET 568 | 569 | // Count32 accumulation function. Accumulates words X8--X15 into 570 | // 32 qword counters at (DI). Assumes X7 == 0. Trashes X4--X15. 571 | TEXT accum32<>(SB), NOSPLIT, $0-0 572 | FOLDW(X8, X12) 573 | ACCUMQ(0, X8) 574 | ACCUMQ(4, X12) 575 | FOLDW(X9, X13) 576 | ACCUMQ(8, X9) 577 | ACCUMQ(12, X13) 578 | FOLDW(X10, X14) 579 | ACCUMQ(16, X10) 580 | ACCUMQ(20, X14) 581 | FOLDW(X11, X15) 582 | ACCUMQ(24, X11) 583 | ACCUMQ(28, X15) 584 | RET 585 | 586 | // Count64 accumulation function. Accumulates words X8--X15 into 587 | // 64 qword counters at (DI). Assumes X7 == 0. Trashes X4--X15. 588 | TEXT accum64<>(SB), NOSPLIT, $0-0 589 | ACCUMO(0, X8) 590 | ACCUMO(8, X9) 591 | ACCUMO(16, X10) 592 | ACCUMO(24, X11) 593 | ACCUMO(32, X12) 594 | ACCUMO(40, X13) 595 | ACCUMO(48, X14) 596 | ACCUMO(56, X15) 597 | RET 598 | 599 | // func count8sse2(counts *[8]int, buf []uint8) 600 | TEXT ·count8sse2(SB), 0, $0-32 601 | MOVQ counts+0(FP), DI 602 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 603 | MOVQ buf_len+16(FP), CX // CX = len(buf) 604 | MOVQ $accum8<>(SB), BX 605 | CALL countsse2<>(SB) 606 | RET 607 | 608 | // func count16sse2(counts *[16]int, buf []uint16) 609 | TEXT ·count16sse2(SB), 0, $0-32 610 | MOVQ counts+0(FP), DI 611 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 612 | MOVQ buf_len+16(FP), CX // CX = len(buf) 613 | MOVQ $accum16<>(SB), BX 614 | SHLQ $1, CX // count in bytes 615 | CALL countsse2<>(SB) 616 | RET 617 | 618 | // func count32sse2(counts *[32]int, buf []uint32) 619 | TEXT ·count32sse2(SB), 0, $0-32 620 | MOVQ counts+0(FP), DI 621 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 622 | MOVQ buf_len+16(FP), CX // CX = len(buf) 623 | MOVQ $accum32<>(SB), BX 624 | SHLQ $2, CX // count in bytes 625 | CALL countsse2<>(SB) 626 | RET 627 | 628 | // func count64sse2(counts *[64]int, buf []uint64) 629 | TEXT ·count64sse2(SB), 0, $0-32 630 | MOVQ counts+0(FP), DI 631 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 632 | MOVQ buf_len+16(FP), CX // CX = len(buf) 633 | MOVQ $accum64<>(SB), BX 634 | SHLQ $3, CX // count in bytes 635 | CALL countsse2<>(SB) 636 | RET 637 | -------------------------------------------------------------------------------- /dispatch.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020, 2024 Robert Clausecker 2 | 3 | // Positional population counts. 4 | // 5 | // This package contains a set of functions to compute positional 6 | // population counts for arrays of uint8, uint16, uint32, or uint64. 7 | // Optimised assembly optimisations are provided for amd64 (AVX-512, 8 | // AVX2, SSE2), 386 (AVX2, SSE2), and ARM64 (NEON). An optimal 9 | // implementation constrainted by the instruction set extensions 10 | // available on your CPU is chosen automatically at runtime. If no 11 | // assembly implementation exists, a generic fallback implementation 12 | // will be used. The pospop package thus works on all architectures 13 | // supported by the Go toolchain. 14 | // 15 | // The kernels works on a block size of 240, 480, or 960 bytes. A 16 | // buffer size that is a multiple of 64 bytes and at least 10 kB in size 17 | // is recommended. The author's benchmarks show that a buffer size 18 | // around 100 kB appears optimal. 19 | // 20 | // See the example on the Count8 function for what the positional 21 | // population count operation does. 22 | package pospop 23 | 24 | import "unsafe" 25 | 26 | // each platform must provide arrays count8funcs, coun16funcs, 27 | // count32funcs, and count64funcs of type count8impl, ... listing 28 | // the available implementations. The member available indicates that 29 | // the function would run on this machine. The dispatch code picks the 30 | // lowest-numbered function in the array for which available is true. 31 | // The generic implementation should be available under all 32 | // circumstances so it can be run by the unit tests. The name field 33 | // should be the name of the implementation and should not repeat the 34 | // "count#" prefix. 35 | 36 | type count8impl struct { 37 | count8 func(*[8]int, []uint8) 38 | name string 39 | available bool 40 | } 41 | 42 | type count16impl struct { 43 | count16 func(*[16]int, []uint16) 44 | name string 45 | available bool 46 | } 47 | 48 | type count32impl struct { 49 | count32 func(*[32]int, []uint32) 50 | name string 51 | available bool 52 | } 53 | 54 | type count64impl struct { 55 | count64 func(*[64]int, []uint64) 56 | name string 57 | available bool 58 | } 59 | 60 | // optimal count8 implementation selected at runtime 61 | var count8func = func() func(*[8]int, []uint8) { 62 | for _, f := range count8funcs { 63 | if f.available { 64 | return f.count8 65 | } 66 | } 67 | 68 | panic("no implementation of count8 available") 69 | }() 70 | 71 | // optimal count16 implementation selected at runtime 72 | var count16func = func() func(*[16]int, []uint16) { 73 | for _, f := range count16funcs { 74 | if f.available { 75 | return f.count16 76 | } 77 | } 78 | 79 | panic("no implementation of count16 available") 80 | }() 81 | 82 | // optimal count32 implementation selected at runtime 83 | var count32func = func() func(*[32]int, []uint32) { 84 | for _, f := range count32funcs { 85 | if f.available { 86 | return f.count32 87 | } 88 | } 89 | 90 | panic("no implementation of count32 available") 91 | }() 92 | 93 | // optimal count64 implementation selected at runtime 94 | var count64func = func() func(*[64]int, []uint64) { 95 | for _, f := range count64funcs { 96 | if f.available { 97 | return f.count64 98 | } 99 | } 100 | 101 | panic("no implementation of count64 available") 102 | }() 103 | 104 | // Count the number of corresponding set bits of the bytes in str and 105 | // add the results to counts. Each element of counts keeps track of a 106 | // different place; counts[0] for 0x01, counts[1] for 0x02, and so on to 107 | // counts[7] for 0x80. 108 | func CountString(counts *[8]int, str string) { 109 | buf := unsafe.Slice(unsafe.StringData(str), len(str)) 110 | count8func(counts, buf) 111 | } 112 | 113 | // Count the number of corresponding set bits of the bytes in buf and 114 | // add the results to counts. Each element of counts keeps track of a 115 | // different place; counts[0] for 0x01, counts[1] for 0x02, and so on to 116 | // counts[7] for 0x80. 117 | func Count8(counts *[8]int, buf []uint8) { 118 | count8func(counts, buf) 119 | } 120 | 121 | // Count the number of corresponding set bits of the values in buf and 122 | // add the results to counts. Each element of counts keeps track of a 123 | // different place; counts[0] for 0x0001, counts[1] for 0x0002, and so 124 | // on to counts[15] for 0x8000. 125 | func Count16(counts *[16]int, buf []uint16) { 126 | count16func(counts, buf) 127 | } 128 | 129 | // Count the number of corresponding set bits of the values in buf and 130 | // add the results to counts. Each element of counts keeps track of a 131 | // different place; counts[0] for 0x0000001, counts[1] for 0x00000002, 132 | // and so on to counts[31] for 0x80000000. 133 | func Count32(counts *[32]int, buf []uint32) { 134 | count32func(counts, buf) 135 | } 136 | 137 | // Count the number of corresponding set bits of the values in buf and 138 | // add the results to counts. Each element of counts keeps track of a 139 | // different place; counts[0] for 0x000000000000001, counts[1] for 140 | // 0x0000000000000002, and so on to counts[63] for 0x8000000000000000. 141 | func Count64(counts *[64]int, buf []uint64) { 142 | count64func(counts, buf) 143 | } 144 | -------------------------------------------------------------------------------- /example_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Robert Clausecker 2 | 3 | package pospop 4 | 5 | import "fmt" 6 | 7 | // This example illustrates the positional population count operation. 8 | // For each number in the input, Count8() checks which of its bits are 9 | // set and increments the corresponding counters. In this example, 10 | // four numbers (1, 3, 5, 9) have bit 0 set; three numbers (2, 3, 6) 11 | // have bit 1 set, two numbers (5, 6) have bit 2 set and only the number 12 | // 9 has bit 3 set. 13 | func ExampleCount8() { 14 | var counts [8]int 15 | numbers := []uint8{ 16 | 1, // bit 0 set 17 | 2, // bit 1 set 18 | 3, // bits 0 and 1 set 19 | 5, // bits 0 and 2 set 20 | 6, // bits 1 and 2 set 21 | 9, // bits 0 and 3 set 22 | } 23 | 24 | Count8(&counts, numbers) 25 | fmt.Println(counts) 26 | // Output: [4 3 2 1 0 0 0 0] 27 | } 28 | -------------------------------------------------------------------------------- /generic.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Robert Clausecker 2 | 3 | package pospop 4 | 5 | // 8-bit full adder 6 | func csa8(a, b, c uint8) (c_out, s uint8) { 7 | s_ab := a ^ b 8 | c_ab := a & b 9 | 10 | s = s_ab ^ c 11 | c_out = c_ab | s_ab&c 12 | 13 | return 14 | } 15 | 16 | // count8 generic implementation. Uses the same CSA15 17 | // kernel as the vectorised implementations. 18 | func count8generic(counts *[8]int, buf []uint8) { 19 | var i int 20 | 21 | for i = 0; i < len(buf)-14; i += 15 { 22 | b0, a0 := csa8(buf[i+0], buf[i+1], buf[i+2]) 23 | b1, a1 := csa8(buf[i+3], buf[i+4], buf[i+5]) 24 | b2, a2 := csa8(a0, a1, buf[i+6]) 25 | c0, b3 := csa8(b0, b1, b2) 26 | b4, a3 := csa8(a2, buf[i+7], buf[i+8]) 27 | b5, a4 := csa8(a3, buf[i+9], buf[i+10]) 28 | c1, b6 := csa8(b3, b4, b5) 29 | b7, a5 := csa8(a4, buf[i+11], buf[i+12]) 30 | b8, a := csa8(a5, buf[i+13], buf[i+14]) 31 | c2, b := csa8(b6, b7, b8) 32 | d, c := csa8(c0, c1, c2) 33 | 34 | // d:c:b:a now holds the counters 35 | 36 | ba0 := a&0x55 | b<<1&0xaa 37 | ba1 := a>>1&0x55 | b&0xaa 38 | dc0 := c&0x55 | d<<1&0xaa 39 | dc1 := c>>1&0x55 | d&0xaa 40 | 41 | dcba0 := ba0&0x33 | dc0<<2&0xcc 42 | dcba1 := ba0>>2&0x33 | dc0&0xcc 43 | dcba2 := ba1&0x33 | dc1<<2&0xcc 44 | dcba3 := ba1>>2&0x33 | dc1&0xcc 45 | 46 | // add to counters 47 | counts[0] += int(dcba0 & 0x0f) 48 | counts[1] += int(dcba2 & 0x0f) 49 | counts[2] += int(dcba1 & 0x0f) 50 | counts[3] += int(dcba3 & 0x0f) 51 | counts[4] += int(dcba0 >> 4) 52 | counts[5] += int(dcba2 >> 4) 53 | counts[6] += int(dcba1 >> 4) 54 | counts[7] += int(dcba3 >> 4) 55 | } 56 | 57 | // count8safe() manually inlined 58 | for ; i < len(buf); i++ { 59 | for j := 0; j < 8; j++ { 60 | counts[j] += int(buf[i] >> j & 1) 61 | } 62 | } 63 | } 64 | 65 | // 16-bit full adder 66 | func csa16(a, b, c uint16) (c_out, s uint16) { 67 | s_ab := a ^ b 68 | c_ab := a & b 69 | 70 | s = s_ab ^ c 71 | c_out = c_ab | s_ab&c 72 | 73 | return 74 | } 75 | 76 | // count16 generic implementation. Uses the same CSA15 77 | // kernel as the vectorised implementations. 78 | func count16generic(counts *[16]int, buf []uint16) { 79 | var i int 80 | 81 | for i = 0; i < len(buf)-14; i += 15 { 82 | b0, a0 := csa16(buf[i+0], buf[i+1], buf[i+2]) 83 | b1, a1 := csa16(buf[i+3], buf[i+4], buf[i+5]) 84 | b2, a2 := csa16(a0, a1, buf[i+6]) 85 | c0, b3 := csa16(b0, b1, b2) 86 | b4, a3 := csa16(a2, buf[i+7], buf[i+8]) 87 | b5, a4 := csa16(a3, buf[i+9], buf[i+10]) 88 | c1, b6 := csa16(b3, b4, b5) 89 | b7, a5 := csa16(a4, buf[i+11], buf[i+12]) 90 | b8, a6 := csa16(a5, buf[i+13], buf[i+14]) 91 | c2, b9 := csa16(b6, b7, b8) 92 | d0, c3 := csa16(c0, c1, c2) 93 | 94 | // d:c:b:a now holds the counters 95 | a := uint(a6) 96 | b := uint(b9) 97 | c := uint(c3) 98 | d := uint(d0) 99 | 100 | ba0 := a&0x5555 | b<<1&0xaaaa 101 | ba1 := a>>1&0x5555 | b&0xaaaa 102 | dc0 := c&0x5555 | d<<1&0xaaaa 103 | dc1 := c>>1&0x5555 | d&0xaaaa 104 | 105 | dcba0 := ba0&0x3333 | dc0<<2&0xcccc 106 | dcba1 := ba0>>2&0x3333 | dc0&0xcccc 107 | dcba2 := ba1&0x3333 | dc1<<2&0xcccc 108 | dcba3 := ba1>>2&0x3333 | dc1&0xcccc 109 | 110 | // add to counters 111 | counts[0] += int(dcba0 & 0x0f) 112 | counts[1] += int(dcba2 & 0x0f) 113 | counts[2] += int(dcba1 & 0x0f) 114 | counts[3] += int(dcba3 & 0x0f) 115 | counts[4] += int(dcba0 >> 4 & 0x0f) 116 | counts[5] += int(dcba2 >> 4 & 0x0f) 117 | counts[6] += int(dcba1 >> 4 & 0x0f) 118 | counts[7] += int(dcba3 >> 4 & 0x0f) 119 | counts[8] += int(dcba0 >> 8 & 0x0f) 120 | counts[9] += int(dcba2 >> 8 & 0x0f) 121 | counts[10] += int(dcba1 >> 8 & 0x0f) 122 | counts[11] += int(dcba3 >> 8 & 0x0f) 123 | counts[12] += int(dcba0 >> 12) 124 | counts[13] += int(dcba2 >> 12) 125 | counts[14] += int(dcba1 >> 12) 126 | counts[15] += int(dcba3 >> 12) 127 | } 128 | 129 | // count16safe() manually inlined 130 | for ; i < len(buf); i++ { 131 | for j := 0; j < 16; j++ { 132 | counts[j] += int(buf[i] >> j & 1) 133 | } 134 | } 135 | } 136 | 137 | // 32-bit full adder 138 | func csa32(a, b, c uint32) (c_out, s uint32) { 139 | s_ab := a ^ b 140 | c_ab := a & b 141 | 142 | s = s_ab ^ c 143 | c_out = c_ab | s_ab&c 144 | 145 | return 146 | } 147 | 148 | // count32 generic implementation. Uses the same CSA15 149 | // kernel as the vectorised implementations. 150 | func count32generic(counts *[32]int, buf []uint32) { 151 | var i int 152 | 153 | for i = 0; i < len(buf)-14; i += 15 { 154 | b0, a0 := csa32(buf[i+0], buf[i+1], buf[i+2]) 155 | b1, a1 := csa32(buf[i+3], buf[i+4], buf[i+5]) 156 | b2, a2 := csa32(a0, a1, buf[i+6]) 157 | c0, b3 := csa32(b0, b1, b2) 158 | b4, a3 := csa32(a2, buf[i+7], buf[i+8]) 159 | b5, a4 := csa32(a3, buf[i+9], buf[i+10]) 160 | c1, b6 := csa32(b3, b4, b5) 161 | b7, a5 := csa32(a4, buf[i+11], buf[i+12]) 162 | b8, a := csa32(a5, buf[i+13], buf[i+14]) 163 | c2, b := csa32(b6, b7, b8) 164 | d, c := csa32(c0, c1, c2) 165 | 166 | // d:c:b:a now holds the counters 167 | 168 | ba0 := a&0x55555555 | b<<1&0xaaaaaaaa 169 | ba1 := a>>1&0x55555555 | b&0xaaaaaaaa 170 | dc0 := c&0x55555555 | d<<1&0xaaaaaaaa 171 | dc1 := c>>1&0x55555555 | d&0xaaaaaaaa 172 | 173 | dcba0 := ba0&0x33333333 | dc0<<2&0xcccccccc 174 | dcba1 := ba0>>2&0x33333333 | dc0&0xcccccccc 175 | dcba2 := ba1&0x33333333 | dc1<<2&0xcccccccc 176 | dcba3 := ba1>>2&0x33333333 | dc1&0xcccccccc 177 | 178 | // add to counters 179 | counts[0] += int(dcba0 & 0x0f) 180 | counts[1] += int(dcba2 & 0x0f) 181 | counts[2] += int(dcba1 & 0x0f) 182 | counts[3] += int(dcba3 & 0x0f) 183 | counts[4] += int(dcba0 >> 4 & 0x0f) 184 | counts[5] += int(dcba2 >> 4 & 0x0f) 185 | counts[6] += int(dcba1 >> 4 & 0x0f) 186 | counts[7] += int(dcba3 >> 4 & 0x0f) 187 | counts[8] += int(dcba0 >> 8 & 0x0f) 188 | counts[9] += int(dcba2 >> 8 & 0x0f) 189 | counts[10] += int(dcba1 >> 8 & 0x0f) 190 | counts[11] += int(dcba3 >> 8 & 0x0f) 191 | counts[12] += int(dcba0 >> 12 & 0x0f) 192 | counts[13] += int(dcba2 >> 12 & 0x0f) 193 | counts[14] += int(dcba1 >> 12 & 0x0f) 194 | counts[15] += int(dcba3 >> 12 & 0x0f) 195 | counts[16] += int(dcba0 >> 16 & 0x0f) 196 | counts[17] += int(dcba2 >> 16 & 0x0f) 197 | counts[18] += int(dcba1 >> 16 & 0x0f) 198 | counts[19] += int(dcba3 >> 16 & 0x0f) 199 | counts[20] += int(dcba0 >> 20 & 0x0f) 200 | counts[21] += int(dcba2 >> 20 & 0x0f) 201 | counts[22] += int(dcba1 >> 20 & 0x0f) 202 | counts[23] += int(dcba3 >> 20 & 0x0f) 203 | counts[24] += int(dcba0 >> 24 & 0x0f) 204 | counts[25] += int(dcba2 >> 24 & 0x0f) 205 | counts[26] += int(dcba1 >> 24 & 0x0f) 206 | counts[27] += int(dcba3 >> 24 & 0x0f) 207 | counts[28] += int(dcba0 >> 28) 208 | counts[29] += int(dcba2 >> 28) 209 | counts[30] += int(dcba1 >> 28) 210 | counts[31] += int(dcba3 >> 28) 211 | } 212 | 213 | // count32safe() manually inlined 214 | for ; i < len(buf); i++ { 215 | for j := 0; j < 32; j++ { 216 | counts[j] += int(buf[i] >> j & 1) 217 | } 218 | } 219 | } 220 | 221 | // 64-bit full adder 222 | func csa64(a, b, c uint64) (c_out, s uint64) { 223 | s_ab := a ^ b 224 | c_ab := a & b 225 | 226 | s = s_ab ^ c 227 | c_out = c_ab | s_ab&c 228 | 229 | return 230 | } 231 | 232 | // count64 generic implementation. Uses the same CSA15 233 | // kernel as the vectorised implementations. 234 | func count64generic(counts *[64]int, buf []uint64) { 235 | var i int 236 | 237 | for i = 0; i < len(buf)-14; i += 15 { 238 | b0, a0 := csa64(buf[i+0], buf[i+1], buf[i+2]) 239 | b1, a1 := csa64(buf[i+3], buf[i+4], buf[i+5]) 240 | b2, a2 := csa64(a0, a1, buf[i+6]) 241 | c0, b3 := csa64(b0, b1, b2) 242 | b4, a3 := csa64(a2, buf[i+7], buf[i+8]) 243 | b5, a4 := csa64(a3, buf[i+9], buf[i+10]) 244 | c1, b6 := csa64(b3, b4, b5) 245 | b7, a5 := csa64(a4, buf[i+11], buf[i+12]) 246 | b8, a := csa64(a5, buf[i+13], buf[i+14]) 247 | c2, b := csa64(b6, b7, b8) 248 | d, c := csa64(c0, c1, c2) 249 | 250 | // d:c:b:a now holds the counters 251 | 252 | ba0 := a&0x5555555555555555 | b<<1&0xaaaaaaaaaaaaaaaa 253 | ba1 := a>>1&0x5555555555555555 | b&0xaaaaaaaaaaaaaaaa 254 | dc0 := c&0x5555555555555555 | d<<1&0xaaaaaaaaaaaaaaaa 255 | dc1 := c>>1&0x5555555555555555 | d&0xaaaaaaaaaaaaaaaa 256 | 257 | dcba0 := ba0&0x3333333333333333 | dc0<<2&0xcccccccccccccccc 258 | dcba1 := ba0>>2&0x3333333333333333 | dc0&0xcccccccccccccccc 259 | dcba2 := ba1&0x3333333333333333 | dc1<<2&0xcccccccccccccccc 260 | dcba3 := ba1>>2&0x3333333333333333 | dc1&0xcccccccccccccccc 261 | 262 | // split counters for better performance on 32 bit systems 263 | dcba0l := uint(uint32(dcba0)) 264 | dcba0h := uint(dcba0 >> 32) 265 | dcba1l := uint(uint32(dcba1)) 266 | dcba1h := uint(dcba1 >> 32) 267 | dcba2l := uint(uint32(dcba2)) 268 | dcba2h := uint(dcba2 >> 32) 269 | dcba3l := uint(uint32(dcba3)) 270 | dcba3h := uint(dcba3 >> 32) 271 | 272 | // add to counters 273 | counts[0] += int(dcba0l & 0x0f) 274 | counts[1] += int(dcba2l & 0x0f) 275 | counts[2] += int(dcba1l & 0x0f) 276 | counts[3] += int(dcba3l & 0x0f) 277 | counts[4] += int(dcba0l >> 4 & 0x0f) 278 | counts[5] += int(dcba2l >> 4 & 0x0f) 279 | counts[6] += int(dcba1l >> 4 & 0x0f) 280 | counts[7] += int(dcba3l >> 4 & 0x0f) 281 | counts[8] += int(dcba0l >> 8 & 0x0f) 282 | counts[9] += int(dcba2l >> 8 & 0x0f) 283 | counts[10] += int(dcba1l >> 8 & 0x0f) 284 | counts[11] += int(dcba3l >> 8 & 0x0f) 285 | counts[12] += int(dcba0l >> 12 & 0x0f) 286 | counts[13] += int(dcba2l >> 12 & 0x0f) 287 | counts[14] += int(dcba1l >> 12 & 0x0f) 288 | counts[15] += int(dcba3l >> 12 & 0x0f) 289 | counts[16] += int(dcba0l >> 16 & 0x0f) 290 | counts[17] += int(dcba2l >> 16 & 0x0f) 291 | counts[18] += int(dcba1l >> 16 & 0x0f) 292 | counts[19] += int(dcba3l >> 16 & 0x0f) 293 | counts[20] += int(dcba0l >> 20 & 0x0f) 294 | counts[21] += int(dcba2l >> 20 & 0x0f) 295 | counts[22] += int(dcba1l >> 20 & 0x0f) 296 | counts[23] += int(dcba3l >> 20 & 0x0f) 297 | counts[24] += int(dcba0l >> 24 & 0x0f) 298 | counts[25] += int(dcba2l >> 24 & 0x0f) 299 | counts[26] += int(dcba1l >> 24 & 0x0f) 300 | counts[27] += int(dcba3l >> 24 & 0x0f) 301 | counts[28] += int(dcba0l >> 28) 302 | counts[29] += int(dcba2l >> 28) 303 | counts[30] += int(dcba1l >> 28) 304 | counts[31] += int(dcba3l >> 28) 305 | 306 | counts[32] += int(dcba0h & 0x0f) 307 | counts[33] += int(dcba2h & 0x0f) 308 | counts[34] += int(dcba1h & 0x0f) 309 | counts[35] += int(dcba3h & 0x0f) 310 | counts[36] += int(dcba0h >> 4 & 0x0f) 311 | counts[37] += int(dcba2h >> 4 & 0x0f) 312 | counts[38] += int(dcba1h >> 4 & 0x0f) 313 | counts[39] += int(dcba3h >> 4 & 0x0f) 314 | counts[40] += int(dcba0h >> 8 & 0x0f) 315 | counts[41] += int(dcba2h >> 8 & 0x0f) 316 | counts[42] += int(dcba1h >> 8 & 0x0f) 317 | counts[43] += int(dcba3h >> 8 & 0x0f) 318 | counts[44] += int(dcba0h >> 12 & 0x0f) 319 | counts[45] += int(dcba2h >> 12 & 0x0f) 320 | counts[46] += int(dcba1h >> 12 & 0x0f) 321 | counts[47] += int(dcba3h >> 12 & 0x0f) 322 | counts[48] += int(dcba0h >> 16 & 0x0f) 323 | counts[49] += int(dcba2h >> 16 & 0x0f) 324 | counts[50] += int(dcba1h >> 16 & 0x0f) 325 | counts[51] += int(dcba3h >> 16 & 0x0f) 326 | counts[52] += int(dcba0h >> 20 & 0x0f) 327 | counts[53] += int(dcba2h >> 20 & 0x0f) 328 | counts[54] += int(dcba1h >> 20 & 0x0f) 329 | counts[55] += int(dcba3h >> 20 & 0x0f) 330 | counts[56] += int(dcba0h >> 24 & 0x0f) 331 | counts[57] += int(dcba2h >> 24 & 0x0f) 332 | counts[58] += int(dcba1h >> 24 & 0x0f) 333 | counts[59] += int(dcba3h >> 24 & 0x0f) 334 | counts[60] += int(dcba0h >> 28) 335 | counts[61] += int(dcba2h >> 28) 336 | counts[62] += int(dcba1h >> 28) 337 | counts[63] += int(dcba3h >> 28) 338 | } 339 | 340 | // count64safe() manually inlined 341 | for ; i < len(buf); i++ { 342 | for j := 0; j < 64; j++ { 343 | counts[j] += int(buf[i] >> j & 1) 344 | } 345 | } 346 | } 347 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/clausecker/pospop 2 | 3 | go 1.20 4 | 5 | require golang.org/x/sys v0.0.0-20200929083018-4d22bbb62b3c 6 | -------------------------------------------------------------------------------- /go.sum: -------------------------------------------------------------------------------- 1 | golang.org/x/sys v0.0.0-20200929083018-4d22bbb62b3c h1:/h0vtH0PyU0xAoZJVcRw1k0Ng+U0JAy3QDiFmppIlIE= 2 | golang.org/x/sys v0.0.0-20200929083018-4d22bbb62b3c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 3 | -------------------------------------------------------------------------------- /minimize_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2021, 2022 Robert Clausecker 2 | 3 | package pospop 4 | 5 | import "fmt" 6 | import "strings" 7 | 8 | const ( 9 | // max number of entries in a test case 10 | maxTestcaseSize = 100 11 | ) 12 | 13 | // Take a count64 function and a test case and return true if the 14 | // test case is processed correctly. 15 | func testPasses64(count64 func(*[64]int, []uint64), buf []uint64) bool { 16 | var counts, refCounts [64]int 17 | 18 | count64(&counts, buf) 19 | count64safe(&refCounts, buf) 20 | 21 | return counts == refCounts 22 | } 23 | 24 | // Take a failing test case for testCount64 and try to find the 25 | // smallest possible test case to trigger the error. This is done 26 | // by repeatedly clearing bits that do not cause the test case to 27 | // pass when cleared. An attempt is also made to reduce the length 28 | // of the test case. This function modifies its argument and 29 | // returns a subslice of it. 30 | func minimizeTestcase64(count64 func(*[64]int, []uint64), tc []uint64) []uint64 { 31 | // sanity check 32 | if testPasses64(count64, tc) { 33 | return nil 34 | } 35 | 36 | // try to turn off bits 37 | for i := len(tc) - 1; i >= 0; i-- { 38 | for j := 63; j >= 0; j-- { 39 | if tc[i]&(1< 0 && !testPasses64(count64, tc[:len(tc)-1]) { 52 | tc = tc[:len(tc)-1] 53 | } 54 | 55 | return tc 56 | } 57 | 58 | // build a string representation of the minimised test case if it is 59 | // not too long. If it is too long, return the empty string. 60 | func testcaseString64(tc []uint64) string { 61 | if len(tc) == 0 { 62 | return "\tvar buf [0]uint64" 63 | } 64 | 65 | var w strings.Builder 66 | entries := 0 67 | fmt.Fprintf(&w, "\tvar buf [%d]uint64 // %p\n", len(tc), &tc[0]) 68 | for i := range tc { 69 | if tc[i] == 0 { 70 | continue 71 | } 72 | 73 | entries++ 74 | if entries > maxTestcaseSize { 75 | return "" 76 | } 77 | 78 | fmt.Fprintf(&w, "\tbuf[%d] = %#016x\n", i, tc[i]) 79 | } 80 | 81 | return w.String() 82 | } 83 | -------------------------------------------------------------------------------- /overflow_test.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2024 Robert Clausecker 2 | 3 | package pospop 4 | 5 | import "testing" 6 | 7 | // Check if we can get the accumulators to overflow 8 | func TestOverflow(t *testing.T) { 9 | for i := range count64funcs { 10 | t.Run(count64funcs[i].name, func(tt *testing.T) { 11 | if !count64funcs[i].available { 12 | tt.SkipNow() 13 | } 14 | 15 | testOverflow(tt, count64funcs[i].count64) 16 | }) 17 | } 18 | } 19 | 20 | func testOverflow(t *testing.T, count64 func(*[64]int, []uint64)) { 21 | const imax = 16 22 | const jmax = 16 23 | var buf [imax*65536 + jmax]uint64 24 | 25 | for i := range buf { 26 | buf[i] = ^uint64(0) 27 | } 28 | 29 | for i := 1; i <= imax; i++ { 30 | for j := -jmax; j <= jmax; j++ { 31 | testOverflowBuf(t, count64, buf[:i * 65536 + j]) 32 | } 33 | } 34 | } 35 | 36 | func testOverflowBuf(t *testing.T, count64 func(*[64]int, []uint64), buf []uint64) { 37 | var counts, refCounts [64]int 38 | 39 | for i := range refCounts { 40 | refCounts[i] = len(buf) 41 | } 42 | 43 | count64(&counts, buf) 44 | if counts != refCounts { 45 | t.Errorf("length %d: counts don't match: %v", len(buf), countDiff(counts[:], refCounts[:])) 46 | } 47 | } 48 | -------------------------------------------------------------------------------- /overread_test.go: -------------------------------------------------------------------------------- 1 | //go:build unix 2 | 3 | // Copyright (c) 2024 Robert Clausecker 4 | 5 | package pospop 6 | 7 | import ( 8 | "golang.org/x/sys/unix" 9 | "testing" 10 | ) 11 | 12 | // Allocate three pages of memory. Make the first and last page 13 | // inaccessible. Return the full array as well as just the part 14 | // in the middle (which is accessible). 15 | func mapGuarded() (mapping []byte, slice []byte, err error) { 16 | pagesize := unix.Getpagesize() 17 | mapping, err = unix.Mmap(-1, 0, 3*pagesize, unix.PROT_NONE, unix.MAP_ANON|unix.MAP_PRIVATE) 18 | if err != nil { 19 | return nil, nil, err 20 | } 21 | 22 | slice = mapping[pagesize : 2*pagesize : 2*pagesize] 23 | err = unix.Mprotect(slice, unix.PROT_READ|unix.PROT_WRITE) 24 | if err != nil { 25 | unix.Munmap(mapping) 26 | return nil, nil, err 27 | } 28 | 29 | return 30 | } 31 | 32 | // Verify that our count functions only overread memory in benign ways, 33 | // i.e. such that we never cross a page size boundary. 34 | func TestOverread(t *testing.T) { 35 | for i := range count8funcs { 36 | t.Run(count8funcs[i].name, func(tt *testing.T) { 37 | if !count8funcs[i].available { 38 | tt.SkipNow() 39 | } 40 | 41 | testOverread(tt, count8funcs[i].count8) 42 | }) 43 | } 44 | } 45 | 46 | func testOverread(t *testing.T, count8 func(*[8]int, []uint8)) { 47 | var counters [8]int 48 | 49 | mapping, slice, err := mapGuarded() 50 | defer unix.Munmap(mapping) 51 | if err != nil { 52 | t.Log("Cannot allocate memory:", err) 53 | t.SkipNow() 54 | } 55 | 56 | // test large slices that start/end right at the page boundary 57 | for i := 0; i < 64; i++ { 58 | for j := len(slice) - 64; j <= len(slice); j++ { 59 | count8(&counters, slice[i:j]) 60 | } 61 | } 62 | 63 | // test small slices that start right after the page boundary 64 | for i := 0; i < 64; i++ { 65 | for j := i; j <= 64; j++ { 66 | count8(&counters, slice[i:j]) 67 | } 68 | } 69 | 70 | // test small slices that end right before the page boundary 71 | for i := len(slice) - 64; i <= len(slice); i++ { 72 | for j := i; j <= len(slice); j++ { 73 | count8(&counters, slice[i:j]) 74 | } 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /safe.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Robert Clausecker 2 | 3 | package pospop 4 | 5 | // count8 reference implementation for tests. Do not alter. 6 | func count8safe(counts *[8]int, buf []uint8) { 7 | for i := range buf { 8 | for j := 0; j < 8; j++ { 9 | counts[j] += int(buf[i] >> j & 1) 10 | } 11 | } 12 | } 13 | 14 | // count16 reference implementation for tests. Do not alter. 15 | func count16safe(counts *[16]int, buf []uint16) { 16 | for i := range buf { 17 | for j := 0; j < 16; j++ { 18 | counts[j] += int(buf[i] >> j & 1) 19 | } 20 | } 21 | } 22 | 23 | // count32 reference implementation for tests. Do not alter. 24 | func count32safe(counts *[32]int, buf []uint32) { 25 | for i := range buf { 26 | for j := 0; j < 32; j++ { 27 | counts[j] += int(buf[i] >> j & 1) 28 | } 29 | } 30 | } 31 | 32 | // count64 reference implementation for tests. Do not alter. 33 | func count64safe(counts *[64]int, buf []uint64) { 34 | for i := range buf { 35 | for j := 0; j < 64; j++ { 36 | counts[j] += int(buf[i] >> j & 1) 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /select_386.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Robert Clausecker 2 | 3 | package pospop 4 | 5 | import "golang.org/x/sys/cpu" 6 | 7 | func count8avx2(counts *[8]int, buf []byte) 8 | func count8sse2(counts *[8]int, buf []byte) 9 | 10 | func count16avx2(counts *[16]int, buf []uint16) 11 | func count16sse2(counts *[16]int, buf []uint16) 12 | 13 | func count32avx2(counts *[32]int, buf []uint32) 14 | func count32sse2(counts *[32]int, buf []uint32) 15 | 16 | func count64sse2(counts *[64]int, buf []uint64) 17 | func count64avx2(counts *[64]int, buf []uint64) 18 | 19 | var count8funcs = []count8impl{ 20 | {count8avx2, "avx2", cpu.X86.HasAVX2 && cpu.X86.HasBMI2}, 21 | {count8sse2, "sse2", cpu.X86.HasSSE2}, 22 | {count8generic, "generic", true}, 23 | } 24 | 25 | var count16funcs = []count16impl{ 26 | {count16avx2, "avx2", cpu.X86.HasAVX2 && cpu.X86.HasBMI2}, 27 | {count16sse2, "sse2", cpu.X86.HasSSE2}, 28 | {count16generic, "generic", true}, 29 | } 30 | 31 | var count32funcs = []count32impl{ 32 | {count32avx2, "avx2", cpu.X86.HasAVX2 && cpu.X86.HasBMI2}, 33 | {count32sse2, "sse2", cpu.X86.HasSSE2}, 34 | {count32generic, "generic", true}, 35 | } 36 | 37 | var count64funcs = []count64impl{ 38 | {count64avx2, "avx2", cpu.X86.HasAVX2 && cpu.X86.HasBMI2}, 39 | {count64sse2, "sse2", cpu.X86.HasSSE2}, 40 | {count64generic, "generic", true}, 41 | } 42 | -------------------------------------------------------------------------------- /select_amd64.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020 Robert Clausecker 2 | 3 | package pospop 4 | 5 | import "golang.org/x/sys/cpu" 6 | 7 | func count8avx512(counts *[8]int, buf []byte) 8 | func count8avx2(counts *[8]int, buf []byte) 9 | func count8sse2(counts *[8]int, buf []byte) 10 | 11 | func count16avx512(counts *[16]int, buf []uint16) 12 | func count16avx2(counts *[16]int, buf []uint16) 13 | func count16sse2(counts *[16]int, buf []uint16) 14 | 15 | func count32avx512(counts *[32]int, buf []uint32) 16 | func count32avx2(counts *[32]int, buf []uint32) 17 | func count32sse2(counts *[32]int, buf []uint32) 18 | 19 | func count64avx512(counts *[64]int, buf []uint64) 20 | func count64avx2(counts *[64]int, buf []uint64) 21 | func count64sse2(counts *[64]int, buf []uint64) 22 | 23 | var count8funcs = []count8impl{ 24 | {count8avx512, "avx512", cpu.X86.HasBMI2 && cpu.X86.HasAVX512BW}, 25 | {count8avx2, "avx2", cpu.X86.HasBMI2 && cpu.X86.HasAVX2}, 26 | {count8sse2, "sse2", cpu.X86.HasSSE2}, 27 | {count8generic, "generic", true}, 28 | } 29 | 30 | var count16funcs = []count16impl{ 31 | {count16avx512, "avx512", cpu.X86.HasBMI2 && cpu.X86.HasAVX512BW}, 32 | {count16avx2, "avx2", cpu.X86.HasBMI2 && cpu.X86.HasAVX2}, 33 | {count16sse2, "sse2", cpu.X86.HasSSE2}, 34 | {count16generic, "generic", true}, 35 | } 36 | 37 | var count32funcs = []count32impl{ 38 | {count32avx512, "avx512", cpu.X86.HasBMI2 && cpu.X86.HasAVX512BW}, 39 | {count32avx2, "avx2", cpu.X86.HasBMI2 && cpu.X86.HasAVX2}, 40 | {count32sse2, "sse2", cpu.X86.HasSSE2}, 41 | {count32generic, "generic", true}, 42 | } 43 | 44 | var count64funcs = []count64impl{ 45 | {count64avx512, "avx512", cpu.X86.HasBMI2 && cpu.X86.HasAVX512BW}, 46 | {count64avx2, "avx2", cpu.X86.HasBMI2 && cpu.X86.HasAVX2}, 47 | {count64sse2, "sse2", cpu.X86.HasSSE2}, 48 | {count64generic, "generic", true}, 49 | } 50 | -------------------------------------------------------------------------------- /select_arm64.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020, 2024 Robert Clausecker 2 | 3 | package pospop 4 | 5 | func count8neon(counts *[8]int, buf []uint8) 6 | func count16neon(counts *[16]int, buf []uint16) 7 | func count32neon(counts *[32]int, buf []uint32) 8 | func count64neon(counts *[64]int, buf []uint64) 9 | 10 | var count8funcs = []count8impl{ 11 | {count8neon, "neon", true}, 12 | {count8generic, "generic", true}, 13 | } 14 | 15 | var count16funcs = []count16impl{ 16 | {count16neon, "neon", true}, 17 | {count16generic, "generic", true}, 18 | } 19 | 20 | var count32funcs = []count32impl{ 21 | {count32neon, "neon", true}, 22 | {count32generic, "generic", true}, 23 | } 24 | 25 | var count64funcs = []count64impl{ 26 | {count64neon, "neon", true}, 27 | {count64generic, "generic", true}, 28 | } 29 | -------------------------------------------------------------------------------- /select_generic.go: -------------------------------------------------------------------------------- 1 | // Copyright (c) 2020, 2024 Robert Clausecker 2 | 3 | //go:build !386 && !amd64 && !arm64 4 | 5 | package pospop 6 | 7 | // generic variants only 8 | var count8funcs = []count8impl{{count8generic, "generic", true}} 9 | var count16funcs = []count16impl{{count16generic, "generic", true}} 10 | var count32funcs = []count32impl{{count32generic, "generic", true}} 11 | var count64funcs = []count64impl{{count64generic, "generic", true}} 12 | --------------------------------------------------------------------------------