├── COPYING
├── README
├── bench_test.go
├── count_test.go
├── countavx2_386.s
├── countavx2_amd64.s
├── countavx512_amd64.s
├── countneon_arm64.s
├── countsse2_386.s
├── countsse2_amd64.s
├── dispatch.go
├── example_test.go
├── generic.go
├── go.mod
├── go.sum
├── minimize_test.go
├── overflow_test.go
├── overread_test.go
├── safe.go
├── select_386.go
├── select_amd64.go
├── select_arm64.go
└── select_generic.go


/COPYING:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2020--2024 Robert Clausecker <fuz@fuz.su>
 2 | 
 3 | Redistribution and use in source and binary forms, with or without
 4 | modification, are permitted provided that the following conditions
 5 | are met:
 6 | 
 7 | 1. Redistributions of source code must retain the above copyright
 8 |    notice, this list of conditions and the following disclaimer.
 9 | 
10 | 2. Redistributions in binary form must reproduce the above copyright
11 |    notice, this list of conditions and the following disclaimer in the
12 |    documentation and/or other materials provided with the distribution.
13 | 
14 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
15 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
16 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
17 | PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
18 | HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
19 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
20 | TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
21 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
22 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
23 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
 1 | High-performance vectorised positional popcount routines for Go
 2 | ===============================================================
 3 | 
 4 | This repository contains implementations of the positional population
 5 | count functions for Go.  Details on the algorithms used will be
 6 | published in a future research paper.
 7 | 
 8 | To use this library, import it as follows:
 9 | 
10 |     import "github.com/clausecker/pospop"
11 | 
12 | You can then count populations using the Count8, Count16, Count32,
13 | Count64, and CountString functions:
14 | 
15 |     var counts [8]int
16 |     pospop.Count8(&counts, buf)
17 | 
18 | The positional population count for buf is added to the contents of
19 | counts.
20 | 
21 | Supported Platforms
22 | -------------------
23 | 
24 | The kernels works on a block size of 240 or 480 bytes (depending on
25 | whether AVX2 is available or not).  A buffer size that is a multiple
26 | of 480 bytes and at least 10 kB in size is recommended.
27 | 
28 | Implementations are provided for the following SIMD extensions:
29 | 
30 |  * AVX-512 F/BW (amd64)
31 |  * AVX2 (amd64, 386)
32 |  * SSE2 (amd64, 386)
33 |  * NEON (arm64)
34 |  * generic kernel (all architectures)
35 | 
36 | The three kernels for amd64 correspond to the v4, v3, and v1 values
37 | of the GOAMD64 environment variable.  However, all kernels are
38 | compiled in regardless of what value GOAMD64 is set to.
39 | 
40 | The library automatically chooses the fastest available kernel for
41 | the system it is running on.
42 | 
43 | Performance
44 | -----------
45 | 
46 | As all functions (Count8, Count16, Count32, Count64, CountString) of
47 | one set are based on the same kernel with a different accumulation
48 | function, they all perform equally well.  This does not apply to the
49 | generic implementations whose performance is therefore given for every
50 | function individually.
51 | 
52 | The following performance table is grouped by the instruction set used
53 | and the architecture it runs on.  A buffer size of 100 kB was used to
54 | find these results.
55 | 
56 | 
57 | 		amd64		386		arm64		arm
58 | avx512		82.1 GB/s	---		---		---
59 | avx2		34.8 GB/s	31.6 GB/s	---		---
60 | sse2		16.0 GB/s	15.6 GB/s	---		---
61 | neon		---		---		36.9 GB/s	---
62 | generic8	1.02 GB/s	297 MB/s	1.68 GB/s	49.0 MB/s
63 | generic16	1.71 GB/s	1.36 GB/s	3.03 GB/s	67.1 MB/s
64 | generic32	2.66 GB/s	2.21 GB/s	3.83 GB/s	105 MB/s
65 | generic64	3.43 GB/s	1.89 GB/s	6.56 GB/s	82.9 MB/s
66 | 
67 | The following systems were used for benchmarks, all using Go 1.16:
68 | 
69 |  * amd64, 386: Intel(R) Xeon(R) Gold 6138 CPU @ 2.00GHz
70 |  * arm64: Apple M1
71 |  * arm: ARM Cortex-A72 r0p3 (Raspberry Pi 4B)
72 | 
73 | Remaining Work
74 | --------------
75 | 
76 |  * provide assembly kernels for arm, ppcle, and others
77 |    (hardware donations appreciated for further targets)
78 |  * provide variants of Count16, Count32, and Count64 working on byte
79 |    arrays
80 | 
81 | (c) 2020--2024 Robert Clausecker <fuz@fuz.su>.  All Rights Reserved.
82 | 
83 | This code is published under a 2-clause BSD license.  See the file
84 | COPYING for details.
85 | 


--------------------------------------------------------------------------------
/bench_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020, 2021 Robert Clausecker <fuz@fuz.su>
  2 | 
  3 | package pospop
  4 | 
  5 | import "math/rand"
  6 | import "testing"
  7 | import "strconv"
  8 | 
  9 | // sizes to benchmark
 10 | var benchmarkLengths = []int{
 11 | 	1, 10, 100, 1000, 10 * 1000, 100 * 1000, 1000 * 1000, 10 * 1000 * 1000, 100 * 1000 * 1000,
 12 | }
 13 | 
 14 | // sizes to benchmark in a short benchmark
 15 | var benchmarkLengthsShort = []int{100 * 1000}
 16 | 
 17 | // benchmark a count8 implementation
 18 | func benchmarkCount8(b *testing.B, buf []uint8, lengths []int, count8 func(*[8]int, []uint8)) {
 19 | 	for _, l := range lengths {
 20 | 		b.Run(strconv.Itoa(l)+"B", func(b *testing.B) {
 21 | 			var counts [8]int
 22 | 			testbuf := buf[:l]
 23 | 			b.SetBytes(int64(l) * 1)
 24 | 			for i := 0; i < b.N; i++ {
 25 | 				count8(&counts, testbuf)
 26 | 			}
 27 | 		})
 28 | 	}
 29 | }
 30 | 
 31 | // benchmark all Count8 implementations
 32 | func BenchmarkCount8(b *testing.B) {
 33 | 	funcs := count8funcs
 34 | 	lengths := benchmarkLengths
 35 | 
 36 | 	// short benchmark: only test the implementation
 37 | 	// actually used and keep it to one size
 38 | 	if testing.Short() {
 39 | 		funcs = []count8impl{{Count8, "dispatch", true}}
 40 | 		lengths = benchmarkLengthsShort
 41 | 	}
 42 | 
 43 | 	maxlen := lengths[len(lengths)-1]
 44 | 	buf := make([]uint8, maxlen)
 45 | 	rand.Read(buf)
 46 | 
 47 | 	for _, impl := range funcs {
 48 | 		b.Run(impl.name, func(bb *testing.B) {
 49 | 			if !impl.available {
 50 | 				bb.SkipNow()
 51 | 			}
 52 | 
 53 | 			benchmarkCount8(bb, buf, lengths, impl.count8)
 54 | 		})
 55 | 	}
 56 | }
 57 | 
 58 | // benchmark a count16 implementation
 59 | func benchmarkCount16(b *testing.B, buf []uint16, lengths []int, count16 func(*[16]int, []uint16)) {
 60 | 	for _, l := range lengths {
 61 | 		b.Run(strconv.Itoa(l), func(b *testing.B) {
 62 | 			var counts [16]int
 63 | 			testbuf := buf[:l/2]
 64 | 			b.SetBytes(int64(l))
 65 | 			for i := 0; i < b.N; i++ {
 66 | 				count16(&counts, testbuf)
 67 | 			}
 68 | 		})
 69 | 	}
 70 | }
 71 | 
 72 | // benchmark all Count16 implementations
 73 | func BenchmarkCount16(b *testing.B) {
 74 | 	funcs := count16funcs
 75 | 	lengths := benchmarkLengths
 76 | 
 77 | 	// short benchmark: only test the implementation
 78 | 	// actually used and keep it to one size
 79 | 	if testing.Short() {
 80 | 		funcs = []count16impl{{Count16, "dispatch", true}}
 81 | 		lengths = benchmarkLengthsShort
 82 | 	}
 83 | 
 84 | 	maxlen := lengths[len(lengths)-1] / 2
 85 | 	buf := make([]uint16, maxlen)
 86 | 	for i := range buf {
 87 | 		buf[i] = uint16(rand.Int63())
 88 | 	}
 89 | 
 90 | 	for _, impl := range funcs {
 91 | 		b.Run(impl.name, func(bb *testing.B) {
 92 | 			if !impl.available {
 93 | 				bb.SkipNow()
 94 | 			}
 95 | 
 96 | 			benchmarkCount16(bb, buf, lengths, impl.count16)
 97 | 		})
 98 | 	}
 99 | }
100 | 
101 | // benchmark a count32 implementation
102 | func benchmarkCount32(b *testing.B, buf []uint32, lengths []int, count32 func(*[32]int, []uint32)) {
103 | 	for _, l := range lengths {
104 | 		b.Run(strconv.Itoa(l), func(b *testing.B) {
105 | 			var counts [32]int
106 | 			testbuf := buf[:l/4]
107 | 			b.SetBytes(int64(l))
108 | 			for i := 0; i < b.N; i++ {
109 | 				count32(&counts, testbuf)
110 | 			}
111 | 		})
112 | 	}
113 | }
114 | 
115 | // benchmark all Count32 implementations
116 | func BenchmarkCount32(b *testing.B) {
117 | 	funcs := count32funcs
118 | 	lengths := benchmarkLengths
119 | 
120 | 	// short benchmark: only test the implementation
121 | 	// actually used and keep it to one size
122 | 	if testing.Short() {
123 | 		funcs = []count32impl{{Count32, "dispatch", true}}
124 | 		lengths = benchmarkLengthsShort
125 | 	}
126 | 
127 | 	maxlen := lengths[len(lengths)-1] / 4
128 | 	buf := make([]uint32, maxlen)
129 | 	for i := range buf {
130 | 		buf[i] = uint32(rand.Int63())
131 | 	}
132 | 
133 | 	for _, impl := range funcs {
134 | 		b.Run(impl.name, func(bb *testing.B) {
135 | 			if !impl.available {
136 | 				bb.SkipNow()
137 | 			}
138 | 
139 | 			benchmarkCount32(bb, buf, lengths, impl.count32)
140 | 		})
141 | 	}
142 | }
143 | 
144 | // benchmark a count64 implementation
145 | func benchmarkCount64(b *testing.B, buf []uint64, lengths []int, count64 func(*[64]int, []uint64)) {
146 | 	for _, l := range lengths {
147 | 		b.Run(strconv.Itoa(l), func(b *testing.B) {
148 | 			var counts [64]int
149 | 			testbuf := buf[:l/8]
150 | 			b.SetBytes(int64(l))
151 | 			for i := 0; i < b.N; i++ {
152 | 				count64(&counts, testbuf)
153 | 			}
154 | 		})
155 | 	}
156 | }
157 | 
158 | // benchmark all Count64 implementations
159 | func BenchmarkCount64(b *testing.B) {
160 | 	funcs := count64funcs
161 | 	lengths := benchmarkLengths
162 | 
163 | 	// short benchmark: only test the implementation
164 | 	// actually used and keep it to one size
165 | 	if testing.Short() {
166 | 		funcs = []count64impl{{Count64, "dispatch", true}}
167 | 		lengths = benchmarkLengthsShort
168 | 	}
169 | 
170 | 	maxlen := lengths[len(lengths)-1] / 8
171 | 	buf := make([]uint64, maxlen)
172 | 	for i := range buf {
173 | 		buf[i] = rand.Uint64()
174 | 	}
175 | 
176 | 	for _, impl := range funcs {
177 | 		b.Run(impl.name, func(bb *testing.B) {
178 | 			if !impl.available {
179 | 				bb.SkipNow()
180 | 			}
181 | 
182 | 			benchmarkCount64(bb, buf, lengths, impl.count64)
183 | 		})
184 | 	}
185 | }
186 | 


--------------------------------------------------------------------------------
/count_test.go:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020--2022, 2024 Robert Clausecker <fuz@fuz.su>
  2 | 
  3 | package pospop
  4 | 
  5 | import (
  6 | 	"math/rand"
  7 | 	"testing"
  8 | )
  9 | 
 10 | // standard test lengths to try
 11 | var testLengths = []int{
 12 | 	0, 1, 2, 3,
 13 | 	4, 5, 6, 7,
 14 | 	8, 9, 10, 11,
 15 | 	12, 13, 14, 15,
 16 | 	16, 17, 18, 19,
 17 | 	31, 32, 33,
 18 | 	63, 64, 65,
 19 | 	95, 97, 98,
 20 | 	119, 120, 121,
 21 | 	239, 240, 241,
 22 | 	2*240 - 1, 2 * 240, 2*240 + 1,
 23 | 	4*240 - 1, 4 * 240, 4*240 + 1,
 24 | 	1023, 1024, 1025,
 25 | 	(15 + 16) * 8, (15 + 16) * 16, (15 + 16) * 32, (15 + 16) * 64,
 26 | 
 27 | 	// long length to trigger counter overflow
 28 | 	(255*16 + 15) * 64,
 29 | }
 30 | 
 31 | // minimizing the failure causes timeout for long test cases
 32 | const minimizationThreshold = (15 + 16) * 64
 33 | 
 34 | // fill counts with random integers
 35 | func randomCounts(counts []int) {
 36 | 	for i := range counts {
 37 | 		counts[i] = rand.Int()
 38 | 	}
 39 | }
 40 | 
 41 | // compute the difference in length between two equally long integers slices.
 42 | func countDiff(a []int, b []int) []int {
 43 | 	res := make([]int, len(a))
 44 | 
 45 | 	for i := range a {
 46 | 		res[i] = b[i] - a[i]
 47 | 	}
 48 | 
 49 | 	return res
 50 | }
 51 | 
 52 | // test the correctness of a count8 implementation
 53 | func testCount8(t *testing.T, count8 func(*[8]int, []uint8)) {
 54 | 	for _, len := range testLengths {
 55 | 		buf := make([]uint8, len+1)
 56 | 		buf = buf[1 : len+1] // ensure misalignment
 57 | 		for i := range buf {
 58 | 			buf[i] = uint8(rand.Int63())
 59 | 		}
 60 | 
 61 | 		var counts [8]int
 62 | 		randomCounts(counts[:])
 63 | 		refCounts := counts
 64 | 
 65 | 		count8(&counts, buf)
 66 | 		count8safe(&refCounts, buf)
 67 | 
 68 | 		if counts != refCounts {
 69 | 			t.Errorf("length %d: counts don't match: %v\n", len, countDiff(counts[:], refCounts[:]))
 70 | 		}
 71 | 	}
 72 | }
 73 | 
 74 | // test the correctness of a count16 implementation
 75 | func testCount16(t *testing.T, count16 func(*[16]int, []uint16)) {
 76 | 	for _, len := range testLengths {
 77 | 		buf := make([]uint16, len+1)
 78 | 		buf = buf[1 : len+1] // ensure misalignment
 79 | 		for i := range buf {
 80 | 			buf[i] = uint16(rand.Int63())
 81 | 		}
 82 | 
 83 | 		var counts [16]int
 84 | 		randomCounts(counts[:])
 85 | 		refCounts := counts
 86 | 
 87 | 		count16(&counts, buf)
 88 | 		count16safe(&refCounts, buf)
 89 | 
 90 | 		if counts != refCounts {
 91 | 			t.Errorf("length %d: counts don't match: %v\n", len, countDiff(counts[:], refCounts[:]))
 92 | 		}
 93 | 	}
 94 | }
 95 | 
 96 | // test the correctness of a count32 implementation
 97 | func testCount32(t *testing.T, count32 func(*[32]int, []uint32)) {
 98 | 	for _, len := range testLengths {
 99 | 		buf := make([]uint32, len+1)
100 | 		buf = buf[1 : len+1] // ensure misalignment
101 | 		for i := range buf {
102 | 			buf[i] = rand.Uint32()
103 | 		}
104 | 
105 | 		var counts [32]int
106 | 		randomCounts(counts[:])
107 | 		refCounts := counts
108 | 
109 | 		count32(&counts, buf)
110 | 		count32safe(&refCounts, buf)
111 | 
112 | 		if counts != refCounts {
113 | 			t.Errorf("length %d: counts don't match: %v\n", len, countDiff(counts[:], refCounts[:]))
114 | 		}
115 | 	}
116 | }
117 | 
118 | // test the correctness of a count64 implementation
119 | func testCount64(t *testing.T, count64 func(*[64]int, []uint64)) {
120 | 	for _, len := range testLengths {
121 | 		buf := make([]uint64, len+1)
122 | 		buf = buf[1 : len+1] // ensure misalignment
123 | 		for i := range buf {
124 | 			buf[i] = rand.Uint64()
125 | 		}
126 | 
127 | 		var counts [64]int
128 | 		randomCounts(counts[:])
129 | 		refCounts := counts
130 | 
131 | 		count64(&counts, buf)
132 | 		count64safe(&refCounts, buf)
133 | 
134 | 		if counts != refCounts {
135 | 			t.Errorf("length %d: counts don't match: %v\n", len, countDiff(counts[:], refCounts[:]))
136 | 
137 | 			if len > minimizationThreshold {
138 | 				continue
139 | 			}
140 | 
141 | 			min := minimizeTestcase64(count64, buf)
142 | 			tcstr := testcaseString64(min)
143 | 			if tcstr != "" {
144 | 				t.Log("minimized test case:\n", tcstr)
145 | 			}
146 | 		}
147 | 	}
148 | }
149 | 
150 | // test the correctness of CountString
151 | func TestCountString(t *testing.T) {
152 | 	testCount8(t, func(counts *[8]int, buf []uint8) { CountString(counts, string(buf)) })
153 | }
154 | 
155 | // test the correctness of all Count8 implementations
156 | func TestCount8(t *testing.T) {
157 | 	t.Run("dispatch", func(tt *testing.T) { testCount8(tt, Count8) })
158 | 
159 | 	for i := range count8funcs {
160 | 		t.Run(count8funcs[i].name, func(tt *testing.T) {
161 | 			if !count8funcs[i].available {
162 | 				tt.SkipNow()
163 | 			}
164 | 
165 | 			testCount8(tt, count8funcs[i].count8)
166 | 		})
167 | 	}
168 | }
169 | 
170 | // test the correctness of Count16
171 | func TestCount16(t *testing.T) {
172 | 	t.Run("dispatch", func(tt *testing.T) { testCount16(tt, Count16) })
173 | 
174 | 	for i := range count16funcs {
175 | 		t.Run(count16funcs[i].name, func(tt *testing.T) {
176 | 			if !count16funcs[i].available {
177 | 				tt.SkipNow()
178 | 			}
179 | 
180 | 			testCount16(tt, count16funcs[i].count16)
181 | 		})
182 | 	}
183 | }
184 | 
185 | // test the correctness of Count32
186 | func TestCount32(t *testing.T) {
187 | 	t.Run("dispatch", func(tt *testing.T) { testCount32(tt, Count32) })
188 | 
189 | 	for i := range count32funcs {
190 | 		t.Run(count32funcs[i].name, func(tt *testing.T) {
191 | 			if !count32funcs[i].available {
192 | 				tt.SkipNow()
193 | 			}
194 | 
195 | 			testCount32(tt, count32funcs[i].count32)
196 | 		})
197 | 	}
198 | }
199 | 
200 | // test the correctness of Count64
201 | func TestCount64(t *testing.T) {
202 | 	t.Run("dispatch", func(tt *testing.T) { testCount64(tt, Count64) })
203 | 
204 | 	for i := range count64funcs {
205 | 		t.Run(count64funcs[i].name, func(tt *testing.T) {
206 | 			if !count64funcs[i].available {
207 | 				tt.SkipNow()
208 | 			}
209 | 
210 | 			testCount64(tt, count64funcs[i].count64)
211 | 		})
212 | 	}
213 | }
214 | 


--------------------------------------------------------------------------------
/countavx2_386.s:
--------------------------------------------------------------------------------
  1 | #include "textflag.h"
  2 | 
  3 | // AVX2 based kernels for the positional population count operation.
  4 | // All these kernels have the same backbone based on a 15-fold CSA
  5 | // reduction to first reduce 480 byte into 4x32 byte, followed by a
  6 | // bunch of shuffles to group the positional registers into nibbles.
  7 | // These are then summed up using a width-specific summation function.
  8 | // Required CPU extension: AVX2.
  9 | 
 10 | // magic transposition constants, comparison constants
 11 | DATA magic<>+ 0(SB)/8, $0x0000000000000000
 12 | DATA magic<>+ 8(SB)/8, $0x0101010101010101
 13 | DATA magic<>+16(SB)/8, $0x0202020202020202
 14 | DATA magic<>+24(SB)/8, $0x0303030303030303
 15 | DATA magic<>+32(SB)/8, $0x8040201008040201
 16 | DATA magic<>+40(SB)/4, $0x55555555
 17 | DATA magic<>+44(SB)/4, $0x33333333
 18 | DATA magic<>+48(SB)/4, $0x0f0f0f0f
 19 | GLOBL magic<>(SB), RODATA|NOPTR, $52
 20 | 
 21 | // sliding window for head/tail loads.  Unfortunately, there doesn't seem to be
 22 | // a good way to do this with less memory wasted.
 23 | DATA window<>+ 0(SB)/8, $0x0000000000000000
 24 | DATA window<>+ 8(SB)/8, $0x0000000000000000
 25 | DATA window<>+16(SB)/8, $0x0000000000000000
 26 | DATA window<>+24(SB)/8, $0x0000000000000000
 27 | DATA window<>+32(SB)/8, $0xffffffffffffffff
 28 | DATA window<>+40(SB)/8, $0xffffffffffffffff
 29 | DATA window<>+48(SB)/8, $0xffffffffffffffff
 30 | DATA window<>+56(SB)/8, $0xffffffffffffffff
 31 | GLOBL window<>(SB), RODATA|NOPTR, $64
 32 | 
 33 | // B:A = A+B+C, D used for scratch space
 34 | #define CSA(A, B, C, D) \
 35 | 	VPAND A, B, D \
 36 | 	VPXOR A, B, A \
 37 | 	VPAND A, C, B \
 38 | 	VPXOR A, C, A \
 39 | 	VPOR  B, D, B
 40 | 
 41 | // Generic kernel.  This function expects a pointer to a width-specific
 42 | // accumulation function in BX, a possibly unaligned input buffer in SI,
 43 | // counters in DI and a remaining length in BP.
 44 | TEXT countavx<>(SB), NOSPLIT, $160-0
 45 | 	TESTL BP, BP			// any data to process at all?
 46 | 	CMOVLEQ BP, SI			// if not, avoid loading head
 47 | 
 48 | 	// constants for processing the head
 49 | 	VPBROADCASTQ magic<>+32(SB), Y6	// bit position mask
 50 | 	VMOVDQU magic<>+0(SB), Y3	// permutation mask
 51 | 	VPXOR Y0, Y0, Y0		// lower counter register
 52 | 	VPXOR Y1, Y1, Y1		// upper counter register
 53 | 
 54 | 	// load head into scratch space (until alignment/end is reached)
 55 | 	MOVL SI, DX
 56 | 	ANDL $31, DX			// offset of the buffer start from 32 byte alignment
 57 | 	JEQ nohead			// if source buffer is aligned, skip head processing
 58 | 	MOVL $32, AX
 59 | 	SUBL DX, AX			// number of bytes til alignment is reached (head length)
 60 | 	VMOVDQA -32(SI)(AX*1), Y7	// load head
 61 | 	MOVL $window<>(SB), DX		// load window mask base pointer
 62 | 	VMOVDQU (DX)(AX*1), Y5		// load mask of the bytes that are part of the head
 63 | 	VPAND Y5, Y7, Y7		// and mask out those bytes that are not
 64 | 	CMPL AX, BP			// is the head shorter than the buffer?
 65 | 	JLT norunt			// if yes, perform special processing
 66 | 
 67 | 	// buffer is short and does not cross a 32 byte boundary
 68 | 	SUBL BP, AX			// number of bytes by which we overshoot the buffer
 69 | 	VMOVDQU (DX)(AX*1), Y5		// load mask of bytes that overshoot the buffer
 70 | 	VPANDN Y7, Y5, Y7		// and clear them in Y4
 71 | 	MOVL BP, AX			// set up the true prefix length
 72 | 
 73 | norunt:	VMOVDQU Y7, scratch-160(SP)	// copy to scratch space
 74 | 	SUBL AX, BP			// mark head as accounted for
 75 | 	MOVL SI, DX			// keep a copy of the head pointer
 76 | 	ADDL AX, SI			// and advance past head
 77 | 
 78 | 	ANDL $31, DX			// compute misalignment again
 79 | 	SHRL $3, DX			// misalignment in qwords (rounded down)
 80 | 	ANDL $3, DX			// and reduced to range 0--3
 81 | 
 82 | 	// process head, 8 bytes at a time (up to 4 times)
 83 | head:	VPBROADCASTD scratch-160+0(SP)(DX*8), Y4
 84 | 					// Y4 = 3210:3210:3210:3210:3210:3210:3210:3210
 85 | 	VPBROADCASTD scratch-160+4(SP)(DX*8), Y5
 86 | 	VPSHUFB Y3, Y4, Y4		// Y4 = 3333:3333:2222:2222:1111:1111:0000:0000
 87 | 	VPSHUFB Y3, Y5, Y5
 88 | 	VPAND Y6, Y4, Y4		// mask out one bit in each copy of the bytes
 89 | 	VPAND Y6, Y5, Y5
 90 | 	VPCMPEQB Y6, Y4, Y4		// set bytes to -1 if the bits were set
 91 | 	VPCMPEQB Y6, Y5, Y5		// or to 0 otherwise
 92 | 	VPSUBB Y4, Y0, Y0		// add 1/0 (subtract -1/0) to counters
 93 | 	VPSUBB Y5, Y1, Y1
 94 | 	ADDL $1, DX
 95 | 	CMPL DX, $4			// have we processed the full head?
 96 | 	JLT head
 97 | 
 98 | 	// produce 16 byte aligned point to counter vector in DX
 99 | nohead:	MOVL $counts-160+31(SP), DX
100 | 	ANDL $~31, DX			// align to 32 bytes
101 | 
102 | 	// initialise counters to what we have
103 | 	VPXOR Y7, Y7, Y7		// zero register
104 | 	VPUNPCKLBW Y7, Y0, Y4		// 0-7, 16-23
105 | 	VMOVDQA Y4, 0*32(DX)
106 | 	VPUNPCKHBW Y7, Y0, Y5		// 8-15, 24-31
107 | 	VMOVDQA Y5, 1*32(DX)
108 | 	VPUNPCKLBW Y7, Y1, Y6		// 32-39, 48-55
109 | 	VMOVDQA Y6, 2*32(DX)
110 | 	VPUNPCKHBW Y7, Y1, Y7		// 40-47, 56-63
111 | 	VMOVDQA Y7, 3*32(DX)
112 | 
113 | 	SUBL $15*32, BP			// enough data left to process?
114 | 	JLT endvec			// also, pre-subtract
115 | 
116 | 	MOVL $65535-4, AX		// space left til overflow could occur in Y8--Y11
117 | 
118 | vec:	VMOVDQU 0*32(SI), Y0		// load 480 bytes from buf
119 | 	VMOVDQU 1*32(SI), Y1		// and sum them into Y3:Y2:Y1:Y0
120 | 	VMOVDQU 2*32(SI), Y4
121 | 	VMOVDQU 3*32(SI), Y2
122 | 	VMOVDQU 4*32(SI), Y3
123 | 	VMOVDQU 5*32(SI), Y5
124 | 	VMOVDQU 6*32(SI), Y6
125 | 	CSA(Y0, Y1, Y4, Y7)
126 | 	VMOVDQU 7*32(SI), Y4
127 | 	CSA(Y3, Y2, Y5, Y7)
128 | 	VMOVDQU 8*32(SI), Y5
129 | 	CSA(Y0, Y3, Y6, Y7)
130 | 	VMOVDQU 9*32(SI), Y6
131 | 	CSA(Y1, Y2, Y3, Y7)
132 | 	VMOVDQU 10*32(SI), Y3
133 | 	CSA(Y0, Y4, Y5, Y7)
134 | 	VMOVDQU 11*32(SI), Y5
135 | 	CSA(Y0, Y3, Y6, Y7)
136 | 	VMOVDQU 12*32(SI), Y6
137 | 	CSA(Y1, Y3, Y4, Y7)
138 | 	VMOVDQU 13*32(SI), Y4
139 | 	CSA(Y0, Y5, Y6, Y7)
140 | 	VMOVDQU 14*32(SI), Y6
141 | 	CSA(Y0, Y4, Y6, Y7)
142 | 	CSA(Y1, Y4, Y5, Y7)
143 | 	CSA(Y2, Y3, Y4, Y7)
144 | 
145 | 	// load magic constants
146 | 	VPBROADCASTD magic<>+40(SB), Y7	// 0x55555555
147 | 	VPADDD Y7, Y7, Y6		// 0xaaaaaaaa
148 | 
149 | 	ADDL $15*32, SI
150 | 
151 | 	// group Y0--Y3 into nibbles in the same registers
152 | 	VPAND Y0, Y6, Y5
153 | 	VPSRLD $1, Y5, Y5
154 | 	VPAND Y1, Y7, Y4
155 | 	VPADDD Y4, Y4, Y4
156 | 	VPAND Y0, Y7, Y0
157 | 	VPAND Y1, Y6, Y1
158 | 	VPOR Y0, Y4, Y0			// Y0 = eca86420 (low crumbs)
159 | 	VPOR Y1, Y5, Y1			// Y1 = fdb97531 (low crumbs)
160 | 
161 | 	VPAND Y2, Y6, Y5
162 | 	VPSRLD $1, Y5, Y5
163 | 	VPAND Y3, Y7, Y4
164 | 	VPADDD Y4, Y4, Y4
165 | 	VPAND Y2, Y7, Y2
166 | 	VPBROADCASTD magic<>+44(SB), Y7	// 0x33333333
167 | 	VPAND Y3, Y6, Y3
168 | 	VPSLLD $2, Y7, Y6		// 0xcccccccc
169 | 	VPOR Y2, Y4, Y2			// Y2 = eca86420 (high crumbs)
170 | 	VPOR Y3, Y5, Y3			// Y3 = fdb97531 (high crumbs)
171 | 
172 | 	VPAND Y0, Y6, Y5
173 | 	VPSRLD $2, Y5, Y5
174 | 	VPAND Y2, Y7, Y4
175 | 	VPSLLD $2, Y4, Y4
176 | 	VPAND Y0, Y7, Y0
177 | 	VPAND Y2, Y6, Y2
178 | 	VPOR Y0, Y4, Y0			// Y0 = c840
179 | 	VPOR Y2, Y5, Y2			// Y2 = ea62
180 | 
181 | 	VPAND Y1, Y6, Y5
182 | 	VPSRLD $2, Y5, Y5
183 | 	VPAND Y3, Y7, Y4
184 | 	VPSLLD $2, Y4, Y4
185 | 	VPAND Y1, Y7, Y1
186 | 	VPAND Y3, Y6, Y3
187 | 	VPOR Y1, Y4, Y1			// Y1 = d951
188 | 	VPOR Y3, Y5, Y3			// Y3 = fb73
189 | 
190 | 	VPBROADCASTD magic<>+48(SB), Y7	// 0x0f0f0f0f for deinterleaving nibbles
191 | 
192 | 	// pre-shuffle nibbles
193 | 	VPUNPCKLBW Y1, Y0, Y4		// Y4 = d9c85140         (3:2:1:0)
194 | 	VPUNPCKHBW Y1, Y0, Y5		// Y5 = d9c85140	 (7:6:5:4)
195 | 	VPUNPCKLBW Y3, Y2, Y6		// Y6 = fbea7362	 (3:2:1:0)
196 | 	VPUNPCKHBW Y3, Y2, Y3		// Y3 = fbea7362	 (7:6:5:4)
197 | 	VPUNPCKLWD Y6, Y4, Y0		// Y0 = fbead9c873625140  (1:0)
198 | 	VPUNPCKHWD Y6, Y4, Y1		// Y1 = fbead9c873625140  (3:2)
199 | 	VPUNPCKLWD Y3, Y5, Y2		// Y2 = fbead9c873625140  (5:4)
200 | 	VPUNPCKHWD Y3, Y5, Y3		// Y3 = fbead9c873625140  (7:6)
201 | 
202 | 	// pull out high and low nibbles and reduce once
203 | 	VPAND Y0, Y7, Y4
204 | 	VPSRLD $4, Y0, Y0
205 | 	VPAND Y0, Y7, Y5
206 | 	VPAND Y2, Y7, Y6
207 | 	VPSRLD $4, Y2, Y2
208 | 	VPADDB Y4, Y6, Y0		// Y0 = ba98:3210:ba98:3210 (1:0)
209 | 	VPAND Y2, Y7, Y2
210 | 	VPADDB Y2, Y5, Y2		// Y2 = fedc:7654:fedc:7654 (1:0)
211 | 
212 | 	VPAND Y1, Y7, Y4
213 | 	VPSRLD $4, Y1, Y1
214 | 	VPAND Y1, Y7, Y5
215 | 	VPAND Y3, Y7, Y6
216 | 	VPSRLD $4, Y3, Y3
217 | 	VPADDB Y4, Y6, Y1		// Y1 = ba98:3210:ba98:3210 (3:2)
218 | 	VPAND Y3, Y7, Y3
219 | 	VPADDB Y3, Y5, Y3		// Y3 = fedc:7654:fedc:7654 (3:2)
220 | 
221 | 	// shuffle dwords and group them
222 | 	VPUNPCKLDQ Y2, Y0, Y4
223 | 	VPUNPCKHDQ Y2, Y0, Y5
224 | 	VPUNPCKLDQ Y3, Y1, Y6
225 | 	VPUNPCKHDQ Y3, Y1, Y7
226 | //	VPERM2I128 $0x20, Y5, Y4, Y0
227 | 	BYTE $0xc4
228 | 	BYTE $0xe3
229 | 	BYTE $0x5d
230 | 	BYTE $0x46
231 | 	BYTE $0xc5
232 | 	BYTE $0x20
233 | //	VPERM2I128 $0x31, Y5, Y4, Y2
234 | 	BYTE $0xc4
235 | 	BYTE $0xe3
236 | 	BYTE $0x5d
237 | 	BYTE $0x46
238 | 	BYTE $0xd5
239 | 	BYTE $0x31
240 | //	VPERM2I128 $0x20, Y7, Y6, Y1
241 | 	BYTE $0xc4
242 | 	BYTE $0xe3
243 | 	BYTE $0x4d
244 | 	BYTE $0x46
245 | 	BYTE $0xcf
246 | 	BYTE $0x20
247 | //	VPERM2I128 $0x31, Y7, Y6, Y3
248 | 	BYTE $0xc4
249 | 	BYTE $0xe3
250 | 	BYTE $0x4d
251 | 	BYTE $0x46
252 | 	BYTE $0xdf
253 | 	BYTE $0x31
254 | 	VPADDB Y2, Y0, Y0		// Y0 = fedc:ba98:7654:3210 (1:0)
255 | 	VPADDB Y3, Y1, Y1		// Y1 = fedc:ba98:7654:3210 (3:2)
256 | 
257 | 
258 | 	// zero-extend and add to Y8--Y11
259 | 	VPXOR Y7, Y7, Y7
260 | 	VPUNPCKLBW Y7, Y0, Y4
261 | 	VPUNPCKHBW Y7, Y0, Y5
262 | 	VPUNPCKLBW Y7, Y1, Y6
263 | 	VPUNPCKHBW Y7, Y1, Y1
264 | 
265 | 	VPADDW 0*32(DX), Y4, Y4
266 | 	VPADDW 1*32(DX), Y5, Y5
267 | 	VPADDW 2*32(DX), Y6, Y6
268 | 	VPADDW 3*32(DX), Y1, Y1
269 | 
270 | 	// write back to counters
271 | 	VMOVDQA Y4, 0*32(DX)
272 | 	VMOVDQA Y5, 1*32(DX)
273 | 	VMOVDQA Y6, 2*32(DX)
274 | 	VMOVDQA Y1, 3*32(DX)
275 | 
276 | 	SUBL $15*4, AX			// account for possible overflow
277 | 	CMPL AX, $15*4			// enough space left in the counters?
278 | 	JGE have_space
279 | 
280 | 	// flush accumulators into counters
281 | 	CALL *BX			// call accumulation function
282 | 	VPXOR Y7, Y7, Y7
283 | 	VMOVDQA Y7, 0*32(DX)
284 | 	VMOVDQA Y7, 1*32(DX)
285 | 	VMOVDQA Y7, 2*32(DX)
286 | 	VMOVDQA Y7, 3*32(DX)
287 | 
288 | 	MOVL $65535, AX			// space left til overflow could occur
289 | 
290 | have_space:
291 | 	SUBL $15*32, BP			// account for bytes consumed
292 | 	JGE vec
293 | 
294 | endvec:	VPBROADCASTQ magic<>+32(SB), Y2	// byte mask
295 | 	VMOVDQU magic<>+0(SB), Y3	// permutation mask
296 | 	VPXOR Y0, Y0, Y0		// lower counter register
297 | 	VPXOR Y1, Y1, Y1		// upper counter register
298 | 
299 | 	// process tail, 8 bytes at a time
300 | 	SUBL $8-15*32, BP		// 8 bytes left to process?
301 | 	JLT tail1
302 | 
303 | tail8:	VPBROADCASTD 0(SI), Y4
304 | 	VPBROADCASTD 4(SI), Y5
305 | 	ADDL $8, SI
306 | 	VPSHUFB Y3, Y4, Y4
307 | 	VPSHUFB Y3, Y5, Y5
308 | 	VPAND Y2, Y4, Y4
309 | 	VPAND Y2, Y5, Y5
310 | 	VPCMPEQB Y2, Y4, Y4
311 | 	VPCMPEQB Y2, Y5, Y5
312 | 	VPSUBB Y4, Y0, Y0
313 | 	VPSUBB Y5, Y1, Y1
314 | 	SUBL $8, BP
315 | 	JGE tail8
316 | 
317 | 	// process remaining 0--7 byte
318 | tail1:	SUBL $-8, BP			// anything left to process?
319 | 	JLE end
320 | 
321 | //	VMOVQ (SI), X5			// load 8 byte from buffer.  This is ok
322 | 					// as buffer is aligned to 8 byte here
323 | 	BYTE $0xc5
324 | 	BYTE $0xfa
325 | 	BYTE $0x7e
326 | 	BYTE $0x2e
327 | 	MOVL $window<>+32(SB), AX	// load window address
328 | 	SUBL BP, AX			// adjust mask pointer
329 | 	VMOVQ (AX), X6			// load window mask
330 | 	VPANDN X5, X6, X5		// and mask out the desired bytes
331 | 
332 | 	VPBROADCASTD X5, Y4
333 | 	VPSRLDQ $4, X5, X5
334 | 	VPBROADCASTD X5, Y5
335 | 	VPSHUFB Y3, Y4, Y4
336 | 	VPSHUFB Y3, Y5, Y5
337 | 	VPAND Y2, Y4, Y4
338 | 	VPAND Y2, Y5, Y5
339 | 	VPCMPEQB Y2, Y4, Y4
340 | 	VPCMPEQB Y2, Y5, Y5
341 | 	VPSUBB Y4, Y0, Y0
342 | 	VPSUBB Y5, Y1, Y1
343 | 
344 | 	// add tail to counters
345 | end:	VPXOR Y7, Y7, Y7
346 | 	VPUNPCKLBW Y7, Y0, Y4
347 | 	VPUNPCKHBW Y7, Y0, Y5
348 | 	VPUNPCKLBW Y7, Y1, Y6
349 | 	VPUNPCKHBW Y7, Y1, Y1
350 | 
351 | 	VPADDW 0*32(DX), Y4, Y4
352 | 	VPADDW 1*32(DX), Y5, Y5
353 | 	VPADDW 2*32(DX), Y6, Y6
354 | 	VPADDW 3*32(DX), Y1, Y1
355 | 
356 | 	// write back to counters
357 | 	VMOVDQA Y4, 0*32(DX)
358 | 	VMOVDQA Y5, 1*32(DX)
359 | 	VMOVDQA Y6, 2*32(DX)
360 | 	VMOVDQA Y1, 3*32(DX)
361 | 
362 | 	// and perform a final accumulation
363 | 	CALL *BX
364 | 	VZEROUPPER
365 | 	RET
366 | 
367 | // Count8 accumulation function.  Accumulates words
368 | // into 8 dword counters at (DI).  Trashes Y0--Y7.
369 | TEXT accum8<>(SB), NOSPLIT, $0-0
370 | 	VPMOVZXWD 0*16(DX), Y0
371 | 	VPMOVZXWD 1*16(DX), Y2
372 | 	VPMOVZXWD 2*16(DX), Y1
373 | 	VPMOVZXWD 3*16(DX), Y3
374 | 	VPMOVZXWD 4*16(DX), Y4
375 | 	VPMOVZXWD 5*16(DX), Y6
376 | 	VPMOVZXWD 6*16(DX), Y5
377 | 	VPMOVZXWD 7*16(DX), Y7
378 | 	VPADDD Y0, Y4, Y0
379 | 	VPADDD Y1, Y5, Y1
380 | 	VPADDD Y2, Y6, Y2
381 | 	VPADDD Y3, Y7, Y3
382 | 	VPADDD Y0, Y2, Y0
383 | 	VPADDD Y1, Y3, Y1
384 | 	VPADDD Y1, Y0, Y0
385 | 	VPADDD 0*32(DI), Y0, Y0
386 | 	VMOVDQU Y0, 0*32(DI)
387 | 	RET
388 | 
389 | // Count16 accumulation function.  Accumulates words
390 | // into 16 dword counters at (DI).  Trashes Y0--Y7.
391 | TEXT accum16<>(SB), NOSPLIT, $0-0
392 | 	VPMOVZXWD 0*16(DX), Y0
393 | 	VPMOVZXWD 1*16(DX), Y2
394 | 	VPMOVZXWD 2*16(DX), Y1
395 | 	VPMOVZXWD 3*16(DX), Y3
396 | 	VPMOVZXWD 4*16(DX), Y4
397 | 	VPMOVZXWD 5*16(DX), Y6
398 | 	VPMOVZXWD 6*16(DX), Y5
399 | 	VPMOVZXWD 7*16(DX), Y7
400 | 	VPADDD Y0, Y4, Y0
401 | 	VPADDD Y1, Y5, Y1
402 | 	VPADDD Y2, Y6, Y2
403 | 	VPADDD Y3, Y7, Y3
404 | 	VPADDD Y0, Y2, Y0
405 | 	VPADDD Y1, Y3, Y1
406 | 	VPADDD 0*32(DI), Y0, Y0
407 | 	VPADDD 1*32(DI), Y1, Y1
408 | 	VMOVDQU Y0, 0*32(DI)
409 | 	VMOVDQU Y1, 1*32(DI)
410 | 	RET
411 | 
412 | // Count32 accumulation function.  Accumulates words
413 | // into 32 dword counters at (DI).  Trashes Y0--Y7.
414 | TEXT accum32<>(SB), NOSPLIT, $0-0
415 | 	VPMOVZXWD 0*16(DX), Y0
416 | 	VPMOVZXWD 1*16(DX), Y2
417 | 	VPMOVZXWD 2*16(DX), Y1
418 | 	VPMOVZXWD 3*16(DX), Y3
419 | 	VPMOVZXWD 4*16(DX), Y4
420 | 	VPMOVZXWD 5*16(DX), Y6
421 | 	VPMOVZXWD 6*16(DX), Y5
422 | 	VPMOVZXWD 7*16(DX), Y7
423 | 	VPADDD Y0, Y4, Y0
424 | 	VPADDD Y1, Y5, Y1
425 | 	VPADDD Y2, Y6, Y2
426 | 	VPADDD Y3, Y7, Y3
427 | 	VPADDD 0*32(DI), Y0, Y0
428 | 	VPADDD 1*32(DI), Y1, Y1
429 | 	VPADDD 2*32(DI), Y2, Y2
430 | 	VPADDD 3*32(DI), Y3, Y3
431 | 	VMOVDQU Y0, 0*32(DI)
432 | 	VMOVDQU Y1, 1*32(DI)
433 | 	VMOVDQU Y2, 2*32(DI)
434 | 	VMOVDQU Y3, 3*32(DI)
435 | 	RET
436 | 
437 | // Count64 accumulation function.  Accumulates words
438 | // into 64 dword counters at (DI).  Trashes Y0--Y3.
439 | TEXT accum64<>(SB), NOSPLIT, $0-0
440 | 	VPMOVZXWD 0*16(DX), Y0
441 | 	VPMOVZXWD 1*16(DX), Y2
442 | 	VPMOVZXWD 2*16(DX), Y1
443 | 	VPMOVZXWD 3*16(DX), Y3
444 | 	VPADDD 0*32(DI), Y0, Y0
445 | 	VPADDD 1*32(DI), Y1, Y1
446 | 	VPADDD 2*32(DI), Y2, Y2
447 | 	VPADDD 3*32(DI), Y3, Y3
448 | 	VMOVDQU Y0, 0*32(DI)
449 | 	VMOVDQU Y1, 1*32(DI)
450 | 	VMOVDQU Y2, 2*32(DI)
451 | 	VMOVDQU Y3, 3*32(DI)
452 | 	VPMOVZXWD 4*16(DX), Y0
453 | 	VPMOVZXWD 5*16(DX), Y2
454 | 	VPMOVZXWD 6*16(DX), Y1
455 | 	VPMOVZXWD 7*16(DX), Y3
456 | 	VPADDD 4*32(DI), Y0, Y0
457 | 	VPADDD 5*32(DI), Y1, Y1
458 | 	VPADDD 6*32(DI), Y2, Y2
459 | 	VPADDD 7*32(DI), Y3, Y3
460 | 	VMOVDQU Y0, 4*32(DI)
461 | 	VMOVDQU Y1, 5*32(DI)
462 | 	VMOVDQU Y2, 6*32(DI)
463 | 	VMOVDQU Y3, 7*32(DI)
464 | 	RET
465 | 
466 | // func count8avx2(counts *[8]int, buf []uint8)
467 | TEXT ·count8avx2(SB), 0, $0-16
468 | 	MOVL counts+0(FP), DI
469 | 	MOVL buf_base+4(FP), SI		// SI = &buf[0]
470 | 	MOVL buf_len+8(FP), BP		// BP = len(buf)
471 | 	MOVL $accum8<>(SB), BX
472 | 	CALL countavx<>(SB)
473 | 	RET
474 | 
475 | // func count16avx2(counts *[16]int, buf []uint16)
476 | TEXT ·count16avx2(SB), 0, $0-16
477 | 	MOVL counts+0(FP), DI
478 | 	MOVL buf_base+4(FP), SI		// SI = &buf[0]
479 | 	MOVL buf_len+8(FP), BP		// BP = len(buf)
480 | 	MOVL $accum16<>(SB), BX
481 | 	SHLL $1, BP			// count in bytes
482 | 	CALL countavx<>(SB)
483 | 	RET
484 | 
485 | // func count32avx2(counts *[32]int, buf []uint32)
486 | TEXT ·count32avx2(SB), 0, $0-16
487 | 	MOVL counts+0(FP), DI
488 | 	MOVL buf_base+4(FP), SI		// SI = &buf[0]
489 | 	MOVL buf_len+8(FP), BP		// BP = len(buf)
490 | 	MOVL $accum32<>(SB), BX
491 | 	SHLL $2, BP			// count in bytes
492 | 	CALL countavx<>(SB)
493 | 	RET
494 | 
495 | // func count64avx2(counts *[64]int, buf []uint64)
496 | TEXT ·count64avx2(SB), 0, $0-16
497 | 	MOVL counts+0(FP), DI
498 | 	MOVL buf_base+4(FP), SI		// SI = &buf[0]
499 | 	MOVL buf_len+8(FP), BP		// BP = len(buf)
500 | 	MOVL $accum64<>(SB), BX
501 | 	SHLL $3, BP			// count in bytes
502 | 	CALL countavx<>(SB)
503 | 	RET
504 | 


--------------------------------------------------------------------------------
/countavx2_amd64.s:
--------------------------------------------------------------------------------
  1 | #include "textflag.h"
  2 | 
  3 | // An AVX2 based kernel first doing a 15-fold CSA reduction and then
  4 | // a 16-fold CSA reduction, carrying over place-value vectors between
  5 | // iterations.
  6 | // Required CPU extension: AVX2, BMI2.
  7 | 
  8 | // magic transposition constants, comparison constants
  9 | DATA magic<>+ 0(SB)/8, $0x0000000000000000
 10 | DATA magic<>+ 8(SB)/8, $0x0101010101010101
 11 | DATA magic<>+16(SB)/8, $0x0202020202020202
 12 | DATA magic<>+24(SB)/8, $0x0303030303030303
 13 | DATA magic<>+32(SB)/8, $0x0404040404040404
 14 | DATA magic<>+40(SB)/8, $0x0505050505050505
 15 | DATA magic<>+48(SB)/8, $0x0606060606060606
 16 | DATA magic<>+56(SB)/8, $0x0707070707070707
 17 | DATA magic<>+64(SB)/8, $0x8040201008040201
 18 | DATA magic<>+72(SB)/4, $0x55555555
 19 | DATA magic<>+76(SB)/4, $0x33333333
 20 | DATA magic<>+80(SB)/4, $0x0f0f0f0f
 21 | DATA magic<>+84(SB)/4, $0x00ff00ff
 22 | GLOBL magic<>(SB), RODATA|NOPTR, $88
 23 | 
 24 | // sliding window for head/tail loads.  Unfortunately, there doesn't seem to be
 25 | // a good way to do this with less memory wasted.
 26 | DATA window<>+ 0(SB)/8, $0x0000000000000000
 27 | DATA window<>+ 8(SB)/8, $0x0000000000000000
 28 | DATA window<>+16(SB)/8, $0x0000000000000000
 29 | DATA window<>+24(SB)/8, $0x0000000000000000
 30 | DATA window<>+32(SB)/8, $0xffffffffffffffff
 31 | DATA window<>+40(SB)/8, $0xffffffffffffffff
 32 | DATA window<>+48(SB)/8, $0xffffffffffffffff
 33 | DATA window<>+56(SB)/8, $0xffffffffffffffff
 34 | GLOBL window<>(SB), RODATA|NOPTR, $64
 35 | 
 36 | // B:A = A+B+C, D used for scratch space
 37 | #define CSA(A, B, C, D) \
 38 | 	VPAND A, B, D \
 39 | 	VPXOR A, B, A \
 40 | 	VPAND A, C, B \
 41 | 	VPXOR A, C, A \
 42 | 	VPOR  B, D, B
 43 | 
 44 | // count 8 bytes from L into Y0 and Y1,
 45 | // using Y4, and Y5 for scratch space
 46 | #define COUNT8(L) \
 47 | 	VPBROADCASTQ L, Y4 \	// Y4 = 7654:3210:7654:3210:7654:3210:7654:3210
 48 | 	VPSHUFB Y7, Y4, Y5 \	// Y5 = 7777:7777:6666:6666:5555:5555:4444:4444
 49 | 	VPSHUFB Y3, Y4, Y4 \	// Y4 = 3333:3333:2222:2222:1111:1111:0000:0000
 50 | 	VPAND Y2, Y5, Y5 \
 51 | 	VPAND Y2, Y4, Y4 \	// mask out one bit in each copy of the bytes
 52 | 	VPCMPEQB Y2, Y5, Y5 \	// set bytes to -1 if the bits were set
 53 | 	VPCMPEQB Y2, Y4, Y4 \	// or to 0 otherwise
 54 | 	VPSUBB Y5, Y1, Y1 \
 55 | 	VPSUBB Y4, Y0, Y0	// add 1/0 (subtract -1/0) to counters
 56 | 
 57 | 
 58 | // Generic kernel.  This function expects a pointer to a width-specific
 59 | // accumulation function in BX, a possibly unaligned input buffer in SI,
 60 | // counters in DI and a remaining length in CX.
 61 | TEXT countavx2<>(SB), NOSPLIT, $0-0
 62 | 	CMPQ CX, $15*32			// is the CSA kernel worth using?
 63 | 	JLT runt
 64 | 
 65 | 	// load head until alignment/end is reached
 66 | 	MOVL SI, DX
 67 | 	ANDL $31, DX			// offset of the buffer start from 32 byte alignment
 68 | 	MOVL $32, AX
 69 | 	SUBL DX, AX			// number of bytes til alignment is reached (head length)
 70 | 	SUBQ DX, SI			// align source to 32 bytes
 71 | 	VMOVDQA (SI), Y0		// load head
 72 | 	ADDQ DX, CX			// and account for head length
 73 | 	LEAQ window<>(SB), DX		// load window mask base pointer
 74 | 	VPAND (DX)(AX*1), Y0, Y0	// mask out bytes not in head
 75 | 
 76 | 	VMOVDQA 1*32(SI), Y1		// load 480 (-32) bytes from buf
 77 | 	VMOVDQA 2*32(SI), Y4		// and sum them into Y3:Y2:Y1:Y0
 78 | 	VMOVDQA 3*32(SI), Y2
 79 | 	VMOVDQA 4*32(SI), Y3
 80 | 	VMOVDQA 5*32(SI), Y5
 81 | 	VMOVDQA 6*32(SI), Y6
 82 | 	CSA(Y0, Y1, Y4, Y7)
 83 | 	VMOVDQA 7*32(SI), Y4
 84 | 	CSA(Y3, Y2, Y5, Y7)
 85 | 	VMOVDQA 8*32(SI), Y5
 86 | 	CSA(Y0, Y3, Y6, Y7)
 87 | 	VMOVDQA 9*32(SI), Y6
 88 | 	CSA(Y1, Y2, Y3, Y7)
 89 | 	VMOVDQA 10*32(SI), Y3
 90 | 	CSA(Y0, Y4, Y5, Y7)
 91 | 	VMOVDQA 11*32(SI), Y5
 92 | 	CSA(Y0, Y3, Y6, Y7)
 93 | 	VMOVDQA 12*32(SI), Y6
 94 | 	CSA(Y1, Y3, Y4, Y7)
 95 | 	VMOVDQA 13*32(SI), Y4
 96 | 	CSA(Y0, Y5, Y6, Y7)
 97 | 	VMOVDQA 14*32(SI), Y6
 98 | 	VPBROADCASTD magic<>+72(SB), Y15 // 0x55555555
 99 | 	VPBROADCASTD magic<>+76(SB), Y13 // 0x33333333
100 | 	CSA(Y0, Y4, Y6, Y7)
101 | 	VPXOR Y8, Y8, Y8		// initialise counters
102 | 	VPXOR Y9, Y9, Y9
103 | 	CSA(Y1, Y4, Y5, Y7)
104 | 	VPXOR Y10, Y10, Y10
105 | 	VPXOR Y11, Y11, Y11
106 | 	CSA(Y2, Y3, Y4, Y7)
107 | 
108 | 	ADDQ $15*32, SI
109 | 	SUBQ $(15+16)*32, CX		// enough data left to process?
110 | 	JLT post
111 | 
112 | 	MOVL $65535, AX			// space left til overflow could occur in Y8--Y11
113 | 
114 | 	// load 512 bytes from buf, add them to Y0..Y3 into Y0..Y4
115 | vec:	VMOVDQA 0*32(SI), Y4
116 | 	VMOVDQA 1*32(SI), Y5
117 | 	VMOVDQA 2*32(SI), Y6
118 | 	VMOVDQA 3*32(SI), Y12
119 | 	VMOVDQA 4*32(SI), Y14
120 | 	CSA(Y0, Y4, Y5, Y7)
121 | 	VMOVDQA 5*32(SI), Y5
122 | 	CSA(Y6, Y12, Y14, Y7)
123 | 	VMOVDQA 6*32(SI), Y14
124 | 	CSA(Y1, Y4, Y12, Y7)
125 | 	VMOVDQA 7*32(SI), Y12
126 | 	CSA(Y0, Y5, Y6, Y7)
127 | 	VMOVDQA 8*32(SI), Y6
128 | 	CSA(Y6, Y12, Y14, Y7)
129 | 	VMOVDQA 9*32(SI), Y14
130 | 	CSA(Y1, Y5, Y12, Y7)
131 | 	VMOVDQA 10*32(SI), Y12
132 | 	CSA(Y0, Y12, Y14, Y7)
133 | 	VMOVDQA 11*32(SI), Y14
134 | 	CSA(Y2, Y4, Y5, Y7)
135 | 	VMOVDQA 12*32(SI), Y5
136 | 	CSA(Y0, Y6, Y14, Y7)
137 | 	VMOVDQA 13*32(SI), Y14
138 | 	CSA(Y1, Y6, Y12, Y7)
139 | 	VMOVDQA 14*32(SI), Y12
140 | 	CSA(Y5, Y12, Y14, Y7)
141 | 	VMOVDQA 15*32(SI), Y14
142 | 	CSA(Y0, Y5, Y14, Y7)
143 | 	ADDQ $16*32, SI
144 | 	PREFETCHT0 0(SI)
145 | 	PREFETCHT0 32(SI)
146 | 	CSA(Y1, Y5, Y12, Y7)
147 | 	CSA(Y2, Y5, Y6, Y7)
148 | 	CSA(Y3, Y4, Y5, Y7)
149 | 
150 | 
151 | 	VPBROADCASTD magic<>+84(SB), Y12 // 0x00ff00ff
152 | 	VPBROADCASTD magic<>+80(SB), Y14 // 0x0f0f0f0f
153 | 
154 | 	// now Y0..Y4 hold counters; preserve Y0..Y4 for the next round
155 | 	// and add Y4 to the counters.
156 | 
157 | 	// split into even/odd and reduce into crumbs
158 | 	VPAND Y4, Y15, Y5		// Y5 = 02468ace x16
159 | 	VPANDN Y4, Y15, Y6		// Y6 = 13579bdf x16
160 | 	VPSRLD $1, Y6, Y6
161 | 	VPERM2I128 $0x20, Y6, Y5, Y4
162 | 	VPERM2I128 $0x31, Y6, Y5, Y5
163 | 	VPADDD Y5, Y4, Y4		// Y4 = 02468ace x8 13579bdf x8
164 | 
165 | 	// split again and reduce into nibbles
166 | 	VPAND Y4, Y13, Y5		// Y5 = 048c x8 159d x8
167 | 	VPANDN Y4, Y13, Y6		// Y6 = 26ae x8 37bf x8
168 | 	VPSRLD $2, Y6, Y6
169 | 	VPUNPCKLQDQ Y6, Y5, Y4
170 | 	VPUNPCKHQDQ Y6, Y5, Y5
171 | 	VPADDD Y5, Y4, Y4		// Y4 = 048c x4 26ae x4 159d x4 37bf x4
172 | 
173 | 	// split again into bytes and shuffle into order
174 | 	VPAND Y4, Y14, Y5		// Y5 = 08 x4 2a x4 19 x4 3b x4
175 | 	VPANDN Y4, Y14, Y6		// Y4 = 4c x4 6e x4 5d x4 7f x4
176 | 	VPSLLD $4, Y5, Y5
177 | 	VPERM2I128 $0x20, Y6, Y5, Y4	// Y4 = 08 x4 2a x4 4c x4 6e x4
178 | 	VPERM2I128 $0x31, Y6, Y5, Y5	// Y5 = 19 x4 3b x4 5d x4 7f x4
179 | 	VPUNPCKLWD Y5, Y4, Y6		// Y6 = 0819 x4 4c5d x4
180 | 	VPUNPCKHWD Y5, Y4, Y7		// Y7 = 2a3b x4 6e7f x4
181 | 	VPUNPCKLDQ Y7, Y6, Y4		// Y4 = 08192a3b[0:1] 4c5d6e7f[0:1]
182 | 	VPUNPCKHDQ Y7, Y6, Y5		// Y5 = 08192a3b[2:3] 4c5d6e7f[2:3]
183 | 	VPERMQ $0xd8, Y4, Y4		// Y4 = 08192a3b4c5d6e7f[0:1]
184 | 	VPERMQ $0xd8, Y5, Y5		// Y5 = 08192a3b4c5d6e7f[2:3]
185 | 
186 | 	// split again into words and add to counters
187 | 	VPAND Y4, Y12, Y6		// Y6 = 01234567[0:1]
188 | 	VPAND Y5, Y12, Y7		// Y7 = 01234567[2:3]
189 | 	VPADDW Y6, Y8, Y8
190 | 	VPADDW Y7, Y10, Y10
191 | 	VPSRLW $8, Y4, Y4		// Y4 = 89abcdef[0:1]
192 | 	VPSRLW $8, Y5, Y5		// Y5 = 89abcdef[2:3]
193 | 	VPADDW Y4, Y9, Y9
194 | 	VPADDW Y5, Y11, Y11
195 | 
196 | 	SUBL $16*4, AX			// account for possible overflow
197 | 	CMPL AX, $(15+15)*4		// enough space left in the counters?
198 | 	JGE have_space
199 | 
200 | 	// flush accumulators into counters
201 | 	VPXOR Y7, Y7, Y7
202 | 	CALL *BX			// call accumulation function
203 | 	VPXOR Y8, Y8, Y8		// clear accumulators for next round
204 | 	VPXOR Y9, Y9, Y9
205 | 	VPXOR Y10, Y10, Y10
206 | 	VPXOR Y11, Y11, Y11
207 | 
208 | 	MOVL $65535, AX			// space left til overflow could occur
209 | 
210 | have_space:
211 | 	SUBQ $16*32, CX			// account for bytes consumed
212 | 	JGE vec
213 | 
214 | 	// group nibbles in Y0, Y1, Y2, and Y3 into Y4, Y5, Y6, and Y7
215 | post:	VPBROADCASTD magic<>+80(SB), Y14 // 0x0f0f0f0f
216 | 
217 | 	VPAND Y1, Y15, Y5
218 | 	VPADDD Y5, Y5, Y5
219 | 	VPAND Y3, Y15, Y7
220 | 	VPADDD Y7, Y7, Y7
221 | 	VPAND Y0, Y15, Y4
222 | 	VPAND Y2, Y15, Y6
223 | 	VPOR Y4, Y5, Y4			// Y4 = eca86420 (low crumbs)
224 | 	VPOR Y6, Y7, Y5			// Y5 = eca86420 (high crumbs)
225 | 
226 | 	VPANDN Y0, Y15, Y0
227 | 	VPSRLD $1, Y0, Y0
228 | 	VPANDN Y2, Y15, Y2
229 | 	VPSRLD $1, Y2, Y2
230 | 	VPANDN Y1, Y15, Y1
231 | 	VPANDN Y3, Y15, Y3
232 | 	VPOR Y0, Y1, Y6			// Y6 = fdb97531 (low crumbs)
233 | 	VPOR Y2, Y3, Y7			// Y7 = fdb97531 (high crumbs)
234 | 
235 | 	VPAND Y5, Y13, Y1
236 | 	VPSLLD $2, Y1, Y1
237 | 	VPAND Y7, Y13, Y3
238 | 	VPSLLD $2, Y3, Y3
239 | 	VPAND Y4, Y13, Y0
240 | 	VPAND Y6, Y13, Y2
241 | 	VPOR Y0, Y1, Y0			// Y0 = c840
242 | 	VPOR Y2, Y3, Y1			// Y1 = d951
243 | 
244 | 	VPANDN Y4, Y13, Y4
245 | 	VPSRLD $2, Y4, Y4
246 | 	VPANDN Y6, Y13, Y6
247 | 	VPSRLD $2, Y6, Y6
248 | 	VPANDN Y5, Y13, Y5
249 | 	VPANDN Y7, Y13, Y7
250 | 	VPOR Y4, Y5, Y2			// Y2 = ea62
251 | 	VPOR Y6, Y7, Y3			// Y3 = fb73
252 | 
253 | 	// pre-shuffle nibbles
254 | 	VPUNPCKLBW Y1, Y0, Y5		// Y5 = d9c85140         (3:2:1:0)
255 | 	VPUNPCKHBW Y1, Y0, Y0		// Y0 = d9c85140         (7:6:5:4)
256 | 	VPUNPCKLBW Y3, Y2, Y6		// Y6 = fbea7362         (3:2:1:0)
257 | 	VPUNPCKHBW Y3, Y2, Y1		// Y1 = fbea7362         (3:2:1:0)
258 | 	VPUNPCKLWD Y6, Y5, Y4		// Y4 = fbead9c873625140 (1:0)
259 | 	VPUNPCKHWD Y6, Y5, Y5		// Y5 = fbead9c873625140 (3:2)
260 | 	VPUNPCKLWD Y1, Y0, Y6		// Y6 = fbead9c873624150 (5:4)
261 | 	VPUNPCKHWD Y1, Y0, Y7		// Y7 = fbead9c873624150 (7:6)
262 | 
263 | 	// pull out high and low nibbles
264 | 	VPAND Y4, Y14, Y0
265 | 	VPSRLD $4, Y4, Y4
266 | 	VPAND Y4, Y14, Y4
267 | 	VPAND Y5, Y14, Y1
268 | 	VPSRLD $4, Y5, Y5
269 | 	VPAND Y5, Y14, Y5
270 | 	VPAND Y6, Y14, Y2
271 | 	VPSRLD $4, Y6, Y6
272 | 	VPAND Y6, Y14, Y6
273 | 	VPAND Y7, Y14, Y3
274 | 	VPSRLD $4, Y7, Y7
275 | 	VPAND Y7, Y14, Y7
276 | 
277 | 	// reduce common values
278 | 	VPADDB Y2, Y0, Y0		// Y0 = ba98:3210:ba98:3210 (1:0)
279 | 	VPADDB Y3, Y1, Y1		// Y1 = ba98:3210:ba98:3210 (3:2)
280 | 	VPADDB Y6, Y4, Y2		// Y2 = fedc:7654:fedc:7654 (1:0)
281 | 	VPADDB Y7, Y5, Y3		// Y3 = fedc:7654:fedc:7654 (3:2)
282 | 
283 | 	// shuffle dwords and group them
284 | 	VPUNPCKLDQ Y2, Y0, Y4
285 | 	VPUNPCKHDQ Y2, Y0, Y5
286 | 	VPUNPCKLDQ Y3, Y1, Y6
287 | 	VPUNPCKHDQ Y3, Y1, Y7
288 | 	VPERM2I128 $0x20, Y5, Y4, Y0
289 | 	VPERM2I128 $0x31, Y5, Y4, Y2
290 | 	VPERM2I128 $0x20, Y7, Y6, Y1
291 | 	VPERM2I128 $0x31, Y7, Y6, Y3
292 | 	VPADDB Y2, Y0, Y0		// Y0 = fedc:ba98:7654:3210 (1:0)
293 | 	VPADDB Y3, Y1, Y1		// Y1 = fedc:ba98:7654:3210 (3:2)
294 | 
295 | 	// zero-extend and add to Y8--Y11
296 | 	VPXOR Y7, Y7, Y7
297 | 	VPUNPCKLBW Y7, Y0, Y4
298 | 	VPUNPCKHBW Y7, Y0, Y5
299 | 	VPUNPCKLBW Y7, Y1, Y6
300 | 	VPUNPCKHBW Y7, Y1, Y1
301 | 
302 | 	VPADDW Y4, Y8, Y8
303 | 	VPADDW Y5, Y9, Y9
304 | 	VPADDW Y6, Y10, Y10
305 | 	VPADDW Y1, Y11, Y11
306 | 
307 | endvec:	CMPL CX, $-16*32		// no bytes left to process?
308 | 	JE end
309 | 
310 | 	VPBROADCASTQ magic<>+64(SB), Y2	// byte mask
311 | 	VMOVDQU magic<>+0(SB), Y3	// permutation mask
312 | 	VMOVDQU magic<>+32(SB), Y7
313 | 	VPXOR Y0, Y0, Y0		// lower counter register
314 | 	VPXOR Y1, Y1, Y1		// upper counter register
315 | 
316 | 	// process tail, 8 bytes at a time
317 | 	SUBL $8-16*32, CX		// 8 bytes left to process?
318 | 	JLE tail1
319 | 
320 | tail8:	COUNT8((SI))
321 | 	ADDQ $8, SI
322 | 	SUBL $8, CX
323 | 	JGT tail8
324 | 
325 | 	// process remaining 1--8 bytes
326 | tail1:	MOVL $8*8(CX*8), CX
327 | 	BZHIQ CX, (SI), AX		// load tail into AX (will never fault)
328 | 	VMOVQ AX, X6
329 | 	COUNT8(X6)
330 | 
331 | 	// add tail to counters
332 | 	VPXOR Y7, Y7, Y7
333 | 	VPUNPCKLBW Y7, Y0, Y4
334 | 	VPUNPCKHBW Y7, Y0, Y5
335 | 	VPUNPCKLBW Y7, Y1, Y6
336 | 	VPUNPCKHBW Y7, Y1, Y7
337 | 
338 | 	VPADDW Y4, Y8, Y8
339 | 	VPADDW Y5, Y9, Y9
340 | 	VPADDW Y6, Y10, Y10
341 | 	VPADDW Y7, Y11, Y11
342 | 
343 | 	// and perform a final accumulation
344 | end:	VPXOR Y7, Y7, Y7
345 | 	CALL *BX
346 | 	VZEROUPPER
347 | 	RET
348 | 
349 | 	// buffer is short, do just head/tail processing
350 | runt:	VPBROADCASTQ magic<>+64(SB), Y2	// bit position mask
351 | 	VMOVDQU magic<>+0(SB), Y3	// permutation mask
352 | 	VMOVDQU magic<>+32(SB), Y7
353 | 	VPXOR Y0, Y0, Y0		// lower counter register
354 | 	VPXOR Y1, Y1, Y1		// upper counter register
355 | 	SUBL $8, CX			// 8 byte left to process?
356 | 	JLT runt1
357 | 
358 | 	// process runt, 8 bytes at a time
359 | runt8:	COUNT8((SI))
360 | 	ADDQ $8, SI
361 | 	SUBL $8, CX
362 | 	JGE runt8
363 | 
364 | 	// process remaining 0--7 byte
365 | 	// while making sure we don't get a page fault
366 | runt1:	CMPL CX, $-8			// anything left to process?
367 | 	JLE runt_accum
368 | 
369 | 	LEAL 7(SI)(CX*1), DX		// last address of buffer
370 | 	XORL SI, DX			// which bits changed?
371 | 	LEAL 8*8(CX*8), CX		// CX scaled to a bit length
372 | 	TESTL $8, DX			// did we cross an alignment boundary?
373 | 	JNE crossrunt1			// if yes, we can safely load directly
374 | 
375 | 	LEAL (SI*8), AX
376 | 	ANDQ $~7, SI			// align buffer to 8 bytes
377 | 	MOVQ (SI), R8			// and load 8 bytes from buffer
378 | 	SHRXQ AX, R8, R8		// buffer starting at the beginning
379 | 	BZHIQ CX, R8, R8		// mask out bytes past the buffer
380 | 	JMP dorunt1
381 | 
382 | crossrunt1:
383 | 	BZHIQ CX, (SI), R8		// load 8 bytes from unaligned buffer
384 | 
385 | dorunt1:VMOVQ R8, X6
386 | 	COUNT8(X6)
387 | 
388 | 	// move tail to counters and perform final accumulation
389 | runt_accum:
390 | 	VPXOR Y7, Y7, Y7
391 | 	VPUNPCKLBW Y7, Y0, Y8
392 | 	VPUNPCKHBW Y7, Y0, Y9
393 | 	VPUNPCKLBW Y7, Y1, Y10
394 | 	VPUNPCKHBW Y7, Y1, Y11
395 | 	CALL *BX
396 | 	VZEROUPPER
397 | 	RET
398 | 
399 | // zero extend Y8--Y11 into dwords and fold the upper 32 counters
400 | // over the lower 32 counters, leaving the registers with
401 | // Y12 contains  0- 3, 16-19
402 | // Y8  contains  4- 7, 20-23
403 | // Y14 contains  8-11, 24-27
404 | // Y9  contains 12-15, 28-31
405 | // Assumes Y7 == 0.
406 | #define FOLD32 \
407 | 	VPUNPCKLWD Y7, Y8, Y12	\
408 | 	VPUNPCKHWD Y7, Y8, Y8	\
409 | 	VPUNPCKLWD Y7, Y9, Y14	\
410 | 	VPUNPCKHWD Y7, Y9, Y9	\
411 | 	VPUNPCKLWD Y7, Y10, Y4	\
412 | 	VPUNPCKHWD Y7, Y10, Y10	\
413 | 	VPUNPCKLWD Y7, Y11, Y5	\
414 | 	VPUNPCKHWD Y7, Y11, Y11	\
415 | 	VPADDD Y12, Y4, Y12	\
416 | 	VPADDD Y8, Y10, Y8	\
417 | 	VPADDD Y14, Y5, Y14	\
418 | 	VPADDD Y9, Y11, Y9
419 | 
420 | // zero-extend dwords in Y trashing Y and Z.  Add the low
421 | // half dwords to a*8(DI) and the high half to b*8(DI).
422 | // Assumes Y7 == 0
423 | #define ACCUM(a, b, Y, Z) \
424 | 	VPERMQ $0xd8, Y, Y \
425 | 	VPUNPCKHDQ Y7, Y, Z \
426 | 	VPUNPCKLDQ Y7, Y, Y \
427 | 	VPADDQ (a)*8(DI), Y, Y \
428 | 	VPADDQ (b)*8(DI), Z, Z \
429 | 	VMOVDQU Y, (a)*8(DI) \
430 | 	VMOVDQU Z, (b)*8(DI)
431 | 
432 | // Count8 accumulation function.  Accumulates words Y8--Y11
433 | // into 8 qword counters at (DI).  Trashes Y0--Y12.
434 | TEXT accum8<>(SB), NOSPLIT, $0-0
435 | 	FOLD32
436 | 
437 | 	VPADDD Y14, Y12, Y12		// 0- 3,  0- 3
438 | 	VPADDD Y9, Y8, Y8		// 4- 7,  4- 7
439 | 	VPERM2I128 $0x20, Y8, Y12, Y14
440 | 	VPERM2I128 $0x31, Y8, Y12, Y4
441 | 	VPADDD Y4, Y14, Y12		// 0- 3,  4- 7
442 | 	ACCUM(0, 4, Y12, Y14)
443 | 	RET
444 | 
445 | // Count16 accumulation function.  Accumulates words Y8--Y11
446 | // into 16 qword counters at (DI).  Trashes Y0--Y12.
447 | TEXT accum16<>(SB), NOSPLIT, $0-0
448 | 	FOLD32
449 | 
450 | 	// fold over upper 16 bit over lower 32 counters
451 | 	VPERM2I128 $0x20, Y8, Y12, Y4	//  0- 3,  4- 7
452 | 	VPERM2I128 $0x31, Y8, Y12, Y10	// 16-19, 20-23
453 | 	VPADDD Y4, Y10, Y12		//  0- 7
454 | 	VPERM2I128 $0x20, Y9, Y14, Y5	//  8-11, 12-15
455 | 	VPERM2I128 $0x31, Y9, Y14, Y11	// 24-27, 29-31
456 | 	VPADDD Y5, Y11, Y4		//  8-15
457 | 
458 | 	// zero extend into qwords and add to counters
459 | 	ACCUM(0, 4, Y12, Y14)
460 | 	ACCUM(8, 12, Y4, Y5)
461 | 
462 | 	RET
463 | 
464 | // Count32 accumulation function.  Accumulates words Y8--Y11
465 | // int 32 qword counters at (DI).  Trashes Y0--Y12
466 | TEXT accum32<>(SB), NOSPLIT, $0-0
467 | 	FOLD32
468 | 
469 | 	ACCUM( 0, 16, Y12, Y4)
470 | 	ACCUM( 4, 20, Y8, Y4)
471 | 	ACCUM( 8, 24, Y14, Y4)
472 | 	ACCUM(12, 28, Y9, Y4)
473 | 
474 | 	RET
475 | 
476 | // accumulate the 16 counters in Y into k*8(DI) to (k+15)*8(DI)
477 | // trashes Y0--Y3.  Assumes Y12 == 0
478 | #define ACCUM64(k, Y) \
479 | 	VPUNPCKLWD Y7, Y, Y12 \
480 | 	VPUNPCKHWD Y7, Y, Y14 \
481 | 	ACCUM(k, k+16, Y12, Y4) \
482 | 	ACCUM(k+4, k+20, Y14, Y4)
483 | 
484 | // Count64 accumulation function.  Accumulates words Y8--Y11
485 | // into 64 qword counters at (DI).  Trashes Y0--Y12.
486 | TEXT accum64<>(SB), NOSPLIT, $0-0
487 | 	ACCUM64(0, Y8)
488 | 	ACCUM64(8, Y9)
489 | 	ACCUM64(32, Y10)
490 | 	ACCUM64(40, Y11)
491 | 	RET
492 | 
493 | // func count8avx2(counts *[8]int, buf []uint8)
494 | TEXT ·count8avx2(SB), 0, $0-32
495 | 	MOVQ counts+0(FP), DI
496 | 	MOVQ buf_base+8(FP), SI		// SI = &buf[0]
497 | 	MOVQ buf_len+16(FP), CX		// CX = len(buf)
498 | 	MOVQ $accum8<>(SB), BX
499 | 	CALL countavx2<>(SB)
500 | 	RET
501 | 
502 | // func count16avx2(counts *[16]int, buf []uint16)
503 | TEXT ·count16avx2(SB), 0, $0-32
504 | 	MOVQ counts+0(FP), DI
505 | 	MOVQ buf_base+8(FP), SI		// SI = &buf[0]
506 | 	MOVQ buf_len+16(FP), CX		// CX = len(buf)
507 | 	MOVQ $accum16<>(SB), BX
508 | 	SHLQ $1, CX			// count in bytes
509 | 	CALL countavx2<>(SB)
510 | 	RET
511 | 
512 | // func count32avx2(counts *[32]int, buf []uint32)
513 | TEXT ·count32avx2(SB), 0, $0-32
514 | 	MOVQ counts+0(FP), DI
515 | 	MOVQ buf_base+8(FP), SI		// SI = &buf[0]
516 | 	MOVQ buf_len+16(FP), CX		// CX = len(buf)
517 | 	MOVQ $accum32<>(SB), BX
518 | 	SHLQ $2, CX			// count in bytes
519 | 	CALL countavx2<>(SB)
520 | 	RET
521 | 
522 | // func count64avx2(counts *[64]int, buf []uint64)
523 | TEXT ·count64avx2(SB), 0, $0-32
524 | 	MOVQ counts+0(FP), DI
525 | 	MOVQ buf_base+8(FP), SI		// SI = &buf[0]
526 | 	MOVQ buf_len+16(FP), CX		// CX = len(buf)
527 | 	MOVQ $accum64<>(SB), BX
528 | 	SHLQ $3, CX			// count in bytes
529 | 	CALL countavx2<>(SB)
530 | 	RET
531 | 


--------------------------------------------------------------------------------
/countavx512_amd64.s:
--------------------------------------------------------------------------------
  1 | #include "textflag.h"
  2 | 
  3 | // An AVX512 based kernel first doing a 15-fold CSA reduction
  4 | // and then a 16-fold CSA reduction, carrying over place-value
  5 | // vectors between iterations.
  6 | // Required CPU extensions: BMI2, AVX-512 -F, -BW.
  7 | 
  8 | // magic constants
  9 | DATA magic<>+ 0(SB)/4, $0x55555555
 10 | DATA magic<>+ 4(SB)/4, $0x33333333
 11 | DATA magic<>+ 8(SB)/4, $0x0f0f0f0f
 12 | DATA magic<>+12(SB)/4, $0x00ff00ff
 13 | 
 14 | // permutation vectors for the last permutation step of the vec loop
 15 | // permutes words
 16 | // A = 0000 1111 2222 3333 4444 5555 6666 7777
 17 | // B = 8888 9999 AAAA BBBB CCCC DDDD EEEE FFFF
 18 | // into the order used by the counters:
 19 | // Q1 = 0123 4567 0123 4567 0123 4567 0123 4567
 20 | // Q2 = 89AB CDEF 89AB CDEF 89AB CDEF 89AB CDEF
 21 | DATA magic<>+16(SB)/8, $0x1c1814100c080400
 22 | DATA magic<>+24(SB)/8, $0x1d1915110d090501
 23 | DATA magic<>+32(SB)/8, $0x1e1a16120e0a0602
 24 | DATA magic<>+40(SB)/8, $0x1f1b17130f0b0703
 25 | GLOBL magic<>(SB), RODATA|NOPTR, $48
 26 | 
 27 | // B:A = A+B+C, D used as scratch space
 28 | #define CSA(A, B, C, D) \
 29 | 	VMOVDQA64 A, D \
 30 | 	VPTERNLOGD $0x96, C, B, A \
 31 | 	VPTERNLOGD $0xe8, C, D, B
 32 | 
 33 | // Generic kernel.  This function expects a pointer to a width-specific
 34 | // accumulation function in BX, a possibly unaligned input buffer in SI,
 35 | // counters in DI and an array length in CX.
 36 | TEXT countavx512<>(SB), NOSPLIT, $0-0
 37 | 	// head and tail constants, counter registers
 38 | 	VPTERNLOGD $0xff, Z30, Z30, Z30	// ffffffff
 39 | 	VPXORD Y25, Y25, Y25		// zero register
 40 | 
 41 | 	CMPQ CX, $15*64			// is the CSA kernel worth using?
 42 | 	JLT runt
 43 | 
 44 | 	// compute misalignment mask
 45 | 	MOVQ $-1, AX
 46 | 	SHLXQ SI, AX, AX		// mask out the head of the load
 47 | 	KMOVQ AX, K1			// prepare mask register
 48 | 	ADDQ SI, CX
 49 | 	ANDQ $~63, SI			// align source to 64 byte
 50 | 	SUBQ SI, CX			// account for head length in CX
 51 | 
 52 | 	VMOVDQU8.Z 0*64(SI), K1, Z0	// load 960 bytes from buf
 53 | 	VMOVDQA64 1*64(SI), Z1		// and sum them into Z3:Z2:Z1:Z0
 54 | 	VMOVDQA64 2*64(SI), Z4
 55 | 	VPXOR Y8, Y8, Y8		// initialise counters
 56 | 	VPXOR Y9, Y9, Y9
 57 | 	VMOVDQA64 3*64(SI), Z2
 58 | 	VMOVDQA64 4*64(SI), Z3
 59 | 	VMOVDQA64 5*64(SI), Z5
 60 | 	CSA(Z0, Z1, Z4, Z22)
 61 | 	VMOVDQA64 6*64(SI), Z6
 62 | 	VMOVDQA64 7*64(SI), Z7
 63 | 	VMOVDQA64 8*64(SI), Z10
 64 | 	CSA(Z2, Z3, Z5, Z22)
 65 | 	VMOVDQA64 9*64(SI), Z11
 66 | 	VMOVDQA64 10*64(SI), Z12
 67 | 	VMOVDQA64 11*64(SI), Z13
 68 | 	CSA(Z6, Z7, Z10, Z22)
 69 | 	VMOVDQA64 12*64(SI), Z4
 70 | 	VMOVDQA64 13*64(SI), Z5
 71 | 	VMOVDQA64 14*64(SI), Z10
 72 | 	CSA(Z11, Z12, Z13, Z22)
 73 | 	VPBROADCASTD magic<>+0(SB), Z28 // 0x55555555 for transposition
 74 | 	VPBROADCASTD magic<>+4(SB), Z27 // 0x33333333 for transposition
 75 | 	VPBROADCASTD magic<>+8(SB), Z26 // 0x0f0f0f0f for transposition
 76 | 	CSA(Z4, Z5, Z10, Z22)
 77 | 	CSA(Z0, Z2, Z6, Z22)
 78 | 	CSA(Z1, Z3, Z7, Z22)
 79 | 	CSA(Z0, Z11, Z4, Z22)
 80 | 	CSA(Z2, Z12, Z5, Z22)
 81 | 	CSA(Z1, Z2, Z11, Z22)
 82 | 	CSA(Z2, Z3, Z12, Z22)
 83 | 
 84 | 	ADDQ $15*64, SI
 85 | 	SUBQ $(15+16)*64, CX		// enough data left to process?
 86 | 	JLT post
 87 | 
 88 | 	VPBROADCASTD magic<>+12(SB), Z24 // 0x00ff00ff
 89 | 	VPMOVZXBW magic<>+16(SB), Z23	// transposition vector
 90 | 	MOVL $65535, AX			// space left til overflow could occur in Z8, Z9
 91 | 
 92 | 	// load 1024 bytes from buf, add them to Z0..Z3 into Z0..Z4
 93 | vec:	VMOVDQA64 0*64(SI), Z4
 94 | 	VMOVDQA64 1*64(SI), Z5
 95 | 	VMOVDQA64 2*64(SI), Z6
 96 | 	VMOVDQA64 3*64(SI), Z7
 97 | 	VMOVDQA64 4*64(SI), Z10
 98 | 	CSA(Z0, Z4, Z5, Z22)
 99 | 	VMOVDQA64 5*64(SI), Z5
100 | 	VMOVDQA64 6*64(SI), Z11
101 | 	VMOVDQA64 7*64(SI), Z12
102 | 	CSA(Z6, Z7, Z10, Z22)
103 | 	VMOVDQA64 8*64(SI), Z10
104 | 	VMOVDQA64 9*64(SI), Z13
105 | 	VMOVDQA64 10*64(SI), Z14
106 | 	CSA(Z5, Z11, Z12, Z22)
107 | 	VMOVDQA64 11*64(SI), Z12
108 | 	VMOVDQA64 12*64(SI), Z15
109 | 	VMOVDQA64 13*64(SI), Z16
110 | 	CSA(Z10, Z13, Z14, Z22)
111 | 	VMOVDQA64 14*64(SI), Z14
112 | 	VMOVDQA64 15*64(SI), Z17
113 | 	CSA(Z12, Z15, Z16, Z22)
114 | 	ADDQ $16*64, SI
115 | 	PREFETCHT0 (SI)
116 | 	CSA(Z0, Z5, Z6, Z22)
117 | 	PREFETCHT0 64(SI)
118 | 	CSA(Z1, Z4, Z7, Z22)
119 | 	CSA(Z10, Z12, Z14, Z22)
120 | 	CSA(Z11, Z13, Z15, Z22)
121 | 	CSA(Z0, Z10, Z17, Z22)
122 | 	CSA(Z1, Z5, Z11, Z22)
123 | 	CSA(Z2, Z4, Z13, Z22)
124 | 	CSA(Z1, Z10, Z12, Z22)
125 | 	CSA(Z2, Z5, Z10, Z22)
126 | 	CSA(Z3, Z4, Z5, Z22)
127 | 
128 | 	// now Z0..Z4 hold counters; preserve Z0..Z3 for next round and
129 | 	// add Z4 to counters.
130 | 
131 | 	// split into even/odd and reduce into crumbs
132 | 	VPANDD Z4, Z28, Z5		// Z5 = bits 02468ace x32
133 | 	VPANDND Z4, Z28, Z6		// Z6 = bits 13579bdf x32
134 | 	VPSRLD $1, Z6, Z6
135 | 	VSHUFI64X2 $0x44, Z6, Z5, Z10
136 | 	VSHUFI64X2 $0xee, Z6, Z5, Z11
137 | 	VPADDD Z10, Z11, Z4		// Z4 = 02468ace x16 ... 13579bdf x16
138 | 
139 | 	// split again and reduce into nibbles
140 | 	VPANDD Z4, Z27, Z5		// Z5 = 048c x16 ... 159d x16
141 | 	VPANDND Z4, Z27, Z6		// Z6 = 26ae x16 ... 37bf x16
142 | 	VPSRLD $2, Z6, Z6
143 | 	VSHUFI64X2 $0x88, Z6, Z5, Z10
144 | 	VSHUFI64X2 $0xdd, Z6, Z5, Z11
145 | 	VPADDD Z10, Z11, Z4		// Z4 = 048c x8  159d x8  26ae x8  37bf x8
146 | 
147 | 	// split again and reduce into bytes (shifted left by 4)
148 | 	VPANDD Z4, Z26, Z5		// Z5 = 08 x8  19 x8  2a x8  3b x8
149 | 	VPANDND Z4, Z26, Z6		// Z6 = 4c x8  5d x8  6e x8  7f x8
150 | 	VPSLLD $4, Z5, Z5
151 | 	VPERMQ $0xd8, Z5, Z5		// Z5 = 08x4 19x4 08x4 19x4  2ax4 3bx4 2ax4 3bx4
152 | 	VPERMQ $0xd8, Z6, Z6		// Z6 = 4cx4 5dx4 4cx4 5dx4  6ex4 7fx4 6ex4 7fx4
153 | 	VSHUFI64X2 $0x88, Z6, Z5, Z10
154 | 	VSHUFI64X2 $0xdd, Z6, Z5, Z11
155 | 	VPADDD Z10, Z11, Z4		// Z4 = 08x4 19x4 2ax4 3bx4 4cx4 5dx4 6ex4 7fx4
156 | 
157 | 	// split again into 16 bit counters
158 | 	VPSRLW $8, Z4, Z6		// Z6 = 8888 9999 aaaa bbbb cccc dddd eeee ffff
159 | 	VPANDD Z4, Z24, Z5		// Z5 = 0000 1111 2222 3333 4444 5555 6666 7777
160 | 
161 | 	// accumulate in permuted order
162 | 	VPADDW Z5, Z8, Z8
163 | 	VPADDW Z6, Z9, Z9
164 | 
165 | 	SUBL $16*8, AX			// account for possible overflow
166 | 	CMPL AX, $(15+15)*8		// enough space left in the counters?
167 | 	JGE have_space
168 | 
169 | 	// fix permutation and flush into counters
170 | 	VPERMW Z8, Z23, Z8		// Z5 = 0123 4567 0123 4567 0123 4567 0123 4567
171 | 	VPERMW Z9, Z23, Z9		// Z6 = 89ab cdef 89ab cdef 89ab cdef 89ab cdef
172 | 	CALL *BX			// call accumulation function
173 | 	VPXOR Y8, Y8, Y8		// clear accumulators for next round
174 | 	VPXOR Y9, Y9, Y9
175 | 	MOVL $65535, AX			// space left til overflow could occur
176 | 
177 | have_space:
178 | 	SUBQ $16*64, CX			// account for bytes consumed
179 | 	JGE vec
180 | 
181 | 	// fix permutation for final step
182 | 	VPERMW Z8, Z23, Z8		// Z5 = 0123 4567 0123 4567 0123 4567 0123 4567
183 | 	VPERMW Z9, Z23, Z9		// Z6 = 89ab cdef 89ab cdef 89ab cdef 89ab cdef
184 | 
185 | 	// sum up Z0..Z3 into the counter registers
186 | post:	VPSRLD $1, Z0, Z4		// group nibbles in Z0--Z3 into Z4--Z7
187 | 	VPADDD Z1, Z1, Z5
188 | 	VPSRLD $1, Z2, Z6
189 | 	VPADDD Z3, Z3, Z7
190 | 	VPTERNLOGD $0xe4, Z28, Z5, Z0	// Z0 = eca86420 (low crumbs)
191 | 	VPTERNLOGD $0xd8, Z28, Z4, Z1	// Z1 = fdb97531 (high crumbs)
192 | 	VPTERNLOGD $0xe4, Z28, Z7, Z2	// Z2 = eca86420 (low crumbs)
193 | 	VPTERNLOGD $0xd8, Z28, Z6, Z3	// Z3 = fdb97531 (high crumbs)
194 | 
195 | 	VPSRLD $2, Z0, Z4
196 | 	VPSRLD $2, Z1, Z6
197 | 	VPSLLD $2, Z2, Z5
198 | 	VPSLLD $2, Z3, Z7
199 | 	VPTERNLOGD $0xd8, Z27, Z4, Z2	// Z2 = ea63
200 | 	VPTERNLOGD $0xd8, Z27, Z6, Z3	// Z3 = fb73
201 | 	VPTERNLOGD $0xe4, Z27, Z5, Z0	// Z0 = c840
202 | 	VPTERNLOGD $0xe4, Z27, Z7, Z1	// Z1 = d951
203 | 
204 | 	// pre-shuffle nibbles (within 128 bit lanes)!
205 | 	VPUNPCKLBW Z3, Z2, Z6		// Z6 = fbea7362 (3:2:1:0)
206 | 	VPUNPCKHBW Z3, Z2, Z3		// Z3 = fbea7362 (7:6:5:4)
207 | 	VPUNPCKLBW Z1, Z0, Z5		// Z5 = d9c85140 (3:2:1:0)
208 | 	VPUNPCKHBW Z1, Z0, Z2		// Z2 = d9c85140 (7:6:5:4)
209 | 	VPUNPCKLWD Z6, Z5, Z4		// Z4 = fbead9c873625140 (1:0)
210 | 	VPUNPCKHWD Z6, Z5, Z5		// Z5 = fbead9c873625140 (3:2)
211 | 	VPUNPCKLWD Z3, Z2, Z6		// Z6 = fbead9c873625140 (5:4)
212 | 	VPUNPCKHWD Z3, Z2, Z7		// Z7 = fbead9c873625140 (7:6)
213 | 
214 | 	// pull out high and low nibbles
215 | 	VPANDD Z26, Z4, Z0
216 | 	VPSRLD $4, Z4, Z4
217 | 	VPANDD Z26, Z4, Z4
218 | 
219 | 	VPANDD Z26, Z5, Z1
220 | 	VPSRLD $4, Z5, Z5
221 | 	VPANDD Z26, Z5, Z5
222 | 
223 | 	VPANDD Z26, Z6, Z2
224 | 	VPSRLD $4, Z6, Z6
225 | 	VPANDD Z26, Z6, Z6
226 | 
227 | 	VPANDD Z26, Z7, Z3
228 | 	VPSRLD $4, Z7, Z7
229 | 	VPANDD Z26, Z7, Z7
230 | 
231 | 	// reduce once
232 | 	VPADDB Z2, Z0, Z0		// Z0 = ba983210 (1:0)
233 | 	VPADDB Z3, Z1, Z1		// Z1 = ba983210 (3:2)
234 | 	VPADDB Z6, Z4, Z2		// Z2 = fedc7654 (1:0)
235 | 	VPADDB Z7, Z5, Z3		// Z3 = fedc7654 (3:2)
236 | 
237 | 	// shuffle again to form ordered groups of 16 counters in each lane
238 | 	VPUNPCKLDQ Z2, Z0, Z4		// Z4 = fedcba9876543210 (0)
239 | 	VPUNPCKHDQ Z2, Z0, Z5		// Z5 = fedcba9876543210 (1)
240 | 	VPUNPCKLDQ Z3, Z1, Z6		// Z6 = fedcba9876543210 (2)
241 | 	VPUNPCKHDQ Z3, Z1, Z7		// Z7 = fedcba9876543210 (3)
242 | 
243 | 	// reduce lanes once (4x1 lane -> 2x2 lanes)
244 | 	VSHUFI64X2 $0x44, Z5, Z4, Z0	// Z0 = fedcba9876543210 (1:1:0:0)
245 | 	VSHUFI64X2 $0xee, Z5, Z4, Z1	// Z1 = fedcba9876543210 (1:1:0:0)
246 | 	VSHUFI64X2 $0x44, Z7, Z6, Z2	// Z2 = fedcba9876543210 (3:3:2:2)
247 | 	VSHUFI64X2 $0xee, Z7, Z6, Z3	// Z2 = fedcba9876543210 (3:3:2:2)
248 | 	VPADDB Z1, Z0, Z0
249 | 	VPADDB Z3, Z2, Z2
250 | 
251 | 	// reduce lanes again (2x2 lanes -> 1x4 lane)
252 | 	VSHUFI64X2 $0x88, Z2, Z0, Z1	// Z1 = fedcba9876543210 (3:2:1:0)
253 | 	VSHUFI64X2 $0xdd, Z2, Z0, Z0	// Z0 = fedcba9876543210 (3:2:1:0)
254 | 	VPADDB Z1, Z0, Z0
255 | 
256 | 	// Zero extend and add to Z8, Z9
257 | 	VPUNPCKLBW Z25, Z0, Z1		// Z1 = 76543210 (3:2:1:0)
258 | 	VPUNPCKHBW Z25, Z0, Z2		// Z2 = fedcba98 (3:2:1:0)
259 | 	VPADDW Z1, Z8, Z8
260 | 	VPADDW Z2, Z9, Z9
261 | 
262 | endvec:	VPXOR Y0, Y0, Y0		// counter register
263 | 
264 | 	// process tail, 8 bytes at a time
265 | 	CMPL CX, $-16*64		// no bytes left to process?
266 | 	JE end
267 | 	SUBL $8-16*64, CX		// 8 bytes left to process?
268 | 	JLE tail1
269 | 
270 | tail8:	KMOVQ (SI), K1
271 | 	ADDQ $8, SI
272 | 	VPSUBB Z30, Z0, K1, Z0
273 | 	SUBL $8, CX
274 | 	JGT tail8
275 | 
276 | 	// process remaining 1--8 bytes
277 | tail1:	MOVL $8*8(CX*8), CX
278 | 	BZHIQ CX, (SI), AX		// load tail into AX (will never fault)
279 | 	KMOVQ AX, K1
280 | 	VPSUBB Z30, Z0, K1, Z0
281 | 
282 | 	// add tail to counters
283 | 	VPUNPCKLBW Z25, Z0, Z1
284 | 	VPUNPCKHBW Z25, Z0, Z2
285 | 	VPADDW Z1, Z8, Z8
286 | 	VPADDW Z2, Z9, Z9
287 | 
288 | 	// and perform a final accumulation
289 | end:	CALL *BX
290 | 	VZEROUPPER
291 | 	RET
292 | 
293 | 	// special processing for when the data is less than
294 | 	// one iteration of the kernel
295 | runt:	VPXOR Y0, Y0, Y0		// counter register
296 | 	SUBL $8, CX			// 8 bytes left to process?
297 | 	JLE runtrunt			// input of 0--8 bytes?
298 | 
299 | runt8:	KMOVQ (SI), K1
300 | 	ADDQ $8, SI
301 | 	VPSUBB Z30, Z0, K1, Z0
302 | 	SUBQ $8, CX
303 | 	JGT runt8
304 | 
305 | 	// process last 1--7 bytes
306 | 	// as SI has no particular alignment, we cannot savely overread
307 | 	// instead overlap previous chunk and shift out junk
308 | 	MOVL $(CX*8), DX
309 | 	NEGL DX				// number of bits to be masked out
310 | 	SHRXQ DX, (SI)(CX*1), AX
311 | 	KMOVQ AX, K1
312 | 	VPSUBB Z30, Z0, K1, Z0
313 | 
314 | 	// populate counters and accumulate
315 | 	VPUNPCKLBW Z25, Z0, Z8
316 | 	VPUNPCKHBW Z25, Z0, Z9
317 | 	CALL *BX
318 | 	VZEROUPPER
319 | 	RET
320 | 
321 | 	// process runt of 0--8 bytes
322 | runtrunt:
323 | 	ADDL $8, CX
324 | 	XORL AX, AX
325 | 	BTSL CX, AX			// 1 << CX
326 | 	DECL AX				// mask of CX ones
327 | 	KMOVD AX, K1
328 | 	VMOVDQU8.Z (SI), K1, X4		// just the runt bytes
329 | 	VMOVQ X4, AX
330 | 	KMOVQ AX, K1
331 | 	VPSUBB Z30, Z0, K1, Z0
332 | 
333 | 	// populate counters and accumulate
334 | 	VPUNPCKLBW Z25, Z0, Z8
335 | 	VPUNPCKHBW Z25, Z0, Z9
336 | 	CALL *BX
337 | 	VZEROUPPER
338 | 	RET
339 | 
340 | TEXT accum8<>(SB), NOSPLIT, $0-0
341 | 	// unpack and zero-extend
342 | 	VPMOVZXWQ X8, Z10
343 | 	VEXTRACTI128 $1, Y8, X11
344 | 	VPMOVZXWQ X11, Z11
345 | 	VEXTRACTI64X2 $2, Z8, X12
346 | 	VPMOVZXWQ X12, Z12
347 | 	VEXTRACTI64X2 $3, Z8, X13
348 | 	VPMOVZXWQ X13, Z13
349 | 	VPMOVZXWQ X9, Z14
350 | 	VEXTRACTI128 $1, Y9, X15
351 | 	VPMOVZXWQ X15, Z15
352 | 	VEXTRACTI64X2 $2, Z9, X16
353 | 	VPMOVZXWQ X16, Z16
354 | 	VEXTRACTI64X2 $3, Z9, X17
355 | 	VPMOVZXWQ X17, Z17
356 | 
357 | 	// fold over thrice
358 | 	VPADDQ Z12, Z10, Z10
359 | 	VPADDQ Z13, Z11, Z11
360 | 	VPADDQ Z16, Z14, Z14
361 | 	VPADDQ Z17, Z15, Z15
362 | 	VPADDQ Z11, Z10, Z10
363 | 	VPADDQ Z15, Z14, Z14
364 | 	VPADDQ Z14, Z10, Z10
365 | 
366 | 	// add to counters
367 | 	VPADDQ 0*64(DI), Z10, Z10
368 | 	VMOVDQU64 Z10, 0*64(DI)
369 | 
370 | 	RET
371 | 
372 | TEXT accum16<>(SB), NOSPLIT, $0-0
373 | 	// unpack and zero-extend
374 | 	VPMOVZXWQ X8, Z10
375 | 	VEXTRACTI128 $1, Y8, X11
376 | 	VPMOVZXWQ X11, Z11
377 | 	VEXTRACTI64X2 $2, Z8, X12
378 | 	VPMOVZXWQ X12, Z12
379 | 	VEXTRACTI64X2 $3, Z8, X13
380 | 	VPMOVZXWQ X13, Z13
381 | 	VPMOVZXWQ X9, Z14
382 | 	VEXTRACTI128 $1, Y9, X15
383 | 	VPMOVZXWQ X15, Z15
384 | 	VEXTRACTI64X2 $2, Z9, X16
385 | 	VPMOVZXWQ X16, Z16
386 | 	VEXTRACTI64X2 $3, Z9, X17
387 | 	VPMOVZXWQ X17, Z17
388 | 
389 | 	// fold over twice
390 | 	VPADDQ Z12, Z10, Z10
391 | 	VPADDQ Z13, Z11, Z11
392 | 	VPADDQ Z16, Z14, Z14
393 | 	VPADDQ Z17, Z15, Z15
394 | 	VPADDQ Z11, Z10, Z10
395 | 	VPADDQ Z15, Z14, Z14
396 | 
397 | 	// add to counters
398 | 	VPADDQ 0*64(DI), Z10, Z10
399 | 	VPADDQ 1*64(DI), Z14, Z14
400 | 	VMOVDQU64 Z10, 0*64(DI)
401 | 	VMOVDQU64 Z14, 1*64(DI)
402 | 
403 | 	RET
404 | 
405 | TEXT accum32<>(SB), NOSPLIT, $0-0
406 | 	// fold high half over low half and reduce
407 | 	VEXTRACTI64X2 $2, Z8, X12
408 | 	VEXTRACTI64X2 $2, Z9, X13
409 | 	VPMOVZXWQ X8, Z10
410 | 	VPMOVZXWQ X9, Z11
411 | 	VPMOVZXWQ X12, Z12
412 | 	VPMOVZXWQ X13, Z13
413 | 	VPADDQ Z12, Z10, Z10
414 | 	VPADDQ Z13, Z11, Z11
415 | 	VPADDQ 0*64(DI), Z10, Z10
416 | 	VPADDQ 1*64(DI), Z11, Z11
417 | 	VMOVDQU64 Z10, 0*64(DI)
418 | 	VMOVDQU64 Z11, 1*64(DI)
419 | 
420 | 	VEXTRACTI128 $1, Y8, X10
421 | 	VEXTRACTI128 $1, Y9, X11
422 | 	VEXTRACTI64X2 $3, Z8, X12
423 | 	VEXTRACTI64X2 $3, Z9, X13
424 | 	VPMOVZXWQ X10, Z10
425 | 	VPMOVZXWQ X11, Z11
426 | 	VPMOVZXWQ X12, Z12
427 | 	VPMOVZXWQ X13, Z13
428 | 	VPADDQ Z12, Z10, Z10
429 | 	VPADDQ Z13, Z11, Z11
430 | 	VPADDQ 2*64(DI), Z10, Z10
431 | 	VPADDQ 3*64(DI), Z11, Z11
432 | 	VMOVDQU64 Z10, 2*64(DI)
433 | 	VMOVDQU64 Z11, 3*64(DI)
434 | 
435 | 	RET
436 | 
437 | TEXT accum64<>(SB), NOSPLIT, $0-0
438 | 	VPMOVZXWQ X8, Z13
439 | 	VPMOVZXWQ X9, Z14
440 | 	VPADDQ 0*64(DI), Z13, Z13
441 | 	VPADDQ 1*64(DI), Z14, Z14
442 | 	VMOVDQU64 Z13, 0*64(DI)
443 | 	VMOVDQU64 Z14, 1*64(DI)
444 | 
445 | 	VEXTRACTI128 $1, Y8, X13
446 | 	VEXTRACTI128 $1, Y9, X14
447 | 	VPMOVZXWQ X13, Z13
448 | 	VPMOVZXWQ X14, Z14
449 | 	VPADDQ 2*64(DI), Z13, Z13
450 | 	VPADDQ 3*64(DI), Z14, Z14
451 | 	VMOVDQU64 Z13, 2*64(DI)
452 | 	VMOVDQU64 Z14, 3*64(DI)
453 | 
454 | 	VEXTRACTI64X2 $2, Z8, X13
455 | 	VEXTRACTI64X2 $2, Z9, X14
456 | 	VPMOVZXWQ X13, Z13
457 | 	VPMOVZXWQ X14, Z14
458 | 	VPADDQ 4*64(DI), Z13, Z13
459 | 	VPADDQ 5*64(DI), Z14, Z14
460 | 	VMOVDQU64 Z13, 4*64(DI)
461 | 	VMOVDQU64 Z14, 5*64(DI)
462 | 
463 | 	VEXTRACTI64X2 $3, Z8, X13
464 | 	VEXTRACTI64X2 $3, Z9, X14
465 | 	VPMOVZXWQ X13, Z13
466 | 	VPMOVZXWQ X14, Z14
467 | 	VPADDQ 6*64(DI), Z13, Z13
468 | 	VPADDQ 7*64(DI), Z14, Z14
469 | 	VMOVDQU64 Z13, 6*64(DI)
470 | 	VMOVDQU64 Z14, 7*64(DI)
471 | 
472 | 	RET
473 | 
474 | // func count8avx512(counts *[8]int, buf []uint8)
475 | TEXT ·count8avx512(SB), 0, $0-32
476 | 	MOVQ counts+0(FP), DI
477 | 	MOVQ buf_base+8(FP), SI
478 | 	MOVQ buf_len+16(FP), CX
479 | 	MOVQ $accum8<>(SB), BX
480 | 	CALL countavx512<>(SB)
481 | 	RET
482 | 
483 | // func count16avx512(counts *[16]int, buf []uint16)
484 | TEXT ·count16avx512(SB), 0, $0-32
485 | 	MOVQ counts+0(FP), DI
486 | 	MOVQ buf_base+8(FP), SI
487 | 	MOVQ buf_len+16(FP), CX
488 | 	MOVQ $accum16<>(SB), BX
489 | 	SHLQ $1, CX
490 | 	CALL countavx512<>(SB)
491 | 	RET
492 | 
493 | // func count32avx512(counts *[32]int, buf []uint32)
494 | TEXT ·count32avx512(SB), 0, $0-32
495 | 	MOVQ counts+0(FP), DI
496 | 	MOVQ buf_base+8(FP), SI
497 | 	MOVQ buf_len+16(FP), CX
498 | 	MOVQ $accum32<>(SB), BX
499 | 	SHLQ $2, CX
500 | 	CALL countavx512<>(SB)
501 | 	RET
502 | 
503 | // func count64avx512(counts *[64]int, buf []uint64)
504 | TEXT ·count64avx512(SB), 0, $0-32
505 | 	MOVQ counts+0(FP), DI
506 | 	MOVQ buf_base+8(FP), SI
507 | 	MOVQ buf_len+16(FP), CX
508 | 	MOVQ $accum64<>(SB), BX
509 | 	SHLQ $3, CX
510 | 	CALL countavx512<>(SB)
511 | 	RET
512 | 


--------------------------------------------------------------------------------
/countneon_arm64.s:
--------------------------------------------------------------------------------
  1 | //+build arm64,go1.16
  2 | 
  3 | #include "textflag.h"
  4 | 
  5 | // A NEON based kernel first doing a 15-fold CSA reduction and then a
  6 | // 16-fold CSA reduction, carrying over place-value vectors between
  7 | // iterations.
  8 | 
  9 | // magic transposition constants, sliding window
 10 | DATA magic<>+ 0(SB)/8, $0x8040201008040201
 11 | DATA magic<>+ 8(SB)/8, $0x0000000000000000
 12 | DATA magic<>+16(SB)/8, $0x0000000000000000
 13 | DATA magic<>+24(SB)/8, $0xffffffffffffffff
 14 | DATA magic<>+32(SB)/8, $0xffffffffffffffff
 15 | GLOBL magic<>(SB), RODATA|NOPTR, $40
 16 | 
 17 | // B:A = A+B+C, V31 used for scratch space
 18 | #define CSA(A, B, C) \
 19 | 	VEOR B.B16, A.B16, V31.B16 \
 20 | 	VEOR C.B16, V31.B16, A.B16 \
 21 | 	VBIT V31.B16, C.B16, B.B16
 22 | 
 23 | // D:A = A+B+C
 24 | #define CSAC(A, B, C, D) \
 25 | 	VEOR A.B16, B.B16, D.B16 \
 26 | 	VEOR D.B16, C.B16, A.B16 \
 27 | 	VBSL B.B16, C.B16, D.B16
 28 | 
 29 | // Process 4 bytes from S.  Add low word counts to L, high to H.
 30 | // Assumes masks loaded into V28, V29, and V30.  Trashes V4, V5.
 31 | #define COUNT4(L, H, S) \
 32 | 	VTBL V30.B16, [S.B16], V4.B16 \	// V4 = 0000:0000:1111:1111
 33 | 	VTBL V29.B16, [S.B16], V5.B16 \	// V5 = 2222:2222:3333:3333
 34 | 	VCMTST V28.B16, V4.B16, V4.B16 \
 35 | 	VCMTST V28.B16, V5.B16, V5.B16 \
 36 | 	VSUB V4.B16, L.B16, L.B16 \
 37 | 	VSUB V5.B16, H.B16, H.B16
 38 | 
 39 | // Generic kernel.  This function expects a pointer to a width-specific
 40 | // accumulation function in R0, a possibly unaligned input buffer in R1,
 41 | // counters in R2 and a remaining length in R3.
 42 | TEXT countneon<>(SB), NOSPLIT, $0-0
 43 | 	// constant for processing the head
 44 | 	MOVD $magic<>(SB), R4
 45 | 	VLD1R.P 8(R4), [V28.D2]		// 80402010080402018040201008040201
 46 | 	VMOVI $1, V30.B8		// 00000000000000000101010101010101
 47 | 	VMOVI $2, V29.B16		// 02020202020202020202020202020202
 48 | 	VADD V30.B16, V29.B16, V29.B16	// 02020202020202020303030303030303
 49 | 	VMOVI $0, V8.B16		// counter registers
 50 | 	VMOVI $0, V10.B16
 51 | 	VMOVI $0, V12.B16
 52 | 	VMOVI $0, V14.B16
 53 | 
 54 | 	CMP $15*16, R3			// is the CSA kernel worth using?
 55 | 	BLT runt
 56 | 
 57 | 	// load head until alignment/end is reached
 58 | 	AND $15, R1, R6			// offset of the buffer start from 16 byte alignment
 59 | 	AND $~15, R1, R1		// align the source buffer pointer
 60 | 	SUB $16, R6, R5			// negated number of bytes til alignment is reached
 61 | 	ADD R6, R3, R3			// account for head length in CX
 62 | 	NEG R5, R5			// number of bytes til alignment is reached
 63 | 	VLD1.P 16(R1), [V3.B16]		// load head, advance past it
 64 | //	VMOVQ (R4)(R5), V5		// load mask of bytes that are part of the head
 65 | 	WORD $0x3ce56885
 66 | 	VAND V5.B16, V3.B16, V0.B16	// and mask out those bytes that are not
 67 | 
 68 | 	// load 15 registers worth of data and accumulate into V3--V0
 69 | 	VLD1.P 2*16(R1), [V1.B16, V2.B16]
 70 | 	VLD1.P 4*16(R1), [V3.B16, V4.B16, V5.B16, V6.B16]
 71 | 	VLD1.P 4*16(R1), [V16.B16, V17.B16, V18.B16, V19.B16]
 72 | 	CSA(V0, V1, V2)
 73 | 	VMOVI $0x55, V27.B16		// 55555555 for transposition
 74 | 	CSAC(V0, V3, V4, V2)
 75 | 	VMOVI $0x33, V26.B16		// 33333333 for transposition
 76 | 	CSAC(V0, V5, V6, V3)
 77 | 	VLD1.P 4*16(R1), [V4.B16, V5.B16, V6.B16, V7.B16]
 78 | 	CSA(V1, V2, V3)
 79 | 	CSA(V0, V16, V17)
 80 | 	VMOVI $0x0f, V25.B16		// 0f0f0f0f for extracting nibbles
 81 | 	CSA(V0, V18, V19)
 82 | 	MOVD $65535, R6			// space left til overflow could occur in V8--V15
 83 | 	CSAC(V1, V16, V18, V3)
 84 | 	VMOVI $0, V9.B16
 85 | 	CSA(V0, V4, V5)
 86 | 	VMOVI $0, V11.B16
 87 | 	CSA(V0, V6, V7)
 88 | 	VMOVI $0, V13.B16
 89 | 	CSA(V1, V4, V6)
 90 | 	VMOVI $0, V15.B16
 91 | 	CSA(V2, V3, V4)
 92 | 
 93 | 	SUBS $(15+16)*16, R3, R3	// enough data left to process?
 94 | 	BLT post
 95 | 
 96 | 	// load 16 registers worth of data and accumulate into V4--V0
 97 | vec:	VLD1.P 4*16(R1), [V4.B16, V5.B16, V6.B16, V7.B16]
 98 | 	VLD1.P 4*16(R1), [V16.B16, V17.B16, V18.B16, V19.B16]
 99 | 	VLD1.P 4*16(R1), [V20.B16, V21.B16, V22.B16, V23.B16]
100 | 	CSA(V4, V5, V6)
101 | 	CSA(V0, V17, V19)
102 | 	CSA(V7, V16, V18)
103 | 	CSA(V21, V22, V20)
104 | 	CSA(V1, V5, V17)
105 | 	VLD1.P 4*16(R1), [V17.B16, V18.B16, V19.B16, V20.B16]
106 | 	CSA(V0, V4, V7)
107 | 	CSA(V17, V18, V23)
108 | 	CSA(V19, V20, V21)
109 | 	CSA(V16, V18, V22)
110 | 	CSA(V1, V4, V20)
111 | 	CSA(V0, V17, V19)
112 | 	CSA(V2, V5, V18)
113 | 	CSA(V1, V16, V17)
114 | 	CSA(V2, V4, V16)
115 | 	CSA(V3, V4, V5)
116 | 
117 | 	// now V0..V4 hold counters; preserve V0..V3 for the next round and
118 | 	// add V4 to counters.
119 | 
120 | 	// split into even/odd and reduce into crumbs
121 | 	VAND V27.B16, V4.B16, V5.B16	// V5 = bits 02468ace x8
122 | //	VBIC V27.B16, V4.B16, V6.B16	// V6 = bits 13579bdf x8
123 | 	WORD $0x4e7b1c86
124 | 	VUSHR $1, V6.B16, V6.B16
125 | 	VZIP1 V6.D2, V5.D2, V4.D2
126 | 	VZIP2 V6.D2, V5.D2, V5.D2
127 | 	VADD V5.B16, V4.B16, V4.B16	// V4 = 02468ace x4 13579bdf x4
128 | 
129 | 	// split again into nibbles
130 | 	VAND V26.B16, V4.B16, V5.B16	// V5 = 048c x4 159d x4
131 | //	VBIC V26.B16, V4.B16, V6.B16	// V6 = 26ae x4 37bf x4
132 | 	WORD $0x4e7a1c86
133 | 	VUSHR $2, V6.B16, V6.B16
134 | 
135 | 	// split again into bytes and shuffle into order (also scale)
136 | 	VAND V25.B16, V5.B16, V4.B16	// V4 = 08 x4 19 x4
137 | //	VBIC V25.B16, V5.B16, V5.B16	// V5 = 4c x4 5d x4
138 | 	WORD $0x4e791ca5
139 | //	VBIC V25.B16, V6.B16, V7.B16	// V7 = 6e x4 7f x4
140 | 	WORD $0x4e791cc7
141 | 	VAND V25.B16, V6.B16, V6.B16	// V6 = 2a x4 3b x4
142 | 	VSHL $4, V4.B16, V4.B16
143 | 	VSHL $4, V6.B16, V6.B16
144 | 
145 | 	VZIP1 V6.B16, V4.B16, V16.B16	// V16 = 028a x4
146 | 	VZIP2 V6.B16, V4.B16, V17.B16	// V17 = 139b x4
147 | 	VZIP1 V7.B16, V5.B16, V18.B16	// V18 = 46ce x4
148 | 	VZIP2 V7.B16, V5.B16, V19.B16	// V19 = 57df x4
149 | 
150 | 	VZIP1 V17.B16, V16.B16, V4.B16	// V4 = 012389ab[0:1]
151 | 	VZIP2 V17.B16, V16.B16, V5.B16	// V5 = 012389ab[2:3]
152 | 	VZIP1 V19.B16, V18.B16, V6.B16	// V6 = 4567cdef[0:1]
153 | 	VZIP2 V19.B16, V18.B16, V7.B16	// V7 = 4567cdef[2:3]
154 | 
155 | 	VZIP1 V6.S4, V4.S4, V16.S4	// V16 = 01234567[0:1]
156 | 	VZIP2 V6.S4, V4.S4, V17.S4	// V17 = 89abcdef[0:1]
157 | 	VZIP1 V7.S4, V5.S4, V18.S4	// V18 = 01234567[2:3]
158 | 	VZIP2 V7.S4, V5.S4, V19.S4	// V19 = 89abcdef[2:3]
159 | 
160 | 	// add to counters
161 | 	VUADDW V16.B8, V8.H8, V8.H8
162 | 	VUADDW2 V16.B16, V9.H8, V9.H8
163 | 	VUADDW V17.B8, V10.H8, V10.H8
164 | 	VUADDW2 V17.B16, V11.H8, V11.H8
165 | 	VUADDW V18.B8, V12.H8, V12.H8
166 | 	VUADDW2 V18.B16, V13.H8, V13.H8
167 | 	VUADDW V19.B8, V14.H8, V14.H8
168 | 	VUADDW2 V19.B16, V15.H8, V15.H8
169 | 
170 | 	SUB $16*2, R6, R6		// account for possible overflow
171 | 	CMP $(15+15)*2, R6		// enough space left in the counters?
172 | 
173 | 	BGE have_space
174 | 
175 | 	CALL *R0			// call accumulation function
176 | 	VMOVI $0, V8.B16		// clear counters for next round
177 | 	VMOVI $0, V9.B16
178 | 	VMOVI $0, V10.B16
179 | 	VMOVI $0, V11.B16
180 | 	VMOVI $0, V12.B16
181 | 	VMOVI $0, V13.B16
182 | 	VMOVI $0, V14.B16
183 | 	VMOVI $0, V15.B16
184 | 
185 | 	MOVD $65535, R6			// space left til overflow could occur
186 | 
187 | have_space:
188 | 	SUBS $16*16, R3, R3		// account for bytes consumed
189 | 	BGE vec
190 | 
191 | 	// group V0--V3 into nibbles in the same register
192 | post:	VUSHR $1, V0.B16, V4.B16
193 | 	VADD V1.B16, V1.B16, V5.B16
194 | 	VUSHR $1, V2.B16, V6.B16
195 | 	VADD V3.B16, V3.B16, V7.B16
196 | 	VBIF V27.B16, V5.B16, V0.B16	// V0 = eca86420 (low crumbs)
197 | 	VBIT V27.B16, V4.B16, V1.B16	// V1 = fdb97531 (high crumbs)
198 | 	VBIF V27.B16, V7.B16, V2.B16	// V2 = eca86420 (low crumbs)
199 | 	VBIT V27.B16, V6.B16, V3.B16	// V3 = fdb97531 (high crumbs)
200 | 
201 | 	VUSHR $2, V0.B16, V4.B16
202 | 	VUSHR $2, V1.B16, V6.B16
203 | 	VSHL $2, V2.B16, V5.B16
204 | 	VSHL $2, V3.B16, V7.B16
205 | 	VBIT V26.B16, V4.B16, V2.B16	// V2 = ea62
206 | 	VBIT V26.B16, V6.B16, V3.B16	// V3 = fb73
207 | 	VBIF V26.B16, V5.B16, V0.B16	// V0 = c840
208 | 	VBIF V26.B16, V7.B16, V1.B16	// V1 = d951
209 | 
210 | 	// pre-shuffle nibbles
211 | 	VZIP1 V3.B16, V2.B16, V6.B16	// V6 = fbea7362 (3:2:1:0)
212 | 	VZIP2 V3.B16, V2.B16, V3.B16	// V3 = fbea7362 (7:6:5:4)
213 | 	VZIP1 V1.B16, V0.B16, V5.B16	// V5 = d9c85140 (3:2:1:0)
214 | 	VZIP2 V1.B16, V0.B16, V2.B16	// V2 = d9c85140 (7:6:5:4)
215 | 	VZIP1 V6.H8, V5.H8, V4.H8	// V4 = fbead9c873625140 (1:0)
216 | 	VZIP2 V6.H8, V5.H8, V5.H8	// V5 = fbead9c873625140 (3:2)
217 | 	VZIP1 V3.H8, V2.H8, V6.H8	// V6 = fbead9c873625150 (5:4)
218 | 	VZIP2 V3.H8, V2.H8, V7.H8	// V7 = fbead9c873625150 (7:6)
219 | 
220 | 	// pull out high and low nibbles and reduce once
221 | 	VAND V4.B16, V25.B16, V0.B16
222 | 	VUSHR $4, V4.B16, V4.B16
223 | 	VAND V5.B16, V25.B16, V1.B16
224 | 	VUSHR $4, V5.B16, V5.B16
225 | 	VAND V6.B16, V25.B16, V2.B16
226 | 	VADD V0.B16, V2.B16, V0.B16	// V0 = ba983210 (1:0)
227 | 	VUSRA $4, V6.B16, V4.B16	// V4 = fedc7654 (1:0)
228 | 	VAND V7.B16, V25.B16, V3.B16
229 | 	VADD V1.B16, V3.B16, V1.B16	// V1 = ba983210 (3:2)
230 | 	VUSRA $4, V7.B16, V5.B16	// V5 = fedc7654 (3:2)
231 | 
232 | 	// shuffle one last time
233 | 	VZIP1 V4.S4, V0.S4, V2.S4	// V2 = fedcba987654 (0)
234 | 	VZIP2 V4.S4, V0.S4, V3.S4	// V3 = fedcba987654 (1)
235 | 	VZIP1 V5.S4, V1.S4, V6.S4	// V6 = fedcba987654 (2)
236 | 	VZIP2 V5.S4, V1.S4, V7.S4	// V7 = fedcba987654 (3)
237 | 
238 | 	// add to counters
239 | 	VUADDW V2.B8, V8.H8, V8.H8
240 | 	VUADDW2 V2.B16, V9.H8, V9.H8
241 | 	VUADDW V3.B8, V10.H8, V10.H8
242 | 	VUADDW2 V3.B16, V11.H8, V11.H8
243 | 	VUADDW V6.B8, V12.H8, V12.H8
244 | 	VUADDW2 V6.B16, V13.H8, V13.H8
245 | 	VUADDW V7.B8, V14.H8, V14.H8
246 | 	VUADDW2 V7.B16, V15.H8, V15.H8
247 | 
248 | endvec:	VMOVI $0, V0.B16		// counter registers
249 | 	VMOVI $0, V1.B16
250 | 	VMOVI $0, V2.B16
251 | 	VMOVI $0, V3.B16
252 | 
253 | 	// process tail, 8 bytes at a time
254 | 	ADDS $16*16-8, R3, R3		// 8 bytes left to process?
255 | 	BLT tail1
256 | 
257 | tail8:	SUBS $8, R3
258 | 	FMOVS.P 4(R1), F6
259 | 	FMOVS.P 4(R1), F7
260 | 	COUNT4(V0, V1, V6)
261 | 	COUNT4(V2, V3, V7)
262 | 	BGE tail8
263 | 
264 | 	// process remaining 0--7 bytes
265 | tail1:	ADDS $8, R3			// anything left to process?
266 | 	BLE end
267 | 
268 | 	FMOVD (R1), F6			// load 8 bytes from buffer
269 | 	SUB R3, R4, R6			// shifted window address
270 | 	FMOVQ 16(R6), F5		// load window mask
271 | //	VBIC V5.B16, V6.B16, V6.B16	// mask out the desired bytes
272 | 	WORD $0x4e651cc6
273 | 
274 | 	// process tail
275 | 	VEXT $4, V6.B16, V6.B16, V7.B16
276 | 	COUNT4(V0, V1, V6)
277 | 	COUNT4(V2, V3, V7)
278 | 
279 | 	// add tail to counters
280 | end:	VUADDW V0.B8, V9.H8, V9.H8
281 | 	VUADDW2 V0.B16, V8.H8, V8.H8
282 | 	VUADDW V1.B8, V11.H8, V11.H8
283 | 	VUADDW2 V1.B16, V10.H8, V10.H8
284 | 	VUADDW V2.B8, V13.H8, V13.H8
285 | 	VUADDW2 V2.B16, V12.H8, V12.H8
286 | 	VUADDW V3.B8, V15.H8, V15.H8
287 | 	VUADDW2 V3.B16, V14.H8, V14.H8
288 | 
289 | 	CALL *R0
290 | 	RET
291 | 
292 | 	// very short input, use tail routine only
293 | runt:	SUBS $8, R3			// 8 bytes left to process?
294 | 	BLT runt1
295 | 
296 | 	// process runt, 8 bytes at a time
297 | runt8:	SUBS $8, R3
298 | 	FMOVS.P 4(R1), F6
299 | 	FMOVS.P 4(R1), F7
300 | 	COUNT4(V8, V10, V6)
301 | 	COUNT4(V12, V14, V7)
302 | 	BGE runt8
303 | 
304 | 	// process remaining 0--7 bytes
305 | 	// while making sure we don't get a page fault
306 | runt1:	ADDS $8, R3			// anything left to process?
307 | 	BLE runt_accum
308 | 
309 | 	AND $7, R1, R5			// offset from 8 byte alignment
310 | 	ADD R5, R3, R8			// length of buffer including alignment
311 | 	LSL $3, R3, R3			// remaining length in bits
312 | 	MOVD $-1, R7
313 | 	LSL R3, R7, R7			// mask of bits where R6 is out of range
314 | 	CMP $8, R8			// if this exceeds an alignment boundary
315 | 	BGT crossrunt1			// we can safely load directly
316 | 
317 | 	AND $~7, R1, R1			// align buffer to 8 bytes
318 | 	MOVD (R1), R6			// and load 8 bytes from buffer
319 | 	LSL $3, R5, R5			// offset from 8 byte alignment in bits
320 | 	LSR R5, R6, R6			// buffer starting at the beginning
321 | 	B dorunt1
322 | 
323 | crossrunt1:
324 | 	MOVD (R1), R6			// load 8 bytes from unaligned buffer
325 | 
326 | dorunt1:
327 | 	BIC R7, R6, R6			// clear out of range bits
328 | 	FMOVD R6, F6			// move buffer to SIMD unit
329 | 	VEXT $4, V6.B16, V6.B16, V7.B16
330 | 	COUNT4(V8, V10, V6)
331 | 	COUNT4(V12, V14, V7)
332 | 
333 | 	// initialise counters with tail
334 | runt_accum:
335 | 	VUXTL V8.B8, V9.H8		//  8--15  89abcdef[0]
336 | 	VUXTL2 V8.B16, V8.H8		//  0-- 7  01234567[0]
337 | 	VUXTL V10.B8, V11.H8		// 24--31  89abcdef[1]
338 | 	VUXTL2 V10.B16, V10.H8		// 16--23  01234567[1]
339 | 	VUXTL V12.B8, V13.H8		// 40--47  89abcdef[2]
340 | 	VUXTL2 V12.B16, V12.H8		// 32--39  01234567[2]
341 | 	VUXTL V14.B8, V15.H8		// 56--63  89abcdef[3]
342 | 	VUXTL2 V14.B16, V14.H8		// 48--55  01234567[3]
343 | 
344 | 	CALL *R0
345 | 	RET
346 | 
347 | TEXT accum8<>(SB), NOSPLIT, $0-0
348 | 	// load counts registers
349 | 	VLD1 (R2), [V4.D2, V5.D2, V6.D2, V7.D2]
350 | 
351 | 	// zero extend into dwords and fold
352 | //	VUADDL V8.H4, V10.H4, V16.S4
353 | //	VUADDL2 V8.H8, V10.H8, V17.S4
354 | //	VUADDL V9.H4, V11.H4, V18.S4
355 | //	VUADDL2 V9.H8, V11.H8, V19.S4
356 | //	VUADDL V12.H4, V14.H4, V20.S4
357 | //	VUADDL2 V12.H8, V14.H8, V21.S4
358 | //	VUADDL V13.H4, V15.H4, V22.S4
359 | //	VUADDL2 V13.H8, V15.H8, V23.S4
360 | 	WORD $0x2e680150
361 | 	WORD $0x6e680151
362 | 	WORD $0x2e690172
363 | 	WORD $0x6e690173
364 | 	WORD $0x2e6c01d4
365 | 	WORD $0x6e6c01d5
366 | 	WORD $0x2e6d01f6
367 | 	WORD $0x6e6d01f7
368 | 
369 | 	// reduce integer pairs
370 | 	VADD V18.S4, V16.S4, V16.S4
371 | 	VADD V19.S4, V17.S4, V17.S4
372 | 	VADD V22.S4, V20.S4, V20.S4
373 | 	VADD V23.S4, V21.S4, V21.S4
374 | 	VADD V20.S4, V16.S4, V16.S4
375 | 	VADD V21.S4, V17.S4, V17.S4
376 | 
377 | 	// accumulate
378 | 	VUADDW V16.S2, V4.D2, V4.D2
379 | 	VUADDW2 V16.S4, V5.D2, V5.D2
380 | 	VUADDW V17.S2, V6.D2, V6.D2
381 | 	VUADDW2 V17.S4, V7.D2, V7.D2
382 | 
383 | 	// write back counts registers
384 | 	VST1 [V4.D2, V5.D2, V6.D2, V7.D2], (R2)
385 | 	RET
386 | 
387 | TEXT accum16<>(SB), NOSPLIT, $0-0
388 | 	// load first half of the counts
389 | 	VLD1.P 4*16(R2), [V4.D2, V5.D2, V6.D2, V7.D2]
390 | 
391 | 	// zero extend into dwords and fold
392 | //	VUADDL V8.H4, V10.H4, V16.S4
393 | //	VUADDL2 V8.H8, V10.H8, V17.S4
394 | //	VUADDL V9.H4, V11.H4, V18.S4
395 | //	VUADDL2 V9.H8, V11.H8, V19.S4
396 | //	VUADDL V12.H4, V14.H4, V20.S4
397 | //	VUADDL2 V12.H8, V14.H8, V21.S4
398 | //	VUADDL V13.H4, V15.H4, V22.S4
399 | //	VUADDL2 V13.H8, V15.H8, V23.S4
400 | 	WORD $0x2e680150
401 | 	WORD $0x6e680151
402 | 	WORD $0x2e690172
403 | 	WORD $0x6e690173
404 | 	WORD $0x2e6c01d4
405 | 	WORD $0x6e6c01d5
406 | 	WORD $0x2e6d01f6
407 | 	WORD $0x6e6d01f7
408 | 
409 | 	// reduce integer pairs
410 | 	VADD V20.S4, V16.S4, V16.S4
411 | 	VADD V21.S4, V17.S4, V17.S4
412 | 	VADD V22.S4, V18.S4, V18.S4
413 | 	VADD V23.S4, V19.S4, V19.S4
414 | 
415 | 	// load second half of the counts
416 | 	VLD1 (R2), [V20.D2, V21.D2, V22.D2, V23.D2]
417 | 	SUB $4*16, R2, R2		// move R2 back to the beginning
418 | 
419 | 	// accumulate
420 | 	VUADDW V16.S2, V4.D2, V4.D2
421 | 	VUADDW2 V16.S4, V5.D2, V5.D2
422 | 	VUADDW V17.S2, V6.D2, V6.D2
423 | 	VUADDW2 V17.S4, V7.D2, V7.D2
424 | 	VUADDW V18.S2, V20.D2, V20.D2
425 | 	VUADDW2 V18.S4, V21.D2, V21.D2
426 | 	VUADDW V19.S2, V22.D2, V22.D2
427 | 	VUADDW2 V19.S4, V23.D2, V23.D2
428 | 
429 | 	// write back
430 | 	VST1.P [V4.D2, V5.D2, V6.D2, V7.D2], 4*16(R2)
431 | 	VST1 [V20.D2, V21.D2, V22.D2, V23.D2], (R2)
432 | 	SUB $4*16, R2, R2		// restore R2
433 | 
434 | 	RET
435 | 
436 | TEXT accum32<>(SB), NOSPLIT, $0-0
437 | 	MOVD R2, R7			// source register
438 | 	MOVD R2, R8			// destination register
439 | 	MOVD $2, R9			// counter
440 | 
441 | 	// load counts registers
442 | loop:	VLD1.P 4*16(R7), [V20.D2, V21.D2, V22.D2, V23.D2]
443 | 	VLD1.P 4*16(R7), [V4.D2, V5.D2, V6.D2, V7.D2]
444 | 
445 | 	SUB $1, R9, R9
446 | 
447 | 	// zero extend into dwords and fold
448 | //	VUADDL V8.H4, V12.H4, V16.S4
449 | //	VUADDL2 V8.H8, V12.H8, V17.S4
450 | //	VUADDL V9.H4, V13.H4, V18.S4
451 | //	VUADDL2 V9.H8, V13.H8, V19.S4
452 | 	WORD $0x2e680190
453 | 	WORD $0x6e680191
454 | 	WORD $0x2e6901b2
455 | 	WORD $0x6e6901b3
456 | 
457 | 	// shift remaining counters forwards
458 | 	// can't use the VMOV alias because the assembler
459 | 	// doesn't support it.  VORR does the trick though
460 | 	VORR V10.B16, V10.B16, V8.B16
461 | 	VORR V11.B16, V11.B16, V9.B16
462 | 	VORR V14.B16, V14.B16, V12.B16
463 | 	VORR V15.B16, V15.B16, V13.B16
464 | 
465 | 	// accumulate
466 | 	VUADDW V16.S2, V20.D2, V20.D2
467 | 	VUADDW2 V16.S4, V21.D2, V21.D2
468 | 	VUADDW V17.S2, V22.D2, V22.D2
469 | 	VUADDW2 V17.S4, V23.D2, V23.D2
470 | 	VUADDW V18.S2, V4.D2, V4.D2
471 | 	VUADDW2 V18.S4, V5.D2, V5.D2
472 | 	VUADDW V19.S2, V6.D2, V6.D2
473 | 	VUADDW2 V19.S4, V7.D2, V7.D2
474 | 
475 | 	// write back
476 | 	VST1.P [V20.D2, V21.D2, V22.D2, V23.D2], 4*16(R8)
477 | 	VST1.P [V4.D2, V5.D2, V6.D2, V7.D2], 4*16(R8)
478 | 
479 | 	CBNZ R9, loop
480 | 
481 | 	RET
482 | 
483 | TEXT accum64<>(SB), NOSPLIT, $0-0
484 | 	MOVD R2, R7			// source register
485 | 	MOVD R2, R8			// destination register
486 | 	MOVD $4, R9			// counter
487 | 
488 | 	// load counts registers
489 | loop:	VLD1.P 4*16(R7), [V20.D2, V21.D2, V22.D2, V23.D2]
490 | 	VLD1.P 4*16(R7), [V4.D2, V5.D2, V6.D2, V7.D2]
491 | 
492 | 	SUB $1, R9, R9
493 | 
494 | 	// zero extend into dwords
495 | 	VUXTL V8.H4, V16.S4
496 | 	VUXTL2 V8.H8, V17.S4
497 | 	VUXTL V9.H4, V18.S4
498 | 	VUXTL2 V9.H8, V19.S4
499 | 
500 | 	// shift remaining counters forwards
501 | 	// can't use the VMOV alias because the assembler
502 | 	// doesn't support it.  VORR does the trick though
503 | 	VORR V10.B16, V10.B16, V8.B16
504 | 	VORR V11.B16, V11.B16, V9.B16
505 | 	VORR V12.B16, V12.B16, V10.B16
506 | 	VORR V13.B16, V13.B16, V11.B16
507 | 	VORR V14.B16, V14.B16, V12.B16
508 | 	VORR V15.B16, V15.B16, V13.B16
509 | 
510 | 	// accumulate
511 | 	VUADDW V16.S2, V20.D2, V20.D2
512 | 	VUADDW2 V16.S4, V21.D2, V21.D2
513 | 	VUADDW V17.S2, V22.D2, V22.D2
514 | 	VUADDW2 V17.S4, V23.D2, V23.D2
515 | 	VUADDW V18.S2, V4.D2, V4.D2
516 | 	VUADDW2 V18.S4, V5.D2, V5.D2
517 | 	VUADDW V19.S2, V6.D2, V6.D2
518 | 	VUADDW2 V19.S4, V7.D2, V7.D2
519 | 
520 | 	// write back
521 | 	VST1.P [V20.D2, V21.D2, V22.D2, V23.D2], 4*16(R8)
522 | 	VST1.P [V4.D2, V5.D2, V6.D2, V7.D2], 4*16(R8)
523 | 
524 | 	CBNZ R9, loop
525 | 
526 | 	RET
527 | 
528 | TEXT ·count8neon(SB), 0, $0-32
529 | 	LDP counts+0(FP), (R2, R1)
530 | 	MOVD buf_len+16(FP), R3
531 | 	MOVD $accum8<>(SB), R0
532 | 	CALL countneon<>(SB)
533 | 	RET
534 | 
535 | TEXT ·count16neon(SB), 0, $0-32
536 | 	LDP counts+0(FP), (R2, R1)
537 | 	MOVD buf_len+16(FP), R3
538 | 	MOVD $accum16<>(SB), R0
539 | 	LSL $1, R3, R3			// count in bytes
540 | 	CALL countneon<>(SB)
541 | 	RET
542 | 
543 | TEXT ·count32neon(SB), 0, $0-32
544 | 	LDP counts+0(FP), (R2, R1)
545 | 	MOVD buf_len+16(FP), R3
546 | 	MOVD $accum32<>(SB), R0
547 | 	LSL $2, R3, R3			// count in bytes
548 | 	CALL countneon<>(SB)
549 | 	RET
550 | 
551 | TEXT ·count64neon(SB), 0, $0-32
552 | 	LDP counts+0(FP), (R2, R1)
553 | 	MOVD buf_len+16(FP), R3
554 | 	MOVD $accum64<>(SB), R0
555 | 	LSL $3, R3, R3			// count in bytes
556 | 	CALL countneon<>(SB)
557 | 	RET
558 | 


--------------------------------------------------------------------------------
/countsse2_386.s:
--------------------------------------------------------------------------------
  1 | #include "textflag.h"
  2 | 
  3 | // SSE2 based kernels for the positional population count operation.
  4 | // All these kernels have the same backbone based on a 15-fold CSA
  5 | // reduction to first reduce 240 byte into 4x16 byte, followed by a
  6 | // bunch of shuffles to group the positional registers into nibbles.
  7 | // These are then summed up using a width-specific summation function.
  8 | // Required CPU extension: SSE2.
  9 | 
 10 | // magic transposition constants
 11 | DATA magic<> +0(SB)/8, $0x8040201008040201
 12 | DATA magic<>+ 8(SB)/8, $0xaaaaaaaa55555555
 13 | DATA magic<>+16(SB)/8, $0xcccccccc33333333
 14 | DATA magic<>+24(SB)/4, $0x0f0f0f0f
 15 | GLOBL magic<>(SB), RODATA|NOPTR, $28
 16 | 
 17 | // sliding window for head/tail loads.  Unfortunately, there doesn't
 18 | // seem to be a good way to do this with less memory wasted.
 19 | DATA window<> +0(SB)/8, $0x0000000000000000
 20 | DATA window<> +8(SB)/8, $0x0000000000000000
 21 | DATA window<>+16(SB)/8, $0xffffffffffffffff
 22 | DATA window<>+24(SB)/8, $0xffffffffffffffff
 23 | GLOBL window<>(SB), RODATA|NOPTR, $32
 24 | 
 25 | // B:A = A+B+C, D used for scratch space
 26 | #define CSA(A, B, C, D) \
 27 | 	MOVOA A, D \
 28 | 	PAND B, D \
 29 | 	PXOR B, A \
 30 | 	MOVOA A, B \
 31 | 	PAND C, B \
 32 | 	PXOR C, A \
 33 | 	POR D, B
 34 | 
 35 | // Process 4 bytes from X4.  Add low word counts to L, high to H
 36 | // assumes mask loaded into X2.  Trashes X4, X5.
 37 | #define COUNT4(L, H) \			// X4 = ----:----:----:3210
 38 | 	PUNPCKLBW X4, X4 \		// X4 = ----:----:3322:1100
 39 | 	PUNPCKLWL X4, X4 \		// X4 = 3333:2222:1111:0000
 40 | 	PSHUFD $0xfa, X4, X5 \		// X5 = 3333:3333:2222:2222
 41 | 	PUNPCKLLQ X4, X4 \		// X5 = 1111:1111:0000:0000
 42 | 	PAND X6, X4 \
 43 | 	PAND X6, X5 \
 44 | 	PCMPEQB X6, X4 \
 45 | 	PCMPEQB X6, X5 \
 46 | 	PSUBB X4, L \
 47 | 	PSUBB X5, H
 48 | 
 49 | // zero extend X from bytes into words and add to the counter vectors
 50 | // S1 and S2.  X7 is expected to be a zero register, X6 and X are trashed.
 51 | #define ACCUM(S1, S2, X) \
 52 | 	MOVOA X, X6 \
 53 | 	PUNPCKLBW X7, X \
 54 | 	PUNPCKHBW X7, X6 \
 55 | 	PADDW S1, X \
 56 | 	PADDW S2, X6 \
 57 | 	MOVOA X, S1 \
 58 | 	MOVOA X6, S2
 59 | 
 60 | // Generic kernel.  This function expects a pointer to a width-specific
 61 | // accumulation funciton in BX, a possibly unaligned input buffer in SI,
 62 | // counters in DI and a remaining length in BP.
 63 | TEXT countsse<>(SB), NOSPLIT, $144-0
 64 | 	TESTL BP, BP			// any data to process at all?
 65 | 	CMOVLEQ BP, SI			// if not, avoid loading head
 66 | 
 67 | 	// constants for processing the head
 68 | 	MOVQ magic<>+0(SB), X6		// bit position mask
 69 | 	PSHUFD $0x44, X6, X6		// broadcast into both qwords
 70 | 	PXOR X0, X0			// counter registers
 71 | 	PXOR X1, X1
 72 | 	PXOR X2, X2
 73 | 	PXOR X3, X3
 74 | 
 75 | 	// load head into scratch space (until alignment/end is reached)
 76 | 	MOVL SI, DX
 77 | 	ANDL $15, DX			// offset of the buffer start from 16 byte alignment
 78 | 	JEQ nohead			// if source buffer is aligned, skip head processing
 79 | 	MOVL $16, AX
 80 | 	SUBL DX, AX			// number of bytes til alignment is reached (head length)
 81 | 	MOVL $window<>(SB), DX
 82 | 	MOVOA -16(SI)(AX*1), X7		// load head
 83 | 	MOVOU (DX)(AX*1), X5		// load mask of the bytes that are part of the head
 84 | 	PAND X5, X7			// and mask out those bytes that are not
 85 | 	CMPL AX, BP			// is the head shorter than the buffer?
 86 | 	JLT norunt
 87 | 
 88 | 	// buffer is short and does not cross a 16 byte boundary
 89 | 	SUBL BP, AX			// number of bytes by which we overshoot the buffer
 90 | 	MOVOU (DX)(AX*1), X5		// load mask of bytes that overshoot the buffer
 91 | 	PANDN X7, X5			// and clear them
 92 | 	MOVOA X5, X7			// move head buffer back to X4
 93 | 	MOVL BP, AX			// set up true prefix length
 94 | 
 95 | norunt:	SUBL AX, BP			// mark head as accounted for
 96 | 	ADDL AX, SI			// and advance past the head
 97 | 
 98 | 	// process head in four increments of 4 bytes
 99 | 	MOVOA X7, X4
100 | 	PSRLO $4, X7
101 | 	COUNT4(X0, X1)
102 | 	MOVOA X7, X4
103 | 	PSRLO $4, X7
104 | 	COUNT4(X2, X3)
105 | 	MOVOA X7, X4
106 | 	PSRLO $4, X7
107 | 	COUNT4(X0, X1)
108 | 	MOVOA X7, X4
109 | 	COUNT4(X2, X3)
110 | 
111 | 	// produce 16 byte aligned pointer to counter vector in DX
112 | nohead:	MOVL $counts-144+15(SP), DX
113 | 	ANDL $~15, DX			// align to 16 bytes
114 | 
115 | 	// initialise counters in (DX) to what we have
116 | 	PXOR X7, X7			// zero register
117 | 	MOVOA X0, X4
118 | 	PUNPCKLBW X7, X0
119 | 	PUNPCKHBW X7, X4
120 | 	MOVOA X0, 0*16(DX)
121 | 	MOVOA X4, 1*16(DX)
122 | 	MOVOA X1, X4
123 | 	PUNPCKLBW X7, X1
124 | 	PUNPCKHBW X7, X4
125 | 	MOVOA X1, 2*16(DX)
126 | 	MOVOA X4, 3*16(DX)
127 | 	MOVOA X2, X4
128 | 	PUNPCKLBW X7, X2
129 | 	PUNPCKHBW X7, X4
130 | 	MOVOA X2, 4*16(DX)
131 | 	MOVOA X4, 5*16(DX)
132 | 	MOVOA X3, X4
133 | 	PUNPCKLBW X7, X3
134 | 	PUNPCKHBW X7, X4
135 | 	MOVOA X3, 6*16(DX)
136 | 	MOVOA X4, 7*16(DX)
137 | 
138 | 	SUBL $15*16, BP			// enough data left to process?
139 | 	JLT endvec			// also, pre-subtract
140 | 
141 | 	MOVL $65535-4, AX		// space left til overflow could occur in Y8--Y11
142 | 
143 | vec:	MOVOA 0*16(SI), X0		// load 240 bytes from buf
144 | 	MOVOA 1*16(SI), X1		// and sum them into Y3:Y2:Y1:Y0
145 | 	MOVOA 2*16(SI), X4
146 | 	MOVOA 3*16(SI), X2
147 | 	MOVOA 4*16(SI), X3
148 | 	MOVOA 5*16(SI), X5
149 | 	MOVOA 6*16(SI), X6
150 | 	CSA(X0, X1, X4, X7)
151 | 	MOVOA 7*16(SI), X4
152 | 	CSA(X3, X2, X5, X7)
153 | 	MOVOA 8*16(SI), X5
154 | 	CSA(X0, X3, X6, X7)
155 | 	MOVOA 9*16(SI), X6
156 | 	CSA(X1, X2, X3, X7)
157 | 	MOVOA 10*16(SI), X3
158 | 	CSA(X0, X4, X5, X7)
159 | 	MOVOA 11*16(SI), X5
160 | 	CSA(X0, X3, X6, X7)
161 | 	MOVOA 12*16(SI), X6
162 | 	CSA(X1, X3, X4, X7)
163 | 	MOVOA 13*16(SI), X4
164 | 	CSA(X0, X5, X6, X7)
165 | 	MOVOA 14*16(SI), X6
166 | 	CSA(X0, X4, X6, X7)
167 | 	CSA(X1, X4, X5, X7)
168 | 	CSA(X2, X3, X4, X7)
169 | 
170 | 	// load magic constants
171 | 	MOVQ magic<>+8(SB), X7
172 | 	PSHUFD $0x55, X7, X6		// 0xaaaaaaaa
173 | 	PSHUFD $0x00, X7, X7		// 0x55555555
174 | 
175 | 	ADDL $15*16, SI
176 | 
177 | 	// group X0--X3 into nibbles in the same register
178 | 	MOVOA X0, X5
179 | 	PAND X6, X5
180 | 	PSRLL $1, X5
181 | 	MOVOA X1, X4
182 | 	PAND X7, X4
183 | 	PADDL X4, X4
184 | 	PAND X7, X0
185 | 	PAND X6, X1
186 | 	POR X4, X0			// X0 = eca86420 (low crumbs)
187 | 	POR X5, X1			// X1 = fdb97531 (high crumbs)
188 | 
189 | 	MOVOA X2, X5
190 | 	PAND X6, X5
191 | 	PSRLL $1, X5
192 | 	MOVOA X3, X4
193 | 	PAND X7, X4
194 | 	PADDL X4, X4
195 | 	PAND X7, X2
196 | 	PAND X6, X3
197 | 	POR X4, X2			// X0 = eca86420 (low crumbs)
198 | 	POR X5, X3			// X1 = fdb97531 (high crumbs)
199 | 
200 | 	MOVQ magic<>+16(SB), X7
201 | 	PSHUFD $0x55, X7, X6		// 0xcccccccc
202 | 	PSHUFD $0x00, X7, X7		// 0x33333333
203 | 
204 | 	MOVOA X0, X5
205 | 	PAND X6, X5
206 | 	PSRLL $2, X5
207 | 	MOVOA X2, X4
208 | 	PAND X7, X4
209 | 	PSLLL $2, X4
210 | 	PAND X7, X0
211 | 	PAND X6, X2
212 | 	POR X4, X0			// X0 = c840
213 | 	POR X5, X2			// X2 = ea62
214 | 
215 | 	MOVOA X1, X5
216 | 	PAND X6, X5
217 | 	PSRLL $2, X5
218 | 	MOVOA X3, X4
219 | 	PAND X7, X4
220 | 	PSLLL $2, X4
221 | 	PAND X7, X1
222 | 	PAND X6, X3
223 | 	POR X4, X1			// X1 = d951
224 | 	POR X5, X3			// X3 = fb73
225 | 
226 | 	MOVD magic<>+24(SB), X7
227 | 	PSHUFD $0x00, X7, X7		// 0x0f0f0f0f
228 | 
229 | 	// pre-shuffle nibbles
230 | 	MOVOA X2, X5
231 | 	PUNPCKLBW X3, X2		// X2 = fbea7362 (3:2:1:0)
232 | 	PUNPCKHBW X3, X5		// X5 = fbea7362 (7:6:5:4)
233 | 	MOVOA X0, X3
234 | 	PUNPCKLBW X1, X0		// X0 = d9c85140 (3:2:1:0)
235 | 	PUNPCKHBW X1, X3		// X4 = d9c85140 (7:6:5:4)
236 | 	MOVOA X0, X1
237 | 	PUNPCKLWL X2, X0		// X0 = fbead9c873625140 (1:0)
238 | 	PUNPCKHWL X2, X1		// X1 = fbead9c873625140 (3:2)
239 | 	MOVOA X3, X2
240 | 	PUNPCKLWL X5, X2		// X2 = fbead9c873625140 (5:4)
241 | 	PUNPCKHWL X5, X3		// X3 = fbead9c873625140 (7:6)
242 | 
243 | 	// pull high and low nibbles and reduce once
244 | 	MOVOA X0, X4
245 | 	PSRLL $4, X4
246 | 	PAND X7, X0			// X0 = ba983210 (1:0)
247 | 	PAND X7, X4			// X4 = fedc7654 (1:0)
248 | 
249 | 	MOVOA X2, X6
250 | 	PSRLL $4, X2
251 | 	PAND X7, X6			// X6 = ba983210 (5:4)
252 | 	PAND X7, X2			// X2 = fedc7654 (5:4)
253 | 
254 | 	PADDB X6, X0			// X0 = ba983210 (1:0)
255 | 	PADDB X4, X2			// X2 = fedc7654 (1:0)
256 | 
257 | 	MOVOA X1, X4
258 | 	PSRLL $4, X4
259 | 	PAND X7, X1			// X1 = ba983210 (3:2)
260 | 	PAND X7, X4			// X4 = fedc7654 (3:2)
261 | 
262 | 	MOVOA X3, X6
263 | 	PSRLL $4, X3
264 | 	PAND X7, X6			// X6 = ba983210 (7:6)
265 | 	PAND X7, X3			// X3 = fedc7654 (7:6)
266 | 
267 | 	PADDB X6, X1			// X1 = ba983210 (3:2)
268 | 	PADDB X4, X3			// X3 = fedc7654 (3:2)
269 | 
270 | 	// unpack one last time
271 | 	MOVOA X0, X4
272 | 	PUNPCKLLQ X2, X0		// X0 = fedcba9876543210 (0)
273 | 	PUNPCKHLQ X2, X4		// X4 = fedcba9876543210 (1)
274 | 	MOVOA X1, X5
275 | 	PUNPCKLLQ X3, X1		// X1 = fedcba9876543210 (2)
276 | 	PUNPCKHLQ X3, X5		// X5 = fedcba9876543210 (3)
277 | 
278 | 	// add to counters
279 | 	PXOR X7, X7			// zero register
280 | 	ACCUM(0*16(DX), 1*16(DX), X0)
281 | 	ACCUM(2*16(DX), 3*16(DX), X4)
282 | 	ACCUM(4*16(DX), 5*16(DX), X1)
283 | 	ACCUM(6*16(DX), 7*16(DX), X5)
284 | 
285 | 	SUBL $15*2, AX			// account for possible overflow
286 | 	CMPL AX, $15*2			// enough space left in the counters?
287 | 	JGE have_space
288 | 
289 | 	CALL *BX			// call accumulation function
290 | 
291 | 	// clear counts for next round
292 | 	PXOR X7, X7
293 | 	MOVOA X7, 0*16(DX)
294 | 	MOVOA X7, 1*16(DX)
295 | 	MOVOA X7, 2*16(DX)
296 | 	MOVOA X7, 3*16(DX)
297 | 	MOVOA X7, 4*16(DX)
298 | 	MOVOA X7, 5*16(DX)
299 | 	MOVOA X7, 6*16(DX)
300 | 	MOVOA X7, 7*16(DX)
301 | 
302 | 	MOVL $65535, AX			// space left til overflow could occur
303 | 
304 | have_space:
305 | 	SUBL $15*16, BP			// account for bytes consumed
306 | 	JGE vec
307 | 
308 | 	// constants for processing the tail
309 | endvec:	MOVQ magic<>+0(SB), X6		// bit position mask
310 | 	PSHUFD $0x44, X6, X6		// broadcast into both qwords
311 | 	PXOR X0, X0			// counter registers
312 | 	PXOR X1, X1
313 | 	PXOR X2, X2
314 | 	PXOR X3, X3
315 | 
316 | 	// process tail, 4 bytes at a time
317 | 	SUBL $8-15*16, BP		// 8 bytes left to process?
318 | 	JLT tail1
319 | 
320 | tail8:	MOVL (SI), X4
321 | 	COUNT4(X0, X1)
322 | 	MOVL 4(SI), X4
323 | 	COUNT4(X2, X3)
324 | 	ADDL $8, SI
325 | 	SUBL $8, BP
326 | 	JGE tail8
327 | 
328 | 	// process remaining 0--7 byte
329 | tail1:	SUBL $-8, BP			// anything left to process?
330 | 	JLE end
331 | 
332 | 	MOVQ (SI), X5			// load 8 bytes from buffer.  Note that
333 | 					// buffer is aligned to 8 byte here
334 | 	MOVL $window<>+16(SB), AX	// load window address
335 | 	SUBL BP, AX			// adjust mask pointer
336 | 	MOVQ (AX), X7			// load window mask
337 | 	PANDN X5, X7			// and mask out the desired bytes
338 | 
339 | 	// process rest
340 | 	MOVOA X7, X4
341 | 	PSRLO $4, X7
342 | 	COUNT4(X0, X1)
343 | 	MOVOA X7, X4
344 | 	COUNT4(X2, X3)
345 | 
346 | 	// add tail to counters
347 | end:	PXOR X7, X7			// zero register
348 | 	ACCUM(0*16(DX), 1*16(DX), X0)
349 | 	ACCUM(2*16(DX), 3*16(DX), X1)
350 | 	ACCUM(4*16(DX), 5*16(DX), X2)
351 | 	ACCUM(6*16(DX), 7*16(DX), X3)
352 | 
353 | 	CALL *BX
354 | 	RET
355 | 
356 | // zero-extend words in X and Y to dwords, sum them, and move the
357 | // halves back into X and Y.  Assumes X7 == 0.  Trashes X2 and X3.
358 | #define FOLDW(X, Y) \
359 | 	MOVOA X, X2 \
360 | 	PUNPCKLWL X7, X \
361 | 	PUNPCKHWL X7, X2 \
362 | 	MOVOA Y, X3 \
363 | 	PUNPCKLWL X7, X3 \
364 | 	PUNPCKHWL X7, Y \
365 | 	PADDL X3, X \
366 | 	PADDL X2, Y
367 | 
368 | // add dwords in X to (a)*4(DI), trashing X2.
369 | #define ACCUMQ(a, X) \
370 | 	MOVOU (a)*4(DI), X2 \
371 | 	PADDL X, X2 \
372 | 	MOVOU X2, (a)*4(DI)
373 | 
374 | // zero-extend words in s*16(DX) to dwords and add to a*4(DI) to (a+7)*4(DI).
375 | // Assumes X7 == 0 and trashes X0, X1, and X2.
376 | #define ACCUMO(a, s) \
377 | 	MOVOA (s)*16(DX), X0 \
378 | 	MOVOA X0, X1 \
379 | 	PUNPCKLWL X7, X0 \
380 | 	PUNPCKHWL X7, X1 \
381 | 	ACCUMQ(a, X0) \
382 | 	ACCUMQ(a+4, X1)
383 | 
384 | // Count8 accumulation function.  Accumulates words into
385 | // 8 dword counters at (DI).  Trashes X0--X7.
386 | TEXT accum8<>(SB), NOSPLIT, $0-0
387 | 	MOVOA 0*16(DX), X0
388 | 	MOVOA 4*16(DX), X1
389 | 	MOVOA 2*16(DX), X4
390 | 	MOVOA 6*16(DX), X5
391 | 	FOLDW(X0, X1)
392 | 	FOLDW(X4, X5)
393 | 	PADDL X4, X0
394 | 	PADDL X5, X1
395 | 	ACCUMQ(0, X0)
396 | 	ACCUMQ(4, X1)
397 | 	MOVOA 1*16(DX), X0
398 | 	MOVOA 5*16(DX), X1
399 | 	MOVOA 3*16(DX), X4
400 | 	MOVOA 7*16(DX), X5
401 | 	FOLDW(X0, X1)
402 | 	FOLDW(X4, X5)
403 | 	PADDL X4, X0
404 | 	PADDL X5, X1
405 | 	ACCUMQ(0, X0)
406 | 	ACCUMQ(4, X1)
407 | 	RET
408 | 
409 | // Count16 accumulation function.  Accumulates words into
410 | // 16 dword counters at (DI).  Trashes X0--X7.
411 | TEXT accum16<>(SB), NOSPLIT, $0-0
412 | 	MOVOA 0*16(DX), X0
413 | 	MOVOA 4*16(DX), X1
414 | 	MOVOA 2*16(DX), X4
415 | 	MOVOA 6*16(DX), X5
416 | 	FOLDW(X0, X1)
417 | 	FOLDW(X4, X5)
418 | 	PADDL X4, X0
419 | 	PADDL X5, X1
420 | 	ACCUMQ(0, X0)
421 | 	ACCUMQ(4, X1)
422 | 	MOVOA 1*16(DX), X0
423 | 	MOVOA 5*16(DX), X1
424 | 	MOVOA 3*16(DX), X4
425 | 	MOVOA 7*16(DX), X5
426 | 	FOLDW(X0, X1)
427 | 	FOLDW(X4, X5)
428 | 	PADDL X4, X0
429 | 	PADDL X5, X1
430 | 	ACCUMQ(8, X0)
431 | 	ACCUMQ(12, X1)
432 | 	RET
433 | 
434 | // Count32 accumulation function.  Accumulates words into
435 | // 32 dword counters at (DI).  Trashes X0--X7.
436 | TEXT accum32<>(SB), NOSPLIT, $0-0
437 | 	MOVOA 0*16(DX), X0
438 | 	MOVOA 4*16(DX), X1
439 | 	FOLDW(X0, X1)
440 | 	ACCUMQ(0, X0)
441 | 	ACCUMQ(4, X1)
442 | 	MOVOA 1*16(DX), X0
443 | 	MOVOA 5*16(DX), X1
444 | 	FOLDW(X0, X1)
445 | 	ACCUMQ(8, X0)
446 | 	ACCUMQ(12, X1)
447 | 	MOVOA 2*16(DX), X0
448 | 	MOVOA 6*16(DX), X1
449 | 	FOLDW(X0, X1)
450 | 	ACCUMQ(16, X0)
451 | 	ACCUMQ(20, X1)
452 | 	MOVOA 3*16(DX), X0
453 | 	MOVOA 7*16(DX), X1
454 | 	FOLDW(X0, X1)
455 | 	ACCUMQ(24, X0)
456 | 	ACCUMQ(28, X1)
457 | 	RET
458 | 
459 | // Count64 accumulation function.  Accumulates words into
460 | // 64 dword counters at (DI).  Trashes X0, X1, and X7.
461 | TEXT accum64<>(SB), NOSPLIT, $0-0
462 | 	ACCUMO( 0, 0)
463 | 	ACCUMO( 8, 1)
464 | 	ACCUMO(16, 2)
465 | 	ACCUMO(24, 3)
466 | 	ACCUMO(32, 4)
467 | 	ACCUMO(40, 5)
468 | 	ACCUMO(48, 6)
469 | 	ACCUMO(56, 7)
470 | 	RET
471 | 
472 | // func count8sse2(counts *[8]int, buf []uint8)
473 | TEXT ·count8sse2(SB), 0, $0-16
474 | 	MOVL counts+0(FP), DI
475 | 	MOVL buf_base+4(FP), SI		// SI = &buf[0]
476 | 	MOVL buf_len+8(FP), BP		// BP = len(buf)
477 | 	MOVL $accum8<>(SB), BX
478 | 	CALL countsse<>(SB)
479 | 	RET
480 | 
481 | // func count16sse2(counts *[16]int, buf []uint16)
482 | TEXT ·count16sse2(SB), 0, $0-16
483 | 	MOVL counts+0(FP), DI
484 | 	MOVL buf_base+4(FP), SI		// SI = &buf[0]
485 | 	MOVL buf_len+8(FP), BP		// BP = len(buf)
486 | 	MOVL $accum16<>(SB), BX
487 | 	SHLL $1, BP			// count in bytes
488 | 	CALL countsse<>(SB)
489 | 	RET
490 | 
491 | // func count32sse2(counts *[32]int, buf []uint32)
492 | TEXT ·count32sse2(SB), 0, $0-16
493 | 	MOVL counts+0(FP), DI
494 | 	MOVL buf_base+4(FP), SI		// SI = &buf[0]
495 | 	MOVL buf_len+8(FP), BP		// BP = len(buf)
496 | 	MOVL $accum32<>(SB), BX
497 | 	SHLL $2, BP			// count in bytes
498 | 	CALL countsse<>(SB)
499 | 	RET
500 | 
501 | 
502 | // func count64sse2(counts *[64]int, buf []uint64)
503 | TEXT ·count64sse2(SB), 0, $0-16
504 | 	MOVL counts+0(FP), DI
505 | 	MOVL buf_base+4(FP), SI		// SI = &buf[0]
506 | 	MOVL buf_len+8(FP), BP		// BP = len(buf)
507 | 	MOVL $accum64<>(SB), BX
508 | 	SHLL $3, BP			// count in bytes
509 | 	CALL countsse<>(SB)
510 | 	RET
511 | 


--------------------------------------------------------------------------------
/countsse2_amd64.s:
--------------------------------------------------------------------------------
  1 | #include "textflag.h"
  2 | 
  3 | // An SSE2 based kernel first doing a 15-fold CSA reduction and then
  4 | // a 16-fold CSA reduction, carrying over place-value vectors between
  5 | // iterations.  Required CPU extension: SSE2.
  6 | 
  7 | // magic transposition constants
  8 | DATA magic<> +0(SB)/8, $0x8040201008040201
  9 | DATA magic<>+ 8(SB)/8, $0xaaaaaaaa55555555
 10 | DATA magic<>+16(SB)/8, $0xcccccccc33333333
 11 | DATA magic<>+24(SB)/8, $0x00ff00ff0f0f0f0f
 12 | GLOBL magic<>(SB), RODATA|NOPTR, $32
 13 | 
 14 | // sliding window for head/tail loads.  Unfortunately, there doesn't
 15 | // seem to be a good way to do this with less memory wasted.
 16 | DATA window<> +0(SB)/8, $0x0000000000000000
 17 | DATA window<> +8(SB)/8, $0x0000000000000000
 18 | DATA window<>+16(SB)/8, $0xffffffffffffffff
 19 | DATA window<>+24(SB)/8, $0xffffffffffffffff
 20 | GLOBL window<>(SB), RODATA|NOPTR, $32
 21 | 
 22 | // B:A = A+B+C
 23 | #define CSA(A, B, C) \
 24 | 	PXOR C, B \
 25 | 	PXOR A, C \
 26 | 	PXOR B, A \
 27 | 	POR  C, B \
 28 | 	PXOR A, B
 29 | 
 30 | // Process 4 bytes from X4.  Add low word counts to L, high to H
 31 | // assumes mask loaded into X2.  Trashes X4, X5.
 32 | #define COUNT4(L, H) \			// X4 = ----:----:----:3210
 33 | 	PUNPCKLBW X4, X4 \		// X4 = ----:----:3322:1100
 34 | 	PUNPCKLWL X4, X4 \		// X4 = 3333:2222:1111:0000
 35 | 	PSHUFD $0xfa, X4, X5 \		// X5 = 3333:3333:2222:2222
 36 | 	PUNPCKLLQ X4, X4 \		// X5 = 1111:1111:0000:0000
 37 | 	PAND X6, X4 \
 38 | 	PAND X6, X5 \
 39 | 	PCMPEQB X6, X4 \
 40 | 	PCMPEQB X6, X5 \
 41 | 	PSUBB X4, L \
 42 | 	PSUBB X5, H
 43 | 
 44 | // zero extend X from bytes into words and add to the counter vectors
 45 | // S1 and S2.  X7 is expected to be a zero register, X6 and X are trashed.
 46 | #define ACCUM(S1, S2, X) \
 47 | 	MOVOA X, X6 \
 48 | 	PUNPCKLBW X7, X \
 49 | 	PUNPCKHBW X7, X6 \
 50 | 	PADDW X, S1 \
 51 | 	PADDW X6, S2
 52 | 
 53 | // Generic kernel.  This function expects a pointer to a width-specific
 54 | // accumulation funciton in BX, a possibly unaligned input buffer in SI,
 55 | // counters in DI and a remaining length in CX.
 56 | TEXT countsse2<>(SB), NOSPLIT, $32-0
 57 | 	// constants for processing the head
 58 | 	MOVQ magic<>+0(SB), X6		// bit position mask
 59 | 	PSHUFD $0x44, X6, X6		// broadcast into both qwords
 60 | 	PXOR X7, X7			// zero register
 61 | 	PXOR X8, X8			// counter registers
 62 | 	PXOR X10, X10
 63 | 	PXOR X12, X12
 64 | 	PXOR X14, X14
 65 | 
 66 | 	CMPQ CX, $15*16			// is the CSA kernel worth using?
 67 | 	JLT runt
 68 | 
 69 | 	// load head until alignment/end is reached
 70 | 	MOVL SI, DX
 71 | 	ANDL $15, DX			// offset of the buffer start from 16 byte alignment
 72 | 	MOVL $16, AX
 73 | 	SUBL DX, AX			// number of bytes til alignment is reached (head length)
 74 | 	SUBQ DX, SI			// align source to 16 bytes
 75 | 	ADDQ DX, CX			// and account for head length
 76 | 	MOVQ $window<>(SB), DX		// load window mask base pointer
 77 | 	MOVOU (DX)(AX*1), X2		// load mask of the bytes that are part of the head
 78 | 	PAND (SI), X2			// load head and mask out bytes that are not in the head
 79 | 
 80 | 	// load 240 - 16 bytes from buf and sum them into X3:X2:X1:X0
 81 | 	MOVOA 1*16(SI), X1
 82 | 	MOVOA 2*16(SI), X0
 83 | 	MOVOA 3*16(SI), X5
 84 | 	MOVOA 4*16(SI), X4
 85 | 	MOVOA 5*16(SI), X3
 86 | 	CSA(X0, X1, X2)
 87 | 	MOVOA 6*16(SI), X7
 88 | 	MOVOA 7*16(SI), X6
 89 | 	MOVOA 8*16(SI), X2
 90 | 	CSA(X3, X4, X5)
 91 | 	MOVOA 9*16(SI), X5
 92 | 	CSA(X2, X6, X7)
 93 | 	MOVOA 10*16(SI), X7
 94 | 	CSA(X0, X5, X3)
 95 | 	MOVOA 11*16(SI), X3
 96 | 	CSA(X1, X4, X6)
 97 | 	MOVOA 12*16(SI), X6
 98 | 	CSA(X0, X2, X7)
 99 | 	MOVOA 13*16(SI), X7
100 | 	PXOR X9, X9			// initialise remaining counters
101 | 	PXOR X11, X11
102 | 	CSA(X3, X7, X6)
103 | 	MOVOA 14*16(SI), X6
104 | 	CSA(X1, X2, X5)
105 | 	ADDQ $15*16, SI
106 | 	CSA(X0, X3, X6)
107 | 	MOVL $65535, AX			// space left til overflow could occur in Y8--Y11
108 | 	CSA(X1, X3, X7)
109 | 	PXOR X13, X13
110 | 	PXOR X15, X15
111 | 	CSA(X2, X3, X4)
112 | 
113 | 	SUBQ $(15+16)*16, CX		// enough data left to process?
114 | 	JLT post
115 | 
116 | 	// load 256 bytes from buf, add them to X0..X3 into X0..X4
117 | vec:	MOVOA 0*16(SI), X4
118 | 	MOVOA 1*16(SI), X5
119 | 	MOVOU X8, X8save-32(SP)		// stash some counters to give us
120 | 	MOVOU X9, X9save-16(SP)		// more registers to play with
121 | 	MOVOA 2*16(SI), X6
122 | 	MOVOA 3*16(SI), X7
123 | 	MOVOA 4*16(SI), X8
124 | 	MOVOA 5*16(SI), X9
125 | 	CSA(X0, X5, X4)
126 | 	MOVOA 6*16(SI), X4
127 | 	CSA(X6, X8, X7)
128 | 	MOVOA 7*16(SI), X7
129 | 	CSA(X1, X8, X5)
130 | 	MOVOA 8*16(SI), X5
131 | 	CSA(X0, X6, X9)
132 | 	MOVOA 9*16(SI), X9
133 | 	CSA(X4, X5, X7)
134 | 	MOVOA 10*16(SI), X7
135 | 	CSA(X1, X5, X6)
136 | 	MOVOA 11*16(SI), X6
137 | 	CSA(X0, X4, X9)
138 | 	MOVOA 12*16(SI), X9
139 | 	CSA(X2, X5, X8)
140 | 	MOVOA 13*16(SI), X8
141 | 	CSA(X0, X6, X7)
142 | 	MOVOA 14*16(SI), X7
143 | 	CSA(X1, X4, X6)
144 | 	MOVOA 15*16(SI), X6
145 | 	CSA(X7, X8, X9)
146 | 	MOVOU magic<>+8(SB), X9		// 55555555, aaaaaaaa, 33333333, cccccccc
147 | 	CSA(X0, X6, X7)
148 | 	ADDQ $16*16, SI
149 | #define D	90
150 | 	PREFETCHT0 (D+ 0)*16(SI)
151 | 	CSA(X1, X6, X8)
152 | 	PREFETCHT0 (D+ 4)*16(SI)
153 | 	CSA(X2, X4, X6)
154 | 	PREFETCHT0 (D+ 8)*16(SI)
155 | 	CSA(X3, X4, X5)
156 | 	PREFETCHT0 (D+12)*16(SI)
157 | 
158 | 	MOVQ magic<>+24(SB), X8		// 0f0f0f0f, 00ff00ff
159 | 
160 | 	// now X0..X4 hold counters; preserve X0..X4 for the next round
161 | 	// and add X4 to the the counters.
162 | 
163 | 	// split into even/odd and reduce into crumbs
164 | 	PSHUFD $0x00, X9, X7		// X7 = 55..55
165 | 	MOVOA X4, X5
166 | 	PAND X7, X5			// X5 = 02468ace x8
167 | 	PANDN X4, X7			// X7 = 13579bdf x8
168 | 	PSRLL $1, X7
169 | 	MOVOA X5, X4
170 | 	PUNPCKLQDQ X7, X4
171 | 	PUNPCKHQDQ X7, X5
172 | 	PADDL X5, X4			// X4 = 02468ace x4 13579bdf x4
173 | 
174 | 	// split again into nibbles
175 | 	PSHUFD $0xaa, X9, X5		// X7 = 33..33
176 | 	MOVOA X5, X7
177 | 	PANDN X4, X5			// X5 = 26ae x4 37bf x4
178 | 	PAND X7, X4			// X4 = 048c x4 159d x4
179 | 	PSRLL $2, X5
180 | 
181 | 	// split into bytes and shuffle into order
182 | 	PSHUFD $0x00, X8, X6		// X6 = 0f..0f
183 | 	MOVOA X6, X7
184 | 	PANDN X4, X6			// X6 = 4c x4 5d x4
185 | 	PAND X7, X4			// X4 = 08 x4 19 x4
186 | 	MOVOA X7, X9
187 | 	PANDN X5, X7			// X7 = 6e x4 7f x4
188 | 	PAND X9, X5			// X5 = 2a x4 3b x4
189 | 	PSLLL $4, X4
190 | 	PSLLL $4, X5
191 | 
192 | 	MOVOA X4, X9
193 | 	PUNPCKLWL X5, X4		// X4 = 082a x4
194 | 	PUNPCKHWL X5, X9		// X9 = 193b x4
195 | 	MOVOA X6, X5
196 | 	PUNPCKLWL X7, X5		// X5 = 4c6e x4
197 | 	PUNPCKHWL X7, X6		// X6 = 5d7f x4
198 | 	MOVOA X4, X7
199 | 	PUNPCKLWL X9, X4		// X4 = 08192a3b[0:1]
200 | 	PUNPCKHWL X9, X7		// X7 = 08192a3b[2:3]
201 | 	MOVOA X5, X9
202 | 	PUNPCKLWL X6, X5		// X5 = 4c5d6e7f[0:1]
203 | 	PUNPCKHWL X6, X9		// X9 = 4c5d6e7f[2:3]
204 | 	MOVOA X4, X6
205 | 	PUNPCKLQDQ X5, X4		// X4 = 08192a3b4c5d6e7f[0]
206 | 	PUNPCKHQDQ X5, X6		// X6 = 08192a3b4c5d6e7f[1]
207 | 	MOVOA X7, X5
208 | 	PUNPCKLQDQ X9, X5		// X5 = 08192a3b4c5d6e7f[2]
209 | 	PUNPCKHQDQ X9, X7		// X7 = 08192a3b4c5d6e7f[3]
210 | 
211 | 	// split into words and add to counters
212 | 	PSHUFD $0x55, X8, X8		// X8 = 00ff..00ff
213 | 	MOVOA X6, X9
214 | 	PAND X8, X6			// X6 = 01234678[1]
215 | 	PSRLW $8, X9			// X9 = 89abcdef[1]
216 | 	PADDW X6, X10
217 | 	PADDW X9, X11
218 | 
219 | 	MOVOA X8, X6
220 | 	MOVOU X8save-32(SP), X8
221 | 	MOVOA X5, X9
222 | 	PAND X6, X5			// X5 = 01234567[2]
223 | 	PSRLW $8, X9			// X9 = 89abcdef[2]
224 | 	PADDW X5, X12
225 | 	PADDW X9, X13
226 | 
227 | 	MOVOU X9save-16(SP), X9
228 | 	MOVOA X7, X5
229 | 	PAND X6, X7			// X7 = 01234567[3]
230 | 	PSRLW $8, X5			// X5 = 89abcdef[3]
231 | 	PADDW X7, X14
232 | 	PADDW X5, X15
233 | 
234 | 	MOVOA X4, X5
235 | 	PAND X6, X4			// X4 = 01234567[0]
236 | 	PSRLW $8, X5			// X5 = 89abcdef[0]
237 | 	PADDW X4, X8
238 | 	PADDW X5, X9
239 | 
240 | 	SUBL $16*2, AX			// account for possible overflow
241 | 	CMPL AX, $(15+15)*2		// enough space left in the counters?
242 | 	JGE have_space
243 | 
244 | 	PXOR X7, X7
245 | 	CALL *BX			// call accumulation function
246 | 	PXOR X8, X8			// clear counters for next round
247 | 	PXOR X9, X9
248 | 	PXOR X10, X10
249 | 	PXOR X11, X11
250 | 	PXOR X12, X12
251 | 	PXOR X13, X13
252 | 	PXOR X14, X14
253 | 	PXOR X15, X15
254 | 
255 | 	MOVL $65535, AX			// space left til overflow could occur
256 | 
257 | have_space:
258 | 	SUBQ $16*16, CX			// account for bytes consumed
259 | 	JGE vec
260 | 
261 | post:	MOVQ magic<>+8(SB), X5		// load magic constants
262 | 	PSHUFD $0x55, X5, X6		// 0xaaaaaaaa
263 | 	PSHUFD $0x00, X5, X7		// 0x55555555
264 | 
265 | 	// group X0--X3 into nibbles in the same register
266 | 	MOVOA X0, X5
267 | 	PAND X6, X5
268 | 	PSRLL $1, X5
269 | 	MOVOA X1, X4
270 | 	PAND X7, X4
271 | 	PADDL X4, X4
272 | 	PAND X7, X0
273 | 	PAND X6, X1
274 | 	POR X4, X0			// X0 = eca86420 (low crumbs)
275 | 	POR X5, X1			// X1 = fdb97531 (high crumbs)
276 | 
277 | 	MOVOA X2, X5
278 | 	PAND X6, X5
279 | 	PSRLL $1, X5
280 | 	MOVOA X3, X4
281 | 	PAND X7, X4
282 | 	PADDL X4, X4
283 | 	PAND X7, X2
284 | 	PAND X6, X3
285 | 	POR X4, X2			// X0 = eca86420 (low crumbs)
286 | 	POR X5, X3			// X1 = fdb97531 (high crumbs)
287 | 
288 | 	MOVQ magic<>+16(SB), X7
289 | 	PSHUFD $0x55, X7, X6		// 0xcccccccc
290 | 	PSHUFD $0x00, X7, X7		// 0x33333333
291 | 
292 | 	MOVOA X0, X5
293 | 	PAND X6, X5
294 | 	PSRLL $2, X5
295 | 	MOVOA X2, X4
296 | 	PAND X7, X4
297 | 	PSLLL $2, X4
298 | 	PAND X7, X0
299 | 	PAND X6, X2
300 | 	POR X4, X0			// X0 = c840
301 | 	POR X5, X2			// X2 = ea62
302 | 
303 | 	MOVOA X1, X5
304 | 	PAND X6, X5
305 | 	PSRLL $2, X5
306 | 	MOVOA X3, X4
307 | 	PAND X7, X4
308 | 	PSLLL $2, X4
309 | 	PAND X7, X1
310 | 	PAND X6, X3
311 | 	POR X4, X1			// X1 = d951
312 | 	POR X5, X3			// X3 = fb73
313 | 
314 | 	MOVD magic<>+24(SB), X7
315 | 	PSHUFD $0x00, X7, X7		// 0x0f0f0f0f
316 | 
317 | 	// pre-shuffle nibbles
318 | 	MOVOA X2, X5
319 | 	PUNPCKLBW X3, X2		// X2 = fbea7362 (3:2:1:0)
320 | 	PUNPCKHBW X3, X5		// X5 = fbea7362 (7:6:5:4)
321 | 	MOVOA X0, X3
322 | 	PUNPCKLBW X1, X0		// X0 = d9c85140 (3:2:1:0)
323 | 	PUNPCKHBW X1, X3		// X4 = d9c85140 (7:6:5:4)
324 | 	MOVOA X0, X1
325 | 	PUNPCKLWL X2, X0		// X0 = fbead9c873625140 (1:0)
326 | 	PUNPCKHWL X2, X1		// X1 = fbead9c873625140 (3:2)
327 | 	MOVOA X3, X2
328 | 	PUNPCKLWL X5, X2		// X2 = fbead9c873625140 (5:4)
329 | 	PUNPCKHWL X5, X3		// X3 = fbead9c873625140 (7:6)
330 | 
331 | 	// pull high and low nibbles and reduce once
332 | 	MOVOA X0, X4
333 | 	PSRLL $4, X4
334 | 	PAND X7, X0			// X0 = ba983210 (1:0)
335 | 	PAND X7, X4			// X4 = fedc7654 (1:0)
336 | 
337 | 	MOVOA X2, X6
338 | 	PSRLL $4, X2
339 | 	PAND X7, X6			// X6 = ba983210 (5:4)
340 | 	PAND X7, X2			// X2 = fedc7654 (5:4)
341 | 
342 | 	PADDB X6, X0			// X0 = ba983210 (1:0)
343 | 	PADDB X4, X2			// X2 = fedc7654 (1:0)
344 | 
345 | 	MOVOA X1, X4
346 | 	PSRLL $4, X4
347 | 	PAND X7, X1			// X1 = ba983210 (3:2)
348 | 	PAND X7, X4			// X4 = fedc7654 (3:2)
349 | 
350 | 	MOVOA X3, X6
351 | 	PSRLL $4, X3
352 | 	PAND X7, X6			// X6 = ba983210 (7:6)
353 | 	PAND X7, X3			// X3 = fedc7654 (7:6)
354 | 
355 | 	PADDB X6, X1			// X1 = ba983210 (3:2)
356 | 	PADDB X4, X3			// X3 = fedc7654 (3:2)
357 | 
358 | 	// unpack one last time
359 | 	MOVOA X0, X4
360 | 	PUNPCKLLQ X2, X0		// X0 = fedcba9876543210 (0)
361 | 	PUNPCKHLQ X2, X4		// X4 = fedcba9876543210 (1)
362 | 	MOVOA X1, X5
363 | 	PUNPCKLLQ X3, X1		// X1 = fedcba9876543210 (2)
364 | 	PUNPCKHLQ X3, X5		// X5 = fedcba9876543210 (3)
365 | 
366 | 	// add to counters
367 | 	PXOR X7, X7			// zero register
368 | 	ACCUM( X8,  X9, X0)
369 | 	ACCUM(X10, X11, X4)
370 | 	ACCUM(X12, X13, X1)
371 | 	ACCUM(X14, X15, X5)
372 | 
373 | 	// constants for processing the tail
374 | endvec:	MOVQ magic<>+0(SB), X6		// bit position mask
375 | 	PSHUFD $0x44, X6, X6		// broadcast into both qwords
376 | 	PXOR X0, X0			// counter registers
377 | 	PXOR X1, X1
378 | 	PXOR X2, X2
379 | 	PXOR X3, X3
380 | 
381 | 	// process tail, 4 bytes at a time
382 | 	SUBL $8-16*16, CX		// 8 bytes left to process?
383 | 	JLT tail1
384 | 
385 | tail8:	MOVL 0(SI), X4
386 | 	COUNT4(X0, X1)
387 | 	MOVL 4(SI), X4
388 | 	COUNT4(X2, X3)
389 | 	ADDQ $8, SI
390 | 	SUBL $8, CX
391 | 	JGE tail8
392 | 
393 | 	// process remaining 0--7 byte
394 | tail1:	SUBL $-8, CX			// anything left to process?
395 | 	JLE end
396 | 
397 | 	MOVQ (SI), X5			// load 8 bytes from buffer.  Note that
398 | 					// buffer is aligned to 8 byte here
399 | 	MOVQ $window<>+16(SB), AX	// load window address
400 | 	SUBQ CX, AX			// adjust mask
401 | 	MOVQ (AX), X7			// load window mask
402 | 	PANDN X5, X7			// and mask out the desired bytes
403 | 
404 | 	// process rest
405 | 	MOVOA X7, X4
406 | 	COUNT4(X0, X1)
407 | 	PSRLO $4, X7
408 | 	MOVOA X7, X4
409 | 	COUNT4(X2, X3)
410 | 
411 | 	// add tail to counters
412 | end:	PXOR X7, X7			// zero register
413 | 	MOVOA X0, X4
414 | 	PUNPCKLBW X7, X0
415 | 	PUNPCKHBW X7, X4
416 | 	PADDW X0, X8
417 | 	PADDW X4, X9
418 | 	MOVOA X1, X4
419 | 	PUNPCKLBW X7, X1
420 | 	PUNPCKHBW X7, X4
421 | 	PADDW X1, X10
422 | 	PADDW X4, X11
423 | 	MOVOA X2, X4
424 | 	PUNPCKLBW X7, X2
425 | 	PUNPCKHBW X7, X4
426 | 	PADDW X2, X12
427 | 	PADDW X4, X13
428 | 	MOVOA X3, X4
429 | 	PUNPCKLBW X7, X3
430 | 	PUNPCKHBW X7, X4
431 | 	PADDW X3, X14
432 | 	PADDW X4, X15
433 | 
434 | 	CALL *BX
435 | 	RET
436 | 
437 | 	// buffer is short, do just head/tail processing
438 | runt:	SUBL $8, CX			// 8 bytes left to process?
439 | 	JLT runt1
440 | 
441 | 	// process runt 8 bytes at a time
442 | runt8:	MOVL 0(SI), X4
443 | 	COUNT4(X8, X10)
444 | 	MOVL 4(SI), X4
445 | 	COUNT4(X12, X14)
446 | 	ADDQ $8, SI
447 | 	SUBL $8, CX
448 | 	JGE runt8
449 | 
450 | 	// process remaining 0--7 byte
451 | 	// while making sure we don't get a page fault
452 | runt1:	ADDL $8, CX			// anything left to process?
453 | 	JLE runt_accum
454 | 
455 | 	MOVL SI, AX
456 | 	ANDL $7, AX			// offset from 8 byte alignment
457 | 	LEAL (AX)(CX*1), DX		// length of buffer plus alignment
458 | 	SHLL $3, CX			// remaining length in bits
459 | 	XORQ R9, R9
460 | 	BTSQ CX, R9
461 | 	DECQ R9				// mask of bits where R8 is in range
462 | 	CMPL DX, $8			// if this exceeds the alignment boundary
463 | 	JGT crossrunt1			// we can safely load directly
464 | 
465 | 	ANDQ $~7, SI			// align buffer to 8 bytes
466 | 	MOVQ (SI), R8			// and and load 8 bytes from buffer
467 | 	LEAL (AX*8), CX			// offset from 8 byte alignment in bits
468 | 	SHRQ CX, R8			// buffer starting from the beginning
469 | 	JMP dorunt1
470 | 
471 | crossrunt1:
472 | 	MOVQ (SI), R8			// load 8 bytes from unaligned buffer
473 | 
474 | dorunt1:
475 | 	ANDQ R9, R8			// mask out bytes behind the buffer
476 | 	MOVL R8, X4
477 | 	SHRQ $32, R8
478 | 	COUNT4(X8, X10)
479 | 	MOVL R8, X4
480 | 	COUNT4(X12, X14)
481 | 
482 | 	// move tail to counters and perform final accumulation
483 | runt_accum:
484 | 	MOVOA X8, X9
485 | 	PUNPCKLBW X7, X8
486 | 	PUNPCKHBW X7, X9
487 | 	MOVOA X10, X11
488 | 	PUNPCKLBW X7, X10
489 | 	PUNPCKHBW X7, X11
490 | 	MOVOA X12, X13
491 | 	PUNPCKLBW X7, X12
492 | 	PUNPCKHBW X7, X13
493 | 	MOVOA X14, X15
494 | 	PUNPCKLBW X7, X14
495 | 	PUNPCKHBW X7, X15
496 | 
497 | 	CALL *BX
498 | 	RET
499 | 
500 | // zero-extend dwords in X trashing X, X4, and X5.  Add the low half
501 | // dwords to a*8(DI) and the high half to (a+2)*8(DI).
502 | // Assumes X7 == 0.
503 | #define ACCUMQ(a, X) \
504 | 	MOVOA X, X4 \
505 | 	PUNPCKLLQ X7, X \
506 | 	PUNPCKHLQ X7, X4 \
507 | 	MOVOU (a)*8(DI), X5 \
508 | 	PADDQ X, X5 \
509 | 	MOVOU X5, (a)*8(DI) \
510 | 	MOVOU (a+2)*8(DI), X5 \
511 | 	PADDQ X4, X5 \
512 | 	MOVOU X5, (a+2)*8(DI)
513 | 
514 | // zero-extend words in X to qwords and add to a*8(DI) to (a+7)*8(DI).
515 | // Trashes X4, X5, and X6.  Assumes X7 == 0 an X8 <= X <= X15.
516 | #define ACCUMO(a, X) \
517 | 	MOVOA X, X6 \
518 | 	PUNPCKLWL X7, X6 \
519 | 	PUNPCKHWL X7, X \
520 | 	ACCUMQ(a, X6) \
521 | 	ACCUMQ(a+4, X)
522 | 
523 | // zero-extend words in X and Y to dwords, sum them, and move the
524 | // halves back into X and Y.  Assumes X7 == 0.  Trashes X4, X5.
525 | #define FOLDW(X, Y) \
526 | 	MOVOA X, X4 \
527 | 	PUNPCKLWL X7, X \
528 | 	PUNPCKHWL X7, X4 \
529 | 	MOVOA Y, X5 \
530 | 	PUNPCKLWL X7, X5 \
531 | 	PUNPCKHWL X7, Y \
532 | 	PADDL X5, X \
533 | 	PADDL X4, Y
534 | 
535 | // Count8 accumulation function.  Accumulates words X8--X15 into
536 | // 8 qword counters at (DI).  Assumes X7 == 0.  Trashes X4--X15.
537 | TEXT accum8<>(SB), NOSPLIT, $0-0
538 | 	FOLDW(X8, X12)
539 | 	FOLDW(X9, X13)
540 | 	FOLDW(X10, X14)
541 | 	FOLDW(X11, X15)
542 | 	PADDL X10, X8
543 | 	PADDL X11, X9
544 | 	PADDL X14, X12
545 | 	PADDL X15, X13
546 | 	PADDL X9, X8
547 | 	ACCUMQ(0, X8)
548 | 	PADDL X13, X12
549 | 	ACCUMQ(4, X12)
550 | 	RET
551 | 
552 | // Count16 accumulation function.  Accumulates words X8--X15 into
553 | // 16 qword counters at (DI).  Assumes X7 == 0.  Trashes X4--X15.
554 | TEXT accum16<>(SB), NOSPLIT, $0-0
555 | 	FOLDW(X8, X12)
556 | 	FOLDW(X9, X13)
557 | 	FOLDW(X10, X14)
558 | 	FOLDW(X11, X15)
559 | 	PADDL X10, X8
560 | 	ACCUMQ(0, X8)
561 | 	PADDL X14, X12
562 | 	ACCUMQ(4, X12)
563 | 	PADDL X11, X9
564 | 	ACCUMQ(8, X9)
565 | 	PADDL X15, X13
566 | 	ACCUMQ(12, X13)
567 | 	RET
568 | 
569 | // Count32 accumulation function.  Accumulates words X8--X15 into
570 | // 32 qword counters at (DI).  Assumes X7 == 0.  Trashes X4--X15.
571 | TEXT accum32<>(SB), NOSPLIT, $0-0
572 | 	FOLDW(X8, X12)
573 | 	ACCUMQ(0, X8)
574 | 	ACCUMQ(4, X12)
575 | 	FOLDW(X9, X13)
576 | 	ACCUMQ(8, X9)
577 | 	ACCUMQ(12, X13)
578 | 	FOLDW(X10, X14)
579 | 	ACCUMQ(16, X10)
580 | 	ACCUMQ(20, X14)
581 | 	FOLDW(X11, X15)
582 | 	ACCUMQ(24, X11)
583 | 	ACCUMQ(28, X15)
584 | 	RET
585 | 
586 | // Count64 accumulation function.  Accumulates words X8--X15 into
587 | // 64 qword counters at (DI).  Assumes X7 == 0.  Trashes X4--X15.
588 | TEXT accum64<>(SB), NOSPLIT, $0-0
589 | 	ACCUMO(0, X8)
590 | 	ACCUMO(8, X9)
591 | 	ACCUMO(16, X10)
592 | 	ACCUMO(24, X11)
593 | 	ACCUMO(32, X12)
594 | 	ACCUMO(40, X13)
595 | 	ACCUMO(48, X14)
596 | 	ACCUMO(56, X15)
597 | 	RET
598 | 
599 | // func count8sse2(counts *[8]int, buf []uint8)
600 | TEXT ·count8sse2(SB), 0, $0-32
601 | 	MOVQ counts+0(FP), DI
602 | 	MOVQ buf_base+8(FP), SI		// SI = &buf[0]
603 | 	MOVQ buf_len+16(FP), CX		// CX = len(buf)
604 | 	MOVQ $accum8<>(SB), BX
605 | 	CALL countsse2<>(SB)
606 | 	RET
607 | 
608 | // func count16sse2(counts *[16]int, buf []uint16)
609 | TEXT ·count16sse2(SB), 0, $0-32
610 | 	MOVQ counts+0(FP), DI
611 | 	MOVQ buf_base+8(FP), SI		// SI = &buf[0]
612 | 	MOVQ buf_len+16(FP), CX		// CX = len(buf)
613 | 	MOVQ $accum16<>(SB), BX
614 | 	SHLQ $1, CX			// count in bytes
615 | 	CALL countsse2<>(SB)
616 | 	RET
617 | 
618 | // func count32sse2(counts *[32]int, buf []uint32)
619 | TEXT ·count32sse2(SB), 0, $0-32
620 | 	MOVQ counts+0(FP), DI
621 | 	MOVQ buf_base+8(FP), SI		// SI = &buf[0]
622 | 	MOVQ buf_len+16(FP), CX		// CX = len(buf)
623 | 	MOVQ $accum32<>(SB), BX
624 | 	SHLQ $2, CX			// count in bytes
625 | 	CALL countsse2<>(SB)
626 | 	RET
627 | 
628 | // func count64sse2(counts *[64]int, buf []uint64)
629 | TEXT ·count64sse2(SB), 0, $0-32
630 | 	MOVQ counts+0(FP), DI
631 | 	MOVQ buf_base+8(FP), SI		// SI = &buf[0]
632 | 	MOVQ buf_len+16(FP), CX		// CX = len(buf)
633 | 	MOVQ $accum64<>(SB), BX
634 | 	SHLQ $3, CX			// count in bytes
635 | 	CALL countsse2<>(SB)
636 | 	RET
637 | 


--------------------------------------------------------------------------------
/dispatch.go:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020, 2024 Robert Clausecker <fuz@fuz.su>
  2 | 
  3 | // Positional population counts.
  4 | //
  5 | // This package contains a set of functions to compute positional
  6 | // population counts for arrays of uint8, uint16, uint32, or uint64.
  7 | // Optimised assembly optimisations are provided for amd64 (AVX-512,
  8 | // AVX2, SSE2), 386 (AVX2, SSE2), and ARM64 (NEON).  An optimal
  9 | // implementation constrainted by the instruction set extensions
 10 | // available on your CPU is chosen automatically at runtime.  If no
 11 | // assembly implementation exists, a generic fallback implementation
 12 | // will be used.  The pospop package thus works on all architectures
 13 | // supported by the Go toolchain.
 14 | //
 15 | // The kernels works on a block size of 240, 480, or 960 bytes.  A
 16 | // buffer size that is a multiple of 64 bytes and at least 10 kB in size
 17 | // is recommended.  The author's benchmarks show that a buffer size
 18 | // around 100 kB appears optimal.
 19 | //
 20 | // See the example on the Count8 function for what the positional
 21 | // population count operation does.
 22 | package pospop
 23 | 
 24 | import "unsafe"
 25 | 
 26 | // each platform must provide arrays count8funcs, coun16funcs,
 27 | // count32funcs, and count64funcs of type count8impl, ... listing
 28 | // the available implementations.  The member available indicates that
 29 | // the function would run on this machine.  The dispatch code picks the
 30 | // lowest-numbered function in the array for which available is true.
 31 | // The generic implementation should be available under all
 32 | // circumstances so it can be run by the unit tests.  The name field
 33 | // should be the name of the implementation and should not repeat the
 34 | // "count#" prefix.
 35 | 
 36 | type count8impl struct {
 37 | 	count8    func(*[8]int, []uint8)
 38 | 	name      string
 39 | 	available bool
 40 | }
 41 | 
 42 | type count16impl struct {
 43 | 	count16   func(*[16]int, []uint16)
 44 | 	name      string
 45 | 	available bool
 46 | }
 47 | 
 48 | type count32impl struct {
 49 | 	count32   func(*[32]int, []uint32)
 50 | 	name      string
 51 | 	available bool
 52 | }
 53 | 
 54 | type count64impl struct {
 55 | 	count64   func(*[64]int, []uint64)
 56 | 	name      string
 57 | 	available bool
 58 | }
 59 | 
 60 | // optimal count8 implementation selected at runtime
 61 | var count8func = func() func(*[8]int, []uint8) {
 62 | 	for _, f := range count8funcs {
 63 | 		if f.available {
 64 | 			return f.count8
 65 | 		}
 66 | 	}
 67 | 
 68 | 	panic("no implementation of count8 available")
 69 | }()
 70 | 
 71 | // optimal count16 implementation selected at runtime
 72 | var count16func = func() func(*[16]int, []uint16) {
 73 | 	for _, f := range count16funcs {
 74 | 		if f.available {
 75 | 			return f.count16
 76 | 		}
 77 | 	}
 78 | 
 79 | 	panic("no implementation of count16 available")
 80 | }()
 81 | 
 82 | // optimal count32 implementation selected at runtime
 83 | var count32func = func() func(*[32]int, []uint32) {
 84 | 	for _, f := range count32funcs {
 85 | 		if f.available {
 86 | 			return f.count32
 87 | 		}
 88 | 	}
 89 | 
 90 | 	panic("no implementation of count32 available")
 91 | }()
 92 | 
 93 | // optimal count64 implementation selected at runtime
 94 | var count64func = func() func(*[64]int, []uint64) {
 95 | 	for _, f := range count64funcs {
 96 | 		if f.available {
 97 | 			return f.count64
 98 | 		}
 99 | 	}
100 | 
101 | 	panic("no implementation of count64 available")
102 | }()
103 | 
104 | // Count the number of corresponding set bits of the bytes in str and
105 | // add the results to counts.  Each element of counts keeps track of a
106 | // different place; counts[0] for 0x01, counts[1] for 0x02, and so on to
107 | // counts[7] for 0x80.
108 | func CountString(counts *[8]int, str string) {
109 | 	buf := unsafe.Slice(unsafe.StringData(str), len(str))
110 | 	count8func(counts, buf)
111 | }
112 | 
113 | // Count the number of corresponding set bits of the bytes in buf and
114 | // add the results to counts.  Each element of counts keeps track of a
115 | // different place; counts[0] for 0x01, counts[1] for 0x02, and so on to
116 | // counts[7] for 0x80.
117 | func Count8(counts *[8]int, buf []uint8) {
118 | 	count8func(counts, buf)
119 | }
120 | 
121 | // Count the number of corresponding set bits of the values in buf and
122 | // add the results to counts.  Each element of counts keeps track of a
123 | // different place; counts[0] for 0x0001, counts[1] for 0x0002, and so
124 | // on to counts[15] for 0x8000.
125 | func Count16(counts *[16]int, buf []uint16) {
126 | 	count16func(counts, buf)
127 | }
128 | 
129 | // Count the number of corresponding set bits of the values in buf and
130 | // add the results to counts.  Each element of counts keeps track of a
131 | // different place; counts[0] for 0x0000001, counts[1] for 0x00000002,
132 | // and so on to counts[31] for 0x80000000.
133 | func Count32(counts *[32]int, buf []uint32) {
134 | 	count32func(counts, buf)
135 | }
136 | 
137 | // Count the number of corresponding set bits of the values in buf and
138 | // add the results to counts.  Each element of counts keeps track of a
139 | // different place; counts[0] for 0x000000000000001, counts[1] for
140 | // 0x0000000000000002, and so on to counts[63] for 0x8000000000000000.
141 | func Count64(counts *[64]int, buf []uint64) {
142 | 	count64func(counts, buf)
143 | }
144 | 


--------------------------------------------------------------------------------
/example_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Robert Clausecker <fuz@fuz.su>
 2 | 
 3 | package pospop
 4 | 
 5 | import "fmt"
 6 | 
 7 | // This example illustrates the positional population count operation.
 8 | // For each number in the input, Count8() checks which of its bits are
 9 | // set and increments the corresponding counters.  In this example,
10 | // four numbers (1, 3, 5, 9) have bit 0 set; three numbers (2, 3, 6)
11 | // have bit 1 set, two numbers (5, 6) have bit 2 set and only the number
12 | // 9 has bit 3 set.
13 | func ExampleCount8() {
14 | 	var counts [8]int
15 | 	numbers := []uint8{
16 | 		1, // bit 0 set
17 | 		2, // bit 1 set
18 | 		3, // bits 0 and 1 set
19 | 		5, // bits 0 and 2 set
20 | 		6, // bits 1 and 2 set
21 | 		9, // bits 0 and 3 set
22 | 	}
23 | 
24 | 	Count8(&counts, numbers)
25 | 	fmt.Println(counts)
26 | 	// Output: [4 3 2 1 0 0 0 0]
27 | }
28 | 


--------------------------------------------------------------------------------
/generic.go:
--------------------------------------------------------------------------------
  1 | // Copyright (c) 2020 Robert Clausecker <fuz@fuz.su>
  2 | 
  3 | package pospop
  4 | 
  5 | // 8-bit full adder
  6 | func csa8(a, b, c uint8) (c_out, s uint8) {
  7 | 	s_ab := a ^ b
  8 | 	c_ab := a & b
  9 | 
 10 | 	s = s_ab ^ c
 11 | 	c_out = c_ab | s_ab&c
 12 | 
 13 | 	return
 14 | }
 15 | 
 16 | // count8 generic implementation.  Uses the same CSA15
 17 | // kernel as the vectorised implementations.
 18 | func count8generic(counts *[8]int, buf []uint8) {
 19 | 	var i int
 20 | 
 21 | 	for i = 0; i < len(buf)-14; i += 15 {
 22 | 		b0, a0 := csa8(buf[i+0], buf[i+1], buf[i+2])
 23 | 		b1, a1 := csa8(buf[i+3], buf[i+4], buf[i+5])
 24 | 		b2, a2 := csa8(a0, a1, buf[i+6])
 25 | 		c0, b3 := csa8(b0, b1, b2)
 26 | 		b4, a3 := csa8(a2, buf[i+7], buf[i+8])
 27 | 		b5, a4 := csa8(a3, buf[i+9], buf[i+10])
 28 | 		c1, b6 := csa8(b3, b4, b5)
 29 | 		b7, a5 := csa8(a4, buf[i+11], buf[i+12])
 30 | 		b8, a := csa8(a5, buf[i+13], buf[i+14])
 31 | 		c2, b := csa8(b6, b7, b8)
 32 | 		d, c := csa8(c0, c1, c2)
 33 | 
 34 | 		// d:c:b:a now holds the counters
 35 | 
 36 | 		ba0 := a&0x55 | b<<1&0xaa
 37 | 		ba1 := a>>1&0x55 | b&0xaa
 38 | 		dc0 := c&0x55 | d<<1&0xaa
 39 | 		dc1 := c>>1&0x55 | d&0xaa
 40 | 
 41 | 		dcba0 := ba0&0x33 | dc0<<2&0xcc
 42 | 		dcba1 := ba0>>2&0x33 | dc0&0xcc
 43 | 		dcba2 := ba1&0x33 | dc1<<2&0xcc
 44 | 		dcba3 := ba1>>2&0x33 | dc1&0xcc
 45 | 
 46 | 		// add to counters
 47 | 		counts[0] += int(dcba0 & 0x0f)
 48 | 		counts[1] += int(dcba2 & 0x0f)
 49 | 		counts[2] += int(dcba1 & 0x0f)
 50 | 		counts[3] += int(dcba3 & 0x0f)
 51 | 		counts[4] += int(dcba0 >> 4)
 52 | 		counts[5] += int(dcba2 >> 4)
 53 | 		counts[6] += int(dcba1 >> 4)
 54 | 		counts[7] += int(dcba3 >> 4)
 55 | 	}
 56 | 
 57 | 	// count8safe() manually inlined
 58 | 	for ; i < len(buf); i++ {
 59 | 		for j := 0; j < 8; j++ {
 60 | 			counts[j] += int(buf[i] >> j & 1)
 61 | 		}
 62 | 	}
 63 | }
 64 | 
 65 | // 16-bit full adder
 66 | func csa16(a, b, c uint16) (c_out, s uint16) {
 67 | 	s_ab := a ^ b
 68 | 	c_ab := a & b
 69 | 
 70 | 	s = s_ab ^ c
 71 | 	c_out = c_ab | s_ab&c
 72 | 
 73 | 	return
 74 | }
 75 | 
 76 | // count16 generic implementation.  Uses the same CSA15
 77 | // kernel as the vectorised implementations.
 78 | func count16generic(counts *[16]int, buf []uint16) {
 79 | 	var i int
 80 | 
 81 | 	for i = 0; i < len(buf)-14; i += 15 {
 82 | 		b0, a0 := csa16(buf[i+0], buf[i+1], buf[i+2])
 83 | 		b1, a1 := csa16(buf[i+3], buf[i+4], buf[i+5])
 84 | 		b2, a2 := csa16(a0, a1, buf[i+6])
 85 | 		c0, b3 := csa16(b0, b1, b2)
 86 | 		b4, a3 := csa16(a2, buf[i+7], buf[i+8])
 87 | 		b5, a4 := csa16(a3, buf[i+9], buf[i+10])
 88 | 		c1, b6 := csa16(b3, b4, b5)
 89 | 		b7, a5 := csa16(a4, buf[i+11], buf[i+12])
 90 | 		b8, a6 := csa16(a5, buf[i+13], buf[i+14])
 91 | 		c2, b9 := csa16(b6, b7, b8)
 92 | 		d0, c3 := csa16(c0, c1, c2)
 93 | 
 94 | 		// d:c:b:a now holds the counters
 95 | 		a := uint(a6)
 96 | 		b := uint(b9)
 97 | 		c := uint(c3)
 98 | 		d := uint(d0)
 99 | 
100 | 		ba0 := a&0x5555 | b<<1&0xaaaa
101 | 		ba1 := a>>1&0x5555 | b&0xaaaa
102 | 		dc0 := c&0x5555 | d<<1&0xaaaa
103 | 		dc1 := c>>1&0x5555 | d&0xaaaa
104 | 
105 | 		dcba0 := ba0&0x3333 | dc0<<2&0xcccc
106 | 		dcba1 := ba0>>2&0x3333 | dc0&0xcccc
107 | 		dcba2 := ba1&0x3333 | dc1<<2&0xcccc
108 | 		dcba3 := ba1>>2&0x3333 | dc1&0xcccc
109 | 
110 | 		// add to counters
111 | 		counts[0] += int(dcba0 & 0x0f)
112 | 		counts[1] += int(dcba2 & 0x0f)
113 | 		counts[2] += int(dcba1 & 0x0f)
114 | 		counts[3] += int(dcba3 & 0x0f)
115 | 		counts[4] += int(dcba0 >> 4 & 0x0f)
116 | 		counts[5] += int(dcba2 >> 4 & 0x0f)
117 | 		counts[6] += int(dcba1 >> 4 & 0x0f)
118 | 		counts[7] += int(dcba3 >> 4 & 0x0f)
119 | 		counts[8] += int(dcba0 >> 8 & 0x0f)
120 | 		counts[9] += int(dcba2 >> 8 & 0x0f)
121 | 		counts[10] += int(dcba1 >> 8 & 0x0f)
122 | 		counts[11] += int(dcba3 >> 8 & 0x0f)
123 | 		counts[12] += int(dcba0 >> 12)
124 | 		counts[13] += int(dcba2 >> 12)
125 | 		counts[14] += int(dcba1 >> 12)
126 | 		counts[15] += int(dcba3 >> 12)
127 | 	}
128 | 
129 | 	// count16safe() manually inlined
130 | 	for ; i < len(buf); i++ {
131 | 		for j := 0; j < 16; j++ {
132 | 			counts[j] += int(buf[i] >> j & 1)
133 | 		}
134 | 	}
135 | }
136 | 
137 | // 32-bit full adder
138 | func csa32(a, b, c uint32) (c_out, s uint32) {
139 | 	s_ab := a ^ b
140 | 	c_ab := a & b
141 | 
142 | 	s = s_ab ^ c
143 | 	c_out = c_ab | s_ab&c
144 | 
145 | 	return
146 | }
147 | 
148 | // count32 generic implementation.  Uses the same CSA15
149 | // kernel as the vectorised implementations.
150 | func count32generic(counts *[32]int, buf []uint32) {
151 | 	var i int
152 | 
153 | 	for i = 0; i < len(buf)-14; i += 15 {
154 | 		b0, a0 := csa32(buf[i+0], buf[i+1], buf[i+2])
155 | 		b1, a1 := csa32(buf[i+3], buf[i+4], buf[i+5])
156 | 		b2, a2 := csa32(a0, a1, buf[i+6])
157 | 		c0, b3 := csa32(b0, b1, b2)
158 | 		b4, a3 := csa32(a2, buf[i+7], buf[i+8])
159 | 		b5, a4 := csa32(a3, buf[i+9], buf[i+10])
160 | 		c1, b6 := csa32(b3, b4, b5)
161 | 		b7, a5 := csa32(a4, buf[i+11], buf[i+12])
162 | 		b8, a := csa32(a5, buf[i+13], buf[i+14])
163 | 		c2, b := csa32(b6, b7, b8)
164 | 		d, c := csa32(c0, c1, c2)
165 | 
166 | 		// d:c:b:a now holds the counters
167 | 
168 | 		ba0 := a&0x55555555 | b<<1&0xaaaaaaaa
169 | 		ba1 := a>>1&0x55555555 | b&0xaaaaaaaa
170 | 		dc0 := c&0x55555555 | d<<1&0xaaaaaaaa
171 | 		dc1 := c>>1&0x55555555 | d&0xaaaaaaaa
172 | 
173 | 		dcba0 := ba0&0x33333333 | dc0<<2&0xcccccccc
174 | 		dcba1 := ba0>>2&0x33333333 | dc0&0xcccccccc
175 | 		dcba2 := ba1&0x33333333 | dc1<<2&0xcccccccc
176 | 		dcba3 := ba1>>2&0x33333333 | dc1&0xcccccccc
177 | 
178 | 		// add to counters
179 | 		counts[0] += int(dcba0 & 0x0f)
180 | 		counts[1] += int(dcba2 & 0x0f)
181 | 		counts[2] += int(dcba1 & 0x0f)
182 | 		counts[3] += int(dcba3 & 0x0f)
183 | 		counts[4] += int(dcba0 >> 4 & 0x0f)
184 | 		counts[5] += int(dcba2 >> 4 & 0x0f)
185 | 		counts[6] += int(dcba1 >> 4 & 0x0f)
186 | 		counts[7] += int(dcba3 >> 4 & 0x0f)
187 | 		counts[8] += int(dcba0 >> 8 & 0x0f)
188 | 		counts[9] += int(dcba2 >> 8 & 0x0f)
189 | 		counts[10] += int(dcba1 >> 8 & 0x0f)
190 | 		counts[11] += int(dcba3 >> 8 & 0x0f)
191 | 		counts[12] += int(dcba0 >> 12 & 0x0f)
192 | 		counts[13] += int(dcba2 >> 12 & 0x0f)
193 | 		counts[14] += int(dcba1 >> 12 & 0x0f)
194 | 		counts[15] += int(dcba3 >> 12 & 0x0f)
195 | 		counts[16] += int(dcba0 >> 16 & 0x0f)
196 | 		counts[17] += int(dcba2 >> 16 & 0x0f)
197 | 		counts[18] += int(dcba1 >> 16 & 0x0f)
198 | 		counts[19] += int(dcba3 >> 16 & 0x0f)
199 | 		counts[20] += int(dcba0 >> 20 & 0x0f)
200 | 		counts[21] += int(dcba2 >> 20 & 0x0f)
201 | 		counts[22] += int(dcba1 >> 20 & 0x0f)
202 | 		counts[23] += int(dcba3 >> 20 & 0x0f)
203 | 		counts[24] += int(dcba0 >> 24 & 0x0f)
204 | 		counts[25] += int(dcba2 >> 24 & 0x0f)
205 | 		counts[26] += int(dcba1 >> 24 & 0x0f)
206 | 		counts[27] += int(dcba3 >> 24 & 0x0f)
207 | 		counts[28] += int(dcba0 >> 28)
208 | 		counts[29] += int(dcba2 >> 28)
209 | 		counts[30] += int(dcba1 >> 28)
210 | 		counts[31] += int(dcba3 >> 28)
211 | 	}
212 | 
213 | 	// count32safe() manually inlined
214 | 	for ; i < len(buf); i++ {
215 | 		for j := 0; j < 32; j++ {
216 | 			counts[j] += int(buf[i] >> j & 1)
217 | 		}
218 | 	}
219 | }
220 | 
221 | // 64-bit full adder
222 | func csa64(a, b, c uint64) (c_out, s uint64) {
223 | 	s_ab := a ^ b
224 | 	c_ab := a & b
225 | 
226 | 	s = s_ab ^ c
227 | 	c_out = c_ab | s_ab&c
228 | 
229 | 	return
230 | }
231 | 
232 | // count64 generic implementation.  Uses the same CSA15
233 | // kernel as the vectorised implementations.
234 | func count64generic(counts *[64]int, buf []uint64) {
235 | 	var i int
236 | 
237 | 	for i = 0; i < len(buf)-14; i += 15 {
238 | 		b0, a0 := csa64(buf[i+0], buf[i+1], buf[i+2])
239 | 		b1, a1 := csa64(buf[i+3], buf[i+4], buf[i+5])
240 | 		b2, a2 := csa64(a0, a1, buf[i+6])
241 | 		c0, b3 := csa64(b0, b1, b2)
242 | 		b4, a3 := csa64(a2, buf[i+7], buf[i+8])
243 | 		b5, a4 := csa64(a3, buf[i+9], buf[i+10])
244 | 		c1, b6 := csa64(b3, b4, b5)
245 | 		b7, a5 := csa64(a4, buf[i+11], buf[i+12])
246 | 		b8, a := csa64(a5, buf[i+13], buf[i+14])
247 | 		c2, b := csa64(b6, b7, b8)
248 | 		d, c := csa64(c0, c1, c2)
249 | 
250 | 		// d:c:b:a now holds the counters
251 | 
252 | 		ba0 := a&0x5555555555555555 | b<<1&0xaaaaaaaaaaaaaaaa
253 | 		ba1 := a>>1&0x5555555555555555 | b&0xaaaaaaaaaaaaaaaa
254 | 		dc0 := c&0x5555555555555555 | d<<1&0xaaaaaaaaaaaaaaaa
255 | 		dc1 := c>>1&0x5555555555555555 | d&0xaaaaaaaaaaaaaaaa
256 | 
257 | 		dcba0 := ba0&0x3333333333333333 | dc0<<2&0xcccccccccccccccc
258 | 		dcba1 := ba0>>2&0x3333333333333333 | dc0&0xcccccccccccccccc
259 | 		dcba2 := ba1&0x3333333333333333 | dc1<<2&0xcccccccccccccccc
260 | 		dcba3 := ba1>>2&0x3333333333333333 | dc1&0xcccccccccccccccc
261 | 
262 | 		// split counters for better performance on 32 bit systems
263 | 		dcba0l := uint(uint32(dcba0))
264 | 		dcba0h := uint(dcba0 >> 32)
265 | 		dcba1l := uint(uint32(dcba1))
266 | 		dcba1h := uint(dcba1 >> 32)
267 | 		dcba2l := uint(uint32(dcba2))
268 | 		dcba2h := uint(dcba2 >> 32)
269 | 		dcba3l := uint(uint32(dcba3))
270 | 		dcba3h := uint(dcba3 >> 32)
271 | 
272 | 		// add to counters
273 | 		counts[0] += int(dcba0l & 0x0f)
274 | 		counts[1] += int(dcba2l & 0x0f)
275 | 		counts[2] += int(dcba1l & 0x0f)
276 | 		counts[3] += int(dcba3l & 0x0f)
277 | 		counts[4] += int(dcba0l >> 4 & 0x0f)
278 | 		counts[5] += int(dcba2l >> 4 & 0x0f)
279 | 		counts[6] += int(dcba1l >> 4 & 0x0f)
280 | 		counts[7] += int(dcba3l >> 4 & 0x0f)
281 | 		counts[8] += int(dcba0l >> 8 & 0x0f)
282 | 		counts[9] += int(dcba2l >> 8 & 0x0f)
283 | 		counts[10] += int(dcba1l >> 8 & 0x0f)
284 | 		counts[11] += int(dcba3l >> 8 & 0x0f)
285 | 		counts[12] += int(dcba0l >> 12 & 0x0f)
286 | 		counts[13] += int(dcba2l >> 12 & 0x0f)
287 | 		counts[14] += int(dcba1l >> 12 & 0x0f)
288 | 		counts[15] += int(dcba3l >> 12 & 0x0f)
289 | 		counts[16] += int(dcba0l >> 16 & 0x0f)
290 | 		counts[17] += int(dcba2l >> 16 & 0x0f)
291 | 		counts[18] += int(dcba1l >> 16 & 0x0f)
292 | 		counts[19] += int(dcba3l >> 16 & 0x0f)
293 | 		counts[20] += int(dcba0l >> 20 & 0x0f)
294 | 		counts[21] += int(dcba2l >> 20 & 0x0f)
295 | 		counts[22] += int(dcba1l >> 20 & 0x0f)
296 | 		counts[23] += int(dcba3l >> 20 & 0x0f)
297 | 		counts[24] += int(dcba0l >> 24 & 0x0f)
298 | 		counts[25] += int(dcba2l >> 24 & 0x0f)
299 | 		counts[26] += int(dcba1l >> 24 & 0x0f)
300 | 		counts[27] += int(dcba3l >> 24 & 0x0f)
301 | 		counts[28] += int(dcba0l >> 28)
302 | 		counts[29] += int(dcba2l >> 28)
303 | 		counts[30] += int(dcba1l >> 28)
304 | 		counts[31] += int(dcba3l >> 28)
305 | 
306 | 		counts[32] += int(dcba0h & 0x0f)
307 | 		counts[33] += int(dcba2h & 0x0f)
308 | 		counts[34] += int(dcba1h & 0x0f)
309 | 		counts[35] += int(dcba3h & 0x0f)
310 | 		counts[36] += int(dcba0h >> 4 & 0x0f)
311 | 		counts[37] += int(dcba2h >> 4 & 0x0f)
312 | 		counts[38] += int(dcba1h >> 4 & 0x0f)
313 | 		counts[39] += int(dcba3h >> 4 & 0x0f)
314 | 		counts[40] += int(dcba0h >> 8 & 0x0f)
315 | 		counts[41] += int(dcba2h >> 8 & 0x0f)
316 | 		counts[42] += int(dcba1h >> 8 & 0x0f)
317 | 		counts[43] += int(dcba3h >> 8 & 0x0f)
318 | 		counts[44] += int(dcba0h >> 12 & 0x0f)
319 | 		counts[45] += int(dcba2h >> 12 & 0x0f)
320 | 		counts[46] += int(dcba1h >> 12 & 0x0f)
321 | 		counts[47] += int(dcba3h >> 12 & 0x0f)
322 | 		counts[48] += int(dcba0h >> 16 & 0x0f)
323 | 		counts[49] += int(dcba2h >> 16 & 0x0f)
324 | 		counts[50] += int(dcba1h >> 16 & 0x0f)
325 | 		counts[51] += int(dcba3h >> 16 & 0x0f)
326 | 		counts[52] += int(dcba0h >> 20 & 0x0f)
327 | 		counts[53] += int(dcba2h >> 20 & 0x0f)
328 | 		counts[54] += int(dcba1h >> 20 & 0x0f)
329 | 		counts[55] += int(dcba3h >> 20 & 0x0f)
330 | 		counts[56] += int(dcba0h >> 24 & 0x0f)
331 | 		counts[57] += int(dcba2h >> 24 & 0x0f)
332 | 		counts[58] += int(dcba1h >> 24 & 0x0f)
333 | 		counts[59] += int(dcba3h >> 24 & 0x0f)
334 | 		counts[60] += int(dcba0h >> 28)
335 | 		counts[61] += int(dcba2h >> 28)
336 | 		counts[62] += int(dcba1h >> 28)
337 | 		counts[63] += int(dcba3h >> 28)
338 | 	}
339 | 
340 | 	// count64safe() manually inlined
341 | 	for ; i < len(buf); i++ {
342 | 		for j := 0; j < 64; j++ {
343 | 			counts[j] += int(buf[i] >> j & 1)
344 | 		}
345 | 	}
346 | }
347 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/clausecker/pospop
2 | 
3 | go 1.20
4 | 
5 | require golang.org/x/sys v0.0.0-20200929083018-4d22bbb62b3c
6 | 


--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | golang.org/x/sys v0.0.0-20200929083018-4d22bbb62b3c h1:/h0vtH0PyU0xAoZJVcRw1k0Ng+U0JAy3QDiFmppIlIE=
2 | golang.org/x/sys v0.0.0-20200929083018-4d22bbb62b3c/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
3 | 


--------------------------------------------------------------------------------
/minimize_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2021, 2022 Robert Clausecker <fuz@fuz.su>
 2 | 
 3 | package pospop
 4 | 
 5 | import "fmt"
 6 | import "strings"
 7 | 
 8 | const (
 9 | 	// max number of entries in a test case
10 | 	maxTestcaseSize = 100
11 | )
12 | 
13 | // Take a count64 function and a test case and return true if the
14 | // test case is processed correctly.
15 | func testPasses64(count64 func(*[64]int, []uint64), buf []uint64) bool {
16 | 	var counts, refCounts [64]int
17 | 
18 | 	count64(&counts, buf)
19 | 	count64safe(&refCounts, buf)
20 | 
21 | 	return counts == refCounts
22 | }
23 | 
24 | // Take a failing test case for testCount64 and try to find the
25 | // smallest possible test case to trigger the error.  This is done
26 | // by repeatedly clearing bits that do not cause the test case to
27 | // pass when cleared.  An attempt is also made to reduce the length
28 | // of the test case.  This function modifies its argument and
29 | // returns a subslice of it.
30 | func minimizeTestcase64(count64 func(*[64]int, []uint64), tc []uint64) []uint64 {
31 | 	// sanity check
32 | 	if testPasses64(count64, tc) {
33 | 		return nil
34 | 	}
35 | 
36 | 	// try to turn off bits
37 | 	for i := len(tc) - 1; i >= 0; i-- {
38 | 		for j := 63; j >= 0; j-- {
39 | 			if tc[i]&(1<<j) == 0 {
40 | 				continue
41 | 			}
42 | 
43 | 			tc[i] &^= 1 << j
44 | 			if testPasses64(count64, tc) {
45 | 				tc[i] |= 1 << j
46 | 			}
47 | 		}
48 | 	}
49 | 
50 | 	// try to shorten the array
51 | 	for len(tc) > 0 && !testPasses64(count64, tc[:len(tc)-1]) {
52 | 		tc = tc[:len(tc)-1]
53 | 	}
54 | 
55 | 	return tc
56 | }
57 | 
58 | // build a string representation of the minimised test case if it is
59 | // not too long.  If it is too long, return the empty string.
60 | func testcaseString64(tc []uint64) string {
61 | 	if len(tc) == 0 {
62 | 		return "\tvar buf [0]uint64"
63 | 	}
64 | 
65 | 	var w strings.Builder
66 | 	entries := 0
67 | 	fmt.Fprintf(&w, "\tvar buf [%d]uint64 // %p\n", len(tc), &tc[0])
68 | 	for i := range tc {
69 | 		if tc[i] == 0 {
70 | 			continue
71 | 		}
72 | 
73 | 		entries++
74 | 		if entries > maxTestcaseSize {
75 | 			return ""
76 | 		}
77 | 
78 | 		fmt.Fprintf(&w, "\tbuf[%d] = %#016x\n", i, tc[i])
79 | 	}
80 | 
81 | 	return w.String()
82 | }
83 | 


--------------------------------------------------------------------------------
/overflow_test.go:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2024 Robert Clausecker <fuz@fuz.su>
 2 | 
 3 | package pospop
 4 | 
 5 | import "testing"
 6 | 
 7 | // Check if we can get the accumulators to overflow
 8 | func TestOverflow(t *testing.T) {
 9 | 	for i := range count64funcs {
10 | 		t.Run(count64funcs[i].name, func(tt *testing.T) {
11 | 			if !count64funcs[i].available {
12 | 				tt.SkipNow()
13 | 			}
14 | 
15 | 			testOverflow(tt, count64funcs[i].count64)
16 | 		})
17 | 	}
18 | }
19 | 
20 | func testOverflow(t *testing.T, count64 func(*[64]int, []uint64)) {
21 | 	const imax = 16
22 | 	const jmax = 16
23 | 	var buf [imax*65536 + jmax]uint64
24 | 
25 | 	for i := range buf {
26 | 		buf[i] = ^uint64(0)
27 | 	}
28 | 
29 | 	for i := 1; i <= imax; i++ {
30 | 		for j := -jmax; j <= jmax; j++ {
31 | 			testOverflowBuf(t, count64, buf[:i * 65536 + j])
32 | 		}
33 | 	}
34 | }
35 | 
36 | func testOverflowBuf(t *testing.T, count64 func(*[64]int, []uint64), buf []uint64) {
37 | 	var counts, refCounts [64]int
38 | 
39 | 	for i := range refCounts {
40 | 		refCounts[i] = len(buf)
41 | 	}
42 | 
43 | 	count64(&counts, buf)
44 | 	if counts != refCounts {
45 | 		t.Errorf("length %d: counts don't match: %v", len(buf), countDiff(counts[:], refCounts[:]))
46 | 	}
47 | }
48 | 


--------------------------------------------------------------------------------
/overread_test.go:
--------------------------------------------------------------------------------
 1 | //go:build unix
 2 | 
 3 | // Copyright (c) 2024 Robert Clausecker <fuz@fuz.su>
 4 | 
 5 | package pospop
 6 | 
 7 | import (
 8 | 	"golang.org/x/sys/unix"
 9 | 	"testing"
10 | )
11 | 
12 | // Allocate three pages of memory.  Make the first and last page
13 | // inaccessible.  Return the full array as well as just the part
14 | // in the middle (which is accessible).
15 | func mapGuarded() (mapping []byte, slice []byte, err error) {
16 | 	pagesize := unix.Getpagesize()
17 | 	mapping, err = unix.Mmap(-1, 0, 3*pagesize, unix.PROT_NONE, unix.MAP_ANON|unix.MAP_PRIVATE)
18 | 	if err != nil {
19 | 		return nil, nil, err
20 | 	}
21 | 
22 | 	slice = mapping[pagesize : 2*pagesize : 2*pagesize]
23 | 	err = unix.Mprotect(slice, unix.PROT_READ|unix.PROT_WRITE)
24 | 	if err != nil {
25 | 		unix.Munmap(mapping)
26 | 		return nil, nil, err
27 | 	}
28 | 
29 | 	return
30 | }
31 | 
32 | // Verify that our count functions only overread memory in benign ways,
33 | // i.e. such that we never cross a page size boundary.
34 | func TestOverread(t *testing.T) {
35 | 	for i := range count8funcs {
36 | 		t.Run(count8funcs[i].name, func(tt *testing.T) {
37 | 			if !count8funcs[i].available {
38 | 				tt.SkipNow()
39 | 			}
40 | 
41 | 			testOverread(tt, count8funcs[i].count8)
42 | 		})
43 | 	}
44 | }
45 | 
46 | func testOverread(t *testing.T, count8 func(*[8]int, []uint8)) {
47 | 	var counters [8]int
48 | 
49 | 	mapping, slice, err := mapGuarded()
50 | 	defer unix.Munmap(mapping)
51 | 	if err != nil {
52 | 		t.Log("Cannot allocate memory:", err)
53 | 		t.SkipNow()
54 | 	}
55 | 
56 | 	// test large slices that start/end right at the page boundary
57 | 	for i := 0; i < 64; i++ {
58 | 		for j := len(slice) - 64; j <= len(slice); j++ {
59 | 			count8(&counters, slice[i:j])
60 | 		}
61 | 	}
62 | 
63 | 	// test small slices that start right after the page boundary
64 | 	for i := 0; i < 64; i++ {
65 | 		for j := i; j <= 64; j++ {
66 | 			count8(&counters, slice[i:j])
67 | 		}
68 | 	}
69 | 
70 | 	// test small slices that end right before the page boundary
71 | 	for i := len(slice) - 64; i <= len(slice); i++ {
72 | 		for j := i; j <= len(slice); j++ {
73 | 			count8(&counters, slice[i:j])
74 | 		}
75 | 	}
76 | }
77 | 


--------------------------------------------------------------------------------
/safe.go:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Robert Clausecker <fuz@fuz.su>
 2 | 
 3 | package pospop
 4 | 
 5 | // count8 reference implementation for tests.  Do not alter.
 6 | func count8safe(counts *[8]int, buf []uint8) {
 7 | 	for i := range buf {
 8 | 		for j := 0; j < 8; j++ {
 9 | 			counts[j] += int(buf[i] >> j & 1)
10 | 		}
11 | 	}
12 | }
13 | 
14 | // count16 reference implementation for tests.  Do not alter.
15 | func count16safe(counts *[16]int, buf []uint16) {
16 | 	for i := range buf {
17 | 		for j := 0; j < 16; j++ {
18 | 			counts[j] += int(buf[i] >> j & 1)
19 | 		}
20 | 	}
21 | }
22 | 
23 | // count32 reference implementation for tests.  Do not alter.
24 | func count32safe(counts *[32]int, buf []uint32) {
25 | 	for i := range buf {
26 | 		for j := 0; j < 32; j++ {
27 | 			counts[j] += int(buf[i] >> j & 1)
28 | 		}
29 | 	}
30 | }
31 | 
32 | // count64 reference implementation for tests.  Do not alter.
33 | func count64safe(counts *[64]int, buf []uint64) {
34 | 	for i := range buf {
35 | 		for j := 0; j < 64; j++ {
36 | 			counts[j] += int(buf[i] >> j & 1)
37 | 		}
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/select_386.go:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Robert Clausecker <fuz@fuz.su>
 2 | 
 3 | package pospop
 4 | 
 5 | import "golang.org/x/sys/cpu"
 6 | 
 7 | func count8avx2(counts *[8]int, buf []byte)
 8 | func count8sse2(counts *[8]int, buf []byte)
 9 | 
10 | func count16avx2(counts *[16]int, buf []uint16)
11 | func count16sse2(counts *[16]int, buf []uint16)
12 | 
13 | func count32avx2(counts *[32]int, buf []uint32)
14 | func count32sse2(counts *[32]int, buf []uint32)
15 | 
16 | func count64sse2(counts *[64]int, buf []uint64)
17 | func count64avx2(counts *[64]int, buf []uint64)
18 | 
19 | var count8funcs = []count8impl{
20 | 	{count8avx2, "avx2", cpu.X86.HasAVX2 && cpu.X86.HasBMI2},
21 | 	{count8sse2, "sse2", cpu.X86.HasSSE2},
22 | 	{count8generic, "generic", true},
23 | }
24 | 
25 | var count16funcs = []count16impl{
26 | 	{count16avx2, "avx2", cpu.X86.HasAVX2 && cpu.X86.HasBMI2},
27 | 	{count16sse2, "sse2", cpu.X86.HasSSE2},
28 | 	{count16generic, "generic", true},
29 | }
30 | 
31 | var count32funcs = []count32impl{
32 | 	{count32avx2, "avx2", cpu.X86.HasAVX2 && cpu.X86.HasBMI2},
33 | 	{count32sse2, "sse2", cpu.X86.HasSSE2},
34 | 	{count32generic, "generic", true},
35 | }
36 | 
37 | var count64funcs = []count64impl{
38 | 	{count64avx2, "avx2", cpu.X86.HasAVX2 && cpu.X86.HasBMI2},
39 | 	{count64sse2, "sse2", cpu.X86.HasSSE2},
40 | 	{count64generic, "generic", true},
41 | }
42 | 


--------------------------------------------------------------------------------
/select_amd64.go:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020 Robert Clausecker <fuz@fuz.su>
 2 | 
 3 | package pospop
 4 | 
 5 | import "golang.org/x/sys/cpu"
 6 | 
 7 | func count8avx512(counts *[8]int, buf []byte)
 8 | func count8avx2(counts *[8]int, buf []byte)
 9 | func count8sse2(counts *[8]int, buf []byte)
10 | 
11 | func count16avx512(counts *[16]int, buf []uint16)
12 | func count16avx2(counts *[16]int, buf []uint16)
13 | func count16sse2(counts *[16]int, buf []uint16)
14 | 
15 | func count32avx512(counts *[32]int, buf []uint32)
16 | func count32avx2(counts *[32]int, buf []uint32)
17 | func count32sse2(counts *[32]int, buf []uint32)
18 | 
19 | func count64avx512(counts *[64]int, buf []uint64)
20 | func count64avx2(counts *[64]int, buf []uint64)
21 | func count64sse2(counts *[64]int, buf []uint64)
22 | 
23 | var count8funcs = []count8impl{
24 | 	{count8avx512, "avx512", cpu.X86.HasBMI2 && cpu.X86.HasAVX512BW},
25 | 	{count8avx2, "avx2", cpu.X86.HasBMI2 && cpu.X86.HasAVX2},
26 | 	{count8sse2, "sse2", cpu.X86.HasSSE2},
27 | 	{count8generic, "generic", true},
28 | }
29 | 
30 | var count16funcs = []count16impl{
31 | 	{count16avx512, "avx512", cpu.X86.HasBMI2 && cpu.X86.HasAVX512BW},
32 | 	{count16avx2, "avx2", cpu.X86.HasBMI2 && cpu.X86.HasAVX2},
33 | 	{count16sse2, "sse2", cpu.X86.HasSSE2},
34 | 	{count16generic, "generic", true},
35 | }
36 | 
37 | var count32funcs = []count32impl{
38 | 	{count32avx512, "avx512", cpu.X86.HasBMI2 && cpu.X86.HasAVX512BW},
39 | 	{count32avx2, "avx2", cpu.X86.HasBMI2 && cpu.X86.HasAVX2},
40 | 	{count32sse2, "sse2", cpu.X86.HasSSE2},
41 | 	{count32generic, "generic", true},
42 | }
43 | 
44 | var count64funcs = []count64impl{
45 | 	{count64avx512, "avx512", cpu.X86.HasBMI2 && cpu.X86.HasAVX512BW},
46 | 	{count64avx2, "avx2", cpu.X86.HasBMI2 && cpu.X86.HasAVX2},
47 | 	{count64sse2, "sse2", cpu.X86.HasSSE2},
48 | 	{count64generic, "generic", true},
49 | }
50 | 


--------------------------------------------------------------------------------
/select_arm64.go:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020, 2024 Robert Clausecker <fuz@fuz.su>
 2 | 
 3 | package pospop
 4 | 
 5 | func count8neon(counts *[8]int, buf []uint8)
 6 | func count16neon(counts *[16]int, buf []uint16)
 7 | func count32neon(counts *[32]int, buf []uint32)
 8 | func count64neon(counts *[64]int, buf []uint64)
 9 | 
10 | var count8funcs = []count8impl{
11 | 	{count8neon, "neon", true},
12 | 	{count8generic, "generic", true},
13 | }
14 | 
15 | var count16funcs = []count16impl{
16 | 	{count16neon, "neon", true},
17 | 	{count16generic, "generic", true},
18 | }
19 | 
20 | var count32funcs = []count32impl{
21 | 	{count32neon, "neon", true},
22 | 	{count32generic, "generic", true},
23 | }
24 | 
25 | var count64funcs = []count64impl{
26 | 	{count64neon, "neon", true},
27 | 	{count64generic, "generic", true},
28 | }
29 | 


--------------------------------------------------------------------------------
/select_generic.go:
--------------------------------------------------------------------------------
 1 | // Copyright (c) 2020, 2024 Robert Clausecker <fuz@fuz.su>
 2 | 
 3 | //go:build !386 && !amd64 && !arm64
 4 | 
 5 | package pospop
 6 | 
 7 | // generic variants only
 8 | var count8funcs = []count8impl{{count8generic, "generic", true}}
 9 | var count16funcs = []count16impl{{count16generic, "generic", true}}
10 | var count32funcs = []count32impl{{count32generic, "generic", true}}
11 | var count64funcs = []count64impl{{count64generic, "generic", true}}
12 | 


--------------------------------------------------------------------------------