├── .github
├── FUNDING.yml
├── bitmap1.png
├── bitmap2.png
├── bitmap3.png
├── bitmap4.png
├── logo.pdn
├── logo.png
└── workflows
│ └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── bench
└── main.go
├── bitmap.go
├── bitmap_amd64.go
├── bitmap_arm64.go
├── bitmap_generic.go
├── bitmap_test.go
├── codec.go
├── codec_test.go
├── codegen
├── bin
│ └── gocc
├── generate.sh
├── simd_apple.c
├── simd_avx.c
├── simd_avx512.c
└── simd_neon.c
├── go.mod
├── go.sum
├── range.go
├── range_test.go
├── simd_apple.go
├── simd_apple.s
├── simd_avx.go
├── simd_avx.s
├── simd_avx512.go
├── simd_avx512.s
├── simd_generic.go
├── simd_neon.go
└── simd_neon.s
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
1 | github: [kelindar]
2 |
--------------------------------------------------------------------------------
/.github/bitmap1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kelindar/bitmap/f172a65d34622e0289d3135db87c5612066feece/.github/bitmap1.png
--------------------------------------------------------------------------------
/.github/bitmap2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kelindar/bitmap/f172a65d34622e0289d3135db87c5612066feece/.github/bitmap2.png
--------------------------------------------------------------------------------
/.github/bitmap3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kelindar/bitmap/f172a65d34622e0289d3135db87c5612066feece/.github/bitmap3.png
--------------------------------------------------------------------------------
/.github/bitmap4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kelindar/bitmap/f172a65d34622e0289d3135db87c5612066feece/.github/bitmap4.png
--------------------------------------------------------------------------------
/.github/logo.pdn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kelindar/bitmap/f172a65d34622e0289d3135db87c5612066feece/.github/logo.pdn
--------------------------------------------------------------------------------
/.github/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kelindar/bitmap/f172a65d34622e0289d3135db87c5612066feece/.github/logo.png
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | name: Test
2 | on: [push, pull_request]
3 | env:
4 | GITHUB_TOKEN: ${{ secrets.COVERALLS_TOKEN }}
5 | GO111MODULE: "on"
6 | jobs:
7 | test:
8 | name: Test with Coverage
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Set up Go
12 | uses: actions/setup-go@v1
13 | with:
14 | go-version: "1.20"
15 | - name: Check out code
16 | uses: actions/checkout@v2
17 | - name: Install dependencies
18 | run: |
19 | go mod download
20 | - name: Run Unit Tests
21 | run: |
22 | go test -tags noasm -race -covermode atomic -coverprofile=profile.cov ./...
23 | go test -race ./...
24 | - name: Upload Coverage
25 | uses: shogo82148/actions-goveralls@v1
26 | with:
27 | path-to-profile: profile.cov
28 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.exe
2 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Roman Atachiants
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 | ## SIMD-Vectorized Bitmap (Bitset) in Go
13 |
14 | This package contains a bitmap implementation, backed by a slice of `[]uint64` and designed for *dense* small or medium collections. This implementation focuses on high performance by avoiding heap allocations, unrolling loops and implementing SIMD vectorization in assembly.
15 |
16 | ## Features
17 |
18 | * Optimized for **zero heap allocation** for all of the important methods of the bitmap.
19 | * Optimized by **vectorized instructions (SIMD)** used for certain operations such as boolean algebra.
20 | * Support for **boolean algebra** that makes it perfect to implement [bitmap indexes](https://en.wikipedia.org/wiki/Bitmap_index).
21 | * Support for **bit counting** with operations such `Min()`, `Max()`, `Count()` and more.
22 | * Support for **fast iteration** over bits set to one by using an unrolled loop.
23 | * Support for **in-place filtering** based on a user-defined predicate.
24 | * Support for **binary encoding** and can be read/written and has a no-copy slice conversion.
25 | * Support for **reusability** by providing `Clone()` and `Clear()` operations.
26 |
27 | ## Documentation
28 |
29 | The general idea of this package is to have a dead simple way of creating bitmaps (bitsets) that provide maximum performance on the modern hardware by using vectorized single-instruction multiple data ([SIMD](https://en.wikipedia.org/wiki/SIMD)) operations. As opposed to something as [roaring bitmaps](https://github.com/RoaringBitmap/roaring) which are excellent for sparse data, this implementation is designed to be used for small or medium dense bit sets. I've used this package to build a columnar in-memory store, so if you want to see how it can be used for indexing, have a look at [kelindar/column](https://github.com/kelindar/column). I'd like to specifically point out the indexing part and how bitmaps can be used as a good alternative to B*Trees and Hash Maps.
30 |
31 | - [Boolean Algebra](#boolean-algebra)
32 | - [Single Bit Operations](#single-bit-operations)
33 | - [Bit Count and Search](#bit-count-and-search)
34 | - [Iterate and Filter](#iterate-and-filter)
35 | - [Example Usage](#example-usage)
36 | - [Benchmarks](#benchmarks)
37 | - [Contributing](#contributing)
38 |
39 | First, here's what you need to do in order to import this package.
40 |
41 | ```go
42 | import "github.com/kelindar/bitmap"
43 | ```
44 |
45 | ## Boolean Algebra
46 |
47 | Perhaps one of the most useful features of this package is the vectorized implementation of boolean operations allowing us to perform boolean algebra on multiple bitmaps. For example, let's imagine that we have a dataset containing books, and four bitmaps defining one of the four properties of each book. In the figure below, you can imagine that our books can be on "columns" and each bit in a bitmap defines whether this attribute exists on a book or not.
48 |
49 |
50 |
51 |
52 |
53 |
54 |
55 | Now, if we want to find all books that were recently published and have an ebook available, we can use an `And()` method on our two bitmaps in order to combine them. In the example below we retrieve 3 hypothetical bitmaps and combine them to answer our query by calling and `And()` method to mutate the `books` bitmap twice.
56 |
57 | ```go
58 | books := bitmapFor("books") // bitmap.Bitmap
59 | recent := bitmapFor("books_recent") // bitmap.Bitmap
60 | ebooks := bitmapFor("books_has_ebook") // bitmap.Bitmap
61 |
62 | // And operation actually mutates our "books" bitmap
63 | books.And(recent)
64 | books.And(ebooks)
65 | ```
66 |
67 |
68 |
69 |
70 |
71 | Now, what if we want to find recently published books which has e-book available but are *not* best-sellers? In that case, we could use binary `AndNot()` operation that hardware exposes. In the example below we combine
72 |
73 | ```go
74 | books.And(recent)
75 | books.And(ebooks)
76 | books.AndNot(bestsellers)
77 | ```
78 |
79 |
80 |
81 |
82 |
83 | ## Single Bit Operations
84 |
85 | When dealing with single elements, this package supports simple single-bit operations. They include `Set()` and `Remove()` to set a bit to one and to zero respectively, as well as `Contans()` to check for a presence (value set to one) of a certain bit. These methods are simple to use and setting a bit which is out of range would automatically resize the bitmap.
86 |
87 | In the example below we're creating a bitmap, setting one bit to one, checking its presence and setting it back to zero after.
88 |
89 | ```go
90 | var books bitmap.Bitmap
91 |
92 | books.Set(3) // Set the 3rd bit to '1'
93 | hasBook := books.Contains(3) // Returns 'true'
94 | books.Remove(3) // Set the 3rd bit to '0'
95 | ```
96 |
97 | ## Bit Count and Search
98 |
99 | When using a bitmap for indexing or free-list purposes, you will often find yourself in need of counting how many bits are set in a bitmap. This operation actually has a specialized hardware instruction `POPCNT` and an efficient implementation is included in this library. The example below shows how you can simply count the number of bits in a bitmap by calling the `Count()` method.
100 |
101 | ```go
102 | // Counts number of bits set to '1'
103 | numBooks := books.Count()
104 | ```
105 |
106 | On the other hand, you might want to find a specific bit either set to one or to zero, the methods `Min()`, `Max()` allow you to find first or last bit set to one while `MinZero()` and `MaxZero()` allow you to find first or last bit set to zero. The figure below demonstrates an example of that.
107 |
108 |
109 |
110 |
111 |
112 |
113 | ## Iterate and Filter
114 |
115 | The bits in the bitmap can also be iterated over using the `Range` method. It is a simple loop which iterates over and calls a callback. If the callback returns false, then the iteration is halted (similar to `sync.Map`).
116 |
117 | ```go
118 | // Iterate over the bits in the bitmap
119 | bitmap.Range(func(x uint32) bool {
120 | println(x)
121 | return true
122 | })
123 | ```
124 |
125 | Another way of iterating is using the `Filter` method. It iterates similarly to `Range` but the callback returns a boolean value, and if it returns `false` then the current bit will be cleared in the underlying bitmap. You could accomplish the same using `Range` and `Remove` but `Filter` is significantly faster.
126 |
127 | ```go
128 | // Filter iterates over the bits and applies a callback
129 | bitmap.Filter(func(x uint32) bool {
130 | return x % 2 == 0
131 | })
132 | ```
133 |
134 | ## Example Usage
135 |
136 | In its simplest form, you can use the bitmap as a bitset, set and remove bits. This is quite useful as an index (free/fill-list) for an array of data.
137 |
138 | ```go
139 | import "github.com/kelindar/bitmap"
140 | ```
141 |
142 | ```go
143 | var books := bitmap.Bitmap
144 | books.Set(300) // sets 300-th bit
145 | books.Set(400) // sets 400-th bit
146 | books.Set(600) // sets 600-th bit (auto-resized)
147 | books.Contains(300) // returns true
148 | books.Contains(301) // returns false
149 | books.Remove(400) // clears 400-th bit
150 |
151 | // Min, Max, Count
152 | min, ok := books.Min() // returns 300
153 | max, ok := books.Max() // returns 600
154 | count := books.Count() // returns 2
155 |
156 | // Boolean algebra
157 | var other bitmap.Bitmap
158 | other.Set(300)
159 | books.And(other) // Intersection
160 | count = books.Count() // Now returns 1
161 | ```
162 |
163 | ## Benchmarks
164 | Benchmarks below were run on a pre-allocated bitmap of **100,000** elements containing with around 50% bits set to one.
165 |
166 | ```
167 | cpu: Intel(R) Core(TM) i7-9700K CPU @ 3.60GHz
168 | BenchmarkBitmap/set-8 552331321 4.319 ns/op 0 B/op 0 allocs/op
169 | BenchmarkBitmap/remove-8 1000000000 1.621 ns/op 0 B/op 0 allocs/op
170 | BenchmarkBitmap/contains-8 1000000000 1.309 ns/op 0 B/op 0 allocs/op
171 | BenchmarkBitmap/clear-8 26083383 90.45 ns/op 0 B/op 0 allocs/op
172 | BenchmarkBitmap/ones-8 6751939 347.9 ns/op 0 B/op 0 allocs/op
173 | BenchmarkBitmap/min-8 757831477 3.137 ns/op 0 B/op 0 allocs/op
174 | BenchmarkBitmap/max-8 1000000000 1.960 ns/op 0 B/op 0 allocs/op
175 | BenchmarkBitmap/min-zero-8 776620110 3.081 ns/op 0 B/op 0 allocs/op
176 | BenchmarkBitmap/max-zero-8 1000000000 1.536 ns/op 0 B/op 0 allocs/op
177 | BenchmarkBitmap/count-8 6071037 382.5 ns/op 0 B/op 0 allocs/op
178 | BenchmarkBitmap/count-to-8 82777459 28.85 ns/op 0 B/op 0 allocs/op
179 | BenchmarkBitmap/clone-8 20654008 111.5 ns/op 0 B/op 0 allocs/op
180 | BenchmarkBitmap/and-8 16813963 143.6 ns/op 0 B/op 0 allocs/op
181 | BenchmarkBitmap/andnot-8 16961106 141.9 ns/op 0 B/op 0 allocs/op
182 | BenchmarkBitmap/or-8 16999562 141.7 ns/op 0 B/op 0 allocs/op
183 | BenchmarkBitmap/xor-8 16954036 144.7 ns/op 0 B/op 0 allocs/op
184 | BenchmarkRange/range-8 18225 131908 ns/op 0 B/op 0 allocs/op
185 | BenchmarkRange/filter-8 25636 93630 ns/op 0 B/op 0 allocs/op
186 | ```
187 |
188 | ## Contributing
189 |
190 | We are open to contributions, feel free to submit a pull request and we'll review it as quickly as we can. This library is maintained by [Roman Atachiants](https://www.linkedin.com/in/atachiants/)
191 |
192 | ## License
193 |
194 | Tile is licensed under the [MIT License](LICENSE).
195 |
--------------------------------------------------------------------------------
/bench/main.go:
--------------------------------------------------------------------------------
1 | package main
2 |
3 | import (
4 | "fmt"
5 | "time"
6 |
7 | "github.com/kelindar/bitmap"
8 | )
9 |
10 | func main() {
11 | const size = 10000000
12 | const iter = 500
13 | const inner = 500
14 |
15 | a := createBitmap(size)
16 | b := createBitmap(size)
17 |
18 | for i := 0; i < iter; i++ {
19 | start := time.Now()
20 |
21 | for j := 0; j < inner; j++ {
22 | a.And(b, b, b, b)
23 | //a.And(b)
24 | //a.And(b)
25 | //a.And(b)
26 | //a.And(b)
27 | }
28 |
29 | fmt.Printf("iteration %v took %v...\n", i*inner, time.Now().Sub(start))
30 | }
31 |
32 | }
33 |
34 | func createBitmap(size int) bitmap.Bitmap {
35 | index := make(bitmap.Bitmap, size/64)
36 | index.Grow(uint32(size - 1))
37 | for i := 0; i < len(index); i++ {
38 | index[i] = 0xf0f0f0f0f0f0f0f0
39 | }
40 | return index
41 | }
42 |
--------------------------------------------------------------------------------
/bitmap.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | package bitmap
5 |
6 | import (
7 | "math/bits"
8 | "unsafe"
9 |
10 | "github.com/klauspost/cpuid/v2"
11 | )
12 |
13 | const (
14 | isUnsupported = iota
15 | isAccelerated
16 | isAVX512
17 | )
18 |
19 | // Hardware contains the resolved acceleration level
20 | var hardware = levelOf(cpuid.CPU)
21 |
22 | // levelOf returns the hardware acceleration level
23 | func levelOf(cpu cpuid.CPUInfo) int {
24 | switch {
25 | case cpu.Supports(cpuid.AVX512F) && cpu.Supports(cpuid.AVX512DQ) && cpu.Supports(cpuid.AVX512BW):
26 | return isAVX512
27 | case cpu.Supports(cpuid.AVX2) && cpu.Supports(cpuid.FMA3):
28 | return isAccelerated
29 | case cpu.Supports(cpuid.ASIMD):
30 | return isAccelerated
31 | default:
32 | return isUnsupported
33 | }
34 | }
35 |
36 | // Bitmap represents a scalar-backed bitmap index
37 | type Bitmap []uint64
38 |
39 | // Set sets the bit x in the bitmap and grows it if necessary.
40 | func (dst *Bitmap) Set(x uint32) {
41 | blkAt := int(x >> 6)
42 | bitAt := int(x % 64)
43 | if size := len(*dst); blkAt >= size {
44 | dst.grow(blkAt)
45 | }
46 |
47 | (*dst)[blkAt] |= (1 << bitAt)
48 | }
49 |
50 | // Remove removes the bit x from the bitmap, but does not shrink it.
51 | func (dst *Bitmap) Remove(x uint32) {
52 | if blkAt := int(x >> 6); blkAt < len(*dst) {
53 | bitAt := int(x % 64)
54 | (*dst)[blkAt] &^= (1 << bitAt)
55 | }
56 | }
57 |
58 | // Contains checks whether a value is contained in the bitmap or not.
59 | func (dst Bitmap) Contains(x uint32) bool {
60 | blkAt := int(x >> 6)
61 | if size := len(dst); blkAt >= size {
62 | return false
63 | }
64 |
65 | bitAt := int(x % 64)
66 | return (dst[blkAt] & (1 << bitAt)) > 0
67 | }
68 |
69 | // Ones sets the entire bitmap to one.
70 | func (dst Bitmap) Ones() {
71 | for i := 0; i < len(dst); i++ {
72 | dst[i] = 0xffffffffffffffff
73 | }
74 | }
75 |
76 | // Min get the smallest value stored in this bitmap, assuming the bitmap is not empty.
77 | func (dst Bitmap) Min() (uint32, bool) {
78 | for blkAt, blk := range dst {
79 | if blk != 0x0 {
80 | return uint32(blkAt<<6 + bits.TrailingZeros64(blk)), true
81 | }
82 | }
83 |
84 | return 0, false
85 | }
86 |
87 | // Max get the largest value stored in this bitmap, assuming the bitmap is not empty.
88 | func (dst Bitmap) Max() (uint32, bool) {
89 | var blk uint64
90 | for blkAt := len(dst) - 1; blkAt >= 0; blkAt-- {
91 | if blk = dst[blkAt]; blk != 0x0 {
92 | return uint32(blkAt<<6 + (63 - bits.LeadingZeros64(blk))), true
93 | }
94 | }
95 | return 0, false
96 | }
97 |
98 | // MinZero finds the first zero bit and returns its index, assuming the bitmap is not empty.
99 | func (dst Bitmap) MinZero() (uint32, bool) {
100 | for blkAt, blk := range dst {
101 | if blk != 0xffffffffffffffff {
102 | return uint32(blkAt<<6 + bits.TrailingZeros64(^blk)), true
103 | }
104 | }
105 | return 0, false
106 | }
107 |
108 | // MaxZero get the last zero bit and return its index, assuming bitmap is not empty
109 | func (dst Bitmap) MaxZero() (uint32, bool) {
110 | var blk uint64
111 | for blkAt := len(dst) - 1; blkAt >= 0; blkAt-- {
112 | if blk = dst[blkAt]; blk != 0xffffffffffffffff {
113 | return uint32(blkAt<<6 + (63 - bits.LeadingZeros64(^blk))), true
114 | }
115 | }
116 | return 0, false
117 | }
118 |
119 | // CountTo counts the number of elements in the bitmap up until the specified index. If until
120 | // is math.MaxUint32, it will return the count. The count is non-inclusive of the index.
121 | func (dst Bitmap) CountTo(until uint32) int {
122 | if len(dst) == 0 {
123 | return 0
124 | }
125 |
126 | if maxUntil := uint32(len(dst) << 6); until > maxUntil {
127 | until = maxUntil
128 | }
129 |
130 | // Figure out the index of the last block
131 | blkUntil := until >> 6
132 | bitUntil := until % 64
133 |
134 | // Count the bits right before the last block
135 | sum := dst[:blkUntil].Count()
136 |
137 | // Count the bits at the end
138 | if bitUntil > 0 {
139 | sum += bits.OnesCount64(dst[blkUntil] << (64 - uint64(bitUntil)))
140 | }
141 | return sum
142 | }
143 |
144 | // Grow grows the bitmap size until we reach the desired bit.
145 | func (dst *Bitmap) Grow(desiredBit uint32) {
146 | dst.grow(int(desiredBit >> 6))
147 | }
148 |
149 | // grow grows the size of the bitmap until we reach the desired block offset
150 | func (dst *Bitmap) grow(blkAt int) {
151 | if len(*dst) > blkAt {
152 | return
153 | }
154 |
155 | // If there's space, resize the slice without copying.
156 | if cap(*dst) > blkAt {
157 | *dst = (*dst)[:blkAt+1]
158 | return
159 | }
160 |
161 | old := *dst
162 | *dst = make(Bitmap, blkAt+1, resize(cap(old), blkAt+1))
163 | copy(*dst, old)
164 | }
165 |
166 | // shrink shrinks the size of the bitmap and resets to zero
167 | func (dst *Bitmap) shrink(length int) {
168 | until := len(*dst)
169 | for i := length; i < until; i++ {
170 | (*dst)[i] = 0
171 | }
172 |
173 | // Trim without reallocating
174 | *dst = (*dst)[:length]
175 | }
176 |
177 | // minlen calculates the minimum length of all of the bitmaps
178 | func minlen(a, b Bitmap, extra []Bitmap) int {
179 | size := minint(len(a), len(b))
180 | for _, v := range extra {
181 | if m := minint(len(a), len(v)); m < size {
182 | size = m
183 | }
184 | }
185 | return size
186 | }
187 |
188 | // maxlen calculates the maximum length of all of the bitmaps
189 | func maxlen(a, b Bitmap, extra []Bitmap) int {
190 | size := maxint(len(a), len(b))
191 | for _, v := range extra {
192 | if m := maxint(len(a), len(v)); m > size {
193 | size = m
194 | }
195 | }
196 | return size
197 | }
198 |
199 | // maxint returns a maximum of two integers without branches.
200 | func maxint(v1, v2 int) int {
201 | return v1 - ((v1 - v2) & ((v1 - v2) >> 31))
202 | }
203 |
204 | // minint returns a minimum of two integers without branches.
205 | func minint(v1, v2 int) int {
206 | return v2 + ((v1 - v2) & ((v1 - v2) >> 31))
207 | }
208 |
209 | // resize calculates the new required capacity and a new index
210 | func resize(capacity, v int) int {
211 | const threshold = 256
212 | if v < threshold {
213 | v |= v >> 1
214 | v |= v >> 2
215 | v |= v >> 4
216 | v |= v >> 8
217 | v |= v >> 16
218 | v++
219 | return int(v)
220 | }
221 |
222 | if capacity < threshold {
223 | capacity = threshold
224 | }
225 |
226 | for 0 < capacity && capacity < (v+1) {
227 | capacity += (capacity + 3*threshold) / 4
228 | }
229 | return capacity
230 | }
231 |
232 | // dimensionsOf returns a uint64 containing the packed dimensions
233 | func dimensionsOf(n, m int) uint64 {
234 | return uint64(n) | (uint64(m) << 32)
235 | }
236 |
237 | // pointersOf returns a pointer to an array containing pointers to the
238 | // first element of each bitmap and the maximum length of all bitmaps
239 | func pointersOf(other Bitmap, extra []Bitmap) (unsafe.Pointer, int) {
240 | out := make([]unsafe.Pointer, len(extra)+1)
241 | out[0] = unsafe.Pointer(&other[0])
242 | max := 0
243 |
244 | for i := range extra {
245 | out[i+1] = unsafe.Pointer(&extra[i][0])
246 | if len(extra[i]) > max {
247 | max = len(extra[i])
248 | }
249 | }
250 |
251 | return unsafe.Pointer(&out[0]), max
252 | }
253 |
--------------------------------------------------------------------------------
/bitmap_amd64.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | //go:build !noasm && amd64
5 |
6 | package bitmap
7 |
8 | import "unsafe"
9 |
10 | // And computes the intersection between two bitmaps and stores the result in the current bitmap
11 | func (dst *Bitmap) And(other Bitmap, extra ...Bitmap) {
12 | max := minlen(*dst, other, extra)
13 | dst.shrink(max)
14 | if max == 0 {
15 | return
16 | }
17 |
18 | switch hardware {
19 | case isAccelerated:
20 | switch len(extra) {
21 | case 0:
22 | _and(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(max))
23 | default:
24 | vx, _ := pointersOf(other, extra)
25 | _and_many(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
26 | }
27 | case isAVX512:
28 | switch len(extra) {
29 | case 0:
30 | _and_avx512(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(max))
31 | default:
32 | vx, _ := pointersOf(other, extra)
33 | _and_many_avx512(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
34 | }
35 | default:
36 | and(*dst, max, other, extra)
37 | return
38 | }
39 | }
40 |
41 | // AndNot computes the difference between two bitmaps and stores the result in the current bitmap.
42 | // Operation works as set subtract: dst - b
43 | func (dst *Bitmap) AndNot(other Bitmap, extra ...Bitmap) {
44 | max := minlen(*dst, other, extra)
45 | if max == 0 {
46 | return
47 | }
48 |
49 | switch hardware {
50 | case isAccelerated:
51 | switch len(extra) {
52 | case 0:
53 | _andn(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(max))
54 | default:
55 | vx, _ := pointersOf(other, extra)
56 | _andn_many(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
57 | }
58 | case isAVX512:
59 | switch len(extra) {
60 | case 0:
61 | _andn_avx512(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(max))
62 | default:
63 | vx, _ := pointersOf(other, extra)
64 | _andn_many_avx512(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
65 | }
66 | default:
67 | andn(*dst, max, other, extra)
68 | return
69 | }
70 | }
71 |
72 | // Or computes the union between two bitmaps and stores the result in the current bitmap
73 | func (dst *Bitmap) Or(other Bitmap, extra ...Bitmap) {
74 | max := maxlen(*dst, other, extra)
75 | if max == 0 {
76 | return
77 | }
78 |
79 | dst.grow(max - 1)
80 | switch hardware {
81 | case isAccelerated:
82 | switch len(extra) {
83 | case 0:
84 | _or(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(len(other)))
85 | default:
86 | vx, max := pointersOf(other, extra)
87 | _or_many(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
88 | }
89 | case isAVX512:
90 | switch len(extra) {
91 | case 0:
92 | _or_avx512(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(len(other)))
93 | default:
94 | vx, max := pointersOf(other, extra)
95 | _or_many_avx512(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
96 | }
97 | default:
98 | or(*dst, other, extra)
99 | }
100 | }
101 |
102 | // Xor computes the symmetric difference between two bitmaps and stores the result in the current bitmap
103 | func (dst *Bitmap) Xor(other Bitmap, extra ...Bitmap) {
104 | max := maxlen(*dst, other, extra)
105 | if max == 0 {
106 | return
107 | }
108 |
109 | dst.grow(max - 1)
110 | switch hardware {
111 | case isAccelerated:
112 | switch len(extra) {
113 | case 0:
114 | _xor(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(len(other)))
115 | default:
116 | vx, max := pointersOf(other, extra)
117 | _xor_many(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
118 | }
119 | case isAVX512:
120 | switch len(extra) {
121 | case 0:
122 | _xor_avx512(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(len(other)))
123 | default:
124 | vx, max := pointersOf(other, extra)
125 | _xor_many_avx512(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
126 | }
127 | default:
128 | xor(*dst, other, extra)
129 | }
130 | }
131 |
132 | // Count returns the number of elements in this bitmap
133 | func (dst Bitmap) Count() int {
134 | if len(dst) == 0 {
135 | return 0
136 | }
137 |
138 | switch hardware {
139 | case isAccelerated:
140 | var res uint64
141 | _count(unsafe.Pointer(&dst[0]), uint64(len(dst)), unsafe.Pointer(&res))
142 | return int(res)
143 | default:
144 | return count(dst)
145 | }
146 | }
147 |
--------------------------------------------------------------------------------
/bitmap_arm64.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | //go:build !noasm && arm64
5 |
6 | package bitmap
7 |
8 | import "unsafe"
9 |
10 | // And computes the intersection between two bitmaps and stores the result in the current bitmap
11 | func (dst *Bitmap) And(other Bitmap, extra ...Bitmap) {
12 | max := minlen(*dst, other, extra)
13 | dst.shrink(max)
14 | if max == 0 {
15 | return
16 | }
17 |
18 | switch hardware {
19 | case isAccelerated:
20 | switch len(extra) {
21 | case 0:
22 | _and(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(max))
23 | default:
24 | vx, _ := pointersOf(other, extra)
25 | _and_many(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
26 | }
27 | default:
28 | and(*dst, max, other, extra)
29 | return
30 | }
31 | }
32 |
33 | // AndNot computes the difference between two bitmaps and stores the result in the current bitmap.
34 | // Operation works as set subtract: dst - b
35 | func (dst *Bitmap) AndNot(other Bitmap, extra ...Bitmap) {
36 | max := minlen(*dst, other, extra)
37 | if max == 0 {
38 | return
39 | }
40 |
41 | switch hardware {
42 | case isAccelerated:
43 | switch len(extra) {
44 | case 0:
45 | _andn(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(max))
46 | default:
47 | vx, _ := pointersOf(other, extra)
48 | _andn_many(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
49 | }
50 | default:
51 | andn(*dst, max, other, extra)
52 | return
53 | }
54 | }
55 |
56 | // Or computes the union between two bitmaps and stores the result in the current bitmap
57 | func (dst *Bitmap) Or(other Bitmap, extra ...Bitmap) {
58 | max := maxlen(*dst, other, extra)
59 | if max == 0 {
60 | return
61 | }
62 |
63 | dst.grow(max - 1)
64 | switch hardware {
65 | case isAccelerated:
66 | switch len(extra) {
67 | case 0:
68 | _or(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(len(other)))
69 | default:
70 | vx, max := pointersOf(other, extra)
71 | _or_many(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
72 | }
73 | default:
74 | or(*dst, other, extra)
75 | }
76 | }
77 |
78 | // Xor computes the symmetric difference between two bitmaps and stores the result in the current bitmap
79 | func (dst *Bitmap) Xor(other Bitmap, extra ...Bitmap) {
80 | max := maxlen(*dst, other, extra)
81 | if max == 0 {
82 | return
83 | }
84 |
85 | dst.grow(max - 1)
86 | switch hardware {
87 | case isAccelerated:
88 | switch len(extra) {
89 | case 0:
90 | _xor(unsafe.Pointer(&(*dst)[0]), unsafe.Pointer(&other[0]), uint64(len(other)))
91 | default:
92 | vx, max := pointersOf(other, extra)
93 | _xor_many(unsafe.Pointer(&(*dst)[0]), vx, dimensionsOf(max, len(extra)+1))
94 | }
95 | default:
96 | xor(*dst, other, extra)
97 | }
98 | }
99 |
100 | // Count returns the number of elements in this bitmap
101 | func (dst Bitmap) Count() int {
102 | if len(dst) == 0 {
103 | return 0
104 | }
105 |
106 | switch hardware {
107 | case isAccelerated:
108 | var res uint64
109 | _count(unsafe.Pointer(&dst[0]), uint64(len(dst)), unsafe.Pointer(&res))
110 | return int(res)
111 | default:
112 | return count(dst)
113 | }
114 | }
115 |
--------------------------------------------------------------------------------
/bitmap_generic.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | //go:build noasm || (!amd64 && !arm64)
5 |
6 | package bitmap
7 |
8 | // And computes the intersection between two bitmaps and stores the result in the current bitmap
9 | func (dst *Bitmap) And(other Bitmap, extra ...Bitmap) {
10 | max := minlen(*dst, other, extra)
11 | dst.shrink(max)
12 | and(*dst, max, other, extra)
13 | }
14 |
15 | // AndNot computes the difference between two bitmaps and stores the result in the current bitmap
16 | func (dst *Bitmap) AndNot(other Bitmap, extra ...Bitmap) {
17 | max := minlen(*dst, other, extra)
18 | andn(*dst, max, other, extra)
19 | }
20 |
21 | // Or computes the union between two bitmaps and stores the result in the current bitmap
22 | func (dst *Bitmap) Or(other Bitmap, extra ...Bitmap) {
23 | max := maxlen(*dst, other, extra)
24 | dst.grow(max - 1)
25 | or(*dst, other, extra)
26 | }
27 |
28 | // Xor computes the symmetric difference between two bitmaps and stores the result in the current bitmap
29 | func (dst *Bitmap) Xor(other Bitmap, extra ...Bitmap) {
30 | max := maxlen(*dst, other, extra)
31 | dst.grow(max - 1)
32 | xor(*dst, other, extra)
33 | }
34 |
35 | // Count returns the number of elements in this bitmap
36 | func (dst Bitmap) Count() int {
37 | return count(dst)
38 | }
39 |
--------------------------------------------------------------------------------
/bitmap_test.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | package bitmap
5 |
6 | import (
7 | "fmt"
8 | "math"
9 | "strconv"
10 | "testing"
11 |
12 | "github.com/stretchr/testify/assert"
13 | )
14 |
15 | /*
16 | cpu: 13th Gen Intel(R) Core(TM) i7-13700K
17 | BenchmarkBitmap/set-24 655739137 1.803 ns/op 0 B/op 0 allocs/op
18 | BenchmarkBitmap/remove-24 1000000000 1.107 ns/op 0 B/op 0 allocs/op
19 | BenchmarkBitmap/contains-24 1000000000 0.8975 ns/op 0 B/op 0 allocs/op
20 | BenchmarkBitmap/clear-24 827574 1487 ns/op 0 B/op 0 allocs/op
21 | BenchmarkBitmap/ones-24 571444 2088 ns/op 0 B/op 0 allocs/op
22 | BenchmarkBitmap/min-24 979591036 1.252 ns/op 0 B/op 0 allocs/op
23 | BenchmarkBitmap/max-24 944884120 1.229 ns/op 0 B/op 0 allocs/op
24 | BenchmarkBitmap/min-zero-24 991736356 1.258 ns/op 0 B/op 0 allocs/op
25 | BenchmarkBitmap/max-zero-24 1000000000 1.157 ns/op 0 B/op 0 allocs/op
26 | BenchmarkBitmap/count-24 393440 3086 ns/op 0 B/op 0 allocs/op
27 | BenchmarkBitmap/count-to-24 58537441 20.20 ns/op 0 B/op 0 allocs/op
28 | BenchmarkBitmap/clone-24 648651 1875 ns/op 0 B/op 0 allocs/op
29 | BenchmarkBitmap/and-24 685710 1733 ns/op 0 B/op 0 allocs/op
30 | BenchmarkBitmap/andnot-24 705882 1709 ns/op 0 B/op 0 allocs/op
31 | BenchmarkBitmap/or-24 705894 1702 ns/op 0 B/op 0 allocs/op
32 | BenchmarkBitmap/xor-24 705919 1721 ns/op 0 B/op 0 allocs/op
33 | */
34 | func BenchmarkBitmap(b *testing.B) {
35 | other := make(Bitmap, 1000000/64)
36 | other.Set(1000000)
37 |
38 | run(b, "set", func(index Bitmap) {
39 | index.Set(5000)
40 | })
41 |
42 | run(b, "remove", func(index Bitmap) {
43 | index.Remove(5000)
44 | })
45 |
46 | run(b, "contains", func(index Bitmap) {
47 | index.Contains(5000)
48 | })
49 |
50 | run(b, "clear", func(index Bitmap) {
51 | index.Clear()
52 | })
53 |
54 | run(b, "ones", func(index Bitmap) {
55 | index.Ones()
56 | })
57 |
58 | run(b, "min", func(index Bitmap) {
59 | index.Min()
60 | })
61 |
62 | run(b, "max", func(index Bitmap) {
63 | index.Max()
64 | })
65 |
66 | run(b, "min-zero", func(index Bitmap) {
67 | index.MinZero()
68 | })
69 |
70 | run(b, "max-zero", func(index Bitmap) {
71 | index.MaxZero()
72 | })
73 |
74 | run(b, "count", func(index Bitmap) {
75 | index.Count()
76 | })
77 |
78 | run(b, "count-to", func(index Bitmap) {
79 | index.CountTo(5001)
80 | })
81 |
82 | into := make(Bitmap, len(other))
83 | run(b, "clone", func(index Bitmap) {
84 | index.Clone(&into)
85 | })
86 |
87 | run(b, "and", func(index Bitmap) {
88 | index.And(other)
89 | })
90 |
91 | run(b, "andnot", func(index Bitmap) {
92 | index.AndNot(other)
93 | })
94 |
95 | run(b, "or", func(index Bitmap) {
96 | index.AndNot(other)
97 | })
98 |
99 | run(b, "xor", func(index Bitmap) {
100 | index.AndNot(other)
101 | })
102 | }
103 |
104 | /*
105 | cpu: 13th Gen Intel(R) Core(TM) i7-13700K
106 | BenchmarkMany/and4-noasm-24 66297 18139 ns/op 0 B/op 0 allocs/op
107 | BenchmarkMany/and4-naive-24 179106 6803 ns/op 0 B/op 0 allocs/op
108 | BenchmarkMany/and4-batch-24 258091 4679 ns/op 32 B/op 1 allocs/op
109 | */
110 | func BenchmarkMany(b *testing.B) {
111 | other := make(Bitmap, 1000000/64)
112 | other.Set(1000000)
113 |
114 | run(b, "and4-noasm", func(index Bitmap) {
115 | max := minlen(index, other, nil)
116 | index.shrink(max)
117 | and(index, max, other, nil)
118 | and(index, max, other, nil)
119 | and(index, max, other, nil)
120 | and(index, max, other, nil)
121 | })
122 |
123 | run(b, "and4-naive", func(index Bitmap) {
124 | index.And(other)
125 | index.And(other)
126 | index.And(other)
127 | index.And(other)
128 | })
129 |
130 | run(b, "and4-batch", func(index Bitmap) {
131 | index.And(other, other, other, other)
132 | })
133 | }
134 |
135 | func TestSetRemove(t *testing.T) {
136 | index := Bitmap{}
137 | for i := uint32(100); i < 200; i++ {
138 | index.Set(i)
139 | assert.True(t, index.Contains(i))
140 | }
141 |
142 | for i := uint32(150); i < 180; i++ {
143 | index.Remove(i)
144 | assert.False(t, index.Contains(i))
145 | }
146 | }
147 |
148 | func TestClear(t *testing.T) {
149 | index := Bitmap{}
150 | for i := uint32(0); i < 500; i++ {
151 | index.Set(i)
152 | assert.True(t, index.Contains(i))
153 | }
154 |
155 | index.Clear()
156 | index.Set(500)
157 | for i := uint32(0); i < 500; i++ {
158 | assert.False(t, index.Contains(i), i)
159 | }
160 | assert.True(t, index.Contains(500))
161 | }
162 |
163 | func TestAnd(t *testing.T) {
164 | a, b := Bitmap{}, Bitmap{}
165 | for i := uint32(0); i < 100; i += 2 {
166 | a.Set(i)
167 | b.Set(i)
168 | }
169 |
170 | a.And(b)
171 | assert.False(t, a.Contains(1))
172 | for i := uint32(0); i < 100; i += 2 {
173 | assert.True(t, a.Contains(i))
174 | }
175 | }
176 |
177 | func TestAndNot(t *testing.T) {
178 | a, b := Bitmap{}, Bitmap{}
179 | for i := uint32(0); i < 100; i += 2 {
180 | a.Set(i)
181 | b.Set(i)
182 | }
183 |
184 | a.AndNot(b)
185 | assert.False(t, a.Contains(1))
186 | for i := uint32(0); i < 100; i += 2 {
187 | assert.False(t, a.Contains(i))
188 | }
189 | }
190 |
191 | func TestAndNot_TheSameBitmap(t *testing.T) {
192 | var a Bitmap
193 | for i := uint32(0); i < 100; i += 2 {
194 | a.Set(i)
195 | }
196 |
197 | a.AndNot(a)
198 |
199 | for i := uint32(0); i < 100; i++ {
200 | assert.Equal(t, false, a.Contains(i), "for "+strconv.Itoa(int(i)))
201 | }
202 | assert.Equal(t, 0, a.Count())
203 | }
204 |
205 | func TestAndNot_DifferentBitmapSizes(t *testing.T) {
206 | var a, b, c, d Bitmap
207 | for i := uint32(0); i < 100; i += 2 {
208 | a.Set(i)
209 | c.Set(i)
210 | }
211 |
212 | for i := uint32(0); i < 200; i += 2 {
213 | b.Set(i)
214 | d.Set(i)
215 | }
216 | a.AndNot(b)
217 | d.AndNot(c)
218 |
219 | for i := uint32(0); i < 100; i++ {
220 | assert.Equal(t, false, a.Contains(i), "for "+strconv.Itoa(int(i)))
221 | assert.Equal(t, false, d.Contains(i), "for "+strconv.Itoa(int(i)))
222 | }
223 | for i := uint32(100); i < 200; i++ {
224 | assert.Equal(t, b.Contains(i), d.Contains(i), "for "+strconv.Itoa(int(i)))
225 | }
226 | assert.Equal(t, 0, a.Count())
227 | assert.Equal(t, 50, d.Count())
228 | }
229 |
230 | func TestOr(t *testing.T) {
231 | a, b := Bitmap{}, Bitmap{}
232 | for i := uint32(0); i < 100; i += 2 {
233 | b.Set(i)
234 | }
235 |
236 | a.Or(b)
237 | assert.False(t, a.Contains(1))
238 | for i := uint32(0); i < 100; i += 2 {
239 | assert.True(t, a.Contains(i))
240 | }
241 | }
242 |
243 | func TestOr_DifferentBitmapSizes(t *testing.T) {
244 | var a, b, c, d Bitmap
245 | for i := uint32(0); i < 100; i += 2 {
246 | a.Set(i)
247 | c.Set(i)
248 | }
249 |
250 | for i := uint32(0); i < 200; i += 2 {
251 | b.Set(i)
252 | d.Set(i)
253 | }
254 | a.Or(b)
255 | d.Or(c)
256 |
257 | for i := uint32(0); i < 200; i++ {
258 | assert.Equal(t, d.Contains(i), a.Contains(i), "for "+strconv.Itoa(int(i)))
259 | }
260 | assert.Equal(t, 100, a.Count())
261 | assert.Equal(t, 100, d.Count())
262 | }
263 |
264 | func TestXor(t *testing.T) {
265 | a, b := Bitmap{}, Bitmap{}
266 | for i := uint32(0); i < 100; i += 2 {
267 | b.Set(i)
268 | }
269 |
270 | a.Xor(b)
271 | assert.False(t, a.Contains(1))
272 | for i := uint32(0); i < 100; i += 2 {
273 | assert.True(t, a.Contains(i))
274 | }
275 | }
276 |
277 | func TestXOr_DifferentBitmapSizes(t *testing.T) {
278 | var a, b, c, d Bitmap
279 | for i := uint32(0); i < 100; i += 2 {
280 | a.Set(i)
281 | c.Set(i)
282 | }
283 |
284 | for i := uint32(0); i < 200; i += 2 {
285 | b.Set(i)
286 | d.Set(i)
287 | }
288 | a.Xor(b)
289 | d.Xor(c)
290 |
291 | for i := uint32(0); i < 200; i++ {
292 | assert.Equal(t, d.Contains(i), a.Contains(i), "for "+strconv.Itoa(int(i)))
293 | }
294 | assert.Equal(t, 50, a.Count())
295 | assert.Equal(t, 50, d.Count())
296 | }
297 |
298 | func TestMin(t *testing.T) {
299 | {
300 | a := Bitmap{0x0, 0x0, 0xffffffffffffff00}
301 | v, ok := a.Min()
302 | assert.True(t, ok)
303 | assert.Equal(t, 64+64+8, int(v))
304 | assert.False(t, a.Contains(v-1))
305 | assert.True(t, a.Contains(v))
306 | }
307 |
308 | {
309 | a := Bitmap{0x0, 0x0}
310 | v, ok := a.Min()
311 | assert.False(t, ok)
312 | assert.Equal(t, 0, int(v))
313 | }
314 | }
315 |
316 | func TestMax(t *testing.T) {
317 | {
318 | a := Bitmap{0x0, 0x0, 0x00000000000000f0}
319 | v, ok := a.Max()
320 | assert.True(t, ok)
321 | assert.Equal(t, 64+64+7, int(v))
322 |
323 | assert.False(t, a.Contains(v-4))
324 | assert.True(t, a.Contains(v-3))
325 | assert.True(t, a.Contains(v-2))
326 | assert.True(t, a.Contains(v-1))
327 | assert.True(t, a.Contains(v))
328 | assert.False(t, a.Contains(v+1))
329 | assert.False(t, a.Contains(v+2))
330 | }
331 |
332 | {
333 | a := Bitmap{0x0, 0x0}
334 | v, ok := a.Max()
335 | assert.False(t, ok)
336 | assert.Equal(t, 0, int(v))
337 | }
338 | }
339 |
340 | func TestMinZero(t *testing.T) {
341 | {
342 | a := Bitmap{0xffffffffffffffff, 0xffffffffffffffff, 0xf0ffffffffffff0f}
343 | v, ok := a.MinZero()
344 | assert.True(t, ok)
345 | assert.Equal(t, 64+64+4, int(v))
346 | assert.False(t, a.Contains(v))
347 | }
348 |
349 | {
350 | a := Bitmap{0xffffffffffffffff, 0xffffffffffffffff}
351 | v, ok := a.MinZero()
352 | assert.False(t, ok)
353 | assert.Equal(t, 0, int(v))
354 | }
355 | }
356 |
357 | func TestMaxZero(t *testing.T) {
358 | {
359 | a := Bitmap{0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffff0f}
360 | v, ok := a.MaxZero()
361 | assert.True(t, ok)
362 | assert.Equal(t, 64+64+7, int(v))
363 | assert.False(t, a.Contains(v))
364 | }
365 |
366 | {
367 | a := Bitmap{0xffffffffffffffff, 0xffffffffffffffff}
368 | v, ok := a.MaxZero()
369 | assert.False(t, ok)
370 | assert.Equal(t, 0, int(v))
371 | }
372 | }
373 |
374 | func TestCount(t *testing.T) {
375 | a := Bitmap{}
376 | assert.Equal(t, 0, a.Count())
377 | assert.Equal(t, 0, a.CountTo(math.MaxUint32))
378 |
379 | b := Bitmap{}
380 | b.Set(1)
381 | b.Set(2)
382 | b.Set(5)
383 | b.Set(6)
384 | b.Set(101)
385 | b.Set(102)
386 | b.Set(105)
387 | b.Set(106)
388 |
389 | assert.Equal(t, 8, b.Count())
390 | assert.Equal(t, 1, b.CountTo(2))
391 | assert.Equal(t, 2, b.CountTo(4))
392 | assert.Equal(t, 4, b.CountTo(100))
393 | assert.Equal(t, 4, b.CountTo(101))
394 | assert.Equal(t, 5, b.CountTo(102))
395 | assert.Equal(t, 8, b.CountTo(128))
396 | assert.Equal(t, 8, b.CountTo(math.MaxUint32))
397 |
398 | b.Set(127)
399 |
400 | assert.Equal(t, 9, b.CountTo(128))
401 | }
402 |
403 | func TestGrow(t *testing.T) {
404 | bitmap := make(Bitmap, 1, 5)
405 | bitmap[0] = 42
406 |
407 | assert.Equal(t, 1, len(bitmap))
408 | assert.Equal(t, 5, cap(bitmap))
409 | assert.Equal(t, Bitmap{42}, bitmap)
410 |
411 | bitmap.grow(0)
412 | assert.Equal(t, 1, len(bitmap))
413 | assert.Equal(t, 5, cap(bitmap))
414 | assert.Equal(t, Bitmap{42}, bitmap)
415 |
416 | bitmap.grow(4)
417 | assert.Equal(t, 5, len(bitmap))
418 | assert.Equal(t, 5, cap(bitmap))
419 | assert.Equal(t, Bitmap{42, 0, 0, 0, 0}, bitmap)
420 |
421 | bitmap.grow(5)
422 | assert.Equal(t, 6, len(bitmap))
423 | assert.Equal(t, Bitmap{42, 0, 0, 0, 0, 0}, bitmap)
424 | bitmap.Grow(6)
425 | }
426 |
427 | func TestAnd_DifferentBitmapSizes(t *testing.T) {
428 | var a, b, c, d Bitmap
429 | for i := uint32(0); i < 100; i += 2 {
430 | a.Set(i)
431 | c.Set(i)
432 | }
433 |
434 | for i := uint32(0); i < 200; i += 2 {
435 | b.Set(i)
436 | d.Set(i)
437 | }
438 |
439 | a.And(b)
440 | d.And(c)
441 |
442 | for i := uint32(0); i < 200; i++ {
443 | assert.Equal(t, a.Contains(i), d.Contains(i), "for "+strconv.Itoa(int(i)))
444 | }
445 | assert.Equal(t, 50, a.Count())
446 | assert.Equal(t, 50, d.Count())
447 | }
448 |
449 | func TestAnd_ConsecutiveAnd_DifferentBitmapSizes(t *testing.T) {
450 | var a, b, c Bitmap
451 | for i := uint32(0); i < 200; i += 2 {
452 | a.Set(i)
453 | c.Set(i)
454 | }
455 |
456 | for i := uint32(0); i < 100; i += 2 {
457 | b.Set(i)
458 | }
459 |
460 | a.And(b)
461 | a.And(c)
462 |
463 | for i := uint32(0); i < 200; i++ {
464 | assert.Equal(t, a.Contains(i), b.Contains(i), "for "+strconv.Itoa(int(i)))
465 | }
466 | assert.Equal(t, 50, a.Count())
467 | }
468 |
469 | func TestResizeBitmap(t *testing.T) {
470 | assert.Equal(t, 1, resize(100, 0))
471 | assert.Equal(t, 2, resize(100, 1))
472 | assert.Equal(t, 4, resize(100, 2))
473 | assert.Equal(t, 16, resize(100, 11))
474 | assert.Equal(t, 256, resize(100, 255))
475 | assert.Equal(t, 1232, resize(100, 1000))
476 | assert.Equal(t, 1232, resize(200, 1000))
477 | assert.Equal(t, 1232, resize(512, 1000))
478 | assert.Equal(t, 1213, resize(500, 1000)) // Inconsistent
479 | assert.Equal(t, 22504, resize(512, 20000))
480 | assert.Equal(t, 28322, resize(22504, 22600))
481 | }
482 |
483 | func TestMinInteger(t *testing.T) {
484 | tests := [][3]int{
485 | {10, 20, 10},
486 | {20, 10, 10},
487 | {0, 10, 0},
488 | {10, 0, 0},
489 | {10, 10, 10},
490 | {10, -10, -10},
491 | {-10, 10, -10},
492 | {-10, 0, -10},
493 | {-10, -10, -10},
494 | }
495 |
496 | for _, tc := range tests {
497 | assert.Equal(t, tc[2], minint(tc[0], tc[1]), fmt.Sprintf("min(%v, %v)", tc[0], tc[1]))
498 | }
499 | }
500 |
501 | func TestMaxInteger(t *testing.T) {
502 | tests := [][3]int{
503 | {10, 20, 20},
504 | {20, 10, 20},
505 | {0, 10, 10},
506 | {10, 0, 10},
507 | {10, 10, 10},
508 | {10, -10, 10},
509 | {-10, 10, 10},
510 | {-10, 0, 0},
511 | {-10, -10, -10},
512 | }
513 |
514 | for _, tc := range tests {
515 | assert.Equal(t, tc[2], maxint(tc[0], tc[1]), fmt.Sprintf("max(%v, %v)", tc[0], tc[1]))
516 | }
517 | }
518 |
519 | func TestBatched(t *testing.T) {
520 | const bits = 0b0011
521 |
522 | // Functions to test
523 | tests := []func(Bitmap) func(Bitmap, ...Bitmap){
524 | func(b Bitmap) func(Bitmap, ...Bitmap) {
525 | return b.And
526 | },
527 | func(b Bitmap) func(Bitmap, ...Bitmap) {
528 | return b.AndNot
529 | },
530 | func(b Bitmap) func(Bitmap, ...Bitmap) {
531 | return b.Or
532 | },
533 | func(b Bitmap) func(Bitmap, ...Bitmap) {
534 | return b.Xor
535 | },
536 | }
537 |
538 | for _, withHw := range []int{isAccelerated, isUnsupported} {
539 | for i, tc := range tests {
540 | t.Run(fmt.Sprintf("%v,avx=%v", i, withHw), func(t *testing.T) {
541 | hardware = withHw
542 | naive := func(n int) Bitmap {
543 | input := Bitmap{bits}
544 | tc(input)(Bitmap{bits})
545 | for i := 0; i < n; i++ {
546 | tc(input)(Bitmap{bits})
547 | }
548 | return input
549 | }
550 |
551 | for n := 0; n < 5; n++ {
552 | input := Bitmap{bits}
553 | other := Bitmap{bits}
554 |
555 | extra := make([]Bitmap, 0, n)
556 | for i := 0; i < n; i++ {
557 | extra = append(extra, Bitmap{bits})
558 | }
559 |
560 | tc(input)(other, extra...)
561 | assert.Equal(t, naive(n), input)
562 | }
563 | })
564 | }
565 | }
566 | }
567 |
568 | func TestEmptyAnd(t *testing.T) {
569 | var a, b Bitmap
570 | a.And(b)
571 | assert.Equal(t, 0, a.Count())
572 | }
573 |
574 | func TestEmptyAndNot(t *testing.T) {
575 | var a, b Bitmap
576 | a.AndNot(b)
577 | assert.Equal(t, 0, a.Count())
578 | }
579 |
580 | func TestEmptyOr(t *testing.T) {
581 | var a, b Bitmap
582 | a.Or(b)
583 | assert.Equal(t, 0, a.Count())
584 | }
585 |
586 | func TestEmptyXor(t *testing.T) {
587 | var a, b Bitmap
588 | a.Xor(b)
589 | assert.Equal(t, 0, a.Count())
590 | }
591 |
592 | func TestTruthTables_NoSIMD(t *testing.T) {
593 | hardware = isUnsupported
594 | testTruthTables(t)
595 | }
596 |
597 | func TestTruthTables_SIMD(t *testing.T) {
598 | hardware = isAccelerated
599 | testTruthTables(t)
600 | }
601 |
602 | func testTruthTables(t *testing.T) {
603 | { // AND
604 | a := Bitmap{0b0011, 0b1011, 0b1100, 0b0000, 0b0011, 0b1011, 0b1100, 0b0000, 0b0011}
605 | a.And(Bitmap{0b0101, 0b1101, 0b1010, 0b1111, 0b0101, 0b1101, 0b1010, 0b1111, 0b0101})
606 | assert.Equal(t, 0b0001, int(a[0]))
607 | assert.Equal(t, 0b1001, int(a[1]))
608 | assert.Equal(t, 0b1000, int(a[2]))
609 | assert.Equal(t, 0b0000, int(a[3]))
610 | assert.Equal(t, 0b0001, int(a[4]))
611 | assert.Equal(t, 0b1001, int(a[5]))
612 | assert.Equal(t, 0b1000, int(a[6]))
613 | assert.Equal(t, 0b0000, int(a[7]))
614 | assert.Equal(t, 0b0001, int(a[8]))
615 | }
616 | { // AND NOT
617 | a := Bitmap{0b0011, 0, 0, 0}
618 | a.AndNot(Bitmap{0b0101})
619 | assert.Equal(t, 0b0010, int(a[0]))
620 | }
621 | { // OR
622 | a := Bitmap{0b0011, 0, 0, 0}
623 | a.Or(Bitmap{0b0101})
624 | assert.Equal(t, 0b0111, int(a[0]))
625 | }
626 | { // XOR
627 | a := Bitmap{0b0011, 0, 0, 0}
628 | a.Xor(Bitmap{0b0101})
629 | assert.Equal(t, 0b0110, int(a[0]))
630 | }
631 | }
632 |
--------------------------------------------------------------------------------
/codec.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | package bitmap
5 |
6 | import (
7 | "encoding/binary"
8 | "encoding/hex"
9 | "encoding/json"
10 | "fmt"
11 | "io"
12 | "reflect"
13 | "strconv"
14 | "strings"
15 | "unsafe"
16 | )
17 |
18 | // FromBytes reads a bitmap from a byte buffer without copying the buffer.
19 | func FromBytes(buffer []byte) (out Bitmap) {
20 | switch {
21 | case len(buffer) == 0:
22 | return nil
23 | case len(buffer)%8 != 0:
24 | panic(fmt.Sprintf("bitmap: buffer length expected to be multiple of 8, was %d", len(buffer)))
25 | }
26 |
27 | hdr := (*reflect.SliceHeader)(unsafe.Pointer(&out))
28 | hdr.Len = len(buffer) >> 3
29 | hdr.Cap = hdr.Len
30 | hdr.Data = uintptr(unsafe.Pointer(&(buffer)[0]))
31 | return out
32 | }
33 |
34 | // ToBytes converts the bitmap to binary representation without copying the underlying
35 | // data. The output buffer should not be modified, since it would also change the bitmap.
36 | func (dst *Bitmap) ToBytes() (out []byte) {
37 | if len(*dst) == 0 {
38 | return nil
39 | }
40 |
41 | hdr := (*reflect.SliceHeader)(unsafe.Pointer(&out))
42 | hdr.Len = len(*dst) * 8
43 | hdr.Cap = hdr.Len
44 | hdr.Data = uintptr(unsafe.Pointer(&(*dst)[0]))
45 | return out
46 | }
47 |
48 | // ReadFrom reads the bitmap from the reader.
49 | func ReadFrom(r io.Reader) (Bitmap, error) {
50 | var output Bitmap
51 | _, err := output.ReadFrom(r)
52 | return output, err
53 | }
54 |
55 | // WriteTo writes the bitmap to a specified writer.
56 | func (dst *Bitmap) WriteTo(w io.Writer) (int64, error) {
57 | buffer := dst.ToBytes()
58 |
59 | // Write the header into the stream
60 | var header [4]byte
61 | binary.BigEndian.PutUint32(header[:4], uint32(len(buffer)))
62 | n1, err := w.Write(header[:4])
63 | if err != nil {
64 | return int64(n1), err
65 | }
66 |
67 | // Write the buffer into the stream
68 | n2, err := w.Write(buffer)
69 | if err != nil {
70 | return int64(n2), err
71 | }
72 |
73 | return int64(n1 + n2), err
74 | }
75 |
76 | // ReadFrom reads data from r until EOF or error. The return value n is the number of
77 | // bytes read. Any error except EOF encountered during the read is also returned.
78 | func (dst *Bitmap) ReadFrom(r io.Reader) (int64, error) {
79 | var header [4]byte
80 | if n, err := io.ReadFull(r, header[:]); err != nil {
81 | return int64(n), err
82 | }
83 |
84 | // If bitmap is too small, create one of the required size
85 | if size := int(binary.BigEndian.Uint32(header[:4])) / 8; size > len(*dst) {
86 | *dst = make(Bitmap, size)
87 | }
88 |
89 | // Read into the buffer
90 | buffer := dst.ToBytes()
91 | n, err := io.ReadFull(r, buffer)
92 | return int64(n + 4), err
93 | }
94 |
95 | // Clone clones the bitmap. If a destination bitmap is provided, the bitmap will be
96 | // cloned inside, otherwise a new Bitmap will be allocated and returned
97 | func (dst Bitmap) Clone(into *Bitmap) Bitmap {
98 | if into == nil {
99 | newm := make(Bitmap, len(dst))
100 | into = &newm
101 | }
102 |
103 | max := maxlen(*into, dst, nil)
104 | into.grow(max - 1)
105 |
106 | copy(*into, dst)
107 | return (*into)[:len(dst)]
108 | }
109 |
110 | // Clear clears the bitmap and resizes it to zero.
111 | func (dst *Bitmap) Clear() {
112 | for i := range *dst {
113 | (*dst)[i] = 0
114 | }
115 | *dst = (*dst)[:0]
116 | }
117 |
118 | // MarshalJSON returns encoded string representation for the bitmap
119 | func (dst Bitmap) MarshalJSON() ([]byte, error) {
120 | var sb strings.Builder
121 | for i := len(dst) - 1; i >= 0; i-- {
122 | // convert each uint64 into 16 * 4-bit hexadecimal character
123 | writeHexdecimal(&sb, dst[i], true)
124 | }
125 |
126 | return json.Marshal(sb.String())
127 | }
128 |
129 | // writeHexdecimal write the hexdecimal representation for given value in buffer
130 | func writeHexdecimal(sb *strings.Builder, value uint64, pad bool) {
131 | maxLen := 16 // 64 bits / 4
132 |
133 | hexadecimal := strings.ToUpper(strconv.FormatUint(value, 16))
134 | hexaLen := len(hexadecimal)
135 |
136 | if !pad || hexaLen == maxLen {
137 | sb.WriteString(hexadecimal)
138 | return
139 | }
140 |
141 | // Add padding
142 | for i := hexaLen; i < maxLen; i++ {
143 | sb.WriteString("0")
144 | }
145 |
146 | sb.WriteString(hexadecimal)
147 | }
148 |
149 | // UnmarshalJSON decodes the received bytes and loads it to bitmap object
150 | func (dst *Bitmap) UnmarshalJSON(data []byte) (err error) {
151 | var str string
152 | if data == nil {
153 | *dst = make(Bitmap, 0)
154 | return
155 | }
156 |
157 | if err := json.Unmarshal(data, &str); err != nil {
158 | return err
159 | }
160 |
161 | mp, err := fromHex(str)
162 | if err != nil {
163 | return err
164 | }
165 |
166 | *dst = mp
167 | return nil
168 |
169 | }
170 |
171 | // fromHex reads a hexadecimal string and converts it to bitmap, character at index 0 is the most significant
172 | func fromHex(hexString string) (Bitmap, error) {
173 | bytes, err := hex.DecodeString(hexString)
174 |
175 | switch {
176 | case err != nil:
177 | return nil, err
178 | case len(bytes) == 0:
179 | return nil, nil
180 | }
181 |
182 | // reverse bytes to maintain bytes significance order (least significant = hexString tail = list head)
183 | for l, r := 0, len(bytes)-1; l < r; l, r = l+1, r-1 {
184 | bytes[l], bytes[r] = bytes[r], bytes[l]
185 | }
186 |
187 | for len(bytes)%8 != 0 {
188 | bytes = append(bytes, 0)
189 | }
190 | return FromBytes(bytes), nil
191 | }
192 |
--------------------------------------------------------------------------------
/codec_test.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | package bitmap
5 |
6 | import (
7 | "bytes"
8 | "encoding/json"
9 | "math"
10 | "math/rand"
11 | "strings"
12 | "testing"
13 |
14 | "github.com/klauspost/cpuid/v2"
15 | "github.com/stretchr/testify/assert"
16 | )
17 |
18 | func BenchmarkCodec(b *testing.B) {
19 | tmp := bytes.NewBuffer(nil)
20 | run(b, "write-to", func(index Bitmap) {
21 | tmp.Reset()
22 | index.WriteTo(tmp)
23 | })
24 |
25 | run(b, "read-from", func(index Bitmap) {
26 | ReadFrom(tmp)
27 | })
28 | }
29 |
30 | func TestSaveLoad(t *testing.T) {
31 | m := Bitmap{}
32 | for i := 0; i <= 5000; i += 10 {
33 | m.Set(uint32(i))
34 | }
35 |
36 | // Save the map
37 | enc := new(bytes.Buffer)
38 | cloned := m.Clone(nil)
39 | n, err := cloned.WriteTo(enc)
40 | assert.NoError(t, err)
41 | assert.Equal(t, int64(636), n)
42 |
43 | // Load the map back
44 | out, err := ReadFrom(enc)
45 | assert.NoError(t, err)
46 | assert.Equal(t, len(m), len(out))
47 | assert.Equal(t, m, out)
48 | }
49 |
50 | func TestFromBytes(t *testing.T) {
51 | m := Bitmap{}
52 | for i := 0; i <= 5000; i += 10 {
53 | m.Set(uint32(i))
54 | }
55 |
56 | out := FromBytes(m.ToBytes())
57 | assert.Equal(t, m, out)
58 | }
59 |
60 | func TestFromBytesNil(t *testing.T) {
61 | out := FromBytes(nil)
62 | assert.Nil(t, out)
63 | }
64 |
65 | func TestFromBytesInvalid(t *testing.T) {
66 | m := make([]byte, 10)
67 | for i := 1; i < 8; i++ {
68 | assert.Panics(t, func() {
69 | FromBytes(m[:i])
70 | })
71 | }
72 | }
73 |
74 | func TestToBytesNil(t *testing.T) {
75 | var m Bitmap
76 | out := m.ToBytes()
77 | assert.Nil(t, out)
78 | }
79 |
80 | func TestJSON(t *testing.T) {
81 | mp := Bitmap{}
82 |
83 | for i := 0; i < 1000; i++ {
84 | mp.Set(uint32(rand.Intn(10000)))
85 | }
86 |
87 | data, err := json.Marshal(mp)
88 | assert.NoError(t, err)
89 |
90 | newMp := Bitmap{}
91 | assert.NoError(t, json.Unmarshal(data, &newMp))
92 | assert.Equal(t, mp, newMp)
93 |
94 | assert.NoError(t, mp.UnmarshalJSON(nil))
95 | assert.Empty(t, mp)
96 |
97 | assert.Error(t, mp.UnmarshalJSON([]byte("\"notvalid")))
98 | assert.Error(t, mp.UnmarshalJSON([]byte("\"Z\"")))
99 | }
100 |
101 | func TestToHexadecimal(t *testing.T) {
102 | type Case struct {
103 | Input uint64
104 | Pad bool
105 | Output string
106 | }
107 | tests := []Case{{
108 | Input: 0,
109 | Pad: false,
110 | Output: "0",
111 | }, {
112 | Input: 42,
113 | Pad: false,
114 | Output: "2A",
115 | }, {
116 | Input: math.MaxUint64,
117 | Pad: false,
118 | Output: "FFFFFFFFFFFFFFFF",
119 | }, {
120 | Input: 15,
121 | Pad: true,
122 | Output: "000000000000000F",
123 | },
124 | }
125 |
126 | for _, tc := range tests {
127 | sb := strings.Builder{}
128 | writeHexdecimal(&sb, tc.Input, tc.Pad)
129 | assert.Equal(t, tc.Output, sb.String())
130 | }
131 |
132 | }
133 |
134 | func TestFromHex(t *testing.T) {
135 | bm, err := fromHex("FFA001")
136 | assert.NoError(t, err)
137 | assert.Equal(t, Bitmap{0xFFA001}, bm)
138 |
139 | bm, err = fromHex("000000000000000000000000000000000001")
140 | assert.NoError(t, err)
141 | assert.Equal(t, Bitmap{1, 0, 0}, bm)
142 |
143 | _, err = fromHex("Not Valid")
144 | assert.Error(t, err)
145 |
146 | bm, err = fromHex("")
147 | assert.NoError(t, err)
148 | assert.Nil(t, bm)
149 | }
150 |
151 | func TestDimensionsOf(t *testing.T) {
152 | testCases := []struct {
153 | n int
154 | m int
155 | expected uint64
156 | }{
157 | {0, 0, 0},
158 | {10, 11, 0xb0000000a},
159 | }
160 |
161 | for _, tc := range testCases {
162 | d := dimensionsOf(tc.n, tc.m)
163 | assert.Equal(t, tc.expected, d)
164 | }
165 | }
166 |
167 | func TestPointersOf(t *testing.T) {
168 | testCases := []struct {
169 | inputOther Bitmap
170 | inputExtra []Bitmap
171 | }{
172 | {Bitmap{1}, []Bitmap{{2}}},
173 | {Bitmap{1}, []Bitmap{{2}, {3}}},
174 | }
175 |
176 | for _, tc := range testCases {
177 | ptr, max := pointersOf(tc.inputOther, tc.inputExtra)
178 | assert.NotNil(t, ptr)
179 | assert.NotZero(t, uintptr(ptr))
180 | assert.NotZero(t, max)
181 | }
182 | }
183 |
184 | func TestLevelOfWithEnabledFeatures(t *testing.T) {
185 | testCases := []struct {
186 | name string
187 | featureIDs []cpuid.FeatureID
188 | expected int
189 | }{
190 | {
191 | name: "AVX-512F, AVX-512BW, and AVX-512DQ support",
192 | featureIDs: []cpuid.FeatureID{cpuid.AVX512F, cpuid.AVX512BW, cpuid.AVX512DQ},
193 | expected: isAVX512,
194 | },
195 | {
196 | name: "AVX2 and FMA3 support",
197 | featureIDs: []cpuid.FeatureID{cpuid.AVX2, cpuid.FMA3},
198 | expected: isAccelerated,
199 | },
200 | {
201 | name: "NEON support on ARM64",
202 | featureIDs: []cpuid.FeatureID{cpuid.ASIMD},
203 | expected: isAccelerated,
204 | },
205 | {
206 | name: "Unsupported feature combination",
207 | featureIDs: []cpuid.FeatureID{cpuid.SHA3, cpuid.AESARM},
208 | expected: isUnsupported,
209 | },
210 | }
211 |
212 | for _, tc := range testCases {
213 | t.Run(tc.name, func(t *testing.T) {
214 | cpu := cpuid.CPUInfo{}
215 | for _, feature := range tc.featureIDs {
216 | cpu.Enable(feature)
217 | }
218 |
219 | level := levelOf(cpu)
220 | assert.Equal(t, tc.expected, level, "expected to return %d, but got %d", tc.expected, level)
221 | })
222 | }
223 | }
224 |
--------------------------------------------------------------------------------
/codegen/bin/gocc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kelindar/bitmap/f172a65d34622e0289d3135db87c5612066feece/codegen/bin/gocc
--------------------------------------------------------------------------------
/codegen/generate.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ./bin/gocc simd_avx.c --arch avx2 -O1 --package bitmap -o ../
4 | ./bin/gocc simd_avx512.c --arch avx512 -O3 --package bitmap -o ../
5 | ./bin/gocc simd_neon.c --arch neon -O3 --package bitmap -o ../
6 | ./bin/gocc simd_apple.c --arch apple -O3 --package bitmap -o ../
--------------------------------------------------------------------------------
/codegen/simd_apple.c:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | #include
5 |
6 | void _and(uint64_t* a, uint64_t* b, uint64_t n) {
7 | #pragma clang loop vectorize(enable)
8 | for (uint64_t i = 0; i < n; ++i) {
9 | a[i] &= b[i];
10 | }
11 | }
12 |
13 | void _andn(uint64_t* a, uint64_t* b, uint64_t n) {
14 | #pragma clang loop vectorize(enable) interleave(enable)
15 | for (uint64_t i = 0; i < n; ++i) {
16 | a[i] &= ~b[i];
17 | }
18 | }
19 |
20 | void _or(uint64_t* a, uint64_t* b, uint64_t n) {
21 | #pragma clang loop vectorize(enable) interleave(enable)
22 | for (uint64_t i = 0; i < n; ++i) {
23 | a[i] |= b[i];
24 | }
25 | }
26 |
27 | void _xor(uint64_t* a, uint64_t* b, uint64_t n) {
28 | #pragma clang loop vectorize(enable) interleave(enable)
29 | for (uint64_t i = 0; i < n; ++i) {
30 | a[i] ^= b[i];
31 | }
32 | }
33 |
34 | void _and_many(uint64_t* a, uint64_t** b, uint64_t dims) {
35 | int64_t n = (dims & 0xffffffff);
36 | int64_t m = (dims >> 32);
37 | const int64_t chunk_size = 512;
38 |
39 | // Loop over chunks of b
40 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
41 | int64_t chunk_end = chunk + chunk_size;
42 | if (chunk_end > n) {
43 | chunk_end = n;
44 | }
45 |
46 | for (int64_t j = 0; j < m; ++j) {
47 | #pragma clang loop vectorize(enable) interleave(enable)
48 | for (int64_t i = chunk; i < chunk_end; ++i) {
49 | a[i] &= b[j][i];
50 | }
51 | }
52 | }
53 | }
54 |
55 | void _andn_many(uint64_t* a, uint64_t** b, uint64_t dims) {
56 | int64_t n = (dims & 0xffffffff);
57 | int64_t m = (dims >> 32);
58 | const int64_t chunk_size = 512;
59 |
60 | // Loop over chunks of b
61 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
62 | int64_t chunk_end = chunk + chunk_size;
63 | if (chunk_end > n) {
64 | chunk_end = n;
65 | }
66 |
67 | for (int64_t j = 0; j < m; ++j) {
68 | #pragma clang loop vectorize(enable) interleave(enable)
69 | for (int64_t i = chunk; i < chunk_end; ++i) {
70 | a[i] &= ~b[j][i];
71 | }
72 | }
73 | }
74 | }
75 |
76 | void _or_many(uint64_t* a, uint64_t** b, uint64_t dims) {
77 | int64_t n = (dims & 0xffffffff);
78 | int64_t m = (dims >> 32);
79 | const int64_t chunk_size = 512;
80 |
81 | // Loop over chunks of b
82 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
83 | int64_t chunk_end = chunk + chunk_size;
84 | if (chunk_end > n) {
85 | chunk_end = n;
86 | }
87 |
88 | for (int64_t j = 0; j < m; ++j) {
89 | #pragma clang loop vectorize(enable) interleave(enable)
90 | for (int64_t i = chunk; i < chunk_end; ++i) {
91 | a[i] |= b[j][i];
92 | }
93 | }
94 | }
95 | }
96 |
97 | void _xor_many(uint64_t* a, uint64_t** b, uint64_t dims) {
98 | int64_t n = (dims & 0xffffffff);
99 | int64_t m = (dims >> 32);
100 | const int64_t chunk_size = 512;
101 |
102 | // Loop over chunks of b
103 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
104 | int64_t chunk_end = chunk + chunk_size;
105 | if (chunk_end > n) {
106 | chunk_end = n;
107 | }
108 |
109 | for (int64_t j = 0; j < m; ++j) {
110 | #pragma clang loop vectorize(enable) interleave(enable)
111 | for (int64_t i = chunk; i < chunk_end; ++i) {
112 | a[i] ^= b[j][i];
113 | }
114 | }
115 | }
116 | }
117 |
118 | void _count(uint64_t *a, uint64_t size, uint64_t *result) {
119 | uint64_t count = 0;
120 |
121 | for (int i = 0; i < size; i++) {
122 | count += __builtin_popcountll(a[i]);
123 | }
124 | *result = count;
125 | }
126 |
--------------------------------------------------------------------------------
/codegen/simd_avx.c:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | #include
5 |
6 | void _and(uint64_t* a, uint64_t* b, uint64_t n) {
7 | #pragma clang loop vectorize(enable)
8 | for (uint64_t i = 0; i < n; ++i) {
9 | a[i] &= b[i];
10 | }
11 | }
12 |
13 | void _andn(uint64_t* a, uint64_t* b, uint64_t n) {
14 | #pragma clang loop vectorize(enable) interleave(enable)
15 | for (uint64_t i = 0; i < n; ++i) {
16 | a[i] &= ~b[i];
17 | }
18 | }
19 |
20 | void _or(uint64_t* a, uint64_t* b, uint64_t n) {
21 | #pragma clang loop vectorize(enable) interleave(enable)
22 | for (uint64_t i = 0; i < n; ++i) {
23 | a[i] |= b[i];
24 | }
25 | }
26 |
27 | void _xor(uint64_t* a, uint64_t* b, uint64_t n) {
28 | #pragma clang loop vectorize(enable) interleave(enable)
29 | for (uint64_t i = 0; i < n; ++i) {
30 | a[i] ^= b[i];
31 | }
32 | }
33 |
34 | void _and_many(uint64_t* a, uint64_t** b, uint64_t dims) {
35 | int64_t n = (dims & 0xffffffff);
36 | int64_t m = (dims >> 32);
37 | const int64_t chunk_size = 512;
38 |
39 | // Loop over chunks of b
40 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
41 | int64_t chunk_end = chunk + chunk_size;
42 | if (chunk_end > n) {
43 | chunk_end = n;
44 | }
45 |
46 | for (int64_t j = 0; j < m; ++j) {
47 | #pragma clang loop vectorize(enable) interleave(enable)
48 | for (int64_t i = chunk; i < chunk_end; ++i) {
49 | a[i] &= b[j][i];
50 | }
51 | }
52 | }
53 | }
54 |
55 | void _andn_many(uint64_t* a, uint64_t** b, uint64_t dims) {
56 | int64_t n = (dims & 0xffffffff);
57 | int64_t m = (dims >> 32);
58 | const int64_t chunk_size = 512;
59 |
60 | // Loop over chunks of b
61 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
62 | int64_t chunk_end = chunk + chunk_size;
63 | if (chunk_end > n) {
64 | chunk_end = n;
65 | }
66 |
67 | for (int64_t j = 0; j < m; ++j) {
68 | #pragma clang loop vectorize(enable) interleave(enable)
69 | for (int64_t i = chunk; i < chunk_end; ++i) {
70 | a[i] &= ~b[j][i];
71 | }
72 | }
73 | }
74 | }
75 |
76 | void _or_many(uint64_t* a, uint64_t** b, uint64_t dims) {
77 | int64_t n = (dims & 0xffffffff);
78 | int64_t m = (dims >> 32);
79 | const int64_t chunk_size = 512;
80 |
81 | // Loop over chunks of b
82 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
83 | int64_t chunk_end = chunk + chunk_size;
84 | if (chunk_end > n) {
85 | chunk_end = n;
86 | }
87 |
88 | for (int64_t j = 0; j < m; ++j) {
89 | #pragma clang loop vectorize(enable) interleave(enable)
90 | for (int64_t i = chunk; i < chunk_end; ++i) {
91 | a[i] |= b[j][i];
92 | }
93 | }
94 | }
95 | }
96 |
97 | void _xor_many(uint64_t* a, uint64_t** b, uint64_t dims) {
98 | int64_t n = (dims & 0xffffffff);
99 | int64_t m = (dims >> 32);
100 | const int64_t chunk_size = 512;
101 |
102 | // Loop over chunks of b
103 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
104 | int64_t chunk_end = chunk + chunk_size;
105 | if (chunk_end > n) {
106 | chunk_end = n;
107 | }
108 |
109 | for (int64_t j = 0; j < m; ++j) {
110 | #pragma clang loop vectorize(enable) interleave(enable)
111 | for (int64_t i = chunk; i < chunk_end; ++i) {
112 | a[i] ^= b[j][i];
113 | }
114 | }
115 | }
116 | }
117 |
118 | void _count(uint64_t *a, uint64_t size, uint64_t *result) {
119 | uint64_t count = 0;
120 |
121 | for (int i = 0; i < size; i++) {
122 | count += __builtin_popcountll(a[i]);
123 | }
124 | *result = count;
125 | }
126 |
--------------------------------------------------------------------------------
/codegen/simd_avx512.c:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | #include
5 |
6 | void _and_avx512(uint64_t* a, uint64_t* b, uint64_t n) {
7 | #pragma clang loop vectorize(enable)
8 | for (uint64_t i = 0; i < n; ++i) {
9 | a[i] &= b[i];
10 | }
11 | }
12 |
13 | void _andn_avx512(uint64_t* a, uint64_t* b, uint64_t n) {
14 | #pragma clang loop vectorize(enable) interleave(enable)
15 | for (uint64_t i = 0; i < n; ++i) {
16 | a[i] &= ~b[i];
17 | }
18 | }
19 |
20 | void _or_avx512(uint64_t* a, uint64_t* b, uint64_t n) {
21 | #pragma clang loop vectorize(enable) interleave(enable)
22 | for (uint64_t i = 0; i < n; ++i) {
23 | a[i] |= b[i];
24 | }
25 | }
26 |
27 | void _xor_avx512(uint64_t* a, uint64_t* b, uint64_t n) {
28 | #pragma clang loop vectorize(enable) interleave(enable)
29 | for (uint64_t i = 0; i < n; ++i) {
30 | a[i] ^= b[i];
31 | }
32 | }
33 |
34 | void _and_many_avx512(uint64_t* a, uint64_t** b, uint64_t dims) {
35 | int64_t n = (dims & 0xffffffff);
36 | int64_t m = (dims >> 32);
37 | const int64_t chunk_size = 512;
38 |
39 | // Loop over chunks of b
40 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
41 | int64_t chunk_end = chunk + chunk_size;
42 | if (chunk_end > n) {
43 | chunk_end = n;
44 | }
45 |
46 | for (int64_t j = 0; j < m; ++j) {
47 | #pragma clang loop vectorize(enable) interleave(enable)
48 | for (int64_t i = chunk; i < chunk_end; ++i) {
49 | a[i] &= b[j][i];
50 | }
51 | }
52 | }
53 | }
54 |
55 | void _andn_many_avx512(uint64_t* a, uint64_t** b, uint64_t dims) {
56 | int64_t n = (dims & 0xffffffff);
57 | int64_t m = (dims >> 32);
58 | const int64_t chunk_size = 512;
59 |
60 | // Loop over chunks of b
61 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
62 | int64_t chunk_end = chunk + chunk_size;
63 | if (chunk_end > n) {
64 | chunk_end = n;
65 | }
66 |
67 | for (int64_t j = 0; j < m; ++j) {
68 | #pragma clang loop vectorize(enable) interleave(enable)
69 | for (int64_t i = chunk; i < chunk_end; ++i) {
70 | a[i] &= ~b[j][i];
71 | }
72 | }
73 | }
74 | }
75 |
76 | void _or_many_avx512(uint64_t* a, uint64_t** b, uint64_t dims) {
77 | int64_t n = (dims & 0xffffffff);
78 | int64_t m = (dims >> 32);
79 | const int64_t chunk_size = 512;
80 |
81 | // Loop over chunks of b
82 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
83 | int64_t chunk_end = chunk + chunk_size;
84 | if (chunk_end > n) {
85 | chunk_end = n;
86 | }
87 |
88 | for (int64_t j = 0; j < m; ++j) {
89 | #pragma clang loop vectorize(enable) interleave(enable)
90 | for (int64_t i = chunk; i < chunk_end; ++i) {
91 | a[i] |= b[j][i];
92 | }
93 | }
94 | }
95 | }
96 |
97 | void _xor_many_avx512(uint64_t* a, uint64_t** b, uint64_t dims) {
98 | int64_t n = (dims & 0xffffffff);
99 | int64_t m = (dims >> 32);
100 | const int64_t chunk_size = 512;
101 |
102 | // Loop over chunks of b
103 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
104 | int64_t chunk_end = chunk + chunk_size;
105 | if (chunk_end > n) {
106 | chunk_end = n;
107 | }
108 |
109 | for (int64_t j = 0; j < m; ++j) {
110 | #pragma clang loop vectorize(enable) interleave(enable)
111 | for (int64_t i = chunk; i < chunk_end; ++i) {
112 | a[i] ^= b[j][i];
113 | }
114 | }
115 | }
116 | }
117 |
--------------------------------------------------------------------------------
/codegen/simd_neon.c:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | #include
5 |
6 | void _and(uint64_t* a, uint64_t* b, uint64_t n) {
7 | #pragma clang loop vectorize(enable)
8 | for (uint64_t i = 0; i < n; ++i) {
9 | a[i] &= b[i];
10 | }
11 | }
12 |
13 | void _andn(uint64_t* a, uint64_t* b, uint64_t n) {
14 | #pragma clang loop vectorize(enable) interleave(enable)
15 | for (uint64_t i = 0; i < n; ++i) {
16 | a[i] &= ~b[i];
17 | }
18 | }
19 |
20 | void _or(uint64_t* a, uint64_t* b, uint64_t n) {
21 | #pragma clang loop vectorize(enable) interleave(enable)
22 | for (uint64_t i = 0; i < n; ++i) {
23 | a[i] |= b[i];
24 | }
25 | }
26 |
27 | void _xor(uint64_t* a, uint64_t* b, uint64_t n) {
28 | #pragma clang loop vectorize(enable) interleave(enable)
29 | for (uint64_t i = 0; i < n; ++i) {
30 | a[i] ^= b[i];
31 | }
32 | }
33 |
34 | void _and_many(uint64_t* a, uint64_t** b, uint64_t dims) {
35 | int64_t n = (dims & 0xffffffff);
36 | int64_t m = (dims >> 32);
37 | const int64_t chunk_size = 512;
38 |
39 | // Loop over chunks of b
40 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
41 | int64_t chunk_end = chunk + chunk_size;
42 | if (chunk_end > n) {
43 | chunk_end = n;
44 | }
45 |
46 | for (int64_t j = 0; j < m; ++j) {
47 | #pragma clang loop vectorize(enable) interleave(enable)
48 | for (int64_t i = chunk; i < chunk_end; ++i) {
49 | a[i] &= b[j][i];
50 | }
51 | }
52 | }
53 | }
54 |
55 | void _andn_many(uint64_t* a, uint64_t** b, uint64_t dims) {
56 | int64_t n = (dims & 0xffffffff);
57 | int64_t m = (dims >> 32);
58 | const int64_t chunk_size = 512;
59 |
60 | // Loop over chunks of b
61 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
62 | int64_t chunk_end = chunk + chunk_size;
63 | if (chunk_end > n) {
64 | chunk_end = n;
65 | }
66 |
67 | for (int64_t j = 0; j < m; ++j) {
68 | #pragma clang loop vectorize(enable) interleave(enable)
69 | for (int64_t i = chunk; i < chunk_end; ++i) {
70 | a[i] &= ~b[j][i];
71 | }
72 | }
73 | }
74 | }
75 |
76 | void _or_many(uint64_t* a, uint64_t** b, uint64_t dims) {
77 | int64_t n = (dims & 0xffffffff);
78 | int64_t m = (dims >> 32);
79 | const int64_t chunk_size = 512;
80 |
81 | // Loop over chunks of b
82 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
83 | int64_t chunk_end = chunk + chunk_size;
84 | if (chunk_end > n) {
85 | chunk_end = n;
86 | }
87 |
88 | for (int64_t j = 0; j < m; ++j) {
89 | #pragma clang loop vectorize(enable) interleave(enable)
90 | for (int64_t i = chunk; i < chunk_end; ++i) {
91 | a[i] |= b[j][i];
92 | }
93 | }
94 | }
95 | }
96 |
97 | void _xor_many(uint64_t* a, uint64_t** b, uint64_t dims) {
98 | int64_t n = (dims & 0xffffffff);
99 | int64_t m = (dims >> 32);
100 | const int64_t chunk_size = 512;
101 |
102 | // Loop over chunks of b
103 | for (int64_t chunk = 0; chunk < n; chunk += chunk_size) {
104 | int64_t chunk_end = chunk + chunk_size;
105 | if (chunk_end > n) {
106 | chunk_end = n;
107 | }
108 |
109 | for (int64_t j = 0; j < m; ++j) {
110 | #pragma clang loop vectorize(enable) interleave(enable)
111 | for (int64_t i = chunk; i < chunk_end; ++i) {
112 | a[i] ^= b[j][i];
113 | }
114 | }
115 | }
116 | }
117 |
118 | void _count(uint64_t *a, uint64_t size, uint64_t *result) {
119 | uint64_t count = 0;
120 |
121 | for (int i = 0; i < size; i++) {
122 | count += __builtin_popcountll(a[i]);
123 | }
124 | *result = count;
125 | }
126 |
--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/kelindar/bitmap
2 |
3 | go 1.18
4 |
5 | require (
6 | github.com/kelindar/simd v1.1.2
7 | github.com/klauspost/cpuid/v2 v2.2.4
8 | github.com/stretchr/testify v1.8.2
9 | )
10 |
11 | require (
12 | github.com/davecgh/go-spew v1.1.1 // indirect
13 | github.com/pmezard/go-difflib v1.0.0 // indirect
14 | golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e // indirect
15 | gopkg.in/yaml.v3 v3.0.1 // indirect
16 | )
17 |
--------------------------------------------------------------------------------
/go.sum:
--------------------------------------------------------------------------------
1 | github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
2 | github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
3 | github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
4 | github.com/kelindar/simd v1.1.2 h1:KduKb+M9cMY2HIH8S/cdJyD+5n5EGgq+Aeeleos55To=
5 | github.com/kelindar/simd v1.1.2/go.mod h1:inq4DFudC7W8L5fhxoeZflLRNpWSs0GNx6MlWFvuvr0=
6 | github.com/klauspost/cpuid/v2 v2.2.4 h1:acbojRNwl3o09bUq+yDCtZFc1aiwaAAxtcn8YkZXnvk=
7 | github.com/klauspost/cpuid/v2 v2.2.4/go.mod h1:RVVoqg1df56z8g3pUjL/3lE5UfnlrJX8tyFgg4nqhuY=
8 | github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
9 | github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
10 | github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
11 | github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw=
12 | github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo=
13 | github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
14 | github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU=
15 | github.com/stretchr/testify v1.8.2 h1:+h33VjcLVPDHtOdpUCuF+7gSuG3yGIftsP1YvFihtJ8=
16 | github.com/stretchr/testify v1.8.2/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4=
17 | golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e h1:CsOuNlbOuf0mzxJIefr6Q4uAUetRUwZE4qt7VfzP+xo=
18 | golang.org/x/sys v0.0.0-20220704084225-05e143d24a9e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
19 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
20 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
21 | gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
22 | gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
23 | gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
24 |
--------------------------------------------------------------------------------
/range.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | package bitmap
5 |
6 | import (
7 | "unsafe"
8 |
9 | "github.com/kelindar/simd"
10 | )
11 |
12 | const full = 0xffffffffffffffff
13 |
14 | // Range iterates over all of the bits set to one in this bitmap.
15 | func (dst Bitmap) Range(fn func(x uint32)) {
16 | for blkAt := 0; blkAt < len(dst); blkAt++ {
17 | blk := (dst)[blkAt]
18 | if blk == 0x0 {
19 | continue // Skip the empty page
20 | }
21 |
22 | // Iterate in a 4-bit chunks so we can reduce the number of function calls and skip
23 | // the bits for which we should not call our range function.
24 | offset := uint32(blkAt << 6)
25 | for ; blk > 0; blk = blk >> 4 {
26 | switch blk & 0b1111 {
27 | case 0b0001:
28 | fn(offset + 0)
29 | case 0b0010:
30 | fn(offset + 1)
31 | case 0b0011:
32 | fn(offset + 0)
33 | fn(offset + 1)
34 | case 0b0100:
35 | fn(offset + 2)
36 | case 0b0101:
37 | fn(offset + 0)
38 | fn(offset + 2)
39 | case 0b0110:
40 | fn(offset + 1)
41 | fn(offset + 2)
42 | case 0b0111:
43 | fn(offset + 0)
44 | fn(offset + 1)
45 | fn(offset + 2)
46 | case 0b1000:
47 | fn(offset + 3)
48 | case 0b1001:
49 | fn(offset + 0)
50 | fn(offset + 3)
51 | case 0b1010:
52 | fn(offset + 1)
53 | fn(offset + 3)
54 | case 0b1011:
55 | fn(offset + 0)
56 | fn(offset + 1)
57 | fn(offset + 3)
58 | case 0b1100:
59 | fn(offset + 2)
60 | fn(offset + 3)
61 | case 0b1101:
62 | fn(offset + 0)
63 | fn(offset + 2)
64 | fn(offset + 3)
65 | case 0b1110:
66 | fn(offset + 1)
67 | fn(offset + 2)
68 | fn(offset + 3)
69 | case 0b1111:
70 | fn(offset + 0)
71 | fn(offset + 1)
72 | fn(offset + 2)
73 | fn(offset + 3)
74 | }
75 | offset += 4
76 | }
77 | }
78 | }
79 |
80 | // Filter predicate
81 | type predicate = func(x uint32) byte
82 |
83 | // Filter iterates over the bitmap elements and calls a predicate provided for each
84 | // containing element. If the predicate returns false, the bitmap at the element's
85 | // position is set to zero.
86 | func (dst *Bitmap) Filter(f func(x uint32) bool) {
87 | fn := *(*predicate)(unsafe.Pointer(&f))
88 | for blkAt := 0; blkAt < len(*dst); blkAt++ {
89 | blk := (*dst)[blkAt]
90 | if blk == 0x0 {
91 | continue // Skip the empty page
92 | }
93 |
94 | offset := uint32(blkAt << 6)
95 | var mask uint64
96 | var i uint32
97 |
98 | // Iterate in a 4-bit chunks so we can reduce the number of function calls and skip
99 | // the bits for which we should not call our filter function.
100 | for ; blk > 0; blk = blk >> 4 {
101 | switch blk & 0b1111 {
102 | case 0b0001:
103 | mask |= uint64(fn(offset)) << i
104 | case 0b0010:
105 | mask |= uint64(fn(offset+1)<<1) << i
106 | case 0b0011:
107 | mask |= uint64(fn(offset)|(fn(offset+1)<<1)) << i
108 | case 0b0100:
109 | mask |= uint64(fn(offset+2)<<2) << i
110 | case 0b0101:
111 | mask |= uint64(fn(offset)|fn(offset+2)<<2) << i
112 | case 0b0110:
113 | mask |= uint64((fn(offset+1)<<1)|(fn(offset+2)<<2)) << i
114 | case 0b0111:
115 | mask |= uint64(fn(offset)|(fn(offset+1)<<1)|(fn(offset+2)<<2)) << i
116 | case 0b1000:
117 | mask |= uint64(fn(offset+3)<<3) << i
118 | case 0b1001:
119 | mask |= uint64(fn(offset)|(fn(offset+3)<<3)) << i
120 | case 0b1010:
121 | mask |= uint64((fn(offset+1)<<1)|(fn(offset+3)<<3)) << i
122 | case 0b1011:
123 | mask |= uint64(fn(offset)|(fn(offset+1)<<1)|(fn(offset+3)<<3)) << i
124 | case 0b1100:
125 | mask |= uint64((fn(offset+2)<<2)|(fn(offset+3)<<3)) << i
126 | case 0b1101:
127 | mask |= uint64(fn(offset)|(fn(offset+2)<<2)|(fn(offset+3)<<3)) << i
128 | case 0b1110:
129 | mask |= uint64((fn(offset+1)<<1)|(fn(offset+2)<<2)|(fn(offset+3)<<3)) << i
130 | case 0b1111:
131 | mask |= uint64(fn(offset)|(fn(offset+1)<<1)|(fn(offset+2)<<2)|(fn(offset+3)<<3)) << i
132 | }
133 |
134 | i += 4
135 | offset += 4
136 | }
137 |
138 | // Apply the mask
139 | (*dst)[blkAt] &= mask
140 | }
141 | }
142 |
143 | // Sum computes a horizontal sum of a slice, filtered by the provided bitmap
144 | func Sum[T simd.Number](src []T, filter Bitmap) (sum T) {
145 | tail := minint(len(src)/64, len(filter)) << 6 // End of 64-byte blocks
146 | last := minint(len(src), len(filter)*64) // End of slice or mask
147 |
148 | var frame [64]T
149 | var i0, i1 int
150 | for i1 = 0; i1 < tail; i1 += 64 {
151 | switch filter[i1>>6] {
152 | case full:
153 | continue // Continue buffering
154 | case 0:
155 | default:
156 | sum += simd.Sum(leftPack(&frame, src[i1:i1+64], filter[i1>>6]))
157 | }
158 |
159 | // Flush the current buffer
160 | if (i1 - i0) > 0 {
161 | sum += simd.Sum(src[i0:i1])
162 | }
163 | i0 = i1 + 64
164 | }
165 |
166 | // Flush the accumulated buffer so far
167 | if (i1 - i0) > 0 {
168 | sum += simd.Sum(src[i0:i1])
169 | }
170 |
171 | // Process the tail
172 | for i := tail; i < last; i++ {
173 | if filter.Contains(uint32(i)) {
174 | sum += src[i]
175 | }
176 | }
177 | return sum
178 | }
179 |
180 | // Min finds the smallest value in a slice, filtered by the provided bitmap
181 | func Min[T simd.Number](src []T, filter Bitmap) (min T, hit bool) {
182 | tail := minint(len(src)/64, len(filter)) << 6 // End of 64-byte blocks
183 | last := minint(len(src), len(filter)*64) // End of slice or mask
184 |
185 | var frame [64]T
186 | var i0, i1 int
187 | for i1 = 0; i1 < tail; i1 += 64 {
188 | switch filter[i1>>6] {
189 | case full:
190 | continue // Continue buffering
191 | case 0:
192 | default:
193 | if m := simd.Min(leftPack(&frame, src[i1:i1+64], filter[i1>>6])); m < min || !hit {
194 | hit = true
195 | min = m
196 | }
197 | }
198 |
199 | // Flush the current buffer
200 | if (i1 - i0) > 0 {
201 | if m := simd.Min(src[i0:i1]); m < min || !hit {
202 | hit = true
203 | min = m
204 | }
205 | }
206 | i0 = i1 + 64
207 | }
208 |
209 | // Flush the accumulated buffer so far
210 | if (i1 - i0) > 0 {
211 | if m := simd.Min(src[i0:i1]); m < min || !hit {
212 | hit = true
213 | min = m
214 | }
215 | }
216 |
217 | // Process the tail
218 | for i := tail; i < last; i++ {
219 | if filter.Contains(uint32(i)) && (src[i] < min || !hit) {
220 | hit = true
221 | min = src[i]
222 | }
223 | }
224 | return
225 | }
226 |
227 | // Max finds the largest value in a slice, filtered by the provided bitmap
228 | func Max[T simd.Number](src []T, filter Bitmap) (max T, hit bool) {
229 | tail := minint(len(src)/64, len(filter)) << 6 // End of 64-byte blocks
230 | last := minint(len(src), len(filter)*64) // End of slice or mask
231 |
232 | var frame [64]T
233 | var i0, i1 int
234 | for i1 = 0; i1 < tail; i1 += 64 {
235 | switch filter[i1>>6] {
236 | case full:
237 | continue // Continue buffering
238 | case 0:
239 | default:
240 | if m := simd.Max(leftPack(&frame, src[i1:i1+64], filter[i1>>6])); m > max || !hit {
241 | hit = true
242 | max = m
243 | }
244 | }
245 |
246 | // Flush the current buffer
247 | if (i1 - i0) > 0 {
248 | if m := simd.Max(src[i0:i1]); m > max || !hit {
249 | hit = true
250 | max = m
251 | }
252 | }
253 | i0 = i1 + 64
254 | }
255 |
256 | // Flush the accumulated buffer so far
257 | if (i1 - i0) > 0 {
258 | if m := simd.Max(src[i0:i1]); m > max || !hit {
259 | hit = true
260 | max = m
261 | }
262 | }
263 |
264 | // Process the tail
265 | for i := tail; i < last; i++ {
266 | if filter.Contains(uint32(i)) && (src[i] > max || !hit) {
267 | hit = true
268 | max = src[i]
269 | }
270 | }
271 | return
272 | }
273 |
274 | // leftPack left-packs a src slice into a dst for a single block blk
275 | func leftPack[T any](dst *[64]T, src []T, blk uint64) []T {
276 | offset := 0
277 | cursor := 0
278 | for ; blk > 0; blk = blk >> 4 {
279 | switch blk & 0b1111 {
280 | case 0b0001:
281 | dst[cursor] = src[offset+0]
282 | cursor += 1
283 | case 0b0010:
284 | dst[cursor] = src[offset+1]
285 | cursor += 1
286 | case 0b0011:
287 | dst[cursor] = src[offset+0]
288 | dst[cursor+1] = src[offset+1]
289 | cursor += 2
290 | case 0b0100:
291 | dst[cursor] = src[offset+2]
292 | cursor += 1
293 | case 0b0101:
294 | dst[cursor] = src[offset+0]
295 | dst[cursor+1] = src[offset+2]
296 | cursor += 2
297 | case 0b0110:
298 | dst[cursor] = src[offset+1]
299 | dst[cursor+1] = src[offset+2]
300 | cursor += 2
301 | case 0b0111:
302 | dst[cursor] = src[offset+0]
303 | dst[cursor+1] = src[offset+1]
304 | dst[cursor+2] = src[offset+2]
305 | cursor += 3
306 | case 0b1000:
307 | dst[cursor] = src[offset+3]
308 | cursor += 1
309 | case 0b1001:
310 | dst[cursor] = src[offset+0]
311 | dst[cursor+1] = src[offset+3]
312 | cursor += 2
313 | case 0b1010:
314 | dst[cursor] = src[offset+1]
315 | dst[cursor+1] = src[offset+3]
316 | cursor += 2
317 | case 0b1011:
318 | dst[cursor] = src[offset+0]
319 | dst[cursor+1] = src[offset+1]
320 | dst[cursor+2] = src[offset+3]
321 | cursor += 3
322 | case 0b1100:
323 | dst[cursor] = src[offset+2]
324 | dst[cursor+1] = src[offset+3]
325 | cursor += 2
326 | case 0b1101:
327 | dst[cursor] = src[offset+0]
328 | dst[cursor+1] = src[offset+2]
329 | dst[cursor+2] = src[offset+3]
330 | cursor += 3
331 | case 0b1110:
332 | dst[cursor] = src[offset+1]
333 | dst[cursor+1] = src[offset+2]
334 | dst[cursor+2] = src[offset+3]
335 | cursor += 3
336 | case 0b1111:
337 | dst[cursor] = src[offset+0]
338 | dst[cursor+1] = src[offset+1]
339 | dst[cursor+2] = src[offset+2]
340 | dst[cursor+3] = src[offset+3]
341 | cursor += 4
342 | }
343 |
344 | offset += 4
345 | }
346 |
347 | return (*dst)[:cursor]
348 | }
349 |
--------------------------------------------------------------------------------
/range_test.go:
--------------------------------------------------------------------------------
1 | package bitmap
2 |
3 | import (
4 | "testing"
5 |
6 | "github.com/kelindar/simd"
7 | "github.com/stretchr/testify/assert"
8 | )
9 |
10 | /*
11 | cpu: Intel(R) Core(TM) i7-9700K CPU @ 3.60GHz
12 | BenchmarkRange/range-8 1891 674656 ns/op 0 B/op 0 allocs/op
13 | BenchmarkRange/filter-8 2222 535359 ns/op 0 B/op 0 allocs/op
14 | */
15 | func BenchmarkRange(b *testing.B) {
16 | var i uint32
17 | run(b, "range", func(index Bitmap) {
18 | index.Range(func(x uint32) {
19 | i = x
20 | return
21 | })
22 | })
23 |
24 | run(b, "filter", func(index Bitmap) {
25 | index.Filter(func(x uint32) bool {
26 | return x%2 == 0
27 | })
28 | })
29 |
30 | _ = i
31 | }
32 |
33 | /*
34 | cpu: Intel(R) Core(TM) i7-9700K CPU @ 3.60GHz
35 | BenchmarkAggregate/sum-8 1849 627004 ns/op 0 B/op 0 allocs/op
36 | BenchmarkAggregate/sum-full-8 16939 68971 ns/op 0 B/op 0 allocs/op
37 | BenchmarkAggregate/min-8 1474 868868 ns/op 0 B/op 0 allocs/op
38 | BenchmarkAggregate/min-full-8 17082 68719 ns/op 0 B/op 0 allocs/op
39 | BenchmarkAggregate/max-8 1322 864578 ns/op 0 B/op 0 allocs/op
40 | BenchmarkAggregate/max-full-8 17354 69015 ns/op 0 B/op 0 allocs/op
41 | */
42 | func BenchmarkAggregate(b *testing.B) {
43 | target := make([]float32, 1000000)
44 | run(b, "sum", func(index Bitmap) {
45 | Sum(target, index)
46 | })
47 |
48 | runFull(b, "sum-full", func(index Bitmap) {
49 | Sum(target, index)
50 | })
51 |
52 | run(b, "min", func(index Bitmap) {
53 | Min(target, index)
54 | })
55 |
56 | runFull(b, "min-full", func(index Bitmap) {
57 | Min(target, index)
58 | })
59 |
60 | run(b, "max", func(index Bitmap) {
61 | Max(target, index)
62 | })
63 |
64 | runFull(b, "max-full", func(index Bitmap) {
65 | Max(target, index)
66 | })
67 | }
68 |
69 | func TestFilter(t *testing.T) {
70 | a := make(Bitmap, 4)
71 | a.Ones()
72 | assert.Equal(t, 256, a.Count())
73 |
74 | // Filter out odd
75 | a.Filter(func(x uint32) bool {
76 | return x%2 == 0
77 | })
78 | assert.Equal(t, 128, a.Count())
79 |
80 | // Filter out even
81 | a.Filter(func(x uint32) bool {
82 | assert.Equal(t, 0, int(x%2)) // Must be odd
83 | return x%2 == 1
84 | })
85 | assert.Equal(t, 0, a.Count())
86 |
87 | // Filter cases
88 | for i := 0; i < 512; i++ {
89 | b := Bitmap{uint64(i)}
90 | c1 := b.Count()
91 | c2 := 0
92 | b.Filter(func(x uint32) bool {
93 | c2++
94 | return true
95 | })
96 |
97 | // We must have the minimum number of function calls
98 | assert.Equal(t, c1, c2)
99 | assert.Equal(t, uint64(i), b[0])
100 | }
101 | }
102 |
103 | func TestRangeCases(t *testing.T) {
104 | for i := 0; i < 512; i++ {
105 | b := Bitmap{uint64(i)}
106 | c1 := b.Count()
107 | c2 := 0
108 | b.Range(func(x uint32) {
109 | c2++
110 | return
111 | })
112 |
113 | // We must have the minimum number of function calls
114 | assert.Equal(t, c1, c2)
115 | assert.Equal(t, uint64(i), b[0])
116 | }
117 | }
118 |
119 | func TestRangeIndex(t *testing.T) {
120 | a := make(Bitmap, 2)
121 | a.Ones()
122 |
123 | triangular := 0
124 | a.Range(func(x uint32) {
125 | triangular += int(x)
126 | return
127 | })
128 | assert.Equal(t, 8128, triangular)
129 | }
130 |
131 | // ----------------------------- Aggregation -----------------------------
132 |
133 | func TestAggSum(t *testing.T) {
134 | { // Empty Bitmap
135 | arr, index := makeAggregateInput(0x0, 0x0)
136 | assert.Equal(t, sumNaive(arr, index), Sum(arr, index))
137 | }
138 |
139 | { // Partial Bitmap
140 | arr, index := makeAggregateInput(0xffffffffffffffff, 0x0123456789abcdef)
141 | assert.Equal(t, sumNaive(arr, index), Sum(arr, index))
142 | }
143 |
144 | { // Full Bitmap
145 | arr, index := makeAggregateInput(0xffffffffffffffff, 0xffffffffffffffff)
146 | assert.Equal(t, sumNaive(arr, index), Sum(arr, index))
147 | }
148 | { // Nil Bitmap
149 | arr, _ := makeAggregateInput(0x0, 0x0)
150 | assert.Equal(t, sumNaive(arr, nil), Sum(arr, nil))
151 | }
152 |
153 | { // Nil Array
154 | _, index := makeAggregateInput(0x0, 0x0)
155 | assert.Equal(t, sumNaive([]int{}, index), Sum([]int{}, index))
156 | }
157 | }
158 |
159 | func TestAggMin(t *testing.T) {
160 | { // Empty Bitmap
161 | arr, index := makeAggregateInput(0x0, 0x0)
162 | expect, ok1 := minNaive(arr, index)
163 | result, ok2 := Min(arr, index)
164 | assert.Equal(t, expect, result)
165 | assert.Equal(t, ok1, ok2)
166 | }
167 |
168 | { // Partial Bitmap
169 | arr, index := makeAggregateInput(0xffffffffffffffff, 0x0123456789abcdef)
170 | expect, ok1 := minNaive(arr, index)
171 | result, ok2 := Min(arr, index)
172 | assert.Equal(t, expect, result)
173 | assert.Equal(t, ok1, ok2)
174 | }
175 |
176 | { // Full Bitmap
177 | arr, index := makeAggregateInput(0xffffffffffffffff, 0xffffffffffffffff)
178 | expect, ok1 := minNaive(arr, index)
179 | result, ok2 := Min(arr, index)
180 | assert.Equal(t, expect, result)
181 | assert.Equal(t, ok1, ok2)
182 | }
183 |
184 | { // Nil Bitmap
185 | arr, _ := makeAggregateInput(0x0, 0x0)
186 | expect, ok1 := minNaive(arr, nil)
187 | result, ok2 := Min(arr, nil)
188 | assert.Equal(t, expect, result)
189 | assert.Equal(t, ok1, ok2)
190 | }
191 |
192 | { // Nil Array
193 | _, index := makeAggregateInput(0x0, 0x0)
194 | expect, ok1 := minNaive([]int{}, index)
195 | result, ok2 := Min([]int{}, index)
196 | assert.Equal(t, expect, result)
197 | assert.Equal(t, ok1, ok2)
198 | }
199 | }
200 |
201 | func TestAggMax(t *testing.T) {
202 | { // Empty Bitmap
203 | arr, index := makeAggregateInput(0x0, 0x0)
204 | expect, ok1 := maxNaive(arr, index)
205 | result, ok2 := Max(arr, index)
206 | assert.Equal(t, expect, result)
207 | assert.Equal(t, ok1, ok2)
208 | }
209 |
210 | { // Partial Bitmap
211 | arr, index := makeAggregateInput(0xffffffffffffffff, 0x0123456789abcdef)
212 | expect, ok1 := maxNaive(arr, index)
213 | result, ok2 := Max(arr, index)
214 | assert.Equal(t, expect, result)
215 | assert.Equal(t, ok1, ok2)
216 | }
217 |
218 | { // Full Bitmap
219 | arr, index := makeAggregateInput(0xffffffffffffffff, 0xffffffffffffffff)
220 | expect, ok1 := maxNaive(arr, index)
221 | result, ok2 := Max(arr, index)
222 | assert.Equal(t, expect, result)
223 | assert.Equal(t, ok1, ok2)
224 | }
225 |
226 | { // Nil Bitmap
227 | arr, _ := makeAggregateInput(0x0, 0x0)
228 | expect, ok1 := maxNaive(arr, nil)
229 | result, ok2 := Max(arr, nil)
230 | assert.Equal(t, expect, result)
231 | assert.Equal(t, ok1, ok2)
232 | }
233 |
234 | { // Nil Array
235 | _, index := makeAggregateInput(0x0, 0x0)
236 | expect, ok1 := maxNaive([]int{}, index)
237 | result, ok2 := Max([]int{}, index)
238 | assert.Equal(t, expect, result)
239 | assert.Equal(t, ok1, ok2)
240 | }
241 | }
242 |
243 | func TestLeftPack(t *testing.T) {
244 | src, index := makeAggregateInput(0x0123456789abcdef, 0x0123456789abcdef)
245 | dst := leftPack(&[64]int{}, src, index[0])
246 | assert.Equal(t, 32, len(dst))
247 | }
248 |
249 | // ----------------------------- Naive Aggregation Funcs -----------------------------
250 |
251 | func sumNaive[T simd.Number](src []T, index Bitmap) (out T) {
252 | size := minint(len(src), len(index)*64)
253 | for i := 0; i < size; i++ {
254 | if index.Contains(uint32(i)) {
255 | out += src[i]
256 | }
257 | }
258 | return
259 | }
260 |
261 | func minNaive[T simd.Number](src []T, index Bitmap) (T, bool) {
262 | if len(src) == 0 || index.Count() == 0 {
263 | return 0, false
264 | }
265 |
266 | size := minint(len(src), len(index)*64)
267 | out := src[0]
268 | for i := 0; i < size; i++ {
269 | if index.Contains(uint32(i)) && src[i] < out {
270 | out = src[i]
271 | }
272 | }
273 | return out, true
274 | }
275 |
276 | func maxNaive[T simd.Number](src []T, index Bitmap) (T, bool) {
277 | if len(src) == 0 || index.Count() == 0 {
278 | return 0, false
279 | }
280 |
281 | size := minint(len(src), len(index)*64)
282 | out := src[0]
283 | for i := 0; i < size; i++ {
284 | if index.Contains(uint32(i)) && src[i] > out {
285 | out = src[i]
286 | }
287 | }
288 | return out, true
289 | }
290 |
291 | func makeAggregateInput(filter1, filter2 uint64) ([]int, Bitmap) {
292 | index := make(Bitmap, 0, 80)
293 | for i := 0; i < 80; i += 2 {
294 | index = append(index, filter1, filter2)
295 | }
296 |
297 | var arr []int
298 | for i := 0; i < 5000; i++ {
299 | arr = append(arr, 100+i)
300 | }
301 |
302 | arr[102] = 50
303 | arr[101] = 5000
304 | arr[152] = 40
305 | arr[151] = 6000
306 | arr[4999] = 30
307 | arr[4998] = 20000
308 | return arr, index
309 | }
310 |
311 | // ----------------------------- Benchmark -----------------------------
312 |
313 | // run runs a benchmark
314 | func run(b *testing.B, name string, f func(index Bitmap)) {
315 | count := 1000064
316 | b.Run(name, func(b *testing.B) {
317 | index := make(Bitmap, count/64)
318 | index.Grow(uint32(count))
319 | for i := 0; i < len(index); i++ {
320 | index[i] = 0xf0f0f0f0f0f0f0f0
321 | }
322 |
323 | b.ReportAllocs()
324 | b.ResetTimer()
325 | for n := 0; n < b.N; n++ {
326 | f(index)
327 | }
328 | })
329 | }
330 |
331 | // run runs a benchmark on a full bitmap
332 | func runFull(b *testing.B, name string, f func(index Bitmap)) {
333 | count := 1000000
334 | b.Run(name, func(b *testing.B) {
335 | index := make(Bitmap, count/64)
336 | index.Grow(uint32(count - 1))
337 | for i := 0; i < len(index); i++ {
338 | index[i] = 0xffffffffffffffff
339 | }
340 |
341 | b.ReportAllocs()
342 | b.ResetTimer()
343 | for n := 0; n < b.N; n++ {
344 | f(index)
345 | }
346 | })
347 | }
348 |
--------------------------------------------------------------------------------
/simd_apple.go:
--------------------------------------------------------------------------------
1 | //go:build !noasm && darwin && arm64
2 | // AUTO-GENERATED BY GOCC -- DO NOT EDIT
3 |
4 | package bitmap
5 |
6 | import "unsafe"
7 |
8 | //go:nosplit
9 | //go:noescape
10 | func _and(a unsafe.Pointer, b unsafe.Pointer, n uint64)
11 |
12 | //go:nosplit
13 | //go:noescape
14 | func _andn(a unsafe.Pointer, b unsafe.Pointer, n uint64)
15 |
16 | //go:nosplit
17 | //go:noescape
18 | func _or(a unsafe.Pointer, b unsafe.Pointer, n uint64)
19 |
20 | //go:nosplit
21 | //go:noescape
22 | func _xor(a unsafe.Pointer, b unsafe.Pointer, n uint64)
23 |
24 | //go:nosplit
25 | //go:noescape
26 | func _and_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
27 |
28 | //go:nosplit
29 | //go:noescape
30 | func _andn_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
31 |
32 | //go:nosplit
33 | //go:noescape
34 | func _or_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
35 |
36 | //go:nosplit
37 | //go:noescape
38 | func _xor_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
39 |
40 | //go:nosplit
41 | //go:noescape
42 | func _count(a unsafe.Pointer, size uint64, result unsafe.Pointer)
43 |
--------------------------------------------------------------------------------
/simd_apple.s:
--------------------------------------------------------------------------------
1 | //go:build !noasm && darwin && arm64
2 | // AUTO-GENERATED BY GOCC -- DO NOT EDIT
3 |
4 | TEXT ·_and(SB), $0-32
5 | MOVD a+0(FP), R0
6 | MOVD b+8(FP), R1
7 | MOVD n+16(FP), R2
8 | WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
9 | WORD $0x910003fd // mov x29, sp
10 | WORD $0xb40002a2 // cbz x2, LBB0_7
11 | WORD $0xf100105f // cmp x2, #4
12 | WORD $0x54000103 // b.lo LBB0_4
13 | WORD $0xd37df048 // lsl x8, x2, #3
14 | WORD $0x8b080029 // add x9, x1, x8
15 | WORD $0xeb00013f // cmp x9, x0
16 | WORD $0x54000229 // b.ls LBB0_8
17 | WORD $0x8b080008 // add x8, x0, x8
18 | WORD $0xeb01011f // cmp x8, x1
19 | WORD $0x540001c9 // b.ls LBB0_8
20 |
21 | BB0_4:
22 | WORD $0xd2800008 // mov x8, #0
23 |
24 | BB0_5:
25 | WORD $0xcb080049 // sub x9, x2, x8
26 | WORD $0xd37df10a // lsl x10, x8, #3
27 | WORD $0x8b0a0008 // add x8, x0, x10
28 | WORD $0x8b0a002a // add x10, x1, x10
29 |
30 | BB0_6:
31 | WORD $0xf840854b // ldr x11, [x10], #8
32 | WORD $0xf940010c // ldr x12, [x8]
33 | WORD $0x8a0b018b // and x11, x12, x11
34 | WORD $0xf800850b // str x11, [x8], #8
35 | WORD $0xf1000529 // subs x9, x9, #1
36 | WORD $0x54ffff61 // b.ne LBB0_6
37 |
38 | BB0_7:
39 | WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
40 | WORD $0xd65f03c0 // ret
41 |
42 | BB0_8:
43 | WORD $0x927ef448 // and x8, x2, #0xfffffffffffffffc
44 | WORD $0x91004029 // add x9, x1, #16
45 | WORD $0x9100400a // add x10, x0, #16
46 | WORD $0xaa0803eb // mov x11, x8
47 |
48 | BB0_9:
49 | WORD $0xad7f8520 // ldp q0, q1, [x9, #-16]
50 | WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16]
51 | WORD $0x4e201c40 // and.16b v0, v2, v0
52 | WORD $0x4e211c61 // and.16b v1, v3, v1
53 | WORD $0xad3f8540 // stp q0, q1, [x10, #-16]
54 | WORD $0x91008129 // add x9, x9, #32
55 | WORD $0x9100814a // add x10, x10, #32
56 | WORD $0xf100116b // subs x11, x11, #4
57 | WORD $0x54ffff01 // b.ne LBB0_9
58 | WORD $0xeb02011f // cmp x8, x2
59 | WORD $0x54fffe00 // b.eq LBB0_7
60 | WORD $0x17ffffe5 // b LBB0_5
61 |
62 | TEXT ·_andn(SB), $0-32
63 | MOVD a+0(FP), R0
64 | MOVD b+8(FP), R1
65 | MOVD n+16(FP), R2
66 | WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
67 | WORD $0x910003fd // mov x29, sp
68 | WORD $0xb40002a2 // cbz x2, LBB1_7
69 | WORD $0xf100105f // cmp x2, #4
70 | WORD $0x54000103 // b.lo LBB1_4
71 | WORD $0xd37df048 // lsl x8, x2, #3
72 | WORD $0x8b080029 // add x9, x1, x8
73 | WORD $0xeb00013f // cmp x9, x0
74 | WORD $0x54000229 // b.ls LBB1_8
75 | WORD $0x8b080008 // add x8, x0, x8
76 | WORD $0xeb01011f // cmp x8, x1
77 | WORD $0x540001c9 // b.ls LBB1_8
78 |
79 | BB1_4:
80 | WORD $0xd2800008 // mov x8, #0
81 |
82 | BB1_5:
83 | WORD $0xcb080049 // sub x9, x2, x8
84 | WORD $0xd37df10a // lsl x10, x8, #3
85 | WORD $0x8b0a0008 // add x8, x0, x10
86 | WORD $0x8b0a002a // add x10, x1, x10
87 |
88 | BB1_6:
89 | WORD $0xf840854b // ldr x11, [x10], #8
90 | WORD $0xf940010c // ldr x12, [x8]
91 | WORD $0x8a2b018b // bic x11, x12, x11
92 | WORD $0xf800850b // str x11, [x8], #8
93 | WORD $0xf1000529 // subs x9, x9, #1
94 | WORD $0x54ffff61 // b.ne LBB1_6
95 |
96 | BB1_7:
97 | WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
98 | WORD $0xd65f03c0 // ret
99 |
100 | BB1_8:
101 | WORD $0x927ef448 // and x8, x2, #0xfffffffffffffffc
102 | WORD $0x91004029 // add x9, x1, #16
103 | WORD $0x9100400a // add x10, x0, #16
104 | WORD $0xaa0803eb // mov x11, x8
105 |
106 | BB1_9:
107 | WORD $0xad7f8520 // ldp q0, q1, [x9, #-16]
108 | WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16]
109 | WORD $0x4e601c40 // bic.16b v0, v2, v0
110 | WORD $0x4e611c61 // bic.16b v1, v3, v1
111 | WORD $0xad3f8540 // stp q0, q1, [x10, #-16]
112 | WORD $0x91008129 // add x9, x9, #32
113 | WORD $0x9100814a // add x10, x10, #32
114 | WORD $0xf100116b // subs x11, x11, #4
115 | WORD $0x54ffff01 // b.ne LBB1_9
116 | WORD $0xeb02011f // cmp x8, x2
117 | WORD $0x54fffe00 // b.eq LBB1_7
118 | WORD $0x17ffffe5 // b LBB1_5
119 |
120 | TEXT ·_or(SB), $0-32
121 | MOVD a+0(FP), R0
122 | MOVD b+8(FP), R1
123 | MOVD n+16(FP), R2
124 | WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
125 | WORD $0x910003fd // mov x29, sp
126 | WORD $0xb40002a2 // cbz x2, LBB2_7
127 | WORD $0xf100105f // cmp x2, #4
128 | WORD $0x54000103 // b.lo LBB2_4
129 | WORD $0xd37df048 // lsl x8, x2, #3
130 | WORD $0x8b080029 // add x9, x1, x8
131 | WORD $0xeb00013f // cmp x9, x0
132 | WORD $0x54000229 // b.ls LBB2_8
133 | WORD $0x8b080008 // add x8, x0, x8
134 | WORD $0xeb01011f // cmp x8, x1
135 | WORD $0x540001c9 // b.ls LBB2_8
136 |
137 | BB2_4:
138 | WORD $0xd2800008 // mov x8, #0
139 |
140 | BB2_5:
141 | WORD $0xcb080049 // sub x9, x2, x8
142 | WORD $0xd37df10a // lsl x10, x8, #3
143 | WORD $0x8b0a0008 // add x8, x0, x10
144 | WORD $0x8b0a002a // add x10, x1, x10
145 |
146 | BB2_6:
147 | WORD $0xf840854b // ldr x11, [x10], #8
148 | WORD $0xf940010c // ldr x12, [x8]
149 | WORD $0xaa0b018b // orr x11, x12, x11
150 | WORD $0xf800850b // str x11, [x8], #8
151 | WORD $0xf1000529 // subs x9, x9, #1
152 | WORD $0x54ffff61 // b.ne LBB2_6
153 |
154 | BB2_7:
155 | WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
156 | WORD $0xd65f03c0 // ret
157 |
158 | BB2_8:
159 | WORD $0x927ef448 // and x8, x2, #0xfffffffffffffffc
160 | WORD $0x91004029 // add x9, x1, #16
161 | WORD $0x9100400a // add x10, x0, #16
162 | WORD $0xaa0803eb // mov x11, x8
163 |
164 | BB2_9:
165 | WORD $0xad7f8520 // ldp q0, q1, [x9, #-16]
166 | WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16]
167 | WORD $0x4ea01c40 // orr.16b v0, v2, v0
168 | WORD $0x4ea11c61 // orr.16b v1, v3, v1
169 | WORD $0xad3f8540 // stp q0, q1, [x10, #-16]
170 | WORD $0x91008129 // add x9, x9, #32
171 | WORD $0x9100814a // add x10, x10, #32
172 | WORD $0xf100116b // subs x11, x11, #4
173 | WORD $0x54ffff01 // b.ne LBB2_9
174 | WORD $0xeb02011f // cmp x8, x2
175 | WORD $0x54fffe00 // b.eq LBB2_7
176 | WORD $0x17ffffe5 // b LBB2_5
177 |
178 | TEXT ·_xor(SB), $0-32
179 | MOVD a+0(FP), R0
180 | MOVD b+8(FP), R1
181 | MOVD n+16(FP), R2
182 | WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
183 | WORD $0x910003fd // mov x29, sp
184 | WORD $0xb40002a2 // cbz x2, LBB3_7
185 | WORD $0xf100105f // cmp x2, #4
186 | WORD $0x54000103 // b.lo LBB3_4
187 | WORD $0xd37df048 // lsl x8, x2, #3
188 | WORD $0x8b080029 // add x9, x1, x8
189 | WORD $0xeb00013f // cmp x9, x0
190 | WORD $0x54000229 // b.ls LBB3_8
191 | WORD $0x8b080008 // add x8, x0, x8
192 | WORD $0xeb01011f // cmp x8, x1
193 | WORD $0x540001c9 // b.ls LBB3_8
194 |
195 | BB3_4:
196 | WORD $0xd2800008 // mov x8, #0
197 |
198 | BB3_5:
199 | WORD $0xcb080049 // sub x9, x2, x8
200 | WORD $0xd37df10a // lsl x10, x8, #3
201 | WORD $0x8b0a0008 // add x8, x0, x10
202 | WORD $0x8b0a002a // add x10, x1, x10
203 |
204 | BB3_6:
205 | WORD $0xf840854b // ldr x11, [x10], #8
206 | WORD $0xf940010c // ldr x12, [x8]
207 | WORD $0xca0b018b // eor x11, x12, x11
208 | WORD $0xf800850b // str x11, [x8], #8
209 | WORD $0xf1000529 // subs x9, x9, #1
210 | WORD $0x54ffff61 // b.ne LBB3_6
211 |
212 | BB3_7:
213 | WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
214 | WORD $0xd65f03c0 // ret
215 |
216 | BB3_8:
217 | WORD $0x927ef448 // and x8, x2, #0xfffffffffffffffc
218 | WORD $0x91004029 // add x9, x1, #16
219 | WORD $0x9100400a // add x10, x0, #16
220 | WORD $0xaa0803eb // mov x11, x8
221 |
222 | BB3_9:
223 | WORD $0xad7f8520 // ldp q0, q1, [x9, #-16]
224 | WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16]
225 | WORD $0x6e201c40 // eor.16b v0, v2, v0
226 | WORD $0x6e211c61 // eor.16b v1, v3, v1
227 | WORD $0xad3f8540 // stp q0, q1, [x10, #-16]
228 | WORD $0x91008129 // add x9, x9, #32
229 | WORD $0x9100814a // add x10, x10, #32
230 | WORD $0xf100116b // subs x11, x11, #4
231 | WORD $0x54ffff01 // b.ne LBB3_9
232 | WORD $0xeb02011f // cmp x8, x2
233 | WORD $0x54fffe00 // b.eq LBB3_7
234 | WORD $0x17ffffe5 // b LBB3_5
235 |
236 | TEXT ·_and_many(SB), $0-32
237 | MOVD a+0(FP), R0
238 | MOVD b+8(FP), R1
239 | MOVD dims+16(FP), R2
240 | WORD $0xa9bb67fa // stp x26, x25, [sp, #-80]! ; 16-byte Folded Spill
241 | WORD $0xa9015ff8 // stp x24, x23, [sp, #16] ; 16-byte Folded Spill
242 | WORD $0xa90257f6 // stp x22, x21, [sp, #32] ; 16-byte Folded Spill
243 | WORD $0xa9034ff4 // stp x20, x19, [sp, #48] ; 16-byte Folded Spill
244 | WORD $0xa9047bfd // stp x29, x30, [sp, #64] ; 16-byte Folded Spill
245 | WORD $0x910103fd // add x29, sp, #64
246 | WORD $0xf2407c48 // ands x8, x2, #0xffffffff
247 | WORD $0x54000a20 // b.eq LBB4_14
248 | WORD $0xd360fc4b // lsr x11, x2, #32
249 | WORD $0xb40009eb // cbz x11, LBB4_14
250 | WORD $0xd2800009 // mov x9, #0
251 | WORD $0xd280000a // mov x10, #0
252 | WORD $0xd280000f // mov x15, #0
253 | WORD $0xf100057f // cmp x11, #1
254 | WORD $0x9a9f856b // csinc x11, x11, xzr, hi
255 | WORD $0x9100400c // add x12, x0, #16
256 | WORD $0x5280400d // mov w13, #512
257 | WORD $0x5280020e // mov w14, #16
258 | WORD $0x14000009 // b LBB4_4
259 |
260 | BB4_3:
261 | WORD $0x910801ad // add x13, x13, #512
262 | WORD $0x9100054a // add x10, x10, #1
263 | WORD $0xd1080129 // sub x9, x9, #512
264 | WORD $0x914005ce // add x14, x14, #1, lsl #12 ; =4096
265 | WORD $0x9140058c // add x12, x12, #1, lsl #12 ; =4096
266 | WORD $0xaa1003ef // mov x15, x16
267 | WORD $0xeb08021f // cmp x16, x8
268 | WORD $0x540007c2 // b.hs LBB4_14
269 |
270 | BB4_4:
271 | WORD $0xeb0801bf // cmp x13, x8
272 | WORD $0x9a8831b1 // csel x17, x13, x8, lo
273 | WORD $0x910801f0 // add x16, x15, #512
274 | WORD $0xeb08021f // cmp x16, x8
275 | WORD $0x9a883202 // csel x2, x16, x8, lo
276 | WORD $0xeb0201ff // cmp x15, x2
277 | WORD $0x54fffe42 // b.hs LBB4_3
278 | WORD $0xd2800002 // mov x2, #0
279 | WORD $0x8b090223 // add x3, x17, x9
280 | WORD $0x927ef463 // and x3, x3, #0xfffffffffffffffc
281 | WORD $0xcb0a2624 // sub x4, x17, x10, lsl #9
282 | WORD $0xd374cd45 // lsl x5, x10, #12
283 | WORD $0xd37df086 // lsl x6, x4, #3
284 | WORD $0xd10020d5 // sub x21, x6, #8
285 | WORD $0x8b050006 // add x6, x0, x5
286 | WORD $0x8b1500c7 // add x7, x6, x21
287 | WORD $0x910020e7 // add x7, x7, #8
288 | WORD $0x927ef493 // and x19, x4, #0xfffffffffffffffc
289 | WORD $0x8b1301f4 // add x20, x15, x19
290 | WORD $0x8b1500b5 // add x21, x5, x21
291 | WORD $0x14000004 // b LBB4_7
292 |
293 | BB4_6:
294 | WORD $0x91000442 // add x2, x2, #1
295 | WORD $0xeb0b005f // cmp x2, x11
296 | WORD $0x54fffc20 // b.eq LBB4_3
297 |
298 | BB4_7:
299 | WORD $0xf8627836 // ldr x22, [x1, x2, lsl #3]
300 | WORD $0xaa0f03f8 // mov x24, x15
301 | WORD $0xf100109f // cmp x4, #4
302 | WORD $0x540002e3 // b.lo LBB4_12
303 | WORD $0x8b0502d7 // add x23, x22, x5
304 | WORD $0x8b1502d8 // add x24, x22, x21
305 | WORD $0x91002318 // add x24, x24, #8
306 | WORD $0xeb1800df // cmp x6, x24
307 | WORD $0xfa4732e2 // ccmp x23, x7, #2, lo
308 | WORD $0xaa0f03f8 // mov x24, x15
309 | WORD $0x54000203 // b.lo LBB4_12
310 | WORD $0x8b0e02d7 // add x23, x22, x14
311 | WORD $0xaa0c03f8 // mov x24, x12
312 | WORD $0xaa0303f9 // mov x25, x3
313 |
314 | BB4_10:
315 | WORD $0xad7f86e0 // ldp q0, q1, [x23, #-16]
316 | WORD $0xad7f8f02 // ldp q2, q3, [x24, #-16]
317 | WORD $0x4e201c40 // and.16b v0, v2, v0
318 | WORD $0x4e211c61 // and.16b v1, v3, v1
319 | WORD $0xad3f8700 // stp q0, q1, [x24, #-16]
320 | WORD $0x910082f7 // add x23, x23, #32
321 | WORD $0x91008318 // add x24, x24, #32
322 | WORD $0xf1001339 // subs x25, x25, #4
323 | WORD $0x54ffff01 // b.ne LBB4_10
324 | WORD $0xaa1403f8 // mov x24, x20
325 | WORD $0xeb13009f // cmp x4, x19
326 | WORD $0x54fffc80 // b.eq LBB4_6
327 |
328 | BB4_12:
329 | WORD $0xcb180237 // sub x23, x17, x24
330 | WORD $0xd37df319 // lsl x25, x24, #3
331 | WORD $0x8b190018 // add x24, x0, x25
332 | WORD $0x8b1902d6 // add x22, x22, x25
333 |
334 | BB4_13:
335 | WORD $0xf84086d9 // ldr x25, [x22], #8
336 | WORD $0xf940031a // ldr x26, [x24]
337 | WORD $0x8a190359 // and x25, x26, x25
338 | WORD $0xf8008719 // str x25, [x24], #8
339 | WORD $0xf10006f7 // subs x23, x23, #1
340 | WORD $0x54ffff61 // b.ne LBB4_13
341 | WORD $0x17ffffd9 // b LBB4_6
342 |
343 | BB4_14:
344 | WORD $0xa9447bfd // ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
345 | WORD $0xa9434ff4 // ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
346 | WORD $0xa94257f6 // ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
347 | WORD $0xa9415ff8 // ldp x24, x23, [sp, #16] ; 16-byte Folded Reload
348 | WORD $0xa8c567fa // ldp x26, x25, [sp], #80 ; 16-byte Folded Reload
349 | WORD $0xd65f03c0 // ret
350 |
351 | TEXT ·_andn_many(SB), $0-32
352 | MOVD a+0(FP), R0
353 | MOVD b+8(FP), R1
354 | MOVD dims+16(FP), R2
355 | WORD $0xa9bb67fa // stp x26, x25, [sp, #-80]! ; 16-byte Folded Spill
356 | WORD $0xa9015ff8 // stp x24, x23, [sp, #16] ; 16-byte Folded Spill
357 | WORD $0xa90257f6 // stp x22, x21, [sp, #32] ; 16-byte Folded Spill
358 | WORD $0xa9034ff4 // stp x20, x19, [sp, #48] ; 16-byte Folded Spill
359 | WORD $0xa9047bfd // stp x29, x30, [sp, #64] ; 16-byte Folded Spill
360 | WORD $0x910103fd // add x29, sp, #64
361 | WORD $0xf2407c48 // ands x8, x2, #0xffffffff
362 | WORD $0x54000a20 // b.eq LBB5_14
363 | WORD $0xd360fc4b // lsr x11, x2, #32
364 | WORD $0xb40009eb // cbz x11, LBB5_14
365 | WORD $0xd2800009 // mov x9, #0
366 | WORD $0xd280000a // mov x10, #0
367 | WORD $0xd280000f // mov x15, #0
368 | WORD $0xf100057f // cmp x11, #1
369 | WORD $0x9a9f856b // csinc x11, x11, xzr, hi
370 | WORD $0x9100400c // add x12, x0, #16
371 | WORD $0x5280400d // mov w13, #512
372 | WORD $0x5280020e // mov w14, #16
373 | WORD $0x14000009 // b LBB5_4
374 |
375 | BB5_3:
376 | WORD $0x910801ad // add x13, x13, #512
377 | WORD $0x9100054a // add x10, x10, #1
378 | WORD $0xd1080129 // sub x9, x9, #512
379 | WORD $0x914005ce // add x14, x14, #1, lsl #12 ; =4096
380 | WORD $0x9140058c // add x12, x12, #1, lsl #12 ; =4096
381 | WORD $0xaa1003ef // mov x15, x16
382 | WORD $0xeb08021f // cmp x16, x8
383 | WORD $0x540007c2 // b.hs LBB5_14
384 |
385 | BB5_4:
386 | WORD $0xeb0801bf // cmp x13, x8
387 | WORD $0x9a8831b1 // csel x17, x13, x8, lo
388 | WORD $0x910801f0 // add x16, x15, #512
389 | WORD $0xeb08021f // cmp x16, x8
390 | WORD $0x9a883202 // csel x2, x16, x8, lo
391 | WORD $0xeb0201ff // cmp x15, x2
392 | WORD $0x54fffe42 // b.hs LBB5_3
393 | WORD $0xd2800002 // mov x2, #0
394 | WORD $0x8b090223 // add x3, x17, x9
395 | WORD $0x927ef463 // and x3, x3, #0xfffffffffffffffc
396 | WORD $0xcb0a2624 // sub x4, x17, x10, lsl #9
397 | WORD $0xd374cd45 // lsl x5, x10, #12
398 | WORD $0xd37df086 // lsl x6, x4, #3
399 | WORD $0xd10020d5 // sub x21, x6, #8
400 | WORD $0x8b050006 // add x6, x0, x5
401 | WORD $0x8b1500c7 // add x7, x6, x21
402 | WORD $0x910020e7 // add x7, x7, #8
403 | WORD $0x927ef493 // and x19, x4, #0xfffffffffffffffc
404 | WORD $0x8b1301f4 // add x20, x15, x19
405 | WORD $0x8b1500b5 // add x21, x5, x21
406 | WORD $0x14000004 // b LBB5_7
407 |
408 | BB5_6:
409 | WORD $0x91000442 // add x2, x2, #1
410 | WORD $0xeb0b005f // cmp x2, x11
411 | WORD $0x54fffc20 // b.eq LBB5_3
412 |
413 | BB5_7:
414 | WORD $0xf8627836 // ldr x22, [x1, x2, lsl #3]
415 | WORD $0xaa0f03f8 // mov x24, x15
416 | WORD $0xf100109f // cmp x4, #4
417 | WORD $0x540002e3 // b.lo LBB5_12
418 | WORD $0x8b0502d7 // add x23, x22, x5
419 | WORD $0x8b1502d8 // add x24, x22, x21
420 | WORD $0x91002318 // add x24, x24, #8
421 | WORD $0xeb1800df // cmp x6, x24
422 | WORD $0xfa4732e2 // ccmp x23, x7, #2, lo
423 | WORD $0xaa0f03f8 // mov x24, x15
424 | WORD $0x54000203 // b.lo LBB5_12
425 | WORD $0x8b0e02d7 // add x23, x22, x14
426 | WORD $0xaa0c03f8 // mov x24, x12
427 | WORD $0xaa0303f9 // mov x25, x3
428 |
429 | BB5_10:
430 | WORD $0xad7f86e0 // ldp q0, q1, [x23, #-16]
431 | WORD $0xad7f8f02 // ldp q2, q3, [x24, #-16]
432 | WORD $0x4e601c40 // bic.16b v0, v2, v0
433 | WORD $0x4e611c61 // bic.16b v1, v3, v1
434 | WORD $0xad3f8700 // stp q0, q1, [x24, #-16]
435 | WORD $0x910082f7 // add x23, x23, #32
436 | WORD $0x91008318 // add x24, x24, #32
437 | WORD $0xf1001339 // subs x25, x25, #4
438 | WORD $0x54ffff01 // b.ne LBB5_10
439 | WORD $0xaa1403f8 // mov x24, x20
440 | WORD $0xeb13009f // cmp x4, x19
441 | WORD $0x54fffc80 // b.eq LBB5_6
442 |
443 | BB5_12:
444 | WORD $0xcb180237 // sub x23, x17, x24
445 | WORD $0xd37df319 // lsl x25, x24, #3
446 | WORD $0x8b190018 // add x24, x0, x25
447 | WORD $0x8b1902d6 // add x22, x22, x25
448 |
449 | BB5_13:
450 | WORD $0xf84086d9 // ldr x25, [x22], #8
451 | WORD $0xf940031a // ldr x26, [x24]
452 | WORD $0x8a390359 // bic x25, x26, x25
453 | WORD $0xf8008719 // str x25, [x24], #8
454 | WORD $0xf10006f7 // subs x23, x23, #1
455 | WORD $0x54ffff61 // b.ne LBB5_13
456 | WORD $0x17ffffd9 // b LBB5_6
457 |
458 | BB5_14:
459 | WORD $0xa9447bfd // ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
460 | WORD $0xa9434ff4 // ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
461 | WORD $0xa94257f6 // ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
462 | WORD $0xa9415ff8 // ldp x24, x23, [sp, #16] ; 16-byte Folded Reload
463 | WORD $0xa8c567fa // ldp x26, x25, [sp], #80 ; 16-byte Folded Reload
464 | WORD $0xd65f03c0 // ret
465 |
466 | TEXT ·_or_many(SB), $0-32
467 | MOVD a+0(FP), R0
468 | MOVD b+8(FP), R1
469 | MOVD dims+16(FP), R2
470 | WORD $0xa9bb67fa // stp x26, x25, [sp, #-80]! ; 16-byte Folded Spill
471 | WORD $0xa9015ff8 // stp x24, x23, [sp, #16] ; 16-byte Folded Spill
472 | WORD $0xa90257f6 // stp x22, x21, [sp, #32] ; 16-byte Folded Spill
473 | WORD $0xa9034ff4 // stp x20, x19, [sp, #48] ; 16-byte Folded Spill
474 | WORD $0xa9047bfd // stp x29, x30, [sp, #64] ; 16-byte Folded Spill
475 | WORD $0x910103fd // add x29, sp, #64
476 | WORD $0xf2407c48 // ands x8, x2, #0xffffffff
477 | WORD $0x54000a20 // b.eq LBB6_14
478 | WORD $0xd360fc4b // lsr x11, x2, #32
479 | WORD $0xb40009eb // cbz x11, LBB6_14
480 | WORD $0xd2800009 // mov x9, #0
481 | WORD $0xd280000a // mov x10, #0
482 | WORD $0xd280000f // mov x15, #0
483 | WORD $0xf100057f // cmp x11, #1
484 | WORD $0x9a9f856b // csinc x11, x11, xzr, hi
485 | WORD $0x9100400c // add x12, x0, #16
486 | WORD $0x5280400d // mov w13, #512
487 | WORD $0x5280020e // mov w14, #16
488 | WORD $0x14000009 // b LBB6_4
489 |
490 | BB6_3:
491 | WORD $0x910801ad // add x13, x13, #512
492 | WORD $0x9100054a // add x10, x10, #1
493 | WORD $0xd1080129 // sub x9, x9, #512
494 | WORD $0x914005ce // add x14, x14, #1, lsl #12 ; =4096
495 | WORD $0x9140058c // add x12, x12, #1, lsl #12 ; =4096
496 | WORD $0xaa1003ef // mov x15, x16
497 | WORD $0xeb08021f // cmp x16, x8
498 | WORD $0x540007c2 // b.hs LBB6_14
499 |
500 | BB6_4:
501 | WORD $0xeb0801bf // cmp x13, x8
502 | WORD $0x9a8831b1 // csel x17, x13, x8, lo
503 | WORD $0x910801f0 // add x16, x15, #512
504 | WORD $0xeb08021f // cmp x16, x8
505 | WORD $0x9a883202 // csel x2, x16, x8, lo
506 | WORD $0xeb0201ff // cmp x15, x2
507 | WORD $0x54fffe42 // b.hs LBB6_3
508 | WORD $0xd2800002 // mov x2, #0
509 | WORD $0x8b090223 // add x3, x17, x9
510 | WORD $0x927ef463 // and x3, x3, #0xfffffffffffffffc
511 | WORD $0xcb0a2624 // sub x4, x17, x10, lsl #9
512 | WORD $0xd374cd45 // lsl x5, x10, #12
513 | WORD $0xd37df086 // lsl x6, x4, #3
514 | WORD $0xd10020d5 // sub x21, x6, #8
515 | WORD $0x8b050006 // add x6, x0, x5
516 | WORD $0x8b1500c7 // add x7, x6, x21
517 | WORD $0x910020e7 // add x7, x7, #8
518 | WORD $0x927ef493 // and x19, x4, #0xfffffffffffffffc
519 | WORD $0x8b1301f4 // add x20, x15, x19
520 | WORD $0x8b1500b5 // add x21, x5, x21
521 | WORD $0x14000004 // b LBB6_7
522 |
523 | BB6_6:
524 | WORD $0x91000442 // add x2, x2, #1
525 | WORD $0xeb0b005f // cmp x2, x11
526 | WORD $0x54fffc20 // b.eq LBB6_3
527 |
528 | BB6_7:
529 | WORD $0xf8627836 // ldr x22, [x1, x2, lsl #3]
530 | WORD $0xaa0f03f8 // mov x24, x15
531 | WORD $0xf100109f // cmp x4, #4
532 | WORD $0x540002e3 // b.lo LBB6_12
533 | WORD $0x8b0502d7 // add x23, x22, x5
534 | WORD $0x8b1502d8 // add x24, x22, x21
535 | WORD $0x91002318 // add x24, x24, #8
536 | WORD $0xeb1800df // cmp x6, x24
537 | WORD $0xfa4732e2 // ccmp x23, x7, #2, lo
538 | WORD $0xaa0f03f8 // mov x24, x15
539 | WORD $0x54000203 // b.lo LBB6_12
540 | WORD $0x8b0e02d7 // add x23, x22, x14
541 | WORD $0xaa0c03f8 // mov x24, x12
542 | WORD $0xaa0303f9 // mov x25, x3
543 |
544 | BB6_10:
545 | WORD $0xad7f86e0 // ldp q0, q1, [x23, #-16]
546 | WORD $0xad7f8f02 // ldp q2, q3, [x24, #-16]
547 | WORD $0x4ea01c40 // orr.16b v0, v2, v0
548 | WORD $0x4ea11c61 // orr.16b v1, v3, v1
549 | WORD $0xad3f8700 // stp q0, q1, [x24, #-16]
550 | WORD $0x910082f7 // add x23, x23, #32
551 | WORD $0x91008318 // add x24, x24, #32
552 | WORD $0xf1001339 // subs x25, x25, #4
553 | WORD $0x54ffff01 // b.ne LBB6_10
554 | WORD $0xaa1403f8 // mov x24, x20
555 | WORD $0xeb13009f // cmp x4, x19
556 | WORD $0x54fffc80 // b.eq LBB6_6
557 |
558 | BB6_12:
559 | WORD $0xcb180237 // sub x23, x17, x24
560 | WORD $0xd37df319 // lsl x25, x24, #3
561 | WORD $0x8b190018 // add x24, x0, x25
562 | WORD $0x8b1902d6 // add x22, x22, x25
563 |
564 | BB6_13:
565 | WORD $0xf84086d9 // ldr x25, [x22], #8
566 | WORD $0xf940031a // ldr x26, [x24]
567 | WORD $0xaa190359 // orr x25, x26, x25
568 | WORD $0xf8008719 // str x25, [x24], #8
569 | WORD $0xf10006f7 // subs x23, x23, #1
570 | WORD $0x54ffff61 // b.ne LBB6_13
571 | WORD $0x17ffffd9 // b LBB6_6
572 |
573 | BB6_14:
574 | WORD $0xa9447bfd // ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
575 | WORD $0xa9434ff4 // ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
576 | WORD $0xa94257f6 // ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
577 | WORD $0xa9415ff8 // ldp x24, x23, [sp, #16] ; 16-byte Folded Reload
578 | WORD $0xa8c567fa // ldp x26, x25, [sp], #80 ; 16-byte Folded Reload
579 | WORD $0xd65f03c0 // ret
580 |
581 | TEXT ·_xor_many(SB), $0-32
582 | MOVD a+0(FP), R0
583 | MOVD b+8(FP), R1
584 | MOVD dims+16(FP), R2
585 | WORD $0xa9bb67fa // stp x26, x25, [sp, #-80]! ; 16-byte Folded Spill
586 | WORD $0xa9015ff8 // stp x24, x23, [sp, #16] ; 16-byte Folded Spill
587 | WORD $0xa90257f6 // stp x22, x21, [sp, #32] ; 16-byte Folded Spill
588 | WORD $0xa9034ff4 // stp x20, x19, [sp, #48] ; 16-byte Folded Spill
589 | WORD $0xa9047bfd // stp x29, x30, [sp, #64] ; 16-byte Folded Spill
590 | WORD $0x910103fd // add x29, sp, #64
591 | WORD $0xf2407c48 // ands x8, x2, #0xffffffff
592 | WORD $0x54000a20 // b.eq LBB7_14
593 | WORD $0xd360fc4b // lsr x11, x2, #32
594 | WORD $0xb40009eb // cbz x11, LBB7_14
595 | WORD $0xd2800009 // mov x9, #0
596 | WORD $0xd280000a // mov x10, #0
597 | WORD $0xd280000f // mov x15, #0
598 | WORD $0xf100057f // cmp x11, #1
599 | WORD $0x9a9f856b // csinc x11, x11, xzr, hi
600 | WORD $0x9100400c // add x12, x0, #16
601 | WORD $0x5280400d // mov w13, #512
602 | WORD $0x5280020e // mov w14, #16
603 | WORD $0x14000009 // b LBB7_4
604 |
605 | BB7_3:
606 | WORD $0x910801ad // add x13, x13, #512
607 | WORD $0x9100054a // add x10, x10, #1
608 | WORD $0xd1080129 // sub x9, x9, #512
609 | WORD $0x914005ce // add x14, x14, #1, lsl #12 ; =4096
610 | WORD $0x9140058c // add x12, x12, #1, lsl #12 ; =4096
611 | WORD $0xaa1003ef // mov x15, x16
612 | WORD $0xeb08021f // cmp x16, x8
613 | WORD $0x540007c2 // b.hs LBB7_14
614 |
615 | BB7_4:
616 | WORD $0xeb0801bf // cmp x13, x8
617 | WORD $0x9a8831b1 // csel x17, x13, x8, lo
618 | WORD $0x910801f0 // add x16, x15, #512
619 | WORD $0xeb08021f // cmp x16, x8
620 | WORD $0x9a883202 // csel x2, x16, x8, lo
621 | WORD $0xeb0201ff // cmp x15, x2
622 | WORD $0x54fffe42 // b.hs LBB7_3
623 | WORD $0xd2800002 // mov x2, #0
624 | WORD $0x8b090223 // add x3, x17, x9
625 | WORD $0x927ef463 // and x3, x3, #0xfffffffffffffffc
626 | WORD $0xcb0a2624 // sub x4, x17, x10, lsl #9
627 | WORD $0xd374cd45 // lsl x5, x10, #12
628 | WORD $0xd37df086 // lsl x6, x4, #3
629 | WORD $0xd10020d5 // sub x21, x6, #8
630 | WORD $0x8b050006 // add x6, x0, x5
631 | WORD $0x8b1500c7 // add x7, x6, x21
632 | WORD $0x910020e7 // add x7, x7, #8
633 | WORD $0x927ef493 // and x19, x4, #0xfffffffffffffffc
634 | WORD $0x8b1301f4 // add x20, x15, x19
635 | WORD $0x8b1500b5 // add x21, x5, x21
636 | WORD $0x14000004 // b LBB7_7
637 |
638 | BB7_6:
639 | WORD $0x91000442 // add x2, x2, #1
640 | WORD $0xeb0b005f // cmp x2, x11
641 | WORD $0x54fffc20 // b.eq LBB7_3
642 |
643 | BB7_7:
644 | WORD $0xf8627836 // ldr x22, [x1, x2, lsl #3]
645 | WORD $0xaa0f03f8 // mov x24, x15
646 | WORD $0xf100109f // cmp x4, #4
647 | WORD $0x540002e3 // b.lo LBB7_12
648 | WORD $0x8b0502d7 // add x23, x22, x5
649 | WORD $0x8b1502d8 // add x24, x22, x21
650 | WORD $0x91002318 // add x24, x24, #8
651 | WORD $0xeb1800df // cmp x6, x24
652 | WORD $0xfa4732e2 // ccmp x23, x7, #2, lo
653 | WORD $0xaa0f03f8 // mov x24, x15
654 | WORD $0x54000203 // b.lo LBB7_12
655 | WORD $0x8b0e02d7 // add x23, x22, x14
656 | WORD $0xaa0c03f8 // mov x24, x12
657 | WORD $0xaa0303f9 // mov x25, x3
658 |
659 | BB7_10:
660 | WORD $0xad7f86e0 // ldp q0, q1, [x23, #-16]
661 | WORD $0xad7f8f02 // ldp q2, q3, [x24, #-16]
662 | WORD $0x6e201c40 // eor.16b v0, v2, v0
663 | WORD $0x6e211c61 // eor.16b v1, v3, v1
664 | WORD $0xad3f8700 // stp q0, q1, [x24, #-16]
665 | WORD $0x910082f7 // add x23, x23, #32
666 | WORD $0x91008318 // add x24, x24, #32
667 | WORD $0xf1001339 // subs x25, x25, #4
668 | WORD $0x54ffff01 // b.ne LBB7_10
669 | WORD $0xaa1403f8 // mov x24, x20
670 | WORD $0xeb13009f // cmp x4, x19
671 | WORD $0x54fffc80 // b.eq LBB7_6
672 |
673 | BB7_12:
674 | WORD $0xcb180237 // sub x23, x17, x24
675 | WORD $0xd37df319 // lsl x25, x24, #3
676 | WORD $0x8b190018 // add x24, x0, x25
677 | WORD $0x8b1902d6 // add x22, x22, x25
678 |
679 | BB7_13:
680 | WORD $0xf84086d9 // ldr x25, [x22], #8
681 | WORD $0xf940031a // ldr x26, [x24]
682 | WORD $0xca190359 // eor x25, x26, x25
683 | WORD $0xf8008719 // str x25, [x24], #8
684 | WORD $0xf10006f7 // subs x23, x23, #1
685 | WORD $0x54ffff61 // b.ne LBB7_13
686 | WORD $0x17ffffd9 // b LBB7_6
687 |
688 | BB7_14:
689 | WORD $0xa9447bfd // ldp x29, x30, [sp, #64] ; 16-byte Folded Reload
690 | WORD $0xa9434ff4 // ldp x20, x19, [sp, #48] ; 16-byte Folded Reload
691 | WORD $0xa94257f6 // ldp x22, x21, [sp, #32] ; 16-byte Folded Reload
692 | WORD $0xa9415ff8 // ldp x24, x23, [sp, #16] ; 16-byte Folded Reload
693 | WORD $0xa8c567fa // ldp x26, x25, [sp], #80 ; 16-byte Folded Reload
694 | WORD $0xd65f03c0 // ret
695 |
696 | TEXT ·_count(SB), $0-32
697 | MOVD a+0(FP), R0
698 | MOVD size+8(FP), R1
699 | MOVD result+16(FP), R2
700 | WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
701 | WORD $0x910003fd // mov x29, sp
702 | WORD $0xb40000c1 // cbz x1, LBB8_3
703 | WORD $0xf100103f // cmp x1, #4
704 | WORD $0x540000c2 // b.hs LBB8_4
705 | WORD $0xd2800008 // mov x8, #0
706 | WORD $0xd2800009 // mov x9, #0
707 | WORD $0x14000019 // b LBB8_7
708 |
709 | BB8_3:
710 | WORD $0xd2800009 // mov x9, #0
711 | WORD $0x14000020 // b LBB8_9
712 |
713 | BB8_4:
714 | WORD $0x927ef428 // and x8, x1, #0xfffffffffffffffc
715 | WORD $0x91004009 // add x9, x0, #16
716 | WORD $0x6f00e400 // movi.2d v0, #0000000000000000
717 | WORD $0xaa0803ea // mov x10, x8
718 | WORD $0x6f00e401 // movi.2d v1, #0000000000000000
719 |
720 | BB8_5:
721 | WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16]
722 | WORD $0x4e205842 // cnt.16b v2, v2
723 | WORD $0x6e202842 // uaddlp.8h v2, v2
724 | WORD $0x6e602842 // uaddlp.4s v2, v2
725 | WORD $0x4e205863 // cnt.16b v3, v3
726 | WORD $0x6e202863 // uaddlp.8h v3, v3
727 | WORD $0x6e602863 // uaddlp.4s v3, v3
728 | WORD $0x6ea06840 // uadalp.2d v0, v2
729 | WORD $0x6ea06861 // uadalp.2d v1, v3
730 | WORD $0x91008129 // add x9, x9, #32
731 | WORD $0xf100114a // subs x10, x10, #4
732 | WORD $0x54fffea1 // b.ne LBB8_5
733 | WORD $0x4ee08420 // add.2d v0, v1, v0
734 | WORD $0x5ef1b800 // addp.2d d0, v0
735 | WORD $0x9e660009 // fmov x9, d0
736 | WORD $0xeb01011f // cmp x8, x1
737 | WORD $0x54000140 // b.eq LBB8_9
738 |
739 | BB8_7:
740 | WORD $0x8b080c0a // add x10, x0, x8, lsl #3
741 | WORD $0xcb080028 // sub x8, x1, x8
742 |
743 | BB8_8:
744 | WORD $0xfc408540 // ldr d0, [x10], #8
745 | WORD $0x0e205800 // cnt.8b v0, v0
746 | WORD $0x2e303800 // uaddlv.8b h0, v0
747 | WORD $0x1e26000b // fmov w11, s0
748 | WORD $0x8b090169 // add x9, x11, x9
749 | WORD $0xf1000508 // subs x8, x8, #1
750 | WORD $0x54ffff41 // b.ne LBB8_8
751 |
752 | BB8_9:
753 | WORD $0xf9000049 // str x9, [x2]
754 | WORD $0xa8c17bfd // ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
755 | WORD $0xd65f03c0 // ret
756 |
--------------------------------------------------------------------------------
/simd_avx.go:
--------------------------------------------------------------------------------
1 | //go:build !noasm && amd64
2 | // AUTO-GENERATED BY GOCC -- DO NOT EDIT
3 |
4 | package bitmap
5 |
6 | import "unsafe"
7 |
8 | //go:nosplit
9 | //go:noescape
10 | func _and(a unsafe.Pointer, b unsafe.Pointer, n uint64)
11 |
12 | //go:nosplit
13 | //go:noescape
14 | func _andn(a unsafe.Pointer, b unsafe.Pointer, n uint64)
15 |
16 | //go:nosplit
17 | //go:noescape
18 | func _or(a unsafe.Pointer, b unsafe.Pointer, n uint64)
19 |
20 | //go:nosplit
21 | //go:noescape
22 | func _xor(a unsafe.Pointer, b unsafe.Pointer, n uint64)
23 |
24 | //go:nosplit
25 | //go:noescape
26 | func _and_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
27 |
28 | //go:nosplit
29 | //go:noescape
30 | func _andn_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
31 |
32 | //go:nosplit
33 | //go:noescape
34 | func _or_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
35 |
36 | //go:nosplit
37 | //go:noescape
38 | func _xor_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
39 |
40 | //go:nosplit
41 | //go:noescape
42 | func _count(a unsafe.Pointer, size uint64, result unsafe.Pointer)
43 |
--------------------------------------------------------------------------------
/simd_avx.s:
--------------------------------------------------------------------------------
1 | //go:build !noasm && amd64
2 | // AUTO-GENERATED BY GOCC -- DO NOT EDIT
3 |
4 | TEXT ·_and(SB), $0-32
5 | MOVQ a+0(FP), DI
6 | MOVQ b+8(FP), SI
7 | MOVQ n+16(FP), DX
8 | BYTE $0x55 // push rbp
9 | WORD $0x8948; BYTE $0xe5 // mov rbp, rsp
10 | LONG $0xf8e48348 // and rsp, -8
11 | WORD $0x8548; BYTE $0xd2 // test rdx, rdx
12 | JE LBB0_10
13 | LONG $0x04fa8348 // cmp rdx, 4
14 | JAE LBB0_3
15 | WORD $0xc031 // xor eax, eax
16 | JMP LBB0_9
17 |
18 | LBB0_3:
19 | LONG $0xd6048d48 // lea rax, [rsi + 8*rdx]
20 | WORD $0x3948; BYTE $0xf8 // cmp rax, rdi
21 | JBE LBB0_6
22 | LONG $0xd7048d48 // lea rax, [rdi + 8*rdx]
23 | WORD $0x3948; BYTE $0xf0 // cmp rax, rsi
24 | JBE LBB0_6
25 | WORD $0xc031 // xor eax, eax
26 | JMP LBB0_9
27 |
28 | LBB0_6:
29 | WORD $0x8948; BYTE $0xd0 // mov rax, rdx
30 | LONG $0xfce08348 // and rax, -4
31 | WORD $0xc931 // xor ecx, ecx
32 |
33 | LBB0_7:
34 | LONG $0x0410fcc5; BYTE $0xcf // vmovups ymm0, ymmword ptr [rdi + 8*rcx]
35 | LONG $0x0454fcc5; BYTE $0xce // vandps ymm0, ymm0, ymmword ptr [rsi + 8*rcx]
36 | LONG $0x0411fcc5; BYTE $0xcf // vmovups ymmword ptr [rdi + 8*rcx], ymm0
37 | LONG $0x04c18348 // add rcx, 4
38 | WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
39 | JNE LBB0_7
40 | WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
41 | JE LBB0_10
42 |
43 | LBB0_9:
44 | LONG $0xc60c8b48 // mov rcx, qword ptr [rsi + 8*rax]
45 | LONG $0xc70c2148 // and qword ptr [rdi + 8*rax], rcx
46 | WORD $0xff48; BYTE $0xc0 // inc rax
47 | WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
48 | JNE LBB0_9
49 |
50 | LBB0_10:
51 | WORD $0x8948; BYTE $0xec // mov rsp, rbp
52 | BYTE $0x5d // pop rbp
53 | WORD $0xf8c5; BYTE $0x77 // vzeroupper
54 | BYTE $0xc3 // ret
55 |
56 | TEXT ·_andn(SB), $0-32
57 | MOVQ a+0(FP), DI
58 | MOVQ b+8(FP), SI
59 | MOVQ n+16(FP), DX
60 | BYTE $0x55 // push rbp
61 | WORD $0x8948; BYTE $0xe5 // mov rbp, rsp
62 | LONG $0xf8e48348 // and rsp, -8
63 | WORD $0x8548; BYTE $0xd2 // test rdx, rdx
64 | JE LBB1_10
65 | LONG $0x04fa8348 // cmp rdx, 4
66 | JAE LBB1_3
67 | WORD $0xc031 // xor eax, eax
68 | JMP LBB1_9
69 |
70 | LBB1_3:
71 | LONG $0xd6048d48 // lea rax, [rsi + 8*rdx]
72 | WORD $0x3948; BYTE $0xf8 // cmp rax, rdi
73 | JBE LBB1_6
74 | LONG $0xd7048d48 // lea rax, [rdi + 8*rdx]
75 | WORD $0x3948; BYTE $0xf0 // cmp rax, rsi
76 | JBE LBB1_6
77 | WORD $0xc031 // xor eax, eax
78 | JMP LBB1_9
79 |
80 | LBB1_6:
81 | WORD $0x8948; BYTE $0xd0 // mov rax, rdx
82 | LONG $0xfce08348 // and rax, -4
83 | WORD $0xc931 // xor ecx, ecx
84 |
85 | LBB1_7:
86 | LONG $0x0410fcc5; BYTE $0xce // vmovups ymm0, ymmword ptr [rsi + 8*rcx]
87 | LONG $0x0455fcc5; BYTE $0xcf // vandnps ymm0, ymm0, ymmword ptr [rdi + 8*rcx]
88 | LONG $0x0411fcc5; BYTE $0xcf // vmovups ymmword ptr [rdi + 8*rcx], ymm0
89 | LONG $0x04c18348 // add rcx, 4
90 | WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
91 | JNE LBB1_7
92 | WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
93 | JE LBB1_10
94 |
95 | LBB1_9:
96 | LONG $0xc60c8b48 // mov rcx, qword ptr [rsi + 8*rax]
97 | WORD $0xf748; BYTE $0xd1 // not rcx
98 | LONG $0xc70c2148 // and qword ptr [rdi + 8*rax], rcx
99 | WORD $0xff48; BYTE $0xc0 // inc rax
100 | WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
101 | JNE LBB1_9
102 |
103 | LBB1_10:
104 | WORD $0x8948; BYTE $0xec // mov rsp, rbp
105 | BYTE $0x5d // pop rbp
106 | WORD $0xf8c5; BYTE $0x77 // vzeroupper
107 | BYTE $0xc3 // ret
108 |
109 | TEXT ·_or(SB), $0-32
110 | MOVQ a+0(FP), DI
111 | MOVQ b+8(FP), SI
112 | MOVQ n+16(FP), DX
113 | BYTE $0x55 // push rbp
114 | WORD $0x8948; BYTE $0xe5 // mov rbp, rsp
115 | LONG $0xf8e48348 // and rsp, -8
116 | WORD $0x8548; BYTE $0xd2 // test rdx, rdx
117 | JE LBB2_10
118 | LONG $0x04fa8348 // cmp rdx, 4
119 | JAE LBB2_3
120 | WORD $0xc031 // xor eax, eax
121 | JMP LBB2_9
122 |
123 | LBB2_3:
124 | LONG $0xd6048d48 // lea rax, [rsi + 8*rdx]
125 | WORD $0x3948; BYTE $0xf8 // cmp rax, rdi
126 | JBE LBB2_6
127 | LONG $0xd7048d48 // lea rax, [rdi + 8*rdx]
128 | WORD $0x3948; BYTE $0xf0 // cmp rax, rsi
129 | JBE LBB2_6
130 | WORD $0xc031 // xor eax, eax
131 | JMP LBB2_9
132 |
133 | LBB2_6:
134 | WORD $0x8948; BYTE $0xd0 // mov rax, rdx
135 | LONG $0xfce08348 // and rax, -4
136 | WORD $0xc931 // xor ecx, ecx
137 |
138 | LBB2_7:
139 | LONG $0x0410fcc5; BYTE $0xcf // vmovups ymm0, ymmword ptr [rdi + 8*rcx]
140 | LONG $0x0456fcc5; BYTE $0xce // vorps ymm0, ymm0, ymmword ptr [rsi + 8*rcx]
141 | LONG $0x0411fcc5; BYTE $0xcf // vmovups ymmword ptr [rdi + 8*rcx], ymm0
142 | LONG $0x04c18348 // add rcx, 4
143 | WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
144 | JNE LBB2_7
145 | WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
146 | JE LBB2_10
147 |
148 | LBB2_9:
149 | LONG $0xc60c8b48 // mov rcx, qword ptr [rsi + 8*rax]
150 | LONG $0xc70c0948 // or qword ptr [rdi + 8*rax], rcx
151 | WORD $0xff48; BYTE $0xc0 // inc rax
152 | WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
153 | JNE LBB2_9
154 |
155 | LBB2_10:
156 | WORD $0x8948; BYTE $0xec // mov rsp, rbp
157 | BYTE $0x5d // pop rbp
158 | WORD $0xf8c5; BYTE $0x77 // vzeroupper
159 | BYTE $0xc3 // ret
160 |
161 | TEXT ·_xor(SB), $0-32
162 | MOVQ a+0(FP), DI
163 | MOVQ b+8(FP), SI
164 | MOVQ n+16(FP), DX
165 | BYTE $0x55 // push rbp
166 | WORD $0x8948; BYTE $0xe5 // mov rbp, rsp
167 | LONG $0xf8e48348 // and rsp, -8
168 | WORD $0x8548; BYTE $0xd2 // test rdx, rdx
169 | JE LBB3_10
170 | LONG $0x04fa8348 // cmp rdx, 4
171 | JAE LBB3_3
172 | WORD $0xc031 // xor eax, eax
173 | JMP LBB3_9
174 |
175 | LBB3_3:
176 | LONG $0xd6048d48 // lea rax, [rsi + 8*rdx]
177 | WORD $0x3948; BYTE $0xf8 // cmp rax, rdi
178 | JBE LBB3_6
179 | LONG $0xd7048d48 // lea rax, [rdi + 8*rdx]
180 | WORD $0x3948; BYTE $0xf0 // cmp rax, rsi
181 | JBE LBB3_6
182 | WORD $0xc031 // xor eax, eax
183 | JMP LBB3_9
184 |
185 | LBB3_6:
186 | WORD $0x8948; BYTE $0xd0 // mov rax, rdx
187 | LONG $0xfce08348 // and rax, -4
188 | WORD $0xc931 // xor ecx, ecx
189 |
190 | LBB3_7:
191 | LONG $0x0410fcc5; BYTE $0xcf // vmovups ymm0, ymmword ptr [rdi + 8*rcx]
192 | LONG $0x0457fcc5; BYTE $0xce // vxorps ymm0, ymm0, ymmword ptr [rsi + 8*rcx]
193 | LONG $0x0411fcc5; BYTE $0xcf // vmovups ymmword ptr [rdi + 8*rcx], ymm0
194 | LONG $0x04c18348 // add rcx, 4
195 | WORD $0x3948; BYTE $0xc8 // cmp rax, rcx
196 | JNE LBB3_7
197 | WORD $0x3948; BYTE $0xd0 // cmp rax, rdx
198 | JE LBB3_10
199 |
200 | LBB3_9:
201 | LONG $0xc60c8b48 // mov rcx, qword ptr [rsi + 8*rax]
202 | LONG $0xc70c3148 // xor qword ptr [rdi + 8*rax], rcx
203 | WORD $0xff48; BYTE $0xc0 // inc rax
204 | WORD $0x3948; BYTE $0xc2 // cmp rdx, rax
205 | JNE LBB3_9
206 |
207 | LBB3_10:
208 | WORD $0x8948; BYTE $0xec // mov rsp, rbp
209 | BYTE $0x5d // pop rbp
210 | WORD $0xf8c5; BYTE $0x77 // vzeroupper
211 | BYTE $0xc3 // ret
212 |
213 | TEXT ·_and_many(SB), $0-32
214 | MOVQ a+0(FP), DI
215 | MOVQ b+8(FP), SI
216 | MOVQ dims+16(FP), DX
217 | BYTE $0x55 // push rbp
218 | WORD $0x8948; BYTE $0xe5 // mov rbp, rsp
219 | WORD $0x5741 // push r15
220 | WORD $0x5641 // push r14
221 | WORD $0x5541 // push r13
222 | WORD $0x5441 // push r12
223 | BYTE $0x53 // push rbx
224 | LONG $0xf8e48348 // and rsp, -8
225 | LONG $0x68ec8348 // sub rsp, 104
226 | LONG $0xffffffbb; BYTE $0xff // mov ebx, 4294967295
227 | WORD $0x2148; BYTE $0xd3 // and rbx, rdx
228 | JE LBB4_14
229 | WORD $0x8948; BYTE $0xd0 // mov rax, rdx
230 | LONG $0x20e8c148 // shr rax, 32
231 | LONG $0x02f88348 // cmp rax, 2
232 | LONG $0x0001bc41; WORD $0x0000 // mov r12d, 1
233 | LONG $0x24448948; BYTE $0x28 // mov qword ptr [rsp + 40], rax
234 | LONG $0xe0430f4c // cmovae r12, rax
235 | LONG $0x000200b9; BYTE $0x00 // mov ecx, 512
236 | WORD $0xc031 // xor eax, eax
237 | LONG $0x24448948; BYTE $0x18 // mov qword ptr [rsp + 24], rax
238 | WORD $0x8948; BYTE $0xf8 // mov rax, rdi
239 | WORD $0xd231 // xor edx, edx
240 | LONG $0x24548948; BYTE $0x10 // mov qword ptr [rsp + 16], rdx
241 | WORD $0xd231 // xor edx, edx
242 | LONG $0x24548948; BYTE $0x08 // mov qword ptr [rsp + 8], rdx
243 | WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
244 | LONG $0x245c8948; BYTE $0x20 // mov qword ptr [rsp + 32], rbx
245 | JMP LBB4_3
246 |
247 | LBB4_2:
248 | LONG $0x2444ff48; BYTE $0x08 // inc qword ptr [rsp + 8]
249 | LONG $0x244c8b48; BYTE $0x38 // mov rcx, qword ptr [rsp + 56]
250 | LONG $0x00c18148; WORD $0x0002; BYTE $0x00 // add rcx, 512
251 | QUAD $0xfffe001024448148; BYTE $0xff // add qword ptr [rsp + 16], -512
252 | QUAD $0x0010001824448148; BYTE $0x00 // add qword ptr [rsp + 24], 4096
253 | LONG $0x10000548; WORD $0x0000 // add rax, 4096
254 | LONG $0x245c8b48; BYTE $0x20 // mov rbx, qword ptr [rsp + 32]
255 | LONG $0x244c8b4c; BYTE $0x30 // mov r9, qword ptr [rsp + 48]
256 | WORD $0x3949; BYTE $0xd9 // cmp r9, rbx
257 | JAE LBB4_14
258 |
259 | LBB4_3:
260 | WORD $0x3948; BYTE $0xd9 // cmp rcx, rbx
261 | WORD $0x8949; BYTE $0xda // mov r10, rbx
262 | LONG $0x244c8948; BYTE $0x38 // mov qword ptr [rsp + 56], rcx
263 | LONG $0xd1420f4c // cmovb r10, rcx
264 | LONG $0x00898d49; WORD $0x0002; BYTE $0x00 // lea rcx, [r9 + 512]
265 | WORD $0x3948; BYTE $0xd9 // cmp rcx, rbx
266 | LONG $0x244c8948; BYTE $0x30 // mov qword ptr [rsp + 48], rcx
267 | LONG $0xd9420f48 // cmovb rbx, rcx
268 | LONG $0x247c8348; WORD $0x0028 // cmp qword ptr [rsp + 40], 0
269 | JE LBB4_2
270 | LONG $0x2454034c; BYTE $0x10 // add r10, qword ptr [rsp + 16]
271 | LONG $0xfce28349 // and r10, -4
272 | LONG $0x247c8b4c; BYTE $0x08 // mov r15, qword ptr [rsp + 8]
273 | WORD $0x894c; BYTE $0xf9 // mov rcx, r15
274 | LONG $0x09e1c148 // shl rcx, 9
275 | WORD $0x8949; BYTE $0xd8 // mov r8, rbx
276 | WORD $0x2949; BYTE $0xc8 // sub r8, rcx
277 | LONG $0x0ce7c149 // shl r15, 12
278 | LONG $0x3f1c8d4e // lea r11, [rdi + r15]
279 | LONG $0x247c894c; BYTE $0x60 // mov qword ptr [rsp + 96], r15
280 | LONG $0xc73c8d4f // lea r15, [r15 + 8*r8]
281 | LONG $0x3f0c8d49 // lea rcx, [r15 + rdi]
282 | LONG $0x244c8948; BYTE $0x58 // mov qword ptr [rsp + 88], rcx
283 | WORD $0x894c; BYTE $0xc1 // mov rcx, r8
284 | LONG $0xfce18348 // and rcx, -4
285 | LONG $0x244c8948; BYTE $0x48 // mov qword ptr [rsp + 72], rcx
286 | WORD $0x014c; BYTE $0xc9 // add rcx, r9
287 | LONG $0x244c8948; BYTE $0x40 // mov qword ptr [rsp + 64], rcx
288 | WORD $0x3145; BYTE $0xf6 // xor r14d, r14d
289 | LONG $0x247c894c; BYTE $0x50 // mov qword ptr [rsp + 80], r15
290 | JMP LBB4_5
291 |
292 | LBB4_12:
293 | WORD $0xff49; BYTE $0xc6 // inc r14
294 | WORD $0x394d; BYTE $0xe6 // cmp r14, r12
295 | JE LBB4_2
296 |
297 | LBB4_5:
298 | WORD $0x3949; BYTE $0xd9 // cmp r9, rbx
299 | JAE LBB4_12
300 | LONG $0xf6148b4a // mov rdx, qword ptr [rsi + 8*r14]
301 | WORD $0x894d; BYTE $0xcd // mov r13, r9
302 | LONG $0x04f88349 // cmp r8, 4
303 | JB LBB4_13
304 | LONG $0x3a0c8d4a // lea rcx, [rdx + r15]
305 | WORD $0x3949; BYTE $0xcb // cmp r11, rcx
306 | JAE LBB4_9
307 | LONG $0x244c8b48; BYTE $0x60 // mov rcx, qword ptr [rsp + 96]
308 | WORD $0x0148; BYTE $0xd1 // add rcx, rdx
309 | WORD $0x894d; BYTE $0xcd // mov r13, r9
310 | LONG $0x244c3b48; BYTE $0x58 // cmp rcx, qword ptr [rsp + 88]
311 | JB LBB4_13
312 |
313 | LBB4_9:
314 | WORD $0x894d; BYTE $0xdf // mov r15, r11
315 | WORD $0x8949; BYTE $0xf3 // mov r11, rsi
316 | LONG $0x244c8b48; BYTE $0x18 // mov rcx, qword ptr [rsp + 24]
317 | LONG $0x0a348d48 // lea rsi, [rdx + rcx]
318 | WORD $0xc931 // xor ecx, ecx
319 |
320 | LBB4_10:
321 | LONG $0x0410fcc5; BYTE $0xc8 // vmovups ymm0, ymmword ptr [rax + 8*rcx]
322 | LONG $0x0454fcc5; BYTE $0xce // vandps ymm0, ymm0, ymmword ptr [rsi + 8*rcx]
323 | LONG $0x0411fcc5; BYTE $0xc8 // vmovups ymmword ptr [rax + 8*rcx], ymm0
324 | LONG $0x04c18348 // add rcx, 4
325 | WORD $0x3949; BYTE $0xca // cmp r10, rcx
326 | JNE LBB4_10
327 | LONG $0x246c8b4c; BYTE $0x40 // mov r13, qword ptr [rsp + 64]
328 | LONG $0x24443b4c; BYTE $0x48 // cmp r8, qword ptr [rsp + 72]
329 | WORD $0x894c; BYTE $0xde // mov rsi, r11
330 | WORD $0x894d; BYTE $0xfb // mov r11, r15
331 | LONG $0x247c8b4c; BYTE $0x50 // mov r15, qword ptr [rsp + 80]
332 | JE LBB4_12
333 |
334 | LBB4_13:
335 | LONG $0xea0c8b4a // mov rcx, qword ptr [rdx + 8*r13]
336 | LONG $0xef0c214a // and qword ptr [rdi + 8*r13], rcx
337 | WORD $0xff49; BYTE $0xc5 // inc r13
338 | WORD $0x3949; BYTE $0xdd // cmp r13, rbx
339 | JB LBB4_13
340 | JMP LBB4_12
341 |
342 | LBB4_14:
343 | LONG $0xd8658d48 // lea rsp, [rbp - 40]
344 | BYTE $0x5b // pop rbx
345 | WORD $0x5c41 // pop r12
346 | WORD $0x5d41 // pop r13
347 | WORD $0x5e41 // pop r14
348 | WORD $0x5f41 // pop r15
349 | BYTE $0x5d // pop rbp
350 | WORD $0xf8c5; BYTE $0x77 // vzeroupper
351 | BYTE $0xc3 // ret
352 |
353 | TEXT ·_andn_many(SB), $0-32
354 | MOVQ a+0(FP), DI
355 | MOVQ b+8(FP), SI
356 | MOVQ dims+16(FP), DX
357 | BYTE $0x55 // push rbp
358 | WORD $0x8948; BYTE $0xe5 // mov rbp, rsp
359 | WORD $0x5741 // push r15
360 | WORD $0x5641 // push r14
361 | WORD $0x5541 // push r13
362 | WORD $0x5441 // push r12
363 | BYTE $0x53 // push rbx
364 | LONG $0xf8e48348 // and rsp, -8
365 | LONG $0x68ec8348 // sub rsp, 104
366 | LONG $0xffffffbb; BYTE $0xff // mov ebx, 4294967295
367 | WORD $0x2148; BYTE $0xd3 // and rbx, rdx
368 | JE LBB5_14
369 | WORD $0x8948; BYTE $0xd0 // mov rax, rdx
370 | LONG $0x20e8c148 // shr rax, 32
371 | LONG $0x02f88348 // cmp rax, 2
372 | LONG $0x0001bc41; WORD $0x0000 // mov r12d, 1
373 | LONG $0x24448948; BYTE $0x28 // mov qword ptr [rsp + 40], rax
374 | LONG $0xe0430f4c // cmovae r12, rax
375 | LONG $0x000200b9; BYTE $0x00 // mov ecx, 512
376 | WORD $0xc031 // xor eax, eax
377 | LONG $0x24448948; BYTE $0x18 // mov qword ptr [rsp + 24], rax
378 | WORD $0x8948; BYTE $0xf8 // mov rax, rdi
379 | WORD $0xd231 // xor edx, edx
380 | LONG $0x24548948; BYTE $0x10 // mov qword ptr [rsp + 16], rdx
381 | WORD $0xd231 // xor edx, edx
382 | LONG $0x24548948; BYTE $0x08 // mov qword ptr [rsp + 8], rdx
383 | WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
384 | LONG $0x245c8948; BYTE $0x20 // mov qword ptr [rsp + 32], rbx
385 | JMP LBB5_3
386 |
387 | LBB5_2:
388 | LONG $0x2444ff48; BYTE $0x08 // inc qword ptr [rsp + 8]
389 | LONG $0x244c8b48; BYTE $0x38 // mov rcx, qword ptr [rsp + 56]
390 | LONG $0x00c18148; WORD $0x0002; BYTE $0x00 // add rcx, 512
391 | QUAD $0xfffe001024448148; BYTE $0xff // add qword ptr [rsp + 16], -512
392 | QUAD $0x0010001824448148; BYTE $0x00 // add qword ptr [rsp + 24], 4096
393 | LONG $0x10000548; WORD $0x0000 // add rax, 4096
394 | LONG $0x245c8b48; BYTE $0x20 // mov rbx, qword ptr [rsp + 32]
395 | LONG $0x244c8b4c; BYTE $0x30 // mov r9, qword ptr [rsp + 48]
396 | WORD $0x3949; BYTE $0xd9 // cmp r9, rbx
397 | JAE LBB5_14
398 |
399 | LBB5_3:
400 | WORD $0x3948; BYTE $0xd9 // cmp rcx, rbx
401 | WORD $0x8949; BYTE $0xdb // mov r11, rbx
402 | LONG $0x244c8948; BYTE $0x38 // mov qword ptr [rsp + 56], rcx
403 | LONG $0xd9420f4c // cmovb r11, rcx
404 | LONG $0x00898d49; WORD $0x0002; BYTE $0x00 // lea rcx, [r9 + 512]
405 | WORD $0x3948; BYTE $0xd9 // cmp rcx, rbx
406 | LONG $0x244c8948; BYTE $0x30 // mov qword ptr [rsp + 48], rcx
407 | LONG $0xd9420f48 // cmovb rbx, rcx
408 | LONG $0x247c8348; WORD $0x0028 // cmp qword ptr [rsp + 40], 0
409 | JE LBB5_2
410 | LONG $0x245c034c; BYTE $0x10 // add r11, qword ptr [rsp + 16]
411 | LONG $0xfce38349 // and r11, -4
412 | LONG $0x247c8b4c; BYTE $0x08 // mov r15, qword ptr [rsp + 8]
413 | WORD $0x894c; BYTE $0xf9 // mov rcx, r15
414 | LONG $0x09e1c148 // shl rcx, 9
415 | WORD $0x8949; BYTE $0xd8 // mov r8, rbx
416 | WORD $0x2949; BYTE $0xc8 // sub r8, rcx
417 | LONG $0x0ce7c149 // shl r15, 12
418 | LONG $0x3f148d4e // lea r10, [rdi + r15]
419 | LONG $0x247c894c; BYTE $0x60 // mov qword ptr [rsp + 96], r15
420 | LONG $0xc73c8d4f // lea r15, [r15 + 8*r8]
421 | LONG $0x3f0c8d49 // lea rcx, [r15 + rdi]
422 | LONG $0x244c8948; BYTE $0x58 // mov qword ptr [rsp + 88], rcx
423 | WORD $0x894c; BYTE $0xc1 // mov rcx, r8
424 | LONG $0xfce18348 // and rcx, -4
425 | LONG $0x244c8948; BYTE $0x48 // mov qword ptr [rsp + 72], rcx
426 | WORD $0x014c; BYTE $0xc9 // add rcx, r9
427 | LONG $0x244c8948; BYTE $0x40 // mov qword ptr [rsp + 64], rcx
428 | WORD $0x3145; BYTE $0xf6 // xor r14d, r14d
429 | LONG $0x247c894c; BYTE $0x50 // mov qword ptr [rsp + 80], r15
430 | JMP LBB5_5
431 |
432 | LBB5_12:
433 | WORD $0xff49; BYTE $0xc6 // inc r14
434 | WORD $0x394d; BYTE $0xe6 // cmp r14, r12
435 | JE LBB5_2
436 |
437 | LBB5_5:
438 | WORD $0x3949; BYTE $0xd9 // cmp r9, rbx
439 | JAE LBB5_12
440 | LONG $0xf6148b4a // mov rdx, qword ptr [rsi + 8*r14]
441 | WORD $0x894d; BYTE $0xcd // mov r13, r9
442 | LONG $0x04f88349 // cmp r8, 4
443 | JB LBB5_13
444 | LONG $0x3a0c8d4a // lea rcx, [rdx + r15]
445 | WORD $0x3949; BYTE $0xca // cmp r10, rcx
446 | JAE LBB5_9
447 | LONG $0x244c8b48; BYTE $0x60 // mov rcx, qword ptr [rsp + 96]
448 | WORD $0x0148; BYTE $0xd1 // add rcx, rdx
449 | WORD $0x894d; BYTE $0xcd // mov r13, r9
450 | LONG $0x244c3b48; BYTE $0x58 // cmp rcx, qword ptr [rsp + 88]
451 | JB LBB5_13
452 |
453 | LBB5_9:
454 | WORD $0x894d; BYTE $0xd7 // mov r15, r10
455 | WORD $0x8949; BYTE $0xf2 // mov r10, rsi
456 | LONG $0x244c8b48; BYTE $0x18 // mov rcx, qword ptr [rsp + 24]
457 | LONG $0x0a348d48 // lea rsi, [rdx + rcx]
458 | WORD $0xc931 // xor ecx, ecx
459 |
460 | LBB5_10:
461 | LONG $0x0410fcc5; BYTE $0xce // vmovups ymm0, ymmword ptr [rsi + 8*rcx]
462 | LONG $0x0455fcc5; BYTE $0xc8 // vandnps ymm0, ymm0, ymmword ptr [rax + 8*rcx]
463 | LONG $0x0411fcc5; BYTE $0xc8 // vmovups ymmword ptr [rax + 8*rcx], ymm0
464 | LONG $0x04c18348 // add rcx, 4
465 | WORD $0x3949; BYTE $0xcb // cmp r11, rcx
466 | JNE LBB5_10
467 | LONG $0x246c8b4c; BYTE $0x40 // mov r13, qword ptr [rsp + 64]
468 | LONG $0x24443b4c; BYTE $0x48 // cmp r8, qword ptr [rsp + 72]
469 | WORD $0x894c; BYTE $0xd6 // mov rsi, r10
470 | WORD $0x894d; BYTE $0xfa // mov r10, r15
471 | LONG $0x247c8b4c; BYTE $0x50 // mov r15, qword ptr [rsp + 80]
472 | JE LBB5_12
473 |
474 | LBB5_13:
475 | LONG $0xea0c8b4a // mov rcx, qword ptr [rdx + 8*r13]
476 | WORD $0xf748; BYTE $0xd1 // not rcx
477 | LONG $0xef0c214a // and qword ptr [rdi + 8*r13], rcx
478 | WORD $0xff49; BYTE $0xc5 // inc r13
479 | WORD $0x3949; BYTE $0xdd // cmp r13, rbx
480 | JB LBB5_13
481 | JMP LBB5_12
482 |
483 | LBB5_14:
484 | LONG $0xd8658d48 // lea rsp, [rbp - 40]
485 | BYTE $0x5b // pop rbx
486 | WORD $0x5c41 // pop r12
487 | WORD $0x5d41 // pop r13
488 | WORD $0x5e41 // pop r14
489 | WORD $0x5f41 // pop r15
490 | BYTE $0x5d // pop rbp
491 | WORD $0xf8c5; BYTE $0x77 // vzeroupper
492 | BYTE $0xc3 // ret
493 |
494 | TEXT ·_or_many(SB), $0-32
495 | MOVQ a+0(FP), DI
496 | MOVQ b+8(FP), SI
497 | MOVQ dims+16(FP), DX
498 | BYTE $0x55 // push rbp
499 | WORD $0x8948; BYTE $0xe5 // mov rbp, rsp
500 | WORD $0x5741 // push r15
501 | WORD $0x5641 // push r14
502 | WORD $0x5541 // push r13
503 | WORD $0x5441 // push r12
504 | BYTE $0x53 // push rbx
505 | LONG $0xf8e48348 // and rsp, -8
506 | LONG $0x68ec8348 // sub rsp, 104
507 | LONG $0xffffffbb; BYTE $0xff // mov ebx, 4294967295
508 | WORD $0x2148; BYTE $0xd3 // and rbx, rdx
509 | JE LBB6_14
510 | WORD $0x8948; BYTE $0xd0 // mov rax, rdx
511 | LONG $0x20e8c148 // shr rax, 32
512 | LONG $0x02f88348 // cmp rax, 2
513 | LONG $0x0001bc41; WORD $0x0000 // mov r12d, 1
514 | LONG $0x24448948; BYTE $0x28 // mov qword ptr [rsp + 40], rax
515 | LONG $0xe0430f4c // cmovae r12, rax
516 | LONG $0x000200b9; BYTE $0x00 // mov ecx, 512
517 | WORD $0xc031 // xor eax, eax
518 | LONG $0x24448948; BYTE $0x18 // mov qword ptr [rsp + 24], rax
519 | WORD $0x8948; BYTE $0xf8 // mov rax, rdi
520 | WORD $0xd231 // xor edx, edx
521 | LONG $0x24548948; BYTE $0x10 // mov qword ptr [rsp + 16], rdx
522 | WORD $0xd231 // xor edx, edx
523 | LONG $0x24548948; BYTE $0x08 // mov qword ptr [rsp + 8], rdx
524 | WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
525 | LONG $0x245c8948; BYTE $0x20 // mov qword ptr [rsp + 32], rbx
526 | JMP LBB6_3
527 |
528 | LBB6_2:
529 | LONG $0x2444ff48; BYTE $0x08 // inc qword ptr [rsp + 8]
530 | LONG $0x244c8b48; BYTE $0x38 // mov rcx, qword ptr [rsp + 56]
531 | LONG $0x00c18148; WORD $0x0002; BYTE $0x00 // add rcx, 512
532 | QUAD $0xfffe001024448148; BYTE $0xff // add qword ptr [rsp + 16], -512
533 | QUAD $0x0010001824448148; BYTE $0x00 // add qword ptr [rsp + 24], 4096
534 | LONG $0x10000548; WORD $0x0000 // add rax, 4096
535 | LONG $0x245c8b48; BYTE $0x20 // mov rbx, qword ptr [rsp + 32]
536 | LONG $0x244c8b4c; BYTE $0x30 // mov r9, qword ptr [rsp + 48]
537 | WORD $0x3949; BYTE $0xd9 // cmp r9, rbx
538 | JAE LBB6_14
539 |
540 | LBB6_3:
541 | WORD $0x3948; BYTE $0xd9 // cmp rcx, rbx
542 | WORD $0x8949; BYTE $0xda // mov r10, rbx
543 | LONG $0x244c8948; BYTE $0x38 // mov qword ptr [rsp + 56], rcx
544 | LONG $0xd1420f4c // cmovb r10, rcx
545 | LONG $0x00898d49; WORD $0x0002; BYTE $0x00 // lea rcx, [r9 + 512]
546 | WORD $0x3948; BYTE $0xd9 // cmp rcx, rbx
547 | LONG $0x244c8948; BYTE $0x30 // mov qword ptr [rsp + 48], rcx
548 | LONG $0xd9420f48 // cmovb rbx, rcx
549 | LONG $0x247c8348; WORD $0x0028 // cmp qword ptr [rsp + 40], 0
550 | JE LBB6_2
551 | LONG $0x2454034c; BYTE $0x10 // add r10, qword ptr [rsp + 16]
552 | LONG $0xfce28349 // and r10, -4
553 | LONG $0x247c8b4c; BYTE $0x08 // mov r15, qword ptr [rsp + 8]
554 | WORD $0x894c; BYTE $0xf9 // mov rcx, r15
555 | LONG $0x09e1c148 // shl rcx, 9
556 | WORD $0x8949; BYTE $0xd8 // mov r8, rbx
557 | WORD $0x2949; BYTE $0xc8 // sub r8, rcx
558 | LONG $0x0ce7c149 // shl r15, 12
559 | LONG $0x3f1c8d4e // lea r11, [rdi + r15]
560 | LONG $0x247c894c; BYTE $0x60 // mov qword ptr [rsp + 96], r15
561 | LONG $0xc73c8d4f // lea r15, [r15 + 8*r8]
562 | LONG $0x3f0c8d49 // lea rcx, [r15 + rdi]
563 | LONG $0x244c8948; BYTE $0x58 // mov qword ptr [rsp + 88], rcx
564 | WORD $0x894c; BYTE $0xc1 // mov rcx, r8
565 | LONG $0xfce18348 // and rcx, -4
566 | LONG $0x244c8948; BYTE $0x48 // mov qword ptr [rsp + 72], rcx
567 | WORD $0x014c; BYTE $0xc9 // add rcx, r9
568 | LONG $0x244c8948; BYTE $0x40 // mov qword ptr [rsp + 64], rcx
569 | WORD $0x3145; BYTE $0xf6 // xor r14d, r14d
570 | LONG $0x247c894c; BYTE $0x50 // mov qword ptr [rsp + 80], r15
571 | JMP LBB6_5
572 |
573 | LBB6_12:
574 | WORD $0xff49; BYTE $0xc6 // inc r14
575 | WORD $0x394d; BYTE $0xe6 // cmp r14, r12
576 | JE LBB6_2
577 |
578 | LBB6_5:
579 | WORD $0x3949; BYTE $0xd9 // cmp r9, rbx
580 | JAE LBB6_12
581 | LONG $0xf6148b4a // mov rdx, qword ptr [rsi + 8*r14]
582 | WORD $0x894d; BYTE $0xcd // mov r13, r9
583 | LONG $0x04f88349 // cmp r8, 4
584 | JB LBB6_13
585 | LONG $0x3a0c8d4a // lea rcx, [rdx + r15]
586 | WORD $0x3949; BYTE $0xcb // cmp r11, rcx
587 | JAE LBB6_9
588 | LONG $0x244c8b48; BYTE $0x60 // mov rcx, qword ptr [rsp + 96]
589 | WORD $0x0148; BYTE $0xd1 // add rcx, rdx
590 | WORD $0x894d; BYTE $0xcd // mov r13, r9
591 | LONG $0x244c3b48; BYTE $0x58 // cmp rcx, qword ptr [rsp + 88]
592 | JB LBB6_13
593 |
594 | LBB6_9:
595 | WORD $0x894d; BYTE $0xdf // mov r15, r11
596 | WORD $0x8949; BYTE $0xf3 // mov r11, rsi
597 | LONG $0x244c8b48; BYTE $0x18 // mov rcx, qword ptr [rsp + 24]
598 | LONG $0x0a348d48 // lea rsi, [rdx + rcx]
599 | WORD $0xc931 // xor ecx, ecx
600 |
601 | LBB6_10:
602 | LONG $0x0410fcc5; BYTE $0xc8 // vmovups ymm0, ymmword ptr [rax + 8*rcx]
603 | LONG $0x0456fcc5; BYTE $0xce // vorps ymm0, ymm0, ymmword ptr [rsi + 8*rcx]
604 | LONG $0x0411fcc5; BYTE $0xc8 // vmovups ymmword ptr [rax + 8*rcx], ymm0
605 | LONG $0x04c18348 // add rcx, 4
606 | WORD $0x3949; BYTE $0xca // cmp r10, rcx
607 | JNE LBB6_10
608 | LONG $0x246c8b4c; BYTE $0x40 // mov r13, qword ptr [rsp + 64]
609 | LONG $0x24443b4c; BYTE $0x48 // cmp r8, qword ptr [rsp + 72]
610 | WORD $0x894c; BYTE $0xde // mov rsi, r11
611 | WORD $0x894d; BYTE $0xfb // mov r11, r15
612 | LONG $0x247c8b4c; BYTE $0x50 // mov r15, qword ptr [rsp + 80]
613 | JE LBB6_12
614 |
615 | LBB6_13:
616 | LONG $0xea0c8b4a // mov rcx, qword ptr [rdx + 8*r13]
617 | LONG $0xef0c094a // or qword ptr [rdi + 8*r13], rcx
618 | WORD $0xff49; BYTE $0xc5 // inc r13
619 | WORD $0x3949; BYTE $0xdd // cmp r13, rbx
620 | JB LBB6_13
621 | JMP LBB6_12
622 |
623 | LBB6_14:
624 | LONG $0xd8658d48 // lea rsp, [rbp - 40]
625 | BYTE $0x5b // pop rbx
626 | WORD $0x5c41 // pop r12
627 | WORD $0x5d41 // pop r13
628 | WORD $0x5e41 // pop r14
629 | WORD $0x5f41 // pop r15
630 | BYTE $0x5d // pop rbp
631 | WORD $0xf8c5; BYTE $0x77 // vzeroupper
632 | BYTE $0xc3 // ret
633 |
634 | TEXT ·_xor_many(SB), $0-32
635 | MOVQ a+0(FP), DI
636 | MOVQ b+8(FP), SI
637 | MOVQ dims+16(FP), DX
638 | BYTE $0x55 // push rbp
639 | WORD $0x8948; BYTE $0xe5 // mov rbp, rsp
640 | WORD $0x5741 // push r15
641 | WORD $0x5641 // push r14
642 | WORD $0x5541 // push r13
643 | WORD $0x5441 // push r12
644 | BYTE $0x53 // push rbx
645 | LONG $0xf8e48348 // and rsp, -8
646 | LONG $0x68ec8348 // sub rsp, 104
647 | LONG $0xffffffbb; BYTE $0xff // mov ebx, 4294967295
648 | WORD $0x2148; BYTE $0xd3 // and rbx, rdx
649 | JE LBB7_14
650 | WORD $0x8948; BYTE $0xd0 // mov rax, rdx
651 | LONG $0x20e8c148 // shr rax, 32
652 | LONG $0x02f88348 // cmp rax, 2
653 | LONG $0x0001bc41; WORD $0x0000 // mov r12d, 1
654 | LONG $0x24448948; BYTE $0x28 // mov qword ptr [rsp + 40], rax
655 | LONG $0xe0430f4c // cmovae r12, rax
656 | LONG $0x000200b9; BYTE $0x00 // mov ecx, 512
657 | WORD $0xc031 // xor eax, eax
658 | LONG $0x24448948; BYTE $0x18 // mov qword ptr [rsp + 24], rax
659 | WORD $0x8948; BYTE $0xf8 // mov rax, rdi
660 | WORD $0xd231 // xor edx, edx
661 | LONG $0x24548948; BYTE $0x10 // mov qword ptr [rsp + 16], rdx
662 | WORD $0xd231 // xor edx, edx
663 | LONG $0x24548948; BYTE $0x08 // mov qword ptr [rsp + 8], rdx
664 | WORD $0x3145; BYTE $0xc9 // xor r9d, r9d
665 | LONG $0x245c8948; BYTE $0x20 // mov qword ptr [rsp + 32], rbx
666 | JMP LBB7_3
667 |
668 | LBB7_2:
669 | LONG $0x2444ff48; BYTE $0x08 // inc qword ptr [rsp + 8]
670 | LONG $0x244c8b48; BYTE $0x38 // mov rcx, qword ptr [rsp + 56]
671 | LONG $0x00c18148; WORD $0x0002; BYTE $0x00 // add rcx, 512
672 | QUAD $0xfffe001024448148; BYTE $0xff // add qword ptr [rsp + 16], -512
673 | QUAD $0x0010001824448148; BYTE $0x00 // add qword ptr [rsp + 24], 4096
674 | LONG $0x10000548; WORD $0x0000 // add rax, 4096
675 | LONG $0x245c8b48; BYTE $0x20 // mov rbx, qword ptr [rsp + 32]
676 | LONG $0x244c8b4c; BYTE $0x30 // mov r9, qword ptr [rsp + 48]
677 | WORD $0x3949; BYTE $0xd9 // cmp r9, rbx
678 | JAE LBB7_14
679 |
680 | LBB7_3:
681 | WORD $0x3948; BYTE $0xd9 // cmp rcx, rbx
682 | WORD $0x8949; BYTE $0xda // mov r10, rbx
683 | LONG $0x244c8948; BYTE $0x38 // mov qword ptr [rsp + 56], rcx
684 | LONG $0xd1420f4c // cmovb r10, rcx
685 | LONG $0x00898d49; WORD $0x0002; BYTE $0x00 // lea rcx, [r9 + 512]
686 | WORD $0x3948; BYTE $0xd9 // cmp rcx, rbx
687 | LONG $0x244c8948; BYTE $0x30 // mov qword ptr [rsp + 48], rcx
688 | LONG $0xd9420f48 // cmovb rbx, rcx
689 | LONG $0x247c8348; WORD $0x0028 // cmp qword ptr [rsp + 40], 0
690 | JE LBB7_2
691 | LONG $0x2454034c; BYTE $0x10 // add r10, qword ptr [rsp + 16]
692 | LONG $0xfce28349 // and r10, -4
693 | LONG $0x247c8b4c; BYTE $0x08 // mov r15, qword ptr [rsp + 8]
694 | WORD $0x894c; BYTE $0xf9 // mov rcx, r15
695 | LONG $0x09e1c148 // shl rcx, 9
696 | WORD $0x8949; BYTE $0xd8 // mov r8, rbx
697 | WORD $0x2949; BYTE $0xc8 // sub r8, rcx
698 | LONG $0x0ce7c149 // shl r15, 12
699 | LONG $0x3f1c8d4e // lea r11, [rdi + r15]
700 | LONG $0x247c894c; BYTE $0x60 // mov qword ptr [rsp + 96], r15
701 | LONG $0xc73c8d4f // lea r15, [r15 + 8*r8]
702 | LONG $0x3f0c8d49 // lea rcx, [r15 + rdi]
703 | LONG $0x244c8948; BYTE $0x58 // mov qword ptr [rsp + 88], rcx
704 | WORD $0x894c; BYTE $0xc1 // mov rcx, r8
705 | LONG $0xfce18348 // and rcx, -4
706 | LONG $0x244c8948; BYTE $0x48 // mov qword ptr [rsp + 72], rcx
707 | WORD $0x014c; BYTE $0xc9 // add rcx, r9
708 | LONG $0x244c8948; BYTE $0x40 // mov qword ptr [rsp + 64], rcx
709 | WORD $0x3145; BYTE $0xf6 // xor r14d, r14d
710 | LONG $0x247c894c; BYTE $0x50 // mov qword ptr [rsp + 80], r15
711 | JMP LBB7_5
712 |
713 | LBB7_12:
714 | WORD $0xff49; BYTE $0xc6 // inc r14
715 | WORD $0x394d; BYTE $0xe6 // cmp r14, r12
716 | JE LBB7_2
717 |
718 | LBB7_5:
719 | WORD $0x3949; BYTE $0xd9 // cmp r9, rbx
720 | JAE LBB7_12
721 | LONG $0xf6148b4a // mov rdx, qword ptr [rsi + 8*r14]
722 | WORD $0x894d; BYTE $0xcd // mov r13, r9
723 | LONG $0x04f88349 // cmp r8, 4
724 | JB LBB7_13
725 | LONG $0x3a0c8d4a // lea rcx, [rdx + r15]
726 | WORD $0x3949; BYTE $0xcb // cmp r11, rcx
727 | JAE LBB7_9
728 | LONG $0x244c8b48; BYTE $0x60 // mov rcx, qword ptr [rsp + 96]
729 | WORD $0x0148; BYTE $0xd1 // add rcx, rdx
730 | WORD $0x894d; BYTE $0xcd // mov r13, r9
731 | LONG $0x244c3b48; BYTE $0x58 // cmp rcx, qword ptr [rsp + 88]
732 | JB LBB7_13
733 |
734 | LBB7_9:
735 | WORD $0x894d; BYTE $0xdf // mov r15, r11
736 | WORD $0x8949; BYTE $0xf3 // mov r11, rsi
737 | LONG $0x244c8b48; BYTE $0x18 // mov rcx, qword ptr [rsp + 24]
738 | LONG $0x0a348d48 // lea rsi, [rdx + rcx]
739 | WORD $0xc931 // xor ecx, ecx
740 |
741 | LBB7_10:
742 | LONG $0x0410fcc5; BYTE $0xc8 // vmovups ymm0, ymmword ptr [rax + 8*rcx]
743 | LONG $0x0457fcc5; BYTE $0xce // vxorps ymm0, ymm0, ymmword ptr [rsi + 8*rcx]
744 | LONG $0x0411fcc5; BYTE $0xc8 // vmovups ymmword ptr [rax + 8*rcx], ymm0
745 | LONG $0x04c18348 // add rcx, 4
746 | WORD $0x3949; BYTE $0xca // cmp r10, rcx
747 | JNE LBB7_10
748 | LONG $0x246c8b4c; BYTE $0x40 // mov r13, qword ptr [rsp + 64]
749 | LONG $0x24443b4c; BYTE $0x48 // cmp r8, qword ptr [rsp + 72]
750 | WORD $0x894c; BYTE $0xde // mov rsi, r11
751 | WORD $0x894d; BYTE $0xfb // mov r11, r15
752 | LONG $0x247c8b4c; BYTE $0x50 // mov r15, qword ptr [rsp + 80]
753 | JE LBB7_12
754 |
755 | LBB7_13:
756 | LONG $0xea0c8b4a // mov rcx, qword ptr [rdx + 8*r13]
757 | LONG $0xef0c314a // xor qword ptr [rdi + 8*r13], rcx
758 | WORD $0xff49; BYTE $0xc5 // inc r13
759 | WORD $0x3949; BYTE $0xdd // cmp r13, rbx
760 | JB LBB7_13
761 | JMP LBB7_12
762 |
763 | LBB7_14:
764 | LONG $0xd8658d48 // lea rsp, [rbp - 40]
765 | BYTE $0x5b // pop rbx
766 | WORD $0x5c41 // pop r12
767 | WORD $0x5d41 // pop r13
768 | WORD $0x5e41 // pop r14
769 | WORD $0x5f41 // pop r15
770 | BYTE $0x5d // pop rbp
771 | WORD $0xf8c5; BYTE $0x77 // vzeroupper
772 | BYTE $0xc3 // ret
773 |
774 | TEXT ·_count(SB), $0-32
775 | MOVQ a+0(FP), DI
776 | MOVQ size+8(FP), SI
777 | MOVQ result+16(FP), DX
778 | BYTE $0x55 // push rbp
779 | WORD $0x8948; BYTE $0xe5 // mov rbp, rsp
780 | LONG $0xf8e48348 // and rsp, -8
781 | WORD $0x8548; BYTE $0xf6 // test rsi, rsi
782 | JE LBB8_1
783 | WORD $0xc931 // xor ecx, ecx
784 | WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
785 |
786 | LBB8_4:
787 | LONG $0xb80f48f3; WORD $0xcf04 // popcnt rax, qword ptr [rdi + 8*rcx]
788 | WORD $0x0149; BYTE $0xc0 // add r8, rax
789 | WORD $0xff48; BYTE $0xc1 // inc rcx
790 | WORD $0x3948; BYTE $0xce // cmp rsi, rcx
791 | JNE LBB8_4
792 | JMP LBB8_2
793 |
794 | LBB8_1:
795 | WORD $0x3145; BYTE $0xc0 // xor r8d, r8d
796 |
797 | LBB8_2:
798 | WORD $0x894c; BYTE $0x02 // mov qword ptr [rdx], r8
799 | WORD $0x8948; BYTE $0xec // mov rsp, rbp
800 | BYTE $0x5d // pop rbp
801 | BYTE $0xc3 // ret
802 |
--------------------------------------------------------------------------------
/simd_avx512.go:
--------------------------------------------------------------------------------
1 | //go:build !noasm && amd64
2 | // AUTO-GENERATED BY GOCC -- DO NOT EDIT
3 |
4 | package bitmap
5 |
6 | import "unsafe"
7 |
8 | //go:nosplit
9 | //go:noescape
10 | func _and_avx512(a unsafe.Pointer, b unsafe.Pointer, n uint64)
11 |
12 | //go:nosplit
13 | //go:noescape
14 | func _andn_avx512(a unsafe.Pointer, b unsafe.Pointer, n uint64)
15 |
16 | //go:nosplit
17 | //go:noescape
18 | func _or_avx512(a unsafe.Pointer, b unsafe.Pointer, n uint64)
19 |
20 | //go:nosplit
21 | //go:noescape
22 | func _xor_avx512(a unsafe.Pointer, b unsafe.Pointer, n uint64)
23 |
24 | //go:nosplit
25 | //go:noescape
26 | func _and_many_avx512(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
27 |
28 | //go:nosplit
29 | //go:noescape
30 | func _andn_many_avx512(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
31 |
32 | //go:nosplit
33 | //go:noescape
34 | func _or_many_avx512(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
35 |
36 | //go:nosplit
37 | //go:noescape
38 | func _xor_many_avx512(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
39 |
--------------------------------------------------------------------------------
/simd_generic.go:
--------------------------------------------------------------------------------
1 | // Copyright (c) Roman Atachiants and contributors. All rights reserved.
2 | // Licensed under the MIT license. See LICENSE file in the project root for details.
3 |
4 | package bitmap
5 |
6 | import "math/bits"
7 |
8 | // Count counts the number of bits set to one
9 | func count(arr []uint64) int {
10 | sum := 0
11 | for i := 0; i < len(arr); i++ {
12 | sum += bits.OnesCount64(arr[i])
13 | }
14 | return sum
15 | }
16 |
17 | // and computes the intersection of multiple bitmaps
18 | func and(a Bitmap, upper int, other Bitmap, extra []Bitmap) {
19 | for i := 0; i < upper; i++ {
20 | a[i] = a[i] & other[i]
21 | }
22 |
23 | for _, b := range extra {
24 | for i := 0; i < upper; i++ {
25 | a[i] = a[i] & b[i]
26 | }
27 | }
28 | }
29 |
30 | // AndNot computes the difference between two bitmaps and stores the result in the current bitmap
31 | func andn(a Bitmap, upper int, other Bitmap, extra []Bitmap) {
32 | for i := 0; i < upper; i++ {
33 | a[i] = a[i] &^ other[i]
34 | }
35 |
36 | for _, b := range extra {
37 | for i := 0; i < upper; i++ {
38 | a[i] = a[i] &^ b[i]
39 | }
40 | }
41 | }
42 |
43 | // or computes the union between two bitmaps and stores the result in the current bitmap
44 | func or(a Bitmap, other Bitmap, extra []Bitmap) {
45 | for i := 0; i < len(other); i++ {
46 | a[i] = a[i] | other[i]
47 | }
48 |
49 | for _, b := range extra {
50 | for i := 0; i < len(b); i++ {
51 | a[i] = a[i] | b[i]
52 | }
53 | }
54 | }
55 |
56 | // Xor computes the symmetric difference between two bitmaps and stores the result in the current bitmap
57 | func xor(a Bitmap, other Bitmap, extra []Bitmap) {
58 | for i := 0; i < len(other); i++ {
59 | a[i] = a[i] ^ other[i]
60 | }
61 |
62 | for _, b := range extra {
63 | for i := 0; i < len(b); i++ {
64 | a[i] = a[i] ^ b[i]
65 | }
66 | }
67 | }
68 |
--------------------------------------------------------------------------------
/simd_neon.go:
--------------------------------------------------------------------------------
1 | //go:build !noasm && !darwin && arm64
2 | // AUTO-GENERATED BY GOCC -- DO NOT EDIT
3 |
4 | package bitmap
5 |
6 | import "unsafe"
7 |
8 | //go:nosplit
9 | //go:noescape
10 | func _and(a unsafe.Pointer, b unsafe.Pointer, n uint64)
11 |
12 | //go:nosplit
13 | //go:noescape
14 | func _andn(a unsafe.Pointer, b unsafe.Pointer, n uint64)
15 |
16 | //go:nosplit
17 | //go:noescape
18 | func _or(a unsafe.Pointer, b unsafe.Pointer, n uint64)
19 |
20 | //go:nosplit
21 | //go:noescape
22 | func _xor(a unsafe.Pointer, b unsafe.Pointer, n uint64)
23 |
24 | //go:nosplit
25 | //go:noescape
26 | func _and_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
27 |
28 | //go:nosplit
29 | //go:noescape
30 | func _andn_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
31 |
32 | //go:nosplit
33 | //go:noescape
34 | func _or_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
35 |
36 | //go:nosplit
37 | //go:noescape
38 | func _xor_many(a unsafe.Pointer, b unsafe.Pointer, dims uint64)
39 |
40 | //go:nosplit
41 | //go:noescape
42 | func _count(a unsafe.Pointer, size uint64, result unsafe.Pointer)
43 |
--------------------------------------------------------------------------------
/simd_neon.s:
--------------------------------------------------------------------------------
1 | //go:build !noasm && !darwin && arm64
2 | // AUTO-GENERATED BY GOCC -- DO NOT EDIT
3 |
4 | TEXT ·_and(SB), $0-32
5 | MOVD a+0(FP), R0
6 | MOVD b+8(FP), R1
7 | MOVD n+16(FP), R2
8 | WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]!
9 | WORD $0x910003fd // mov x29, sp
10 | WORD $0xb40002a2 // cbz x2, .LBB0_7
11 | WORD $0xf100105f // cmp x2, #4
12 | WORD $0x54000103 // b.lo .LBB0_4
13 | WORD $0xd37df048 // lsl x8, x2, #3
14 | WORD $0x8b080029 // add x9, x1, x8
15 | WORD $0xeb00013f // cmp x9, x0
16 | WORD $0x54000229 // b.ls .LBB0_8
17 | WORD $0x8b080008 // add x8, x0, x8
18 | WORD $0xeb01011f // cmp x8, x1
19 | WORD $0x540001c9 // b.ls .LBB0_8
20 |
21 | LBB0_4:
22 | WORD $0xaa1f03e8 // mov x8, xzr
23 |
24 | LBB0_5:
25 | WORD $0xd37df10a // lsl x10, x8, #3
26 | WORD $0xcb080049 // sub x9, x2, x8
27 | WORD $0x8b0a0008 // add x8, x0, x10
28 | WORD $0x8b0a002a // add x10, x1, x10
29 |
30 | LBB0_6:
31 | WORD $0xf840854b // ldr x11, [x10], #8
32 | WORD $0xf940010c // ldr x12, [x8]
33 | WORD $0xf1000529 // subs x9, x9, #1
34 | WORD $0x8a0b018b // and x11, x12, x11
35 | WORD $0xf800850b // str x11, [x8], #8
36 | WORD $0x54ffff61 // b.ne .LBB0_6
37 |
38 | LBB0_7:
39 | WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
40 | WORD $0xd65f03c0 // ret
41 |
42 | LBB0_8:
43 | WORD $0x927ef448 // and x8, x2, #0xfffffffffffffffc
44 | WORD $0x91004029 // add x9, x1, #16
45 | WORD $0x9100400a // add x10, x0, #16
46 | WORD $0xaa0803eb // mov x11, x8
47 |
48 | LBB0_9:
49 | WORD $0xad7f8520 // ldp q0, q1, [x9, #-16]
50 | WORD $0x91008129 // add x9, x9, #32
51 | WORD $0xf100116b // subs x11, x11, #4
52 | WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16]
53 | WORD $0x4e201c40 // and v0.16b, v2.16b, v0.16b
54 | WORD $0x4e211c61 // and v1.16b, v3.16b, v1.16b
55 | WORD $0xad3f8540 // stp q0, q1, [x10, #-16]
56 | WORD $0x9100814a // add x10, x10, #32
57 | WORD $0x54ffff01 // b.ne .LBB0_9
58 | WORD $0xeb02011f // cmp x8, x2
59 | WORD $0x54fffe00 // b.eq .LBB0_7
60 | WORD $0x17ffffe5 // b .LBB0_5
61 |
62 | TEXT ·_andn(SB), $0-32
63 | MOVD a+0(FP), R0
64 | MOVD b+8(FP), R1
65 | MOVD n+16(FP), R2
66 | WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]!
67 | WORD $0x910003fd // mov x29, sp
68 | WORD $0xb40002a2 // cbz x2, .LBB1_7
69 | WORD $0xf100105f // cmp x2, #4
70 | WORD $0x54000103 // b.lo .LBB1_4
71 | WORD $0xd37df048 // lsl x8, x2, #3
72 | WORD $0x8b080029 // add x9, x1, x8
73 | WORD $0xeb00013f // cmp x9, x0
74 | WORD $0x54000229 // b.ls .LBB1_8
75 | WORD $0x8b080008 // add x8, x0, x8
76 | WORD $0xeb01011f // cmp x8, x1
77 | WORD $0x540001c9 // b.ls .LBB1_8
78 |
79 | LBB1_4:
80 | WORD $0xaa1f03e8 // mov x8, xzr
81 |
82 | LBB1_5:
83 | WORD $0xd37df10a // lsl x10, x8, #3
84 | WORD $0xcb080049 // sub x9, x2, x8
85 | WORD $0x8b0a0008 // add x8, x0, x10
86 | WORD $0x8b0a002a // add x10, x1, x10
87 |
88 | LBB1_6:
89 | WORD $0xf840854b // ldr x11, [x10], #8
90 | WORD $0xf940010c // ldr x12, [x8]
91 | WORD $0xf1000529 // subs x9, x9, #1
92 | WORD $0x8a2b018b // bic x11, x12, x11
93 | WORD $0xf800850b // str x11, [x8], #8
94 | WORD $0x54ffff61 // b.ne .LBB1_6
95 |
96 | LBB1_7:
97 | WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
98 | WORD $0xd65f03c0 // ret
99 |
100 | LBB1_8:
101 | WORD $0x927ef448 // and x8, x2, #0xfffffffffffffffc
102 | WORD $0x91004029 // add x9, x1, #16
103 | WORD $0x9100400a // add x10, x0, #16
104 | WORD $0xaa0803eb // mov x11, x8
105 |
106 | LBB1_9:
107 | WORD $0xad7f8520 // ldp q0, q1, [x9, #-16]
108 | WORD $0x91008129 // add x9, x9, #32
109 | WORD $0xf100116b // subs x11, x11, #4
110 | WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16]
111 | WORD $0x4e601c40 // bic v0.16b, v2.16b, v0.16b
112 | WORD $0x4e611c61 // bic v1.16b, v3.16b, v1.16b
113 | WORD $0xad3f8540 // stp q0, q1, [x10, #-16]
114 | WORD $0x9100814a // add x10, x10, #32
115 | WORD $0x54ffff01 // b.ne .LBB1_9
116 | WORD $0xeb02011f // cmp x8, x2
117 | WORD $0x54fffe00 // b.eq .LBB1_7
118 | WORD $0x17ffffe5 // b .LBB1_5
119 |
120 | TEXT ·_or(SB), $0-32
121 | MOVD a+0(FP), R0
122 | MOVD b+8(FP), R1
123 | MOVD n+16(FP), R2
124 | WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]!
125 | WORD $0x910003fd // mov x29, sp
126 | WORD $0xb40002a2 // cbz x2, .LBB2_7
127 | WORD $0xf100105f // cmp x2, #4
128 | WORD $0x54000103 // b.lo .LBB2_4
129 | WORD $0xd37df048 // lsl x8, x2, #3
130 | WORD $0x8b080029 // add x9, x1, x8
131 | WORD $0xeb00013f // cmp x9, x0
132 | WORD $0x54000229 // b.ls .LBB2_8
133 | WORD $0x8b080008 // add x8, x0, x8
134 | WORD $0xeb01011f // cmp x8, x1
135 | WORD $0x540001c9 // b.ls .LBB2_8
136 |
137 | LBB2_4:
138 | WORD $0xaa1f03e8 // mov x8, xzr
139 |
140 | LBB2_5:
141 | WORD $0xd37df10a // lsl x10, x8, #3
142 | WORD $0xcb080049 // sub x9, x2, x8
143 | WORD $0x8b0a0008 // add x8, x0, x10
144 | WORD $0x8b0a002a // add x10, x1, x10
145 |
146 | LBB2_6:
147 | WORD $0xf840854b // ldr x11, [x10], #8
148 | WORD $0xf940010c // ldr x12, [x8]
149 | WORD $0xf1000529 // subs x9, x9, #1
150 | WORD $0xaa0b018b // orr x11, x12, x11
151 | WORD $0xf800850b // str x11, [x8], #8
152 | WORD $0x54ffff61 // b.ne .LBB2_6
153 |
154 | LBB2_7:
155 | WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
156 | WORD $0xd65f03c0 // ret
157 |
158 | LBB2_8:
159 | WORD $0x927ef448 // and x8, x2, #0xfffffffffffffffc
160 | WORD $0x91004029 // add x9, x1, #16
161 | WORD $0x9100400a // add x10, x0, #16
162 | WORD $0xaa0803eb // mov x11, x8
163 |
164 | LBB2_9:
165 | WORD $0xad7f8520 // ldp q0, q1, [x9, #-16]
166 | WORD $0x91008129 // add x9, x9, #32
167 | WORD $0xf100116b // subs x11, x11, #4
168 | WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16]
169 | WORD $0x4ea01c40 // orr v0.16b, v2.16b, v0.16b
170 | WORD $0x4ea11c61 // orr v1.16b, v3.16b, v1.16b
171 | WORD $0xad3f8540 // stp q0, q1, [x10, #-16]
172 | WORD $0x9100814a // add x10, x10, #32
173 | WORD $0x54ffff01 // b.ne .LBB2_9
174 | WORD $0xeb02011f // cmp x8, x2
175 | WORD $0x54fffe00 // b.eq .LBB2_7
176 | WORD $0x17ffffe5 // b .LBB2_5
177 |
178 | TEXT ·_xor(SB), $0-32
179 | MOVD a+0(FP), R0
180 | MOVD b+8(FP), R1
181 | MOVD n+16(FP), R2
182 | WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]!
183 | WORD $0x910003fd // mov x29, sp
184 | WORD $0xb40002a2 // cbz x2, .LBB3_7
185 | WORD $0xf100105f // cmp x2, #4
186 | WORD $0x54000103 // b.lo .LBB3_4
187 | WORD $0xd37df048 // lsl x8, x2, #3
188 | WORD $0x8b080029 // add x9, x1, x8
189 | WORD $0xeb00013f // cmp x9, x0
190 | WORD $0x54000229 // b.ls .LBB3_8
191 | WORD $0x8b080008 // add x8, x0, x8
192 | WORD $0xeb01011f // cmp x8, x1
193 | WORD $0x540001c9 // b.ls .LBB3_8
194 |
195 | LBB3_4:
196 | WORD $0xaa1f03e8 // mov x8, xzr
197 |
198 | LBB3_5:
199 | WORD $0xd37df10a // lsl x10, x8, #3
200 | WORD $0xcb080049 // sub x9, x2, x8
201 | WORD $0x8b0a0008 // add x8, x0, x10
202 | WORD $0x8b0a002a // add x10, x1, x10
203 |
204 | LBB3_6:
205 | WORD $0xf840854b // ldr x11, [x10], #8
206 | WORD $0xf940010c // ldr x12, [x8]
207 | WORD $0xf1000529 // subs x9, x9, #1
208 | WORD $0xca0b018b // eor x11, x12, x11
209 | WORD $0xf800850b // str x11, [x8], #8
210 | WORD $0x54ffff61 // b.ne .LBB3_6
211 |
212 | LBB3_7:
213 | WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
214 | WORD $0xd65f03c0 // ret
215 |
216 | LBB3_8:
217 | WORD $0x927ef448 // and x8, x2, #0xfffffffffffffffc
218 | WORD $0x91004029 // add x9, x1, #16
219 | WORD $0x9100400a // add x10, x0, #16
220 | WORD $0xaa0803eb // mov x11, x8
221 |
222 | LBB3_9:
223 | WORD $0xad7f8520 // ldp q0, q1, [x9, #-16]
224 | WORD $0x91008129 // add x9, x9, #32
225 | WORD $0xf100116b // subs x11, x11, #4
226 | WORD $0xad7f8d42 // ldp q2, q3, [x10, #-16]
227 | WORD $0x6e201c40 // eor v0.16b, v2.16b, v0.16b
228 | WORD $0x6e211c61 // eor v1.16b, v3.16b, v1.16b
229 | WORD $0xad3f8540 // stp q0, q1, [x10, #-16]
230 | WORD $0x9100814a // add x10, x10, #32
231 | WORD $0x54ffff01 // b.ne .LBB3_9
232 | WORD $0xeb02011f // cmp x8, x2
233 | WORD $0x54fffe00 // b.eq .LBB3_7
234 | WORD $0x17ffffe5 // b .LBB3_5
235 |
236 | TEXT ·_and_many(SB), $0-32
237 | MOVD a+0(FP), R0
238 | MOVD b+8(FP), R1
239 | MOVD dims+16(FP), R2
240 | WORD $0xa9bb7bfd // stp x29, x30, [sp, #-80]!
241 | WORD $0xf2407c48 // ands x8, x2, #0xffffffff
242 | WORD $0xf9000bf9 // str x25, [sp, #16]
243 | WORD $0xa9025ff8 // stp x24, x23, [sp, #32]
244 | WORD $0x910003fd // mov x29, sp
245 | WORD $0xa90357f6 // stp x22, x21, [sp, #48]
246 | WORD $0xa9044ff4 // stp x20, x19, [sp, #64]
247 | WORD $0x54000a20 // b.eq .LBB4_14
248 | WORD $0xd360fc4b // lsr x11, x2, #32
249 | WORD $0xb40009eb // cbz x11, .LBB4_14
250 | WORD $0xf100057f // cmp x11, #1
251 | WORD $0xaa1f03e9 // mov x9, xzr
252 | WORD $0xaa1f03ea // mov x10, xzr
253 | WORD $0xaa1f03ef // mov x15, xzr
254 | WORD $0x9a9f856b // csinc x11, x11, xzr, hi
255 | WORD $0x9100400c // add x12, x0, #16
256 | WORD $0x5280400d // mov w13, #512
257 | WORD $0x5280020e // mov w14, #16
258 | WORD $0x14000009 // b .LBB4_4
259 |
260 | LBB4_3:
261 | WORD $0x910801ad // add x13, x13, #512
262 | WORD $0x9100054a // add x10, x10, #1
263 | WORD $0xd1080129 // sub x9, x9, #512
264 | WORD $0x914005ce // add x14, x14, #1, lsl #12
265 | WORD $0x9140058c // add x12, x12, #1, lsl #12
266 | WORD $0xaa1003ef // mov x15, x16
267 | WORD $0xeb08021f // cmp x16, x8
268 | WORD $0x540007c2 // b.hs .LBB4_14
269 |
270 | LBB4_4:
271 | WORD $0xeb0801bf // cmp x13, x8
272 | WORD $0x910801f0 // add x16, x15, #512
273 | WORD $0x9a8831b1 // csel x17, x13, x8, lo
274 | WORD $0xeb08021f // cmp x16, x8
275 | WORD $0x9a883212 // csel x18, x16, x8, lo
276 | WORD $0xeb1201ff // cmp x15, x18
277 | WORD $0x54fffe42 // b.hs .LBB4_3
278 | WORD $0xcb0a2622 // sub x2, x17, x10, lsl #9
279 | WORD $0xd374cd43 // lsl x3, x10, #12
280 | WORD $0xd37df046 // lsl x6, x2, #3
281 | WORD $0x8b030004 // add x4, x0, x3
282 | WORD $0xd10020d4 // sub x20, x6, #8
283 | WORD $0x8b090225 // add x5, x17, x9
284 | WORD $0x8b140086 // add x6, x4, x20
285 | WORD $0x927ef447 // and x7, x2, #0xfffffffffffffffc
286 | WORD $0xaa1f03f2 // mov x18, xzr
287 | WORD $0x927ef4a5 // and x5, x5, #0xfffffffffffffffc
288 | WORD $0x910020c6 // add x6, x6, #8
289 | WORD $0x8b0701f3 // add x19, x15, x7
290 | WORD $0x8b140074 // add x20, x3, x20
291 | WORD $0x14000004 // b .LBB4_7
292 |
293 | LBB4_6:
294 | WORD $0x91000652 // add x18, x18, #1
295 | WORD $0xeb0b025f // cmp x18, x11
296 | WORD $0x54fffc20 // b.eq .LBB4_3
297 |
298 | LBB4_7:
299 | WORD $0xf8727835 // ldr x21, [x1, x18, lsl #3]
300 | WORD $0xaa0f03f7 // mov x23, x15
301 | WORD $0xf100105f // cmp x2, #4
302 | WORD $0x540002e3 // b.lo .LBB4_12
303 | WORD $0x8b1402b6 // add x22, x21, x20
304 | WORD $0x8b0302b7 // add x23, x21, x3
305 | WORD $0x910022d6 // add x22, x22, #8
306 | WORD $0xeb16009f // cmp x4, x22
307 | WORD $0xfa4632e2 // ccmp x23, x6, #2, lo
308 | WORD $0xaa0f03f7 // mov x23, x15
309 | WORD $0x54000203 // b.lo .LBB4_12
310 | WORD $0x8b0e02b6 // add x22, x21, x14
311 | WORD $0xaa0c03f7 // mov x23, x12
312 | WORD $0xaa0503f8 // mov x24, x5
313 |
314 | LBB4_10:
315 | WORD $0xad7f86c0 // ldp q0, q1, [x22, #-16]
316 | WORD $0xf1001318 // subs x24, x24, #4
317 | WORD $0x910082d6 // add x22, x22, #32
318 | WORD $0xad7f8ee2 // ldp q2, q3, [x23, #-16]
319 | WORD $0x4e201c40 // and v0.16b, v2.16b, v0.16b
320 | WORD $0x4e211c61 // and v1.16b, v3.16b, v1.16b
321 | WORD $0xad3f86e0 // stp q0, q1, [x23, #-16]
322 | WORD $0x910082f7 // add x23, x23, #32
323 | WORD $0x54ffff01 // b.ne .LBB4_10
324 | WORD $0xaa1303f7 // mov x23, x19
325 | WORD $0xeb07005f // cmp x2, x7
326 | WORD $0x54fffc80 // b.eq .LBB4_6
327 |
328 | LBB4_12:
329 | WORD $0xd37df2f8 // lsl x24, x23, #3
330 | WORD $0xcb170236 // sub x22, x17, x23
331 | WORD $0x8b180017 // add x23, x0, x24
332 | WORD $0x8b1802b5 // add x21, x21, x24
333 |
334 | LBB4_13:
335 | WORD $0xf84086b8 // ldr x24, [x21], #8
336 | WORD $0xf94002f9 // ldr x25, [x23]
337 | WORD $0xf10006d6 // subs x22, x22, #1
338 | WORD $0x8a180338 // and x24, x25, x24
339 | WORD $0xf80086f8 // str x24, [x23], #8
340 | WORD $0x54ffff61 // b.ne .LBB4_13
341 | WORD $0x17ffffd9 // b .LBB4_6
342 |
343 | LBB4_14:
344 | WORD $0xa9444ff4 // ldp x20, x19, [sp, #64]
345 | WORD $0xa94357f6 // ldp x22, x21, [sp, #48]
346 | WORD $0xa9425ff8 // ldp x24, x23, [sp, #32]
347 | WORD $0xf9400bf9 // ldr x25, [sp, #16]
348 | WORD $0xa8c57bfd // ldp x29, x30, [sp], #80
349 | WORD $0xd65f03c0 // ret
350 |
351 | TEXT ·_andn_many(SB), $0-32
352 | MOVD a+0(FP), R0
353 | MOVD b+8(FP), R1
354 | MOVD dims+16(FP), R2
355 | WORD $0xa9bb7bfd // stp x29, x30, [sp, #-80]!
356 | WORD $0xf2407c48 // ands x8, x2, #0xffffffff
357 | WORD $0xf9000bf9 // str x25, [sp, #16]
358 | WORD $0xa9025ff8 // stp x24, x23, [sp, #32]
359 | WORD $0x910003fd // mov x29, sp
360 | WORD $0xa90357f6 // stp x22, x21, [sp, #48]
361 | WORD $0xa9044ff4 // stp x20, x19, [sp, #64]
362 | WORD $0x54000a20 // b.eq .LBB5_14
363 | WORD $0xd360fc4b // lsr x11, x2, #32
364 | WORD $0xb40009eb // cbz x11, .LBB5_14
365 | WORD $0xf100057f // cmp x11, #1
366 | WORD $0xaa1f03e9 // mov x9, xzr
367 | WORD $0xaa1f03ea // mov x10, xzr
368 | WORD $0xaa1f03ef // mov x15, xzr
369 | WORD $0x9a9f856b // csinc x11, x11, xzr, hi
370 | WORD $0x9100400c // add x12, x0, #16
371 | WORD $0x5280400d // mov w13, #512
372 | WORD $0x5280020e // mov w14, #16
373 | WORD $0x14000009 // b .LBB5_4
374 |
375 | LBB5_3:
376 | WORD $0x910801ad // add x13, x13, #512
377 | WORD $0x9100054a // add x10, x10, #1
378 | WORD $0xd1080129 // sub x9, x9, #512
379 | WORD $0x914005ce // add x14, x14, #1, lsl #12
380 | WORD $0x9140058c // add x12, x12, #1, lsl #12
381 | WORD $0xaa1003ef // mov x15, x16
382 | WORD $0xeb08021f // cmp x16, x8
383 | WORD $0x540007c2 // b.hs .LBB5_14
384 |
385 | LBB5_4:
386 | WORD $0xeb0801bf // cmp x13, x8
387 | WORD $0x910801f0 // add x16, x15, #512
388 | WORD $0x9a8831b1 // csel x17, x13, x8, lo
389 | WORD $0xeb08021f // cmp x16, x8
390 | WORD $0x9a883212 // csel x18, x16, x8, lo
391 | WORD $0xeb1201ff // cmp x15, x18
392 | WORD $0x54fffe42 // b.hs .LBB5_3
393 | WORD $0xcb0a2622 // sub x2, x17, x10, lsl #9
394 | WORD $0xd374cd43 // lsl x3, x10, #12
395 | WORD $0xd37df046 // lsl x6, x2, #3
396 | WORD $0x8b030004 // add x4, x0, x3
397 | WORD $0xd10020d4 // sub x20, x6, #8
398 | WORD $0x8b090225 // add x5, x17, x9
399 | WORD $0x8b140086 // add x6, x4, x20
400 | WORD $0x927ef447 // and x7, x2, #0xfffffffffffffffc
401 | WORD $0xaa1f03f2 // mov x18, xzr
402 | WORD $0x927ef4a5 // and x5, x5, #0xfffffffffffffffc
403 | WORD $0x910020c6 // add x6, x6, #8
404 | WORD $0x8b0701f3 // add x19, x15, x7
405 | WORD $0x8b140074 // add x20, x3, x20
406 | WORD $0x14000004 // b .LBB5_7
407 |
408 | LBB5_6:
409 | WORD $0x91000652 // add x18, x18, #1
410 | WORD $0xeb0b025f // cmp x18, x11
411 | WORD $0x54fffc20 // b.eq .LBB5_3
412 |
413 | LBB5_7:
414 | WORD $0xf8727835 // ldr x21, [x1, x18, lsl #3]
415 | WORD $0xaa0f03f7 // mov x23, x15
416 | WORD $0xf100105f // cmp x2, #4
417 | WORD $0x540002e3 // b.lo .LBB5_12
418 | WORD $0x8b1402b6 // add x22, x21, x20
419 | WORD $0x8b0302b7 // add x23, x21, x3
420 | WORD $0x910022d6 // add x22, x22, #8
421 | WORD $0xeb16009f // cmp x4, x22
422 | WORD $0xfa4632e2 // ccmp x23, x6, #2, lo
423 | WORD $0xaa0f03f7 // mov x23, x15
424 | WORD $0x54000203 // b.lo .LBB5_12
425 | WORD $0x8b0e02b6 // add x22, x21, x14
426 | WORD $0xaa0c03f7 // mov x23, x12
427 | WORD $0xaa0503f8 // mov x24, x5
428 |
429 | LBB5_10:
430 | WORD $0xad7f86c0 // ldp q0, q1, [x22, #-16]
431 | WORD $0xf1001318 // subs x24, x24, #4
432 | WORD $0x910082d6 // add x22, x22, #32
433 | WORD $0xad7f8ee2 // ldp q2, q3, [x23, #-16]
434 | WORD $0x4e601c40 // bic v0.16b, v2.16b, v0.16b
435 | WORD $0x4e611c61 // bic v1.16b, v3.16b, v1.16b
436 | WORD $0xad3f86e0 // stp q0, q1, [x23, #-16]
437 | WORD $0x910082f7 // add x23, x23, #32
438 | WORD $0x54ffff01 // b.ne .LBB5_10
439 | WORD $0xaa1303f7 // mov x23, x19
440 | WORD $0xeb07005f // cmp x2, x7
441 | WORD $0x54fffc80 // b.eq .LBB5_6
442 |
443 | LBB5_12:
444 | WORD $0xd37df2f8 // lsl x24, x23, #3
445 | WORD $0xcb170236 // sub x22, x17, x23
446 | WORD $0x8b180017 // add x23, x0, x24
447 | WORD $0x8b1802b5 // add x21, x21, x24
448 |
449 | LBB5_13:
450 | WORD $0xf84086b8 // ldr x24, [x21], #8
451 | WORD $0xf94002f9 // ldr x25, [x23]
452 | WORD $0xf10006d6 // subs x22, x22, #1
453 | WORD $0x8a380338 // bic x24, x25, x24
454 | WORD $0xf80086f8 // str x24, [x23], #8
455 | WORD $0x54ffff61 // b.ne .LBB5_13
456 | WORD $0x17ffffd9 // b .LBB5_6
457 |
458 | LBB5_14:
459 | WORD $0xa9444ff4 // ldp x20, x19, [sp, #64]
460 | WORD $0xa94357f6 // ldp x22, x21, [sp, #48]
461 | WORD $0xa9425ff8 // ldp x24, x23, [sp, #32]
462 | WORD $0xf9400bf9 // ldr x25, [sp, #16]
463 | WORD $0xa8c57bfd // ldp x29, x30, [sp], #80
464 | WORD $0xd65f03c0 // ret
465 |
466 | TEXT ·_or_many(SB), $0-32
467 | MOVD a+0(FP), R0
468 | MOVD b+8(FP), R1
469 | MOVD dims+16(FP), R2
470 | WORD $0xa9bb7bfd // stp x29, x30, [sp, #-80]!
471 | WORD $0xf2407c48 // ands x8, x2, #0xffffffff
472 | WORD $0xf9000bf9 // str x25, [sp, #16]
473 | WORD $0xa9025ff8 // stp x24, x23, [sp, #32]
474 | WORD $0x910003fd // mov x29, sp
475 | WORD $0xa90357f6 // stp x22, x21, [sp, #48]
476 | WORD $0xa9044ff4 // stp x20, x19, [sp, #64]
477 | WORD $0x54000a20 // b.eq .LBB6_14
478 | WORD $0xd360fc4b // lsr x11, x2, #32
479 | WORD $0xb40009eb // cbz x11, .LBB6_14
480 | WORD $0xf100057f // cmp x11, #1
481 | WORD $0xaa1f03e9 // mov x9, xzr
482 | WORD $0xaa1f03ea // mov x10, xzr
483 | WORD $0xaa1f03ef // mov x15, xzr
484 | WORD $0x9a9f856b // csinc x11, x11, xzr, hi
485 | WORD $0x9100400c // add x12, x0, #16
486 | WORD $0x5280400d // mov w13, #512
487 | WORD $0x5280020e // mov w14, #16
488 | WORD $0x14000009 // b .LBB6_4
489 |
490 | LBB6_3:
491 | WORD $0x910801ad // add x13, x13, #512
492 | WORD $0x9100054a // add x10, x10, #1
493 | WORD $0xd1080129 // sub x9, x9, #512
494 | WORD $0x914005ce // add x14, x14, #1, lsl #12
495 | WORD $0x9140058c // add x12, x12, #1, lsl #12
496 | WORD $0xaa1003ef // mov x15, x16
497 | WORD $0xeb08021f // cmp x16, x8
498 | WORD $0x540007c2 // b.hs .LBB6_14
499 |
500 | LBB6_4:
501 | WORD $0xeb0801bf // cmp x13, x8
502 | WORD $0x910801f0 // add x16, x15, #512
503 | WORD $0x9a8831b1 // csel x17, x13, x8, lo
504 | WORD $0xeb08021f // cmp x16, x8
505 | WORD $0x9a883212 // csel x18, x16, x8, lo
506 | WORD $0xeb1201ff // cmp x15, x18
507 | WORD $0x54fffe42 // b.hs .LBB6_3
508 | WORD $0xcb0a2622 // sub x2, x17, x10, lsl #9
509 | WORD $0xd374cd43 // lsl x3, x10, #12
510 | WORD $0xd37df046 // lsl x6, x2, #3
511 | WORD $0x8b030004 // add x4, x0, x3
512 | WORD $0xd10020d4 // sub x20, x6, #8
513 | WORD $0x8b090225 // add x5, x17, x9
514 | WORD $0x8b140086 // add x6, x4, x20
515 | WORD $0x927ef447 // and x7, x2, #0xfffffffffffffffc
516 | WORD $0xaa1f03f2 // mov x18, xzr
517 | WORD $0x927ef4a5 // and x5, x5, #0xfffffffffffffffc
518 | WORD $0x910020c6 // add x6, x6, #8
519 | WORD $0x8b0701f3 // add x19, x15, x7
520 | WORD $0x8b140074 // add x20, x3, x20
521 | WORD $0x14000004 // b .LBB6_7
522 |
523 | LBB6_6:
524 | WORD $0x91000652 // add x18, x18, #1
525 | WORD $0xeb0b025f // cmp x18, x11
526 | WORD $0x54fffc20 // b.eq .LBB6_3
527 |
528 | LBB6_7:
529 | WORD $0xf8727835 // ldr x21, [x1, x18, lsl #3]
530 | WORD $0xaa0f03f7 // mov x23, x15
531 | WORD $0xf100105f // cmp x2, #4
532 | WORD $0x540002e3 // b.lo .LBB6_12
533 | WORD $0x8b1402b6 // add x22, x21, x20
534 | WORD $0x8b0302b7 // add x23, x21, x3
535 | WORD $0x910022d6 // add x22, x22, #8
536 | WORD $0xeb16009f // cmp x4, x22
537 | WORD $0xfa4632e2 // ccmp x23, x6, #2, lo
538 | WORD $0xaa0f03f7 // mov x23, x15
539 | WORD $0x54000203 // b.lo .LBB6_12
540 | WORD $0x8b0e02b6 // add x22, x21, x14
541 | WORD $0xaa0c03f7 // mov x23, x12
542 | WORD $0xaa0503f8 // mov x24, x5
543 |
544 | LBB6_10:
545 | WORD $0xad7f86c0 // ldp q0, q1, [x22, #-16]
546 | WORD $0xf1001318 // subs x24, x24, #4
547 | WORD $0x910082d6 // add x22, x22, #32
548 | WORD $0xad7f8ee2 // ldp q2, q3, [x23, #-16]
549 | WORD $0x4ea01c40 // orr v0.16b, v2.16b, v0.16b
550 | WORD $0x4ea11c61 // orr v1.16b, v3.16b, v1.16b
551 | WORD $0xad3f86e0 // stp q0, q1, [x23, #-16]
552 | WORD $0x910082f7 // add x23, x23, #32
553 | WORD $0x54ffff01 // b.ne .LBB6_10
554 | WORD $0xaa1303f7 // mov x23, x19
555 | WORD $0xeb07005f // cmp x2, x7
556 | WORD $0x54fffc80 // b.eq .LBB6_6
557 |
558 | LBB6_12:
559 | WORD $0xd37df2f8 // lsl x24, x23, #3
560 | WORD $0xcb170236 // sub x22, x17, x23
561 | WORD $0x8b180017 // add x23, x0, x24
562 | WORD $0x8b1802b5 // add x21, x21, x24
563 |
564 | LBB6_13:
565 | WORD $0xf84086b8 // ldr x24, [x21], #8
566 | WORD $0xf94002f9 // ldr x25, [x23]
567 | WORD $0xf10006d6 // subs x22, x22, #1
568 | WORD $0xaa180338 // orr x24, x25, x24
569 | WORD $0xf80086f8 // str x24, [x23], #8
570 | WORD $0x54ffff61 // b.ne .LBB6_13
571 | WORD $0x17ffffd9 // b .LBB6_6
572 |
573 | LBB6_14:
574 | WORD $0xa9444ff4 // ldp x20, x19, [sp, #64]
575 | WORD $0xa94357f6 // ldp x22, x21, [sp, #48]
576 | WORD $0xa9425ff8 // ldp x24, x23, [sp, #32]
577 | WORD $0xf9400bf9 // ldr x25, [sp, #16]
578 | WORD $0xa8c57bfd // ldp x29, x30, [sp], #80
579 | WORD $0xd65f03c0 // ret
580 |
581 | TEXT ·_xor_many(SB), $0-32
582 | MOVD a+0(FP), R0
583 | MOVD b+8(FP), R1
584 | MOVD dims+16(FP), R2
585 | WORD $0xa9bb7bfd // stp x29, x30, [sp, #-80]!
586 | WORD $0xf2407c48 // ands x8, x2, #0xffffffff
587 | WORD $0xf9000bf9 // str x25, [sp, #16]
588 | WORD $0xa9025ff8 // stp x24, x23, [sp, #32]
589 | WORD $0x910003fd // mov x29, sp
590 | WORD $0xa90357f6 // stp x22, x21, [sp, #48]
591 | WORD $0xa9044ff4 // stp x20, x19, [sp, #64]
592 | WORD $0x54000a20 // b.eq .LBB7_14
593 | WORD $0xd360fc4b // lsr x11, x2, #32
594 | WORD $0xb40009eb // cbz x11, .LBB7_14
595 | WORD $0xf100057f // cmp x11, #1
596 | WORD $0xaa1f03e9 // mov x9, xzr
597 | WORD $0xaa1f03ea // mov x10, xzr
598 | WORD $0xaa1f03ef // mov x15, xzr
599 | WORD $0x9a9f856b // csinc x11, x11, xzr, hi
600 | WORD $0x9100400c // add x12, x0, #16
601 | WORD $0x5280400d // mov w13, #512
602 | WORD $0x5280020e // mov w14, #16
603 | WORD $0x14000009 // b .LBB7_4
604 |
605 | LBB7_3:
606 | WORD $0x910801ad // add x13, x13, #512
607 | WORD $0x9100054a // add x10, x10, #1
608 | WORD $0xd1080129 // sub x9, x9, #512
609 | WORD $0x914005ce // add x14, x14, #1, lsl #12
610 | WORD $0x9140058c // add x12, x12, #1, lsl #12
611 | WORD $0xaa1003ef // mov x15, x16
612 | WORD $0xeb08021f // cmp x16, x8
613 | WORD $0x540007c2 // b.hs .LBB7_14
614 |
615 | LBB7_4:
616 | WORD $0xeb0801bf // cmp x13, x8
617 | WORD $0x910801f0 // add x16, x15, #512
618 | WORD $0x9a8831b1 // csel x17, x13, x8, lo
619 | WORD $0xeb08021f // cmp x16, x8
620 | WORD $0x9a883212 // csel x18, x16, x8, lo
621 | WORD $0xeb1201ff // cmp x15, x18
622 | WORD $0x54fffe42 // b.hs .LBB7_3
623 | WORD $0xcb0a2622 // sub x2, x17, x10, lsl #9
624 | WORD $0xd374cd43 // lsl x3, x10, #12
625 | WORD $0xd37df046 // lsl x6, x2, #3
626 | WORD $0x8b030004 // add x4, x0, x3
627 | WORD $0xd10020d4 // sub x20, x6, #8
628 | WORD $0x8b090225 // add x5, x17, x9
629 | WORD $0x8b140086 // add x6, x4, x20
630 | WORD $0x927ef447 // and x7, x2, #0xfffffffffffffffc
631 | WORD $0xaa1f03f2 // mov x18, xzr
632 | WORD $0x927ef4a5 // and x5, x5, #0xfffffffffffffffc
633 | WORD $0x910020c6 // add x6, x6, #8
634 | WORD $0x8b0701f3 // add x19, x15, x7
635 | WORD $0x8b140074 // add x20, x3, x20
636 | WORD $0x14000004 // b .LBB7_7
637 |
638 | LBB7_6:
639 | WORD $0x91000652 // add x18, x18, #1
640 | WORD $0xeb0b025f // cmp x18, x11
641 | WORD $0x54fffc20 // b.eq .LBB7_3
642 |
643 | LBB7_7:
644 | WORD $0xf8727835 // ldr x21, [x1, x18, lsl #3]
645 | WORD $0xaa0f03f7 // mov x23, x15
646 | WORD $0xf100105f // cmp x2, #4
647 | WORD $0x540002e3 // b.lo .LBB7_12
648 | WORD $0x8b1402b6 // add x22, x21, x20
649 | WORD $0x8b0302b7 // add x23, x21, x3
650 | WORD $0x910022d6 // add x22, x22, #8
651 | WORD $0xeb16009f // cmp x4, x22
652 | WORD $0xfa4632e2 // ccmp x23, x6, #2, lo
653 | WORD $0xaa0f03f7 // mov x23, x15
654 | WORD $0x54000203 // b.lo .LBB7_12
655 | WORD $0x8b0e02b6 // add x22, x21, x14
656 | WORD $0xaa0c03f7 // mov x23, x12
657 | WORD $0xaa0503f8 // mov x24, x5
658 |
659 | LBB7_10:
660 | WORD $0xad7f86c0 // ldp q0, q1, [x22, #-16]
661 | WORD $0xf1001318 // subs x24, x24, #4
662 | WORD $0x910082d6 // add x22, x22, #32
663 | WORD $0xad7f8ee2 // ldp q2, q3, [x23, #-16]
664 | WORD $0x6e201c40 // eor v0.16b, v2.16b, v0.16b
665 | WORD $0x6e211c61 // eor v1.16b, v3.16b, v1.16b
666 | WORD $0xad3f86e0 // stp q0, q1, [x23, #-16]
667 | WORD $0x910082f7 // add x23, x23, #32
668 | WORD $0x54ffff01 // b.ne .LBB7_10
669 | WORD $0xaa1303f7 // mov x23, x19
670 | WORD $0xeb07005f // cmp x2, x7
671 | WORD $0x54fffc80 // b.eq .LBB7_6
672 |
673 | LBB7_12:
674 | WORD $0xd37df2f8 // lsl x24, x23, #3
675 | WORD $0xcb170236 // sub x22, x17, x23
676 | WORD $0x8b180017 // add x23, x0, x24
677 | WORD $0x8b1802b5 // add x21, x21, x24
678 |
679 | LBB7_13:
680 | WORD $0xf84086b8 // ldr x24, [x21], #8
681 | WORD $0xf94002f9 // ldr x25, [x23]
682 | WORD $0xf10006d6 // subs x22, x22, #1
683 | WORD $0xca180338 // eor x24, x25, x24
684 | WORD $0xf80086f8 // str x24, [x23], #8
685 | WORD $0x54ffff61 // b.ne .LBB7_13
686 | WORD $0x17ffffd9 // b .LBB7_6
687 |
688 | LBB7_14:
689 | WORD $0xa9444ff4 // ldp x20, x19, [sp, #64]
690 | WORD $0xa94357f6 // ldp x22, x21, [sp, #48]
691 | WORD $0xa9425ff8 // ldp x24, x23, [sp, #32]
692 | WORD $0xf9400bf9 // ldr x25, [sp, #16]
693 | WORD $0xa8c57bfd // ldp x29, x30, [sp], #80
694 | WORD $0xd65f03c0 // ret
695 |
696 | TEXT ·_count(SB), $0-32
697 | MOVD a+0(FP), R0
698 | MOVD size+8(FP), R1
699 | MOVD result+16(FP), R2
700 | WORD $0xa9bf7bfd // stp x29, x30, [sp, #-16]!
701 | WORD $0x910003fd // mov x29, sp
702 | WORD $0xb40000c1 // cbz x1, .LBB8_3
703 | WORD $0xf100103f // cmp x1, #4
704 | WORD $0x540000c2 // b.hs .LBB8_4
705 | WORD $0xaa1f03e8 // mov x8, xzr
706 | WORD $0xaa1f03e9 // mov x9, xzr
707 | WORD $0x14000019 // b .LBB8_7
708 |
709 | LBB8_3:
710 | WORD $0xaa1f03e9 // mov x9, xzr
711 | WORD $0x14000020 // b .LBB8_9
712 |
713 | LBB8_4:
714 | WORD $0x927ef428 // and x8, x1, #0xfffffffffffffffc
715 | WORD $0x91004009 // add x9, x0, #16
716 | WORD $0x6f00e400 // movi v0.2d, #0000000000000000
717 | WORD $0xaa0803ea // mov x10, x8
718 | WORD $0x6f00e401 // movi v1.2d, #0000000000000000
719 |
720 | LBB8_5:
721 | WORD $0xad7f8d22 // ldp q2, q3, [x9, #-16]
722 | WORD $0x91008129 // add x9, x9, #32
723 | WORD $0xf100114a // subs x10, x10, #4
724 | WORD $0x4e205842 // cnt v2.16b, v2.16b
725 | WORD $0x4e205863 // cnt v3.16b, v3.16b
726 | WORD $0x6e202842 // uaddlp v2.8h, v2.16b
727 | WORD $0x6e202863 // uaddlp v3.8h, v3.16b
728 | WORD $0x6e602842 // uaddlp v2.4s, v2.8h
729 | WORD $0x6e602863 // uaddlp v3.4s, v3.8h
730 | WORD $0x6ea06840 // uadalp v0.2d, v2.4s
731 | WORD $0x6ea06861 // uadalp v1.2d, v3.4s
732 | WORD $0x54fffea1 // b.ne .LBB8_5
733 | WORD $0x4ee08420 // add v0.2d, v1.2d, v0.2d
734 | WORD $0xeb01011f // cmp x8, x1
735 | WORD $0x5ef1b800 // addp d0, v0.2d
736 | WORD $0x9e660009 // fmov x9, d0
737 | WORD $0x54000140 // b.eq .LBB8_9
738 |
739 | LBB8_7:
740 | WORD $0x8b080c0a // add x10, x0, x8, lsl #3
741 | WORD $0xcb080028 // sub x8, x1, x8
742 |
743 | LBB8_8:
744 | WORD $0xfc408540 // ldr d0, [x10], #8
745 | WORD $0xf1000508 // subs x8, x8, #1
746 | WORD $0x0e205800 // cnt v0.8b, v0.8b
747 | WORD $0x2e303800 // uaddlv h0, v0.8b
748 | WORD $0x1e26000b // fmov w11, s0
749 | WORD $0x8b090169 // add x9, x11, x9
750 | WORD $0x54ffff41 // b.ne .LBB8_8
751 |
752 | LBB8_9:
753 | WORD $0xf9000049 // str x9, [x2]
754 | WORD $0xa8c17bfd // ldp x29, x30, [sp], #16
755 | WORD $0xd65f03c0 // ret
756 |
--------------------------------------------------------------------------------