├── LICENSE
├── README.md
├── examples_test.go
├── go.mod
├── logic.go
├── logic_test.go
├── math.go
├── math_test.go
└── utils.go


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Daniel T
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![adc63e1f-a7e3-4272-b3d4-8e60f88c9b92](https://github.com/user-attachments/assets/3a6af901-d95e-46eb-9c1c-9d395fea8739)
  2 | 
  3 | # `swar`: Processs `[]byte` quicker!
  4 | 
  5 | **Process 8 bytes at a time using an old technique called Simd Within a Register.**
  6 | 
  7 | [![Go Reference](https://pkg.go.dev/badge/github.com/dans-stuff/swar.svg)](https://pkg.go.dev/github.com/dans-stuff/swar) [![Go Report Card](https://goreportcard.com/badge/github.com/dans-stuff/swar)](https://goreportcard.com/report/github.com/dans-stuff/swar) 
  8 | 
  9 | - 🚀 **Up to 6x faster** than optimized byte-by-byte code
 10 | - 🔌 **Zero dependencies** - no CGO or assembly required
 11 | - 🧩 **Dead simple API** - works with your existing code
 12 | - ⚡ **Fully portable** - runs anywhere Go runs
 13 | 
 14 | ```go
 15 | chunks, remainder := swar.BytesToLanes(text)
 16 | for _, chunk := range chunks {
 17 |     // We can work with 8 bytes at a time!
 18 |     matches := swar.HighBitWhereEqual(chunk, spaces)
 19 | }
 20 | ```
 21 | 
 22 | ## Installation
 23 | 
 24 | ```bash
 25 | go get github.com/dans-stuff/swar@latest
 26 | ```
 27 | 
 28 | ## Core Operations
 29 | 
 30 | | Category | Operations | Use Cases |
 31 | |----------|------------|-----------|
 32 | | **Comparison** | Equal, Less, Greater | Pattern matching, thresholds |
 33 | | **Math** | Add, Subtract, Min/Max, Average | Signal processing, stats |
 34 | | **Bit Ops** | Swap nibbles, Reverse bits, Count ones | Encoding, hashing |
 35 | | **Selection** | Branchless conditional select | Transformations |
 36 | 
 37 | ## Real Performance
 38 | 
 39 | | Operation | Standard Go | SWAR | Speedup |
 40 | |-----------|-------------|------|---------|
 41 | | Count character occurrences | 19.30 ns | 7.58 ns | **2.55x** |
 42 | | Find uppercase letters | 31.41 ns | 20.32 ns | **1.55x** |
 43 | | Convert case | 61.96 ns | 31.53 ns | **1.96x** |
 44 | | Detect anomalies | 6.99 ns | 4.17 ns | **1.68x** |
 45 | 
 46 | ## Full Example: Character Counter
 47 | 
 48 | This example counts spaces in a string. For short strings it even outperforms stdlib `bytes.Count`, which is written in assembly! Find more examples in the `examples_test.go` file.
 49 | 
 50 | ```go
 51 | package main
 52 | 
 53 | import (
 54 |     "fmt"
 55 |     "github.com/dans-stuff/swar"
 56 | )
 57 | 
 58 | func main() {
 59 |     text := []byte("Hello, World!")
 60 |     
 61 |     // Process in 8-byte chunks
 62 |     lanes, remainder := swar.BytesToLanes(text)
 63 |     
 64 |     // Find spaces in parallel
 65 |     spaces := swar.Dupe(' ')
 66 |     count := 0
 67 |     
 68 |     for _, lane := range lanes {
 69 |         // Sets high bit in bytes equal to space
 70 |         matches := swar.HighBitWhereEqual(lane, spaces)
 71 |         // Count matches
 72 |         count += bits.OnesCount64(matches >> 7)
 73 |     }
 74 |     
 75 |     // Process any leftover bytes
 76 |     for _, c := range text[remainder:] {
 77 |         if c == ' ' {
 78 |             count++
 79 |         }
 80 |     }
 81 |     
 82 |     fmt.Printf("Found %d spaces\n", count)
 83 | }
 84 | ```
 85 | 
 86 | ## Perfect For
 87 | 
 88 | - **Text Processing**: UTF-8 validation, parser tokenization
 89 | - **Network Protocols**: Header parsing, packet filtering
 90 | - **Image Processing**: Thresholding, pixel transformations
 91 | - **Data Analysis**: Time series anomaly detection
 92 | 
 93 | ## How It Works
 94 | 
 95 | SWAR treats a 64-bit integer as 8 parallel lanes, using clever bit manipulation to perform the same operation on all bytes simultaneously without branching.
 96 | 
 97 | ## License & Contributing
 98 | 
 99 | MIT Licensed. [Contributions](https://github.com/dans-stuff/swar/fork) welcome!
100 | 


--------------------------------------------------------------------------------
/examples_test.go:
--------------------------------------------------------------------------------
  1 | package swar
  2 | 
  3 | import (
  4 | 	"bytes"
  5 | 	"math/bits"
  6 | 	"testing"
  7 | )
  8 | 
  9 | // This file contains examples of how to use SWAR.
 10 | // It also includes reference implementations to show what each operation is doing.
 11 | 
 12 | // Sample text for benchmark tests - contains spaces and mixed case for testing
 13 | var lotsOfBytes = []byte("Allo Zorld! I am NOT yelling, but I am using SWAR!")
 14 | 
 15 | // BenchmarkUsageCount compares the performance of counting spaces using traditional
 16 | // byte-by-byte scanning versus SWAR-based parallel comparison. This benchmark
 17 | // demonstrates how SIMD-within-a-register can accelerate simple character counting,
 18 | // which is useful in text processing applications.
 19 | func BenchmarkUsageCount(b *testing.B) {
 20 | 	b.Run("BestNaive", func(b *testing.B) {
 21 | 		count := 0
 22 | 		b.ResetTimer()
 23 | 
 24 | 		for i := 0; i < b.N; i++ {
 25 | 			for _, c := range lotsOfBytes {
 26 | 				if c == ' ' {
 27 | 					count++
 28 | 				}
 29 | 			}
 30 | 		}
 31 | 		if count != 10*b.N {
 32 | 			b.Errorf("Expected %d, got %d", 10*b.N, count)
 33 | 		}
 34 | 	})
 35 | 
 36 | 	b.Run("SWAR", func(b *testing.B) {
 37 | 		spaces := Dupe(' ')
 38 | 		count := 0
 39 | 		b.ResetTimer()
 40 | 
 41 | 		for i := 0; i < b.N; i++ {
 42 | 			chunks, unused := BytesToLanes(lotsOfBytes)
 43 | 			for _, chunk := range chunks {
 44 | 				count += bits.OnesCount64(HighBitWhereEqual(chunk, spaces))
 45 | 			}
 46 | 			for _, c := range lotsOfBytes[unused:] {
 47 | 				if c == ' ' {
 48 | 					count++
 49 | 				}
 50 | 			}
 51 | 		}
 52 | 		if count != 10*b.N {
 53 | 			b.Errorf("Expected %d, got %d", 10*b.N, count)
 54 | 		}
 55 | 	})
 56 | }
 57 | 
 58 | // BenchmarkUsageVisitCaps compares traditional and SWAR approaches for finding and
 59 | // processing uppercase letters in text. This benchmark demonstrates how SWAR enables
 60 | // efficient filtering and position tracking in parallel, which is valuable for
 61 | // text analysis and pattern matching applications.
 62 | func BenchmarkUsageVisitCaps(b *testing.B) {
 63 | 	b.Run("BestNaive", func(b *testing.B) {
 64 | 		sum := 0
 65 | 		b.ResetTimer()
 66 | 
 67 | 		for i := 0; i < b.N; i++ {
 68 | 			for i, c := range lotsOfBytes {
 69 | 				if c >= 'A' && c <= 'Z' {
 70 | 					sum += i
 71 | 				}
 72 | 			}
 73 | 		}
 74 | 		if sum/b.N != 291 {
 75 | 			b.Errorf("Expected 291, got %d", sum/b.N)
 76 | 		}
 77 | 	})
 78 | 
 79 | 	b.Run("SWAR", func(b *testing.B) {
 80 | 		firstCapital, lastCapital := Dupe('A'-1), Dupe('Z'+1)
 81 | 		sum := 0
 82 | 		b.ResetTimer()
 83 | 
 84 | 		for i := 0; i < b.N; i++ {
 85 | 			chunks, unused := BytesToLanes(lotsOfBytes)
 86 | 
 87 | 			for idx, chunk := range chunks {
 88 | 				caps := HighBitWhereGreater(chunk, firstCapital) & HighBitWhereLess(chunk, lastCapital)
 89 | 				matches := ExtractLowBits(caps >> 7)
 90 | 				offsets := Lookup.OnesPositions[matches]
 91 | 				for _, v := range offsets {
 92 | 					sum += v + idx*8
 93 | 				}
 94 | 			}
 95 | 
 96 | 			for i, c := range lotsOfBytes[unused:] {
 97 | 				if c >= 'A' && c <= 'Z' {
 98 | 					sum += unused + i
 99 | 				}
100 | 			}
101 | 		}
102 | 
103 | 		if sum/b.N != 291 {
104 | 			b.Errorf("Expected 291, got %d", sum/b.N)
105 | 		}
106 | 	})
107 | }
108 | 
109 | // BenchmarkUsageUppercase compares standard library and SWAR approaches to converting
110 | // text to uppercase. This benchmark shows how SWAR enables high-performance text
111 | // transformation by applying character-level changes to multiple bytes in parallel,
112 | // which is important for text processing pipelines.
113 | func BenchmarkUsageUppercase(b *testing.B) {
114 | 	b.Run("BestNaive", func(b *testing.B) {
115 | 
116 | 		out := []byte{}
117 | 		b.ResetTimer()
118 | 		for i := 0; i < b.N; i++ {
119 | 			out = bytes.ToUpper(lotsOfBytes)
120 | 		}
121 | 		if string(out) != "ALLO ZORLD! I AM NOT YELLING, BUT I AM USING SWAR!" {
122 | 			b.Errorf("Expected 'ALLO ZORLD! I AM NOT YELLING, BUT I AM USING SWAR!', got '%s'", string(out))
123 | 		}
124 | 	})
125 | 
126 | 	b.Run("SWAR", func(b *testing.B) {
127 | 		out := make([]byte, len(lotsOfBytes))
128 | 		b.ResetTimer()
129 | 		for i := 0; i < b.N; i++ {
130 | 			inc := Dupe(32)
131 | 			firstLower, lastLower := Dupe('a'-1), Dupe('z'+1)
132 | 
133 | 			out = make([]byte, len(lotsOfBytes))
134 | 			outLanes, _ := BytesToLanes(out)
135 | 			chunks, unused := BytesToLanes(lotsOfBytes)
136 | 			for idx, chunk := range chunks {
137 | 				lowercases := HighBitWhereGreater(chunk, firstLower) & HighBitWhereLess(chunk, lastLower)
138 | 				allUpper := SubtractBytesWithWrapping(chunk, inc)
139 | 				outLanes[idx] = SelectByLowBit(allUpper, chunk, lowercases>>7)
140 | 			}
141 | 			for i, c := range lotsOfBytes[unused:] {
142 | 				if c >= 'a' && c <= 'z' {
143 | 					out[unused+i] = c - 32
144 | 				} else {
145 | 					out[unused+i] = c
146 | 				}
147 | 			}
148 | 		}
149 | 		if string(out) != "ALLO ZORLD! I AM NOT YELLING, BUT I AM USING SWAR!" {
150 | 			b.Errorf("Expected 'ALLO ZORLD! I AM NOT YELLING, BUT I AM USING SWAR!', got '%s'", string(out))
151 | 		}
152 | 	})
153 | }
154 | 
155 | // BenchmarkUsageAnomalies demonstrates using SWAR for anomaly detection in time series data.
156 | // This benchmark shows how SWAR enables efficient detection of unusual patterns or outliers
157 | // by processing multiple values simultaneously and using parallel threshold comparison,
158 | // which is critical for real-time monitoring and alerting systems.
159 | func BenchmarkUsageAnomalies(b *testing.B) {
160 | 
161 | 	b.Run("BestNaive", func(b *testing.B) {
162 | 		currentTemps := []byte{0, 0, 0, 0, 0, 0, 0, 0}
163 | 		averageTemps := []byte{0, 0, 0, 0, 0, 0, 0, 0}
164 | 		anomalies := 0
165 | 		threshold := int(2) // going above 2 in one step is an anomaly
166 | 
167 | 		b.ResetTimer()
168 | 		for i := 0; i < b.N; i++ {
169 | 			currentTemps[i%8] += 1 // normal behaviour
170 | 			if i%81 == 0 {
171 | 				currentTemps[i%8] = 0 // simulate an anomaly
172 | 			}
173 | 
174 | 			for j, avg := range averageTemps {
175 | 				averageTemps[j] = byte((int(avg) + int(currentTemps[j])) / 2)
176 | 				delta := int(currentTemps[j]) - int(avg)
177 | 				if delta > threshold || delta < -threshold {
178 | 					averageTemps[j] = currentTemps[j]
179 | 					anomalies++
180 | 				}
181 | 			}
182 | 		}
183 | 		if anomalies != b.N/81 {
184 | 			b.Errorf("Expected %d (%d/81) anomalies, got %d", b.N/81, b.N, anomalies)
185 | 		}
186 | 	})
187 | 
188 | 	b.Run("SWAR", func(b *testing.B) {
189 | 		currentTemps := []byte{0, 0, 0, 0, 0, 0, 0, 0}
190 | 
191 | 		currentLane, _ := BytesToLanes(currentTemps)
192 | 		averageTemps, _ := BytesToLanes([]byte{0, 0, 0, 0, 0, 0, 0, 0})
193 | 		anomalies := 0
194 | 		threshold := Dupe(2) // going above 2 in one step is an anomaly
195 | 
196 | 		b.ResetTimer()
197 | 		for i := 0; i < b.N; i++ {
198 | 			currentTemps[i%8] += 1 // normal behaviour
199 | 			if i%81 == 0 {
200 | 				currentTemps[i%8] = 0 // simulate an anomaly
201 | 			}
202 | 
203 | 			averageTemps[0] = AverageBytes(currentLane[0], averageTemps[0])
204 | 			delta := AbsoluteDifferenceBetweenBytes(currentLane[0], averageTemps[0])
205 | 			overThreshold := HighBitWhereGreater(delta, threshold)
206 | 			if overThreshold != 0 {
207 | 				averageTemps[0] = currentLane[0]
208 | 				anomalies++
209 | 			}
210 | 		}
211 | 		if anomalies != b.N/81 {
212 | 			b.Errorf("Expected %d (%d/81) anomalies, got %d", b.N/81, b.N, anomalies)
213 | 		}
214 | 	})
215 | }
216 | 
217 | // TestToBytes verifies that our endian-aware byte conversion works correctly.
218 | // This test ensures that bytes are extracted in the right order when converting
219 | // from a 64-bit integer to an 8-byte array, which is fundamental for the correctness
220 | // of all other operations that depend on byte manipulation.
221 | func TestToBytes(t *testing.T) {
222 | 	in := uint64(0x000000bfe5bd580c)
223 | 	expected := []byte{0x00, 0x00, 0x00, 0xbf, 0xe5, 0xbd, 0x58, 0x0c}
224 | 	out := toBytes(in)
225 | 	if len(out) != len(expected) {
226 | 		t.Errorf("Expected length %d, got %d", len(expected), len(out))
227 | 	}
228 | 	for i := 0; i < len(expected); i++ {
229 | 		if out[i] != expected[i] {
230 | 			t.Errorf("Expected byte %d to be %x, got %x", i, expected[i], out[i])
231 | 		}
232 | 	}
233 | }
234 | 
235 | // helper: big-endian lanes
236 | func toBytes(v uint64) [8]byte {
237 | 	var b [8]byte
238 | 	for i := 0; i < 8; i++ {
239 | 		shift := uint((7 - i) * 8)
240 | 		b[i] = byte((v >> shift) & 0xFF)
241 | 	}
242 | 	return b
243 | }
244 | 
245 | func fromBytes(b [8]byte) uint64 {
246 | 	var v uint64
247 | 	for i := 0; i < 8; i++ {
248 | 		shift := uint((7 - i) * 8)
249 | 		v |= uint64(b[i]) << shift
250 | 	}
251 | 	return v
252 | }
253 | 
254 | func byteFromLowBits(b [8]byte) uint8 {
255 | 	var out uint8
256 | 	if b[0]&1 != 0 {
257 | 		out |= 1 << 7
258 | 	}
259 | 	if b[1]&1 != 0 {
260 | 		out |= 1 << 6
261 | 	}
262 | 	if b[2]&1 != 0 {
263 | 		out |= 1 << 5
264 | 	}
265 | 	if b[3]&1 != 0 {
266 | 		out |= 1 << 4
267 | 	}
268 | 	if b[4]&1 != 0 {
269 | 		out |= 1 << 3
270 | 	}
271 | 	if b[5]&1 != 0 {
272 | 		out |= 1 << 2
273 | 	}
274 | 	if b[6]&1 != 0 {
275 | 		out |= 1 << 1
276 | 	}
277 | 	if b[7]&1 != 0 {
278 | 		out |= 1 << 0
279 | 	}
280 | 	return out
281 | }
282 | 
283 | func addWrapBytes(ba, bb [8]byte) [8]byte {
284 | 	ba[0] = ba[0] + bb[0]
285 | 	ba[1] = ba[1] + bb[1]
286 | 	ba[2] = ba[2] + bb[2]
287 | 	ba[3] = ba[3] + bb[3]
288 | 	ba[4] = ba[4] + bb[4]
289 | 	ba[5] = ba[5] + bb[5]
290 | 	ba[6] = ba[6] + bb[6]
291 | 	ba[7] = ba[7] + bb[7]
292 | 	return ba
293 | }
294 | 
295 | func minBytes(ba, bb [8]byte) [8]byte {
296 | 	if bb[0] < ba[0] {
297 | 		ba[0] = bb[0]
298 | 	}
299 | 	if bb[1] < ba[1] {
300 | 		ba[1] = bb[1]
301 | 	}
302 | 	if bb[2] < ba[2] {
303 | 		ba[2] = bb[2]
304 | 	}
305 | 	if bb[3] < ba[3] {
306 | 		ba[3] = bb[3]
307 | 	}
308 | 	if bb[4] < ba[4] {
309 | 		ba[4] = bb[4]
310 | 	}
311 | 	if bb[5] < ba[5] {
312 | 		ba[5] = bb[5]
313 | 	}
314 | 	if bb[6] < ba[6] {
315 | 		ba[6] = bb[6]
316 | 	}
317 | 	if bb[7] < ba[7] {
318 | 		ba[7] = bb[7]
319 | 	}
320 | 	return ba
321 | }
322 | 
323 | func maxBytes(ba, bb [8]byte) [8]byte {
324 | 	if bb[0] > ba[0] {
325 | 		ba[0] = bb[0]
326 | 	}
327 | 	if bb[1] > ba[1] {
328 | 		ba[1] = bb[1]
329 | 	}
330 | 	if bb[2] > ba[2] {
331 | 		ba[2] = bb[2]
332 | 	}
333 | 	if bb[3] > ba[3] {
334 | 		ba[3] = bb[3]
335 | 	}
336 | 	if bb[4] > ba[4] {
337 | 		ba[4] = bb[4]
338 | 	}
339 | 	if bb[5] > ba[5] {
340 | 		ba[5] = bb[5]
341 | 	}
342 | 	if bb[6] > ba[6] {
343 | 		ba[6] = bb[6]
344 | 	}
345 | 	if bb[7] > ba[7] {
346 | 		ba[7] = bb[7]
347 | 	}
348 | 	return ba
349 | }
350 | 
351 | func averageBytes(ba, bb [8]byte) [8]byte {
352 | 	ba[0] = byte((int(ba[0]) + int(bb[0])) / 2)
353 | 	ba[1] = byte((int(ba[1]) + int(bb[1])) / 2)
354 | 	ba[2] = byte((int(ba[2]) + int(bb[2])) / 2)
355 | 	ba[3] = byte((int(ba[3]) + int(bb[3])) / 2)
356 | 	ba[4] = byte((int(ba[4]) + int(bb[4])) / 2)
357 | 	ba[5] = byte((int(ba[5]) + int(bb[5])) / 2)
358 | 	ba[6] = byte((int(ba[6]) + int(bb[6])) / 2)
359 | 	ba[7] = byte((int(ba[7]) + int(bb[7])) / 2)
360 | 	return ba
361 | }
362 | 
363 | func swapNibbles(b [8]byte) [8]byte {
364 | 	b[0] = (b[0]&0x0F)<<4 | (b[0]&0xF0)>>4
365 | 	b[1] = (b[1]&0x0F)<<4 | (b[1]&0xF0)>>4
366 | 	b[2] = (b[2]&0x0F)<<4 | (b[2]&0xF0)>>4
367 | 	b[3] = (b[3]&0x0F)<<4 | (b[3]&0xF0)>>4
368 | 	b[4] = (b[4]&0x0F)<<4 | (b[4]&0xF0)>>4
369 | 	b[5] = (b[5]&0x0F)<<4 | (b[5]&0xF0)>>4
370 | 	b[6] = (b[6]&0x0F)<<4 | (b[6]&0xF0)>>4
371 | 	b[7] = (b[7]&0x0F)<<4 | (b[7]&0xF0)>>4
372 | 	return b
373 | }
374 | 
375 | func reverseBits(b [8]byte) [8]byte {
376 | 	b[0] = bits.Reverse8(b[0])
377 | 	b[1] = bits.Reverse8(b[1])
378 | 	b[2] = bits.Reverse8(b[2])
379 | 	b[3] = bits.Reverse8(b[3])
380 | 	b[4] = bits.Reverse8(b[4])
381 | 	b[5] = bits.Reverse8(b[5])
382 | 	b[6] = bits.Reverse8(b[6])
383 | 	b[7] = bits.Reverse8(b[7])
384 | 	return b
385 | }
386 | 
387 | func popcountPerByte(b [8]byte) [8]byte {
388 | 	b[0] = byte(bits.OnesCount8(b[0]))
389 | 	b[1] = byte(bits.OnesCount8(b[1]))
390 | 	b[2] = byte(bits.OnesCount8(b[2]))
391 | 	b[3] = byte(bits.OnesCount8(b[3]))
392 | 	b[4] = byte(bits.OnesCount8(b[4]))
393 | 	b[5] = byte(bits.OnesCount8(b[5]))
394 | 	b[6] = byte(bits.OnesCount8(b[6]))
395 | 	b[7] = byte(bits.OnesCount8(b[7]))
396 | 	return b
397 | }
398 | 
399 | func addSatBytes(ba, bb [8]byte) [8]byte {
400 | 	if bb[0] > 0xFF-ba[0] {
401 | 		ba[0] = 0xFF
402 | 	} else {
403 | 		ba[0] += bb[0]
404 | 	}
405 | 	if bb[1] > 0xFF-ba[1] {
406 | 		ba[1] = 0xFF
407 | 	} else {
408 | 		ba[1] += bb[1]
409 | 	}
410 | 	if bb[2] > 0xFF-ba[2] {
411 | 		ba[2] = 0xFF
412 | 	} else {
413 | 		ba[2] += bb[2]
414 | 	}
415 | 	if bb[3] > 0xFF-ba[3] {
416 | 		ba[3] = 0xFF
417 | 	} else {
418 | 		ba[3] += bb[3]
419 | 	}
420 | 	if bb[4] > 0xFF-ba[4] {
421 | 		ba[4] = 0xFF
422 | 	} else {
423 | 		ba[4] += bb[4]
424 | 	}
425 | 	if bb[5] > 0xFF-ba[5] {
426 | 		ba[5] = 0xFF
427 | 	} else {
428 | 		ba[5] += bb[5]
429 | 	}
430 | 	if bb[6] > 0xFF-ba[6] {
431 | 		ba[6] = 0xFF
432 | 	} else {
433 | 		ba[6] += bb[6]
434 | 	}
435 | 	if bb[7] > 0xFF-ba[7] {
436 | 		ba[7] = 0xFF
437 | 	} else {
438 | 		ba[7] += bb[7]
439 | 	}
440 | 	return ba
441 | }
442 | 
443 | func selectByLowBits(ba, bb, ma [8]byte) [8]byte {
444 | 	var out [8]byte
445 | 	if ma[0] != 0 {
446 | 		out[0] = ba[0]
447 | 	} else {
448 | 		out[0] = bb[0]
449 | 	}
450 | 	if ma[1] != 0 {
451 | 		out[1] = ba[1]
452 | 	} else {
453 | 		out[1] = bb[1]
454 | 	}
455 | 	if ma[2] != 0 {
456 | 		out[2] = ba[2]
457 | 	} else {
458 | 		out[2] = bb[2]
459 | 	}
460 | 	if ma[3] != 0 {
461 | 		out[3] = ba[3]
462 | 	} else {
463 | 		out[3] = bb[3]
464 | 	}
465 | 	if ma[4] != 0 {
466 | 		out[4] = ba[4]
467 | 	} else {
468 | 		out[4] = bb[4]
469 | 	}
470 | 	if ma[5] != 0 {
471 | 		out[5] = ba[5]
472 | 	} else {
473 | 		out[5] = bb[5]
474 | 	}
475 | 	if ma[6] != 0 {
476 | 		out[6] = ba[6]
477 | 	} else {
478 | 		out[6] = bb[6]
479 | 	}
480 | 	if ma[7] != 0 {
481 | 		out[7] = ba[7]
482 | 	} else {
483 | 		out[7] = bb[7]
484 | 	}
485 | 	return out
486 | }
487 | 
488 | func subBytesWrap(aa, bb [8]byte) [8]byte {
489 | 	aa[0] = aa[0] - bb[0]
490 | 	aa[1] = aa[1] - bb[1]
491 | 	aa[2] = aa[2] - bb[2]
492 | 	aa[3] = aa[3] - bb[3]
493 | 	aa[4] = aa[4] - bb[4]
494 | 	aa[5] = aa[5] - bb[5]
495 | 	aa[6] = aa[6] - bb[6]
496 | 	aa[7] = aa[7] - bb[7]
497 | 	return aa
498 | }
499 | 
500 | func subBytesSat(aa, bb [8]byte) [8]byte {
501 | 	if bb[0] > aa[0] {
502 | 		aa[0] = 0
503 | 	} else {
504 | 		aa[0] -= bb[0]
505 | 	}
506 | 	if bb[1] > aa[1] {
507 | 		aa[1] = 0
508 | 	} else {
509 | 		aa[1] -= bb[1]
510 | 	}
511 | 	if bb[2] > aa[2] {
512 | 		aa[2] = 0
513 | 	} else {
514 | 		aa[2] -= bb[2]
515 | 	}
516 | 	if bb[3] > aa[3] {
517 | 		aa[3] = 0
518 | 	} else {
519 | 		aa[3] -= bb[3]
520 | 	}
521 | 	if bb[4] > aa[4] {
522 | 		aa[4] = 0
523 | 	} else {
524 | 		aa[4] -= bb[4]
525 | 	}
526 | 	if bb[5] > aa[5] {
527 | 		aa[5] = 0
528 | 	} else {
529 | 		aa[5] -= bb[5]
530 | 	}
531 | 	if bb[6] > aa[6] {
532 | 		aa[6] = 0
533 | 	} else {
534 | 		aa[6] -= bb[6]
535 | 	}
536 | 	if bb[7] > aa[7] {
537 | 		aa[7] = 0
538 | 	} else {
539 | 		aa[7] -= bb[7]
540 | 	}
541 | 	return aa
542 | }
543 | 
544 | func addBytesSat(aa, bb [8]byte) [8]byte {
545 | 	s0 := uint16(aa[0]) + uint16(bb[0])
546 | 	if s0 > 255 {
547 | 		aa[0] = 0xFF
548 | 	} else {
549 | 		aa[0] = byte(s0)
550 | 	}
551 | 	s1 := uint16(aa[1]) + uint16(bb[1])
552 | 	if s1 > 255 {
553 | 		aa[1] = 0xFF
554 | 	} else {
555 | 		aa[1] = byte(s1)
556 | 	}
557 | 	s2 := uint16(aa[2]) + uint16(bb[2])
558 | 	if s2 > 255 {
559 | 		aa[2] = 0xFF
560 | 	} else {
561 | 		aa[2] = byte(s2)
562 | 	}
563 | 	s3 := uint16(aa[3]) + uint16(bb[3])
564 | 	if s3 > 255 {
565 | 		aa[3] = 0xFF
566 | 	} else {
567 | 		aa[3] = byte(s3)
568 | 	}
569 | 	s4 := uint16(aa[4]) + uint16(bb[4])
570 | 	if s4 > 255 {
571 | 		aa[4] = 0xFF
572 | 	} else {
573 | 		aa[4] = byte(s4)
574 | 	}
575 | 	s5 := uint16(aa[5]) + uint16(bb[5])
576 | 	if s5 > 255 {
577 | 		aa[5] = 0xFF
578 | 	} else {
579 | 		aa[5] = byte(s5)
580 | 	}
581 | 	s6 := uint16(aa[6]) + uint16(bb[6])
582 | 	if s6 > 255 {
583 | 		aa[6] = 0xFF
584 | 	} else {
585 | 		aa[6] = byte(s6)
586 | 	}
587 | 	s7 := uint16(aa[7]) + uint16(bb[7])
588 | 	if s7 > 255 {
589 | 		aa[7] = 0xFF
590 | 	} else {
591 | 		aa[7] = byte(s7)
592 | 	}
593 | 	return aa
594 | }
595 | 
596 | func absDiffBytes(aa, bb [8]byte) [8]byte {
597 | 	if aa[0] >= bb[0] {
598 | 		aa[0] = aa[0] - bb[0]
599 | 	} else {
600 | 		aa[0] = bb[0] - aa[0]
601 | 	}
602 | 	if aa[1] >= bb[1] {
603 | 		aa[1] = aa[1] - bb[1]
604 | 	} else {
605 | 		aa[1] = bb[1] - aa[1]
606 | 	}
607 | 	if aa[2] >= bb[2] {
608 | 		aa[2] = aa[2] - bb[2]
609 | 	} else {
610 | 		aa[2] = bb[2] - aa[2]
611 | 	}
612 | 	if aa[3] >= bb[3] {
613 | 		aa[3] = aa[3] - bb[3]
614 | 	} else {
615 | 		aa[3] = bb[3] - aa[3]
616 | 	}
617 | 	if aa[4] >= bb[4] {
618 | 		aa[4] = aa[4] - bb[4]
619 | 	} else {
620 | 		aa[4] = bb[4] - aa[4]
621 | 	}
622 | 	if aa[5] >= bb[5] {
623 | 		aa[5] = aa[5] - bb[5]
624 | 	} else {
625 | 		aa[5] = bb[5] - aa[5]
626 | 	}
627 | 	if aa[6] >= bb[6] {
628 | 		aa[6] = aa[6] - bb[6]
629 | 	} else {
630 | 		aa[6] = bb[6] - aa[6]
631 | 	}
632 | 	if aa[7] >= bb[7] {
633 | 		aa[7] = aa[7] - bb[7]
634 | 	} else {
635 | 		aa[7] = bb[7] - aa[7]
636 | 	}
637 | 	return aa
638 | }
639 | 
640 | func highBitWhereLess(b [8]byte, c [8]byte) [8]byte {
641 | 	if b[0] < c[0] {
642 | 		b[0] = 0x80
643 | 	} else {
644 | 		b[0] = 0
645 | 	}
646 | 	if b[1] < c[1] {
647 | 		b[1] = 0x80
648 | 	} else {
649 | 		b[1] = 0
650 | 	}
651 | 	if b[2] < c[2] {
652 | 		b[2] = 0x80
653 | 	} else {
654 | 		b[2] = 0
655 | 	}
656 | 	if b[3] < c[3] {
657 | 		b[3] = 0x80
658 | 	} else {
659 | 		b[3] = 0
660 | 	}
661 | 	if b[4] < c[4] {
662 | 		b[4] = 0x80
663 | 	} else {
664 | 		b[4] = 0
665 | 	}
666 | 	if b[5] < c[5] {
667 | 		b[5] = 0x80
668 | 	} else {
669 | 		b[5] = 0
670 | 	}
671 | 	if b[6] < c[6] {
672 | 		b[6] = 0x80
673 | 	} else {
674 | 		b[6] = 0
675 | 	}
676 | 	if b[7] < c[7] {
677 | 		b[7] = 0x80
678 | 	} else {
679 | 		b[7] = 0
680 | 	}
681 | 	return b
682 | }
683 | 
684 | func highBitWhereGreater(b [8]byte, c [8]byte) [8]byte {
685 | 	if b[0] > c[0] {
686 | 		b[0] = 0x80
687 | 	} else {
688 | 		b[0] = 0
689 | 	}
690 | 	if b[1] > c[1] {
691 | 		b[1] = 0x80
692 | 	} else {
693 | 		b[1] = 0
694 | 	}
695 | 	if b[2] > c[2] {
696 | 		b[2] = 0x80
697 | 	} else {
698 | 		b[2] = 0
699 | 	}
700 | 	if b[3] > c[3] {
701 | 		b[3] = 0x80
702 | 	} else {
703 | 		b[3] = 0
704 | 	}
705 | 	if b[4] > c[4] {
706 | 		b[4] = 0x80
707 | 	} else {
708 | 		b[4] = 0
709 | 	}
710 | 	if b[5] > c[5] {
711 | 		b[5] = 0x80
712 | 	} else {
713 | 		b[5] = 0
714 | 	}
715 | 	if b[6] > c[6] {
716 | 		b[6] = 0x80
717 | 	} else {
718 | 		b[6] = 0
719 | 	}
720 | 	if b[7] > c[7] {
721 | 		b[7] = 0x80
722 | 	} else {
723 | 		b[7] = 0
724 | 	}
725 | 	return b
726 | }
727 | 
728 | func highBitWhereEqual(b [8]byte, c [8]byte) [8]byte {
729 | 	if b[0] == c[0] {
730 | 		b[0] = 0x80
731 | 	} else {
732 | 		b[0] = 0
733 | 	}
734 | 	if b[1] == c[1] {
735 | 		b[1] = 0x80
736 | 	} else {
737 | 		b[1] = 0
738 | 	}
739 | 	if b[2] == c[2] {
740 | 		b[2] = 0x80
741 | 	} else {
742 | 		b[2] = 0
743 | 	}
744 | 	if b[3] == c[3] {
745 | 		b[3] = 0x80
746 | 	} else {
747 | 		b[3] = 0
748 | 	}
749 | 	if b[4] == c[4] {
750 | 		b[4] = 0x80
751 | 	} else {
752 | 		b[4] = 0
753 | 	}
754 | 	if b[5] == c[5] {
755 | 		b[5] = 0x80
756 | 	} else {
757 | 		b[5] = 0
758 | 	}
759 | 	if b[6] == c[6] {
760 | 		b[6] = 0x80
761 | 	} else {
762 | 		b[6] = 0
763 | 	}
764 | 	if b[7] == c[7] {
765 | 		b[7] = 0x80
766 | 	} else {
767 | 		b[7] = 0
768 | 	}
769 | 	return b
770 | }
771 | 
772 | func TestSWARFunctionsRef(t *testing.T) {
773 | 	for n := uint64(0); n < 0x_FF_FF_FF_FF_FF; n = (n*12 + 13) / 11 {
774 | 		nA := toBytes(n)
775 | 		if a, b := SwapByteHalves(n), swapNibbles(nA); a != fromBytes(b) {
776 | 			t.Errorf("SwapByteHalves(0x%016x) = 0x%016x; want 0x%016x", n, a, fromBytes(b))
777 | 		}
778 | 		if a, b := ReverseEachByte(n), reverseBits(nA); a != fromBytes(b) {
779 | 			t.Errorf("ReverseEachByte(0x%016x) = 0x%016x; want 0x%016x", n, a, fromBytes(b))
780 | 		}
781 | 		if a, b := CountOnesPerByte(n), popcountPerByte(nA); a != fromBytes(b) {
782 | 			t.Errorf("CountOnesPerByte(0x%016x) = 0x%016x; want 0x%016x", n, a, fromBytes(b))
783 | 		}
784 | 		if a, b := ExtractLowBits(n&LowBits), byteFromLowBits(toBytes(n&LowBits)); a != b {
785 | 			t.Errorf("ExtractLowBits(0x%016x) = 0b%08b; want 0b%08b", n, a, b)
786 | 		}
787 | 
788 | 		m := n ^ 0x0000005351952b76
789 | 		mA := toBytes(m)
790 | 		if a, b := SelectSmallerBytes(n, m), minBytes(nA, mA); a != fromBytes(b) {
791 | 			t.Errorf("SelectSmallerBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b))
792 | 		}
793 | 		if a, b := SelectLargerBytes(n, m), maxBytes(nA, mA); a != fromBytes(b) {
794 | 			t.Errorf("SelectLargerBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b))
795 | 		}
796 | 		if a, b := AverageBytes(n, m), averageBytes(nA, mA); a != fromBytes(b) {
797 | 			t.Errorf("AverageBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b))
798 | 		}
799 | 		if a, b := AbsoluteDifferenceBetweenBytes(n, m), absDiffBytes(nA, mA); a != fromBytes(b) {
800 | 			t.Errorf("AbsoluteDifferenceBetweenBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b))
801 | 		}
802 | 		if a, b := AddBytesWithWrapping(n, m), addWrapBytes(nA, mA); a != fromBytes(b) {
803 | 			t.Errorf("AddWrapBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b))
804 | 		}
805 | 		if a, b := SubtractBytesWithWrapping(n, m), subBytesWrap(nA, mA); a != fromBytes(b) {
806 | 			t.Errorf("SubtractBytesWithWrapping(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b))
807 | 		}
808 | 		if a, b := AddBytesWithMaximum(n, m), addSatBytes(nA, mA); a != fromBytes(b) {
809 | 			t.Errorf("AddBytesWithMaximum(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b))
810 | 		}
811 | 		if a, b := SubtractBytesWithMinimum(n, m), subBytesSat(nA, mA); a != fromBytes(b) {
812 | 			t.Errorf("SubtractBytesWithMinimum(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b))
813 | 		}
814 | 
815 | 		c := Dupe(byte(m % 0x_FE))
816 | 		if a, b := HighBitWhereLess(n, c), highBitWhereLess(nA, toBytes(c)); a != fromBytes(b) {
817 | 			t.Errorf("HighBitWhereLess(0x%016x, %2x) = 0x%016x; want 0x%016x", n, c, a, fromBytes(b))
818 | 		}
819 | 		if a, b := HighBitWhereGreater(n, c), highBitWhereGreater(nA, toBytes(c)); a != fromBytes(b) {
820 | 			t.Errorf("HighBitWhereGreater(0x%016x, %2x) = 0x%016x; want 0x%016x", n, c, a, fromBytes(b))
821 | 		}
822 | 		if a, b := HighBitWhereEqual(n, c), highBitWhereEqual(nA, toBytes(c)); a != fromBytes(b) {
823 | 			t.Errorf("HighBitWhereEqual(0x%016x, %2x) = 0x%016x; want 0x%016x", n, c, a, fromBytes(b))
824 | 		}
825 | 
826 | 		d := uint64(0x_01_00_01_01_00_00_01_00)
827 | 		dA := toBytes(d)
828 | 		if a, b := SelectByLowBit(n, m, d), selectByLowBits(nA, mA, dA); a != fromBytes(b) {
829 | 			t.Errorf("SelectByLowBit(0x%016x, 0x%016x, 0x%016x) = 0x%016x; want 0x%016x (%v)", n, m, d, a, fromBytes(b), dA)
830 | 		}
831 | 
832 | 		// t.Logf("Tested with n=0x%016x, m=0x%016x, c=%02x, d=0x%016x", n, m, c, d)
833 | 		// t.Logf("As arrays: n=%v, m=%v, d=%v", nA, mA, dA)
834 | 	}
835 | }
836 | 


--------------------------------------------------------------------------------
/go.mod:
--------------------------------------------------------------------------------
1 | module github.com/dans-stuff/swar
2 | 
3 | go 1.24.2
4 | 


--------------------------------------------------------------------------------
/logic.go:
--------------------------------------------------------------------------------
 1 | package swar
 2 | 
 3 | const (
 4 | 	// HighBits is a mask with the high bit set in all 8 bytes of a uint64
 5 | 	HighBits uint64 = 0x8080_8080_8080_8080
 6 | )
 7 | 
 8 | // HighBitWhereLess sets the high bit (0x80) in each byte where v < cm
 9 | // Enables parallel comparison of 8 bytes simultaneously
10 | func HighBitWhereLess(v, cm uint64) uint64 {
11 | 	d := (v | HighBits) - (cm &^ HighBits)
12 | 	sel := ((v & (v ^ cm)) | (d &^ (v ^ cm))) & HighBits
13 | 	hbit := sel ^ HighBits // 0x80 in each byte where v < cm
14 | 	return hbit & HighBits // 0x80 or 0x00 per lane
15 | }
16 | 
17 | // HighBitWhereGreater sets the high bit (0x80) in each byte where v > cm
18 | // Perfect for threshold detection across multiple values
19 | func HighBitWhereGreater(v, cm uint64) uint64 {
20 | 	d := (cm | HighBits) - (v &^ HighBits)
21 | 	sel := ((cm & (cm ^ v)) | (d &^ (cm ^ v))) & HighBits
22 | 	hbit := sel ^ HighBits // 0x80 in each byte where v > cm
23 | 	return hbit & HighBits // 0x80 or 0x00 per lane
24 | }
25 | 
26 | // HighBitWhereEqual sets the high bit (0x80) in each byte where v == cm
27 | // Ideal for pattern matching and finding specific values in data
28 | func HighBitWhereEqual(v, cm uint64) uint64 {
29 | 	x := v ^ cm
30 | 	y := ((x & 0x7F7F7F7F7F7F7F7F) + 0x7F7F7F7F7F7F7F7F) | x
31 | 	hi := ^y & HighBits  // 0x80 where x==0 (v==cm)
32 | 	return hi & HighBits // mask off other bits
33 | }
34 | 


--------------------------------------------------------------------------------
/logic_test.go:
--------------------------------------------------------------------------------
  1 | package swar
  2 | 
  3 | import (
  4 | 	"testing"
  5 | )
  6 | 
  7 | // TestHighBitWhereEqual verifies that the HighBitWhereEqual function correctly
  8 | // identifies bytes that match a comparison value. These tests are important because
  9 | // the SWAR technique uses non-intuitive bit manipulation that needs proper verification.
 10 | func TestHighBitWhereEqual(t *testing.T) {
 11 | 	run := func(v, c, want uint64) {
 12 | 		if got := HighBitWhereEqual(v, c); got != want {
 13 | 			t.Errorf("HighBitWhereEqual(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", v, c, got, want)
 14 | 		}
 15 | 	}
 16 | 
 17 | 	run(0x05, Dupe(5), 0x80)
 18 | 	run(0x04, Dupe(5), 0x00)
 19 | 	run(0x05_04, Dupe(5), 0x80_00)
 20 | 	run(0xFF_00, Dupe(0), 0x80_80_80_80_80_80_00_80)
 21 | }
 22 | 
 23 | // TestHighBitWhereLess verifies that the HighBitWhereLess function correctly identifies
 24 | // bytes less than a comparison value. This is crucial for threshold-based processing
 25 | // and range checks operating on multiple bytes in parallel.
 26 | func TestHighBitWhereLess(t *testing.T) {
 27 | 	run := func(v, c, want uint64) {
 28 | 		if got := HighBitWhereLess(v, c); got != want {
 29 | 			t.Errorf("HighBitWhereLess(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", v, c, got, want)
 30 | 		}
 31 | 	}
 32 | 
 33 | 	run(0x06, Dupe(5), 0x80_80_80_80_80_80_80_00)
 34 | 	run(0x04, Dupe(5), 0x80_80_80_80_80_80_80_80)
 35 | 	run(0x01_02_03_04_05_06_07_08, Dupe(5), 0x80_80_80_80_00_00_00_00)
 36 | }
 37 | 
 38 | // TestHighBitWhereGreater verifies that the HighBitWhereGreater function correctly
 39 | // identifies bytes greater than a comparison value. This functionality is essential for
 40 | // detecting outliers, anomalies, and values exceeding specified thresholds.
 41 | func TestHighBitWhereGreater(t *testing.T) {
 42 | 	run := func(v, c, want uint64) {
 43 | 		if got := HighBitWhereGreater(v, c); got != want {
 44 | 			t.Errorf("HighBitWhereGreater(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", v, c, got, want)
 45 | 		}
 46 | 	}
 47 | 
 48 | 	run(0x05, Dupe(5), 0x00)
 49 | 	run(0x06, Dupe(5), 0x80)
 50 | 	run(0xFF_04_05_06_00, Dupe(5), 0x80_00_00_80_00)
 51 | }
 52 | 
 53 | // TestSelectByLowBit verifies that values are correctly selected from a or b based on
 54 | // the corresponding mask bit. This branchless selection is critical for data-dependent
 55 | // operations where conditional logic would otherwise harm performance.
 56 | func TestSelectByLowBit(t *testing.T) {
 57 | 	run := func(a, b, mask, want uint64) {
 58 | 		if got := SelectByLowBit(a, b, mask); got != want {
 59 | 			t.Errorf("SelectByLowBit(0x%016x, 0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, mask, got, want)
 60 | 		}
 61 | 	}
 62 | 
 63 | 	run(0x11_11_11_11, 0x22_22_22_22, 0x01_00_01_00, 0x11_22_11_22)
 64 | }
 65 | 
 66 | // TestMinBytes verifies that our parallel minimum function correctly selects the smaller
 67 | // of two values for each byte position. This is essential for applications like image processing
 68 | // where per-pixel minimum operations affect visual outcomes.
 69 | func TestMinBytes(t *testing.T) {
 70 | 	run := func(a, b, want uint64) {
 71 | 		if got := SelectSmallerBytes(a, b); got != want {
 72 | 			t.Errorf("SelectSmallerBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, got, want)
 73 | 		}
 74 | 	}
 75 | 
 76 | 	run(0x01_02_03_04_05_06_07_08, 0x05_04_03_02_01_00_09_0A, 0x01_02_03_02_01_00_07_08)
 77 | 	run(0x0000_0000_0000_0004, 0x1234_5678_90AB_CDEB, 0x0000_0000_0000_0004)
 78 | }
 79 | 
 80 | // TestMaxBytes verifies that our parallel maximum function correctly selects the larger
 81 | // of two values for each byte position. This is critical for algorithms like feature extraction
 82 | // and signal peak detection where maintaining maximum values is required.
 83 | func TestMaxBytes(t *testing.T) {
 84 | 	run := func(a, b, want uint64) {
 85 | 		if got := SelectLargerBytes(a, b); got != want {
 86 | 			t.Errorf("SelectLargerBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, got, want)
 87 | 		}
 88 | 	}
 89 | 
 90 | 	run(0x01_02_03_04_05_06_07_08, 0x05_04_03_02_01_00_09_0A, 0x05_04_03_04_05_06_09_0A)
 91 | 	run(0x04, 0xEB, 0xEB)
 92 | 	run(0x01, 0x02, 0x02)
 93 | }
 94 | 
 95 | // TestSwapNibbles verifies that our nibble-swapping function correctly exchanges the high
 96 | // and low 4 bits of each byte. This transformation is important for BCD encoding/decoding
 97 | // and certain data format conversions that rely on nibble-level manipulations.
 98 | func TestSwapNibbles(t *testing.T) {
 99 | 	run := func(s, want uint64) {
100 | 		if got := SwapByteHalves(s); got != want {
101 | 			t.Errorf("SwapByteHalves(0x%016x) = 0x%016x; want 0x%016x", s, got, want)
102 | 		}
103 | 	}
104 | 
105 | 	run(0xF0_F0_F0_F0_F0_F0_F0_F0, 0x0F_0F_0F_0F_0F_0F_0F_0F)
106 | }
107 | 
108 | // TestReverseBits verifies that our bit-reversal function correctly reverses the order
109 | // of bits within each byte. This is crucial for operations like endianness conversion
110 | // and certain data transformations that depend on bit-level mirroring.
111 | func TestReverseBits(t *testing.T) {
112 | 	run := func(v, want uint64) {
113 | 		if got := ReverseEachByte(v); got != want {
114 | 			t.Errorf("ReverseEachByte(0x%016x) = 0x%016x; want 0x%016x", v, got, want)
115 | 		}
116 | 	}
117 | 
118 | 	run(0x01_02_04_08_10_20_40_80, 0x80_40_20_10_08_04_02_01)
119 | 	run(0b01001000_11100001_11000011_11110000, 0b00010010_10000111_11000011_00001111)
120 | }
121 | 
122 | // TestPopcountPerByte verifies that our parallel population count correctly counts the
123 | // set bits in each byte. This functionality is essential for feature extraction, hamming
124 | // distance calculation, and statistical analysis of binary data.
125 | func TestPopcountPerByte(t *testing.T) {
126 | 	run := func(p, want uint64) {
127 | 		if got := CountOnesPerByte(p); got != want {
128 | 			t.Errorf("CountOnesPerByte(0x%016x) = 0x%016x; want 0x%016x", p, got, want)
129 | 		}
130 | 	}
131 | 
132 | 	run(0x0F_F0_55_AA_00_FF_33_CC, 0x04_04_04_04_00_08_04_04)
133 | }
134 | 


--------------------------------------------------------------------------------
/math.go:
--------------------------------------------------------------------------------
  1 | package swar
  2 | 
  3 | const (
  4 | 	// mEven selects even bytes in a uint64
  5 | 	mEven uint64 = 0x00FF_00FF_00FF_00FF
  6 | 	// mOdd selects odd bytes in a uint64
  7 | 	mOdd uint64 = 0xFF00_FF00_FF00_FF00
  8 | 	// laneNotHigh masks all bits except the high bit in each byte
  9 | 	laneNotHigh uint64 = 0x7F7F_7F7F_7F7F_7F7F
 10 | )
 11 | 
 12 | // SubtractBytesWithWrapping performs byte-wise subtraction with wrapping
 13 | // Parallel subtraction across all 8 bytes with wrap-around behavior
 14 | func SubtractBytesWithWrapping(a, b uint64) uint64 {
 15 | 	return ((a | HighBits) - (b &^ HighBits)) ^ ((a ^ ^b) & HighBits)
 16 | }
 17 | 
 18 | // SubtractBytesWithMinimum performs byte-wise subtraction clamped at zero
 19 | // Provides saturating subtraction to prevent underflow in all 8 bytes
 20 | func SubtractBytesWithMinimum(a, b uint64) uint64 {
 21 | 	diff := ((a | HighBits) - (b &^ HighBits)) ^ ((a ^ ^b) & HighBits)
 22 | 	bo := ((^a & b) | ((^a | b) & diff)) & HighBits
 23 | 	return diff &^ ((bo >> 7) * 0xFF)
 24 | }
 25 | 
 26 | // AddBytesWithWrapping performs byte-wise addition with wrap-around
 27 | // Parallel addition across all 8 bytes with overflow wrapping to zero
 28 | func AddBytesWithWrapping(a, b uint64) uint64 {
 29 | 	sum := (a & laneNotHigh) + (b & laneNotHigh)
 30 | 	return sum ^ ((a ^ b) & HighBits)
 31 | }
 32 | 
 33 | // AddBytesWithMaximum performs byte-wise addition clamped at 255
 34 | // Saturating addition to prevent overflow in all 8 bytes
 35 | func AddBytesWithMaximum(a, b uint64) uint64 {
 36 | 	preSum := (a & laneNotHigh) + (b & laneNotHigh)
 37 | 	sum := preSum ^ ((a ^ b) & HighBits)
 38 | 	carry := ((a & b) | ((a | b) & ^sum)) & HighBits
 39 | 	return sum | (carry>>7)*0xFF
 40 | }
 41 | 
 42 | // AbsoluteDifferenceBetweenBytes calculates |a-b| for each byte
 43 | // Computes unsigned distances for metrics and signal processing
 44 | func AbsoluteDifferenceBetweenBytes(a, b uint64) uint64 {
 45 | 	d := a - b
 46 | 	borrow := ((^a & b) | ((^a | b) & d)) & HighBits
 47 | 	mask := (borrow >> 7) * 0xFF
 48 | 	n := (a &^ mask) | (b & mask)
 49 | 	m := (a & mask) | (b &^ mask)
 50 | 	return ((n | HighBits) - (m &^ HighBits)) ^ ((n ^ ^m) & HighBits)
 51 | }
 52 | 
 53 | // SelectSmallerBytes returns min(a,b) for each byte
 54 | // Efficient for clipping, filtering, and data preprocessing
 55 | func SelectSmallerBytes(a, b uint64) uint64 {
 56 | 	d := a - b
 57 | 	borrow := ((^a & b) | ((^a | b) & d)) & HighBits
 58 | 	mask := (borrow >> 7) * 0xFF
 59 | 	return (a & mask) | (b &^ mask)
 60 | }
 61 | 
 62 | // SelectLargerBytes returns max(a,b) for each byte
 63 | // Ideal for peak detection, ceiling operations, and filtering
 64 | func SelectLargerBytes(a, b uint64) uint64 {
 65 | 	d := a - b
 66 | 	borrow := ((^a & b) | ((^a | b) & d)) & HighBits
 67 | 	mask := (borrow >> 7) * 0xFF
 68 | 	return (a &^ mask) | (b & mask)
 69 | }
 70 | 
 71 | // AverageBytes calculates (a+b)/2 for each byte without overflow
 72 | // Perfect for signal processing, image manipulation, and smoothing
 73 | func AverageBytes(a, b uint64) uint64 {
 74 | 	common := a & b
 75 | 	diff := (a ^ b) & 0xFEFE_FEFE_FEFE_FEFE
 76 | 	return common + (diff >> 1)
 77 | }
 78 | 
 79 | // SwapByteHalves swaps the high and low nibbles in each byte
 80 | // Useful for BCD encoding/decoding and nibble-level transforms
 81 | func SwapByteHalves(v uint64) uint64 {
 82 | 	lo := v & 0x0F0F_0F0F_0F0F_0F0F
 83 | 	hi := v & 0xF0F0_F0F0_F0F0_F0F0
 84 | 	return (lo << 4) | (hi >> 4)
 85 | }
 86 | 
 87 | // ReverseEachByte reverses the bit order within each byte
 88 | // Useful for endianness conversion and bit-level manipulations
 89 | func ReverseEachByte(v uint64) uint64 {
 90 | 	x := ((v >> 1) & 0x5555555555555555) | ((v & 0x5555555555555555) << 1)
 91 | 	x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2)
 92 | 	x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4)
 93 | 	return x
 94 | }
 95 | 
 96 | // SelectByLowBit selects values from a or b based on mask bits
 97 | // Branchless selection between values based on conditions
 98 | func SelectByLowBit(a, b, mask uint64) uint64 {
 99 | 	byteMask := mask * 0xFF
100 | 	return (a & byteMask) | (b &^ byteMask)
101 | }
102 | 
103 | // CountOnesPerByte counts set bits in each byte
104 | // Parallel population count for hamming distance and feature extraction
105 | func CountOnesPerByte(v uint64) uint64 {
106 | 	m1 := v - ((v >> 1) & 0x5555_5555_5555_5555)
107 | 	m2 := (m1 & 0x3333_3333_3333_3333) + ((m1 >> 2) & 0x3333_3333_3333_3333)
108 | 	return (m2 + (m2 >> 4)) & 0x0F0F_0F0F_0F0F_0F0F
109 | }
110 | 


--------------------------------------------------------------------------------
/math_test.go:
--------------------------------------------------------------------------------
 1 | package swar
 2 | 
 3 | import (
 4 | 	"testing"
 5 | )
 6 | 
 7 | // TestAverageBytes verifies that our parallel averaging algorithm correctly calculates
 8 | // the mean of corresponding bytes. This ensures proper data smoothing and interpolation
 9 | // behavior when processing multiple values simultaneously.
10 | func TestAverageBytes(t *testing.T) {
11 | 	run := func(a, b, want uint64) {
12 | 		if got := AverageBytes(a, b); got != want {
13 | 			t.Errorf("AverageBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, got, want)
14 | 		}
15 | 	}
16 | 
17 | 	run(0x01_10_40_FF, 0xFF_30_80_FD, 0x80_20_60_FE)
18 | 	run(0x04, 0x08, 0x06)
19 | 	run(0x10_DD, 0x30_FF, 0x20_EE)
20 | 	run(0x0004, 0xCDEB, 0x6677)
21 | 	run(0x01FE, 0xCC11, 0x6687)
22 | }
23 | 
24 | // TestAddSatBytes verifies that our saturating addition correctly clamps results to 0xFF
25 | // when overflow occurs. This is crucial for applications like image processing and signal
26 | // manipulation where preventing overflow is necessary for correct results.
27 | func TestAddSatBytes(t *testing.T) {
28 | 	run := func(a, b, want uint64) {
29 | 		if got := AddBytesWithMaximum(a, b); got != want {
30 | 			t.Errorf("AddBytesWithMaximum(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, got, want)
31 | 		}
32 | 	}
33 | 
34 | 	run(0xFF_FE_FD, 0x01_01_01, 0xFF_FF_FE)
35 | 	run(0xFD_FC_FB, 0x03_03_03, 0xFF_FF_FE)
36 | }
37 | 
38 | // TestAddBytesWithWrapping ensures that our wrapping addition correctly handles overflow
39 | // by wrapping around to zero. This behavior is essential for certain algorithms like
40 | // checksums and hash functions where wrap-around arithmetic is expected and required.
41 | func TestAddBytesWithWrapping(t *testing.T) {
42 | 	run := func(a, b, want uint64) {
43 | 		if got := AddBytesWithWrapping(a, b); got != want {
44 | 			t.Errorf("AddBytesWithWrapping(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, got, want)
45 | 		}
46 | 	}
47 | 
48 | 	run(0xFF_FE_FD, 0x01_01_01, 0x00_FF_FE)
49 | 	run(0xFD_FC_FB, 0x03_03_03, 0x00_FF_FE)
50 | 	run(0xF4_F9, 0x0F_01, 0x03_FA)
51 | 	run(0xFF_0F_FF, 0x01_F0_00, 0x00_FF_FF)
52 | }
53 | 


--------------------------------------------------------------------------------
/utils.go:
--------------------------------------------------------------------------------
 1 | package swar
 2 | 
 3 | import "unsafe"
 4 | 
 5 | const (
 6 | 	// LowBits has the lowest bit set in each byte for value duplication
 7 | 	LowBits uint64 = 0x0101_0101_0101_0101
 8 | 	// packMask packs low bits from each byte into a single byte
 9 | 	packMask uint64 = 0x0102_0408_1020_4080
10 | )
11 | 
12 | // BytesToLanes converts a []byte to []uint64 for SWAR processing
13 | // Returns uint64 lanes and index where unused bytes begin
14 | func BytesToLanes(b []byte) ([]uint64, int) {
15 | 	countChunks := len(b) / 8
16 | 	chunks := unsafe.Slice((*uint64)(unsafe.Pointer(&b[0])), countChunks)
17 | 	return chunks, countChunks * 8
18 | }
19 | 
20 | // LanesToBytes converts []uint64 back to []byte
21 | // Zero-copy conversion for optimal performance
22 | func LanesToBytes(lanes []uint64) []byte {
23 | 	countBytes := len(lanes) * 8
24 | 	bytes := unsafe.Slice((*byte)(unsafe.Pointer(&lanes[0])), countBytes)
25 | 	return bytes
26 | }
27 | 
28 | // Dupe duplicates a byte across all 8 bytes of a uint64
29 | // Creates comparison values for parallel operations
30 | func Dupe(c byte) uint64 {
31 | 	return uint64(c) * LowBits
32 | }
33 | 
34 | // ExtractLowBits packs the low bit from each byte into a single byte
35 | // Compacts 8 comparison results into a single byte
36 | func ExtractLowBits(v uint64) byte {
37 | 	return byte((v * packMask) >> 56)
38 | }
39 | 
40 | // IntToLanes converts a uint64 to an 8-byte array
41 | // Access individual bytes for mixed SWAR/byte-level operations
42 | func IntToLanes(i uint64) [8]byte {
43 | 	return *(*[8]byte)(unsafe.Pointer(&i))
44 | }
45 | 
46 | // LanesToInt converts an 8-byte array to uint64
47 | // Zero-copy conversion from byte-level to SWAR format
48 | func LanesToInt(lanes [8]byte) uint64 {
49 | 	return *(*uint64)(unsafe.Pointer(&lanes))
50 | }
51 | 
52 | // Lookup provides precomputed data for optimized operations
53 | // OnesPositions maps byte values to positions of their set bits
54 | var Lookup = struct {
55 | 	OnesPositions [256][]int
56 | }{
57 | 	func() (res [256][]int) {
58 | 		for b := range res {
59 | 			for i := 0; i < 8; i++ {
60 | 				if b>>i&1 == 1 {
61 | 					res[b] = append(res[b], i)
62 | 				}
63 | 			}
64 | 		}
65 | 		return
66 | 	}()}
67 | 


--------------------------------------------------------------------------------