├── LICENSE ├── README.md ├── examples_test.go ├── go.mod ├── logic.go ├── logic_test.go ├── math.go ├── math_test.go └── utils.go /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Daniel T 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![adc63e1f-a7e3-4272-b3d4-8e60f88c9b92](https://github.com/user-attachments/assets/3a6af901-d95e-46eb-9c1c-9d395fea8739) 2 | 3 | # `swar`: Processs `[]byte` quicker! 4 | 5 | **Process 8 bytes at a time using an old technique called Simd Within a Register.** 6 | 7 | [![Go Reference](https://pkg.go.dev/badge/github.com/dans-stuff/swar.svg)](https://pkg.go.dev/github.com/dans-stuff/swar) [![Go Report Card](https://goreportcard.com/badge/github.com/dans-stuff/swar)](https://goreportcard.com/report/github.com/dans-stuff/swar) 8 | 9 | - 🚀 **Up to 6x faster** than optimized byte-by-byte code 10 | - 🔌 **Zero dependencies** - no CGO or assembly required 11 | - 🧩 **Dead simple API** - works with your existing code 12 | - ⚡ **Fully portable** - runs anywhere Go runs 13 | 14 | ```go 15 | chunks, remainder := swar.BytesToLanes(text) 16 | for _, chunk := range chunks { 17 | // We can work with 8 bytes at a time! 18 | matches := swar.HighBitWhereEqual(chunk, spaces) 19 | } 20 | ``` 21 | 22 | ## Installation 23 | 24 | ```bash 25 | go get github.com/dans-stuff/swar@latest 26 | ``` 27 | 28 | ## Core Operations 29 | 30 | | Category | Operations | Use Cases | 31 | |----------|------------|-----------| 32 | | **Comparison** | Equal, Less, Greater | Pattern matching, thresholds | 33 | | **Math** | Add, Subtract, Min/Max, Average | Signal processing, stats | 34 | | **Bit Ops** | Swap nibbles, Reverse bits, Count ones | Encoding, hashing | 35 | | **Selection** | Branchless conditional select | Transformations | 36 | 37 | ## Real Performance 38 | 39 | | Operation | Standard Go | SWAR | Speedup | 40 | |-----------|-------------|------|---------| 41 | | Count character occurrences | 19.30 ns | 7.58 ns | **2.55x** | 42 | | Find uppercase letters | 31.41 ns | 20.32 ns | **1.55x** | 43 | | Convert case | 61.96 ns | 31.53 ns | **1.96x** | 44 | | Detect anomalies | 6.99 ns | 4.17 ns | **1.68x** | 45 | 46 | ## Full Example: Character Counter 47 | 48 | This example counts spaces in a string. For short strings it even outperforms stdlib `bytes.Count`, which is written in assembly! Find more examples in the `examples_test.go` file. 49 | 50 | ```go 51 | package main 52 | 53 | import ( 54 | "fmt" 55 | "github.com/dans-stuff/swar" 56 | ) 57 | 58 | func main() { 59 | text := []byte("Hello, World!") 60 | 61 | // Process in 8-byte chunks 62 | lanes, remainder := swar.BytesToLanes(text) 63 | 64 | // Find spaces in parallel 65 | spaces := swar.Dupe(' ') 66 | count := 0 67 | 68 | for _, lane := range lanes { 69 | // Sets high bit in bytes equal to space 70 | matches := swar.HighBitWhereEqual(lane, spaces) 71 | // Count matches 72 | count += bits.OnesCount64(matches >> 7) 73 | } 74 | 75 | // Process any leftover bytes 76 | for _, c := range text[remainder:] { 77 | if c == ' ' { 78 | count++ 79 | } 80 | } 81 | 82 | fmt.Printf("Found %d spaces\n", count) 83 | } 84 | ``` 85 | 86 | ## Perfect For 87 | 88 | - **Text Processing**: UTF-8 validation, parser tokenization 89 | - **Network Protocols**: Header parsing, packet filtering 90 | - **Image Processing**: Thresholding, pixel transformations 91 | - **Data Analysis**: Time series anomaly detection 92 | 93 | ## How It Works 94 | 95 | SWAR treats a 64-bit integer as 8 parallel lanes, using clever bit manipulation to perform the same operation on all bytes simultaneously without branching. 96 | 97 | ## License & Contributing 98 | 99 | MIT Licensed. [Contributions](https://github.com/dans-stuff/swar/fork) welcome! 100 | -------------------------------------------------------------------------------- /examples_test.go: -------------------------------------------------------------------------------- 1 | package swar 2 | 3 | import ( 4 | "bytes" 5 | "math/bits" 6 | "testing" 7 | ) 8 | 9 | // This file contains examples of how to use SWAR. 10 | // It also includes reference implementations to show what each operation is doing. 11 | 12 | // Sample text for benchmark tests - contains spaces and mixed case for testing 13 | var lotsOfBytes = []byte("Allo Zorld! I am NOT yelling, but I am using SWAR!") 14 | 15 | // BenchmarkUsageCount compares the performance of counting spaces using traditional 16 | // byte-by-byte scanning versus SWAR-based parallel comparison. This benchmark 17 | // demonstrates how SIMD-within-a-register can accelerate simple character counting, 18 | // which is useful in text processing applications. 19 | func BenchmarkUsageCount(b *testing.B) { 20 | b.Run("BestNaive", func(b *testing.B) { 21 | count := 0 22 | b.ResetTimer() 23 | 24 | for i := 0; i < b.N; i++ { 25 | for _, c := range lotsOfBytes { 26 | if c == ' ' { 27 | count++ 28 | } 29 | } 30 | } 31 | if count != 10*b.N { 32 | b.Errorf("Expected %d, got %d", 10*b.N, count) 33 | } 34 | }) 35 | 36 | b.Run("SWAR", func(b *testing.B) { 37 | spaces := Dupe(' ') 38 | count := 0 39 | b.ResetTimer() 40 | 41 | for i := 0; i < b.N; i++ { 42 | chunks, unused := BytesToLanes(lotsOfBytes) 43 | for _, chunk := range chunks { 44 | count += bits.OnesCount64(HighBitWhereEqual(chunk, spaces)) 45 | } 46 | for _, c := range lotsOfBytes[unused:] { 47 | if c == ' ' { 48 | count++ 49 | } 50 | } 51 | } 52 | if count != 10*b.N { 53 | b.Errorf("Expected %d, got %d", 10*b.N, count) 54 | } 55 | }) 56 | } 57 | 58 | // BenchmarkUsageVisitCaps compares traditional and SWAR approaches for finding and 59 | // processing uppercase letters in text. This benchmark demonstrates how SWAR enables 60 | // efficient filtering and position tracking in parallel, which is valuable for 61 | // text analysis and pattern matching applications. 62 | func BenchmarkUsageVisitCaps(b *testing.B) { 63 | b.Run("BestNaive", func(b *testing.B) { 64 | sum := 0 65 | b.ResetTimer() 66 | 67 | for i := 0; i < b.N; i++ { 68 | for i, c := range lotsOfBytes { 69 | if c >= 'A' && c <= 'Z' { 70 | sum += i 71 | } 72 | } 73 | } 74 | if sum/b.N != 291 { 75 | b.Errorf("Expected 291, got %d", sum/b.N) 76 | } 77 | }) 78 | 79 | b.Run("SWAR", func(b *testing.B) { 80 | firstCapital, lastCapital := Dupe('A'-1), Dupe('Z'+1) 81 | sum := 0 82 | b.ResetTimer() 83 | 84 | for i := 0; i < b.N; i++ { 85 | chunks, unused := BytesToLanes(lotsOfBytes) 86 | 87 | for idx, chunk := range chunks { 88 | caps := HighBitWhereGreater(chunk, firstCapital) & HighBitWhereLess(chunk, lastCapital) 89 | matches := ExtractLowBits(caps >> 7) 90 | offsets := Lookup.OnesPositions[matches] 91 | for _, v := range offsets { 92 | sum += v + idx*8 93 | } 94 | } 95 | 96 | for i, c := range lotsOfBytes[unused:] { 97 | if c >= 'A' && c <= 'Z' { 98 | sum += unused + i 99 | } 100 | } 101 | } 102 | 103 | if sum/b.N != 291 { 104 | b.Errorf("Expected 291, got %d", sum/b.N) 105 | } 106 | }) 107 | } 108 | 109 | // BenchmarkUsageUppercase compares standard library and SWAR approaches to converting 110 | // text to uppercase. This benchmark shows how SWAR enables high-performance text 111 | // transformation by applying character-level changes to multiple bytes in parallel, 112 | // which is important for text processing pipelines. 113 | func BenchmarkUsageUppercase(b *testing.B) { 114 | b.Run("BestNaive", func(b *testing.B) { 115 | 116 | out := []byte{} 117 | b.ResetTimer() 118 | for i := 0; i < b.N; i++ { 119 | out = bytes.ToUpper(lotsOfBytes) 120 | } 121 | if string(out) != "ALLO ZORLD! I AM NOT YELLING, BUT I AM USING SWAR!" { 122 | b.Errorf("Expected 'ALLO ZORLD! I AM NOT YELLING, BUT I AM USING SWAR!', got '%s'", string(out)) 123 | } 124 | }) 125 | 126 | b.Run("SWAR", func(b *testing.B) { 127 | out := make([]byte, len(lotsOfBytes)) 128 | b.ResetTimer() 129 | for i := 0; i < b.N; i++ { 130 | inc := Dupe(32) 131 | firstLower, lastLower := Dupe('a'-1), Dupe('z'+1) 132 | 133 | out = make([]byte, len(lotsOfBytes)) 134 | outLanes, _ := BytesToLanes(out) 135 | chunks, unused := BytesToLanes(lotsOfBytes) 136 | for idx, chunk := range chunks { 137 | lowercases := HighBitWhereGreater(chunk, firstLower) & HighBitWhereLess(chunk, lastLower) 138 | allUpper := SubtractBytesWithWrapping(chunk, inc) 139 | outLanes[idx] = SelectByLowBit(allUpper, chunk, lowercases>>7) 140 | } 141 | for i, c := range lotsOfBytes[unused:] { 142 | if c >= 'a' && c <= 'z' { 143 | out[unused+i] = c - 32 144 | } else { 145 | out[unused+i] = c 146 | } 147 | } 148 | } 149 | if string(out) != "ALLO ZORLD! I AM NOT YELLING, BUT I AM USING SWAR!" { 150 | b.Errorf("Expected 'ALLO ZORLD! I AM NOT YELLING, BUT I AM USING SWAR!', got '%s'", string(out)) 151 | } 152 | }) 153 | } 154 | 155 | // BenchmarkUsageAnomalies demonstrates using SWAR for anomaly detection in time series data. 156 | // This benchmark shows how SWAR enables efficient detection of unusual patterns or outliers 157 | // by processing multiple values simultaneously and using parallel threshold comparison, 158 | // which is critical for real-time monitoring and alerting systems. 159 | func BenchmarkUsageAnomalies(b *testing.B) { 160 | 161 | b.Run("BestNaive", func(b *testing.B) { 162 | currentTemps := []byte{0, 0, 0, 0, 0, 0, 0, 0} 163 | averageTemps := []byte{0, 0, 0, 0, 0, 0, 0, 0} 164 | anomalies := 0 165 | threshold := int(2) // going above 2 in one step is an anomaly 166 | 167 | b.ResetTimer() 168 | for i := 0; i < b.N; i++ { 169 | currentTemps[i%8] += 1 // normal behaviour 170 | if i%81 == 0 { 171 | currentTemps[i%8] = 0 // simulate an anomaly 172 | } 173 | 174 | for j, avg := range averageTemps { 175 | averageTemps[j] = byte((int(avg) + int(currentTemps[j])) / 2) 176 | delta := int(currentTemps[j]) - int(avg) 177 | if delta > threshold || delta < -threshold { 178 | averageTemps[j] = currentTemps[j] 179 | anomalies++ 180 | } 181 | } 182 | } 183 | if anomalies != b.N/81 { 184 | b.Errorf("Expected %d (%d/81) anomalies, got %d", b.N/81, b.N, anomalies) 185 | } 186 | }) 187 | 188 | b.Run("SWAR", func(b *testing.B) { 189 | currentTemps := []byte{0, 0, 0, 0, 0, 0, 0, 0} 190 | 191 | currentLane, _ := BytesToLanes(currentTemps) 192 | averageTemps, _ := BytesToLanes([]byte{0, 0, 0, 0, 0, 0, 0, 0}) 193 | anomalies := 0 194 | threshold := Dupe(2) // going above 2 in one step is an anomaly 195 | 196 | b.ResetTimer() 197 | for i := 0; i < b.N; i++ { 198 | currentTemps[i%8] += 1 // normal behaviour 199 | if i%81 == 0 { 200 | currentTemps[i%8] = 0 // simulate an anomaly 201 | } 202 | 203 | averageTemps[0] = AverageBytes(currentLane[0], averageTemps[0]) 204 | delta := AbsoluteDifferenceBetweenBytes(currentLane[0], averageTemps[0]) 205 | overThreshold := HighBitWhereGreater(delta, threshold) 206 | if overThreshold != 0 { 207 | averageTemps[0] = currentLane[0] 208 | anomalies++ 209 | } 210 | } 211 | if anomalies != b.N/81 { 212 | b.Errorf("Expected %d (%d/81) anomalies, got %d", b.N/81, b.N, anomalies) 213 | } 214 | }) 215 | } 216 | 217 | // TestToBytes verifies that our endian-aware byte conversion works correctly. 218 | // This test ensures that bytes are extracted in the right order when converting 219 | // from a 64-bit integer to an 8-byte array, which is fundamental for the correctness 220 | // of all other operations that depend on byte manipulation. 221 | func TestToBytes(t *testing.T) { 222 | in := uint64(0x000000bfe5bd580c) 223 | expected := []byte{0x00, 0x00, 0x00, 0xbf, 0xe5, 0xbd, 0x58, 0x0c} 224 | out := toBytes(in) 225 | if len(out) != len(expected) { 226 | t.Errorf("Expected length %d, got %d", len(expected), len(out)) 227 | } 228 | for i := 0; i < len(expected); i++ { 229 | if out[i] != expected[i] { 230 | t.Errorf("Expected byte %d to be %x, got %x", i, expected[i], out[i]) 231 | } 232 | } 233 | } 234 | 235 | // helper: big-endian lanes 236 | func toBytes(v uint64) [8]byte { 237 | var b [8]byte 238 | for i := 0; i < 8; i++ { 239 | shift := uint((7 - i) * 8) 240 | b[i] = byte((v >> shift) & 0xFF) 241 | } 242 | return b 243 | } 244 | 245 | func fromBytes(b [8]byte) uint64 { 246 | var v uint64 247 | for i := 0; i < 8; i++ { 248 | shift := uint((7 - i) * 8) 249 | v |= uint64(b[i]) << shift 250 | } 251 | return v 252 | } 253 | 254 | func byteFromLowBits(b [8]byte) uint8 { 255 | var out uint8 256 | if b[0]&1 != 0 { 257 | out |= 1 << 7 258 | } 259 | if b[1]&1 != 0 { 260 | out |= 1 << 6 261 | } 262 | if b[2]&1 != 0 { 263 | out |= 1 << 5 264 | } 265 | if b[3]&1 != 0 { 266 | out |= 1 << 4 267 | } 268 | if b[4]&1 != 0 { 269 | out |= 1 << 3 270 | } 271 | if b[5]&1 != 0 { 272 | out |= 1 << 2 273 | } 274 | if b[6]&1 != 0 { 275 | out |= 1 << 1 276 | } 277 | if b[7]&1 != 0 { 278 | out |= 1 << 0 279 | } 280 | return out 281 | } 282 | 283 | func addWrapBytes(ba, bb [8]byte) [8]byte { 284 | ba[0] = ba[0] + bb[0] 285 | ba[1] = ba[1] + bb[1] 286 | ba[2] = ba[2] + bb[2] 287 | ba[3] = ba[3] + bb[3] 288 | ba[4] = ba[4] + bb[4] 289 | ba[5] = ba[5] + bb[5] 290 | ba[6] = ba[6] + bb[6] 291 | ba[7] = ba[7] + bb[7] 292 | return ba 293 | } 294 | 295 | func minBytes(ba, bb [8]byte) [8]byte { 296 | if bb[0] < ba[0] { 297 | ba[0] = bb[0] 298 | } 299 | if bb[1] < ba[1] { 300 | ba[1] = bb[1] 301 | } 302 | if bb[2] < ba[2] { 303 | ba[2] = bb[2] 304 | } 305 | if bb[3] < ba[3] { 306 | ba[3] = bb[3] 307 | } 308 | if bb[4] < ba[4] { 309 | ba[4] = bb[4] 310 | } 311 | if bb[5] < ba[5] { 312 | ba[5] = bb[5] 313 | } 314 | if bb[6] < ba[6] { 315 | ba[6] = bb[6] 316 | } 317 | if bb[7] < ba[7] { 318 | ba[7] = bb[7] 319 | } 320 | return ba 321 | } 322 | 323 | func maxBytes(ba, bb [8]byte) [8]byte { 324 | if bb[0] > ba[0] { 325 | ba[0] = bb[0] 326 | } 327 | if bb[1] > ba[1] { 328 | ba[1] = bb[1] 329 | } 330 | if bb[2] > ba[2] { 331 | ba[2] = bb[2] 332 | } 333 | if bb[3] > ba[3] { 334 | ba[3] = bb[3] 335 | } 336 | if bb[4] > ba[4] { 337 | ba[4] = bb[4] 338 | } 339 | if bb[5] > ba[5] { 340 | ba[5] = bb[5] 341 | } 342 | if bb[6] > ba[6] { 343 | ba[6] = bb[6] 344 | } 345 | if bb[7] > ba[7] { 346 | ba[7] = bb[7] 347 | } 348 | return ba 349 | } 350 | 351 | func averageBytes(ba, bb [8]byte) [8]byte { 352 | ba[0] = byte((int(ba[0]) + int(bb[0])) / 2) 353 | ba[1] = byte((int(ba[1]) + int(bb[1])) / 2) 354 | ba[2] = byte((int(ba[2]) + int(bb[2])) / 2) 355 | ba[3] = byte((int(ba[3]) + int(bb[3])) / 2) 356 | ba[4] = byte((int(ba[4]) + int(bb[4])) / 2) 357 | ba[5] = byte((int(ba[5]) + int(bb[5])) / 2) 358 | ba[6] = byte((int(ba[6]) + int(bb[6])) / 2) 359 | ba[7] = byte((int(ba[7]) + int(bb[7])) / 2) 360 | return ba 361 | } 362 | 363 | func swapNibbles(b [8]byte) [8]byte { 364 | b[0] = (b[0]&0x0F)<<4 | (b[0]&0xF0)>>4 365 | b[1] = (b[1]&0x0F)<<4 | (b[1]&0xF0)>>4 366 | b[2] = (b[2]&0x0F)<<4 | (b[2]&0xF0)>>4 367 | b[3] = (b[3]&0x0F)<<4 | (b[3]&0xF0)>>4 368 | b[4] = (b[4]&0x0F)<<4 | (b[4]&0xF0)>>4 369 | b[5] = (b[5]&0x0F)<<4 | (b[5]&0xF0)>>4 370 | b[6] = (b[6]&0x0F)<<4 | (b[6]&0xF0)>>4 371 | b[7] = (b[7]&0x0F)<<4 | (b[7]&0xF0)>>4 372 | return b 373 | } 374 | 375 | func reverseBits(b [8]byte) [8]byte { 376 | b[0] = bits.Reverse8(b[0]) 377 | b[1] = bits.Reverse8(b[1]) 378 | b[2] = bits.Reverse8(b[2]) 379 | b[3] = bits.Reverse8(b[3]) 380 | b[4] = bits.Reverse8(b[4]) 381 | b[5] = bits.Reverse8(b[5]) 382 | b[6] = bits.Reverse8(b[6]) 383 | b[7] = bits.Reverse8(b[7]) 384 | return b 385 | } 386 | 387 | func popcountPerByte(b [8]byte) [8]byte { 388 | b[0] = byte(bits.OnesCount8(b[0])) 389 | b[1] = byte(bits.OnesCount8(b[1])) 390 | b[2] = byte(bits.OnesCount8(b[2])) 391 | b[3] = byte(bits.OnesCount8(b[3])) 392 | b[4] = byte(bits.OnesCount8(b[4])) 393 | b[5] = byte(bits.OnesCount8(b[5])) 394 | b[6] = byte(bits.OnesCount8(b[6])) 395 | b[7] = byte(bits.OnesCount8(b[7])) 396 | return b 397 | } 398 | 399 | func addSatBytes(ba, bb [8]byte) [8]byte { 400 | if bb[0] > 0xFF-ba[0] { 401 | ba[0] = 0xFF 402 | } else { 403 | ba[0] += bb[0] 404 | } 405 | if bb[1] > 0xFF-ba[1] { 406 | ba[1] = 0xFF 407 | } else { 408 | ba[1] += bb[1] 409 | } 410 | if bb[2] > 0xFF-ba[2] { 411 | ba[2] = 0xFF 412 | } else { 413 | ba[2] += bb[2] 414 | } 415 | if bb[3] > 0xFF-ba[3] { 416 | ba[3] = 0xFF 417 | } else { 418 | ba[3] += bb[3] 419 | } 420 | if bb[4] > 0xFF-ba[4] { 421 | ba[4] = 0xFF 422 | } else { 423 | ba[4] += bb[4] 424 | } 425 | if bb[5] > 0xFF-ba[5] { 426 | ba[5] = 0xFF 427 | } else { 428 | ba[5] += bb[5] 429 | } 430 | if bb[6] > 0xFF-ba[6] { 431 | ba[6] = 0xFF 432 | } else { 433 | ba[6] += bb[6] 434 | } 435 | if bb[7] > 0xFF-ba[7] { 436 | ba[7] = 0xFF 437 | } else { 438 | ba[7] += bb[7] 439 | } 440 | return ba 441 | } 442 | 443 | func selectByLowBits(ba, bb, ma [8]byte) [8]byte { 444 | var out [8]byte 445 | if ma[0] != 0 { 446 | out[0] = ba[0] 447 | } else { 448 | out[0] = bb[0] 449 | } 450 | if ma[1] != 0 { 451 | out[1] = ba[1] 452 | } else { 453 | out[1] = bb[1] 454 | } 455 | if ma[2] != 0 { 456 | out[2] = ba[2] 457 | } else { 458 | out[2] = bb[2] 459 | } 460 | if ma[3] != 0 { 461 | out[3] = ba[3] 462 | } else { 463 | out[3] = bb[3] 464 | } 465 | if ma[4] != 0 { 466 | out[4] = ba[4] 467 | } else { 468 | out[4] = bb[4] 469 | } 470 | if ma[5] != 0 { 471 | out[5] = ba[5] 472 | } else { 473 | out[5] = bb[5] 474 | } 475 | if ma[6] != 0 { 476 | out[6] = ba[6] 477 | } else { 478 | out[6] = bb[6] 479 | } 480 | if ma[7] != 0 { 481 | out[7] = ba[7] 482 | } else { 483 | out[7] = bb[7] 484 | } 485 | return out 486 | } 487 | 488 | func subBytesWrap(aa, bb [8]byte) [8]byte { 489 | aa[0] = aa[0] - bb[0] 490 | aa[1] = aa[1] - bb[1] 491 | aa[2] = aa[2] - bb[2] 492 | aa[3] = aa[3] - bb[3] 493 | aa[4] = aa[4] - bb[4] 494 | aa[5] = aa[5] - bb[5] 495 | aa[6] = aa[6] - bb[6] 496 | aa[7] = aa[7] - bb[7] 497 | return aa 498 | } 499 | 500 | func subBytesSat(aa, bb [8]byte) [8]byte { 501 | if bb[0] > aa[0] { 502 | aa[0] = 0 503 | } else { 504 | aa[0] -= bb[0] 505 | } 506 | if bb[1] > aa[1] { 507 | aa[1] = 0 508 | } else { 509 | aa[1] -= bb[1] 510 | } 511 | if bb[2] > aa[2] { 512 | aa[2] = 0 513 | } else { 514 | aa[2] -= bb[2] 515 | } 516 | if bb[3] > aa[3] { 517 | aa[3] = 0 518 | } else { 519 | aa[3] -= bb[3] 520 | } 521 | if bb[4] > aa[4] { 522 | aa[4] = 0 523 | } else { 524 | aa[4] -= bb[4] 525 | } 526 | if bb[5] > aa[5] { 527 | aa[5] = 0 528 | } else { 529 | aa[5] -= bb[5] 530 | } 531 | if bb[6] > aa[6] { 532 | aa[6] = 0 533 | } else { 534 | aa[6] -= bb[6] 535 | } 536 | if bb[7] > aa[7] { 537 | aa[7] = 0 538 | } else { 539 | aa[7] -= bb[7] 540 | } 541 | return aa 542 | } 543 | 544 | func addBytesSat(aa, bb [8]byte) [8]byte { 545 | s0 := uint16(aa[0]) + uint16(bb[0]) 546 | if s0 > 255 { 547 | aa[0] = 0xFF 548 | } else { 549 | aa[0] = byte(s0) 550 | } 551 | s1 := uint16(aa[1]) + uint16(bb[1]) 552 | if s1 > 255 { 553 | aa[1] = 0xFF 554 | } else { 555 | aa[1] = byte(s1) 556 | } 557 | s2 := uint16(aa[2]) + uint16(bb[2]) 558 | if s2 > 255 { 559 | aa[2] = 0xFF 560 | } else { 561 | aa[2] = byte(s2) 562 | } 563 | s3 := uint16(aa[3]) + uint16(bb[3]) 564 | if s3 > 255 { 565 | aa[3] = 0xFF 566 | } else { 567 | aa[3] = byte(s3) 568 | } 569 | s4 := uint16(aa[4]) + uint16(bb[4]) 570 | if s4 > 255 { 571 | aa[4] = 0xFF 572 | } else { 573 | aa[4] = byte(s4) 574 | } 575 | s5 := uint16(aa[5]) + uint16(bb[5]) 576 | if s5 > 255 { 577 | aa[5] = 0xFF 578 | } else { 579 | aa[5] = byte(s5) 580 | } 581 | s6 := uint16(aa[6]) + uint16(bb[6]) 582 | if s6 > 255 { 583 | aa[6] = 0xFF 584 | } else { 585 | aa[6] = byte(s6) 586 | } 587 | s7 := uint16(aa[7]) + uint16(bb[7]) 588 | if s7 > 255 { 589 | aa[7] = 0xFF 590 | } else { 591 | aa[7] = byte(s7) 592 | } 593 | return aa 594 | } 595 | 596 | func absDiffBytes(aa, bb [8]byte) [8]byte { 597 | if aa[0] >= bb[0] { 598 | aa[0] = aa[0] - bb[0] 599 | } else { 600 | aa[0] = bb[0] - aa[0] 601 | } 602 | if aa[1] >= bb[1] { 603 | aa[1] = aa[1] - bb[1] 604 | } else { 605 | aa[1] = bb[1] - aa[1] 606 | } 607 | if aa[2] >= bb[2] { 608 | aa[2] = aa[2] - bb[2] 609 | } else { 610 | aa[2] = bb[2] - aa[2] 611 | } 612 | if aa[3] >= bb[3] { 613 | aa[3] = aa[3] - bb[3] 614 | } else { 615 | aa[3] = bb[3] - aa[3] 616 | } 617 | if aa[4] >= bb[4] { 618 | aa[4] = aa[4] - bb[4] 619 | } else { 620 | aa[4] = bb[4] - aa[4] 621 | } 622 | if aa[5] >= bb[5] { 623 | aa[5] = aa[5] - bb[5] 624 | } else { 625 | aa[5] = bb[5] - aa[5] 626 | } 627 | if aa[6] >= bb[6] { 628 | aa[6] = aa[6] - bb[6] 629 | } else { 630 | aa[6] = bb[6] - aa[6] 631 | } 632 | if aa[7] >= bb[7] { 633 | aa[7] = aa[7] - bb[7] 634 | } else { 635 | aa[7] = bb[7] - aa[7] 636 | } 637 | return aa 638 | } 639 | 640 | func highBitWhereLess(b [8]byte, c [8]byte) [8]byte { 641 | if b[0] < c[0] { 642 | b[0] = 0x80 643 | } else { 644 | b[0] = 0 645 | } 646 | if b[1] < c[1] { 647 | b[1] = 0x80 648 | } else { 649 | b[1] = 0 650 | } 651 | if b[2] < c[2] { 652 | b[2] = 0x80 653 | } else { 654 | b[2] = 0 655 | } 656 | if b[3] < c[3] { 657 | b[3] = 0x80 658 | } else { 659 | b[3] = 0 660 | } 661 | if b[4] < c[4] { 662 | b[4] = 0x80 663 | } else { 664 | b[4] = 0 665 | } 666 | if b[5] < c[5] { 667 | b[5] = 0x80 668 | } else { 669 | b[5] = 0 670 | } 671 | if b[6] < c[6] { 672 | b[6] = 0x80 673 | } else { 674 | b[6] = 0 675 | } 676 | if b[7] < c[7] { 677 | b[7] = 0x80 678 | } else { 679 | b[7] = 0 680 | } 681 | return b 682 | } 683 | 684 | func highBitWhereGreater(b [8]byte, c [8]byte) [8]byte { 685 | if b[0] > c[0] { 686 | b[0] = 0x80 687 | } else { 688 | b[0] = 0 689 | } 690 | if b[1] > c[1] { 691 | b[1] = 0x80 692 | } else { 693 | b[1] = 0 694 | } 695 | if b[2] > c[2] { 696 | b[2] = 0x80 697 | } else { 698 | b[2] = 0 699 | } 700 | if b[3] > c[3] { 701 | b[3] = 0x80 702 | } else { 703 | b[3] = 0 704 | } 705 | if b[4] > c[4] { 706 | b[4] = 0x80 707 | } else { 708 | b[4] = 0 709 | } 710 | if b[5] > c[5] { 711 | b[5] = 0x80 712 | } else { 713 | b[5] = 0 714 | } 715 | if b[6] > c[6] { 716 | b[6] = 0x80 717 | } else { 718 | b[6] = 0 719 | } 720 | if b[7] > c[7] { 721 | b[7] = 0x80 722 | } else { 723 | b[7] = 0 724 | } 725 | return b 726 | } 727 | 728 | func highBitWhereEqual(b [8]byte, c [8]byte) [8]byte { 729 | if b[0] == c[0] { 730 | b[0] = 0x80 731 | } else { 732 | b[0] = 0 733 | } 734 | if b[1] == c[1] { 735 | b[1] = 0x80 736 | } else { 737 | b[1] = 0 738 | } 739 | if b[2] == c[2] { 740 | b[2] = 0x80 741 | } else { 742 | b[2] = 0 743 | } 744 | if b[3] == c[3] { 745 | b[3] = 0x80 746 | } else { 747 | b[3] = 0 748 | } 749 | if b[4] == c[4] { 750 | b[4] = 0x80 751 | } else { 752 | b[4] = 0 753 | } 754 | if b[5] == c[5] { 755 | b[5] = 0x80 756 | } else { 757 | b[5] = 0 758 | } 759 | if b[6] == c[6] { 760 | b[6] = 0x80 761 | } else { 762 | b[6] = 0 763 | } 764 | if b[7] == c[7] { 765 | b[7] = 0x80 766 | } else { 767 | b[7] = 0 768 | } 769 | return b 770 | } 771 | 772 | func TestSWARFunctionsRef(t *testing.T) { 773 | for n := uint64(0); n < 0x_FF_FF_FF_FF_FF; n = (n*12 + 13) / 11 { 774 | nA := toBytes(n) 775 | if a, b := SwapByteHalves(n), swapNibbles(nA); a != fromBytes(b) { 776 | t.Errorf("SwapByteHalves(0x%016x) = 0x%016x; want 0x%016x", n, a, fromBytes(b)) 777 | } 778 | if a, b := ReverseEachByte(n), reverseBits(nA); a != fromBytes(b) { 779 | t.Errorf("ReverseEachByte(0x%016x) = 0x%016x; want 0x%016x", n, a, fromBytes(b)) 780 | } 781 | if a, b := CountOnesPerByte(n), popcountPerByte(nA); a != fromBytes(b) { 782 | t.Errorf("CountOnesPerByte(0x%016x) = 0x%016x; want 0x%016x", n, a, fromBytes(b)) 783 | } 784 | if a, b := ExtractLowBits(n&LowBits), byteFromLowBits(toBytes(n&LowBits)); a != b { 785 | t.Errorf("ExtractLowBits(0x%016x) = 0b%08b; want 0b%08b", n, a, b) 786 | } 787 | 788 | m := n ^ 0x0000005351952b76 789 | mA := toBytes(m) 790 | if a, b := SelectSmallerBytes(n, m), minBytes(nA, mA); a != fromBytes(b) { 791 | t.Errorf("SelectSmallerBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b)) 792 | } 793 | if a, b := SelectLargerBytes(n, m), maxBytes(nA, mA); a != fromBytes(b) { 794 | t.Errorf("SelectLargerBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b)) 795 | } 796 | if a, b := AverageBytes(n, m), averageBytes(nA, mA); a != fromBytes(b) { 797 | t.Errorf("AverageBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b)) 798 | } 799 | if a, b := AbsoluteDifferenceBetweenBytes(n, m), absDiffBytes(nA, mA); a != fromBytes(b) { 800 | t.Errorf("AbsoluteDifferenceBetweenBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b)) 801 | } 802 | if a, b := AddBytesWithWrapping(n, m), addWrapBytes(nA, mA); a != fromBytes(b) { 803 | t.Errorf("AddWrapBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b)) 804 | } 805 | if a, b := SubtractBytesWithWrapping(n, m), subBytesWrap(nA, mA); a != fromBytes(b) { 806 | t.Errorf("SubtractBytesWithWrapping(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b)) 807 | } 808 | if a, b := AddBytesWithMaximum(n, m), addSatBytes(nA, mA); a != fromBytes(b) { 809 | t.Errorf("AddBytesWithMaximum(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b)) 810 | } 811 | if a, b := SubtractBytesWithMinimum(n, m), subBytesSat(nA, mA); a != fromBytes(b) { 812 | t.Errorf("SubtractBytesWithMinimum(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", n, m, a, fromBytes(b)) 813 | } 814 | 815 | c := Dupe(byte(m % 0x_FE)) 816 | if a, b := HighBitWhereLess(n, c), highBitWhereLess(nA, toBytes(c)); a != fromBytes(b) { 817 | t.Errorf("HighBitWhereLess(0x%016x, %2x) = 0x%016x; want 0x%016x", n, c, a, fromBytes(b)) 818 | } 819 | if a, b := HighBitWhereGreater(n, c), highBitWhereGreater(nA, toBytes(c)); a != fromBytes(b) { 820 | t.Errorf("HighBitWhereGreater(0x%016x, %2x) = 0x%016x; want 0x%016x", n, c, a, fromBytes(b)) 821 | } 822 | if a, b := HighBitWhereEqual(n, c), highBitWhereEqual(nA, toBytes(c)); a != fromBytes(b) { 823 | t.Errorf("HighBitWhereEqual(0x%016x, %2x) = 0x%016x; want 0x%016x", n, c, a, fromBytes(b)) 824 | } 825 | 826 | d := uint64(0x_01_00_01_01_00_00_01_00) 827 | dA := toBytes(d) 828 | if a, b := SelectByLowBit(n, m, d), selectByLowBits(nA, mA, dA); a != fromBytes(b) { 829 | t.Errorf("SelectByLowBit(0x%016x, 0x%016x, 0x%016x) = 0x%016x; want 0x%016x (%v)", n, m, d, a, fromBytes(b), dA) 830 | } 831 | 832 | // t.Logf("Tested with n=0x%016x, m=0x%016x, c=%02x, d=0x%016x", n, m, c, d) 833 | // t.Logf("As arrays: n=%v, m=%v, d=%v", nA, mA, dA) 834 | } 835 | } 836 | -------------------------------------------------------------------------------- /go.mod: -------------------------------------------------------------------------------- 1 | module github.com/dans-stuff/swar 2 | 3 | go 1.24.2 4 | -------------------------------------------------------------------------------- /logic.go: -------------------------------------------------------------------------------- 1 | package swar 2 | 3 | const ( 4 | // HighBits is a mask with the high bit set in all 8 bytes of a uint64 5 | HighBits uint64 = 0x8080_8080_8080_8080 6 | ) 7 | 8 | // HighBitWhereLess sets the high bit (0x80) in each byte where v < cm 9 | // Enables parallel comparison of 8 bytes simultaneously 10 | func HighBitWhereLess(v, cm uint64) uint64 { 11 | d := (v | HighBits) - (cm &^ HighBits) 12 | sel := ((v & (v ^ cm)) | (d &^ (v ^ cm))) & HighBits 13 | hbit := sel ^ HighBits // 0x80 in each byte where v < cm 14 | return hbit & HighBits // 0x80 or 0x00 per lane 15 | } 16 | 17 | // HighBitWhereGreater sets the high bit (0x80) in each byte where v > cm 18 | // Perfect for threshold detection across multiple values 19 | func HighBitWhereGreater(v, cm uint64) uint64 { 20 | d := (cm | HighBits) - (v &^ HighBits) 21 | sel := ((cm & (cm ^ v)) | (d &^ (cm ^ v))) & HighBits 22 | hbit := sel ^ HighBits // 0x80 in each byte where v > cm 23 | return hbit & HighBits // 0x80 or 0x00 per lane 24 | } 25 | 26 | // HighBitWhereEqual sets the high bit (0x80) in each byte where v == cm 27 | // Ideal for pattern matching and finding specific values in data 28 | func HighBitWhereEqual(v, cm uint64) uint64 { 29 | x := v ^ cm 30 | y := ((x & 0x7F7F7F7F7F7F7F7F) + 0x7F7F7F7F7F7F7F7F) | x 31 | hi := ^y & HighBits // 0x80 where x==0 (v==cm) 32 | return hi & HighBits // mask off other bits 33 | } 34 | -------------------------------------------------------------------------------- /logic_test.go: -------------------------------------------------------------------------------- 1 | package swar 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | // TestHighBitWhereEqual verifies that the HighBitWhereEqual function correctly 8 | // identifies bytes that match a comparison value. These tests are important because 9 | // the SWAR technique uses non-intuitive bit manipulation that needs proper verification. 10 | func TestHighBitWhereEqual(t *testing.T) { 11 | run := func(v, c, want uint64) { 12 | if got := HighBitWhereEqual(v, c); got != want { 13 | t.Errorf("HighBitWhereEqual(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", v, c, got, want) 14 | } 15 | } 16 | 17 | run(0x05, Dupe(5), 0x80) 18 | run(0x04, Dupe(5), 0x00) 19 | run(0x05_04, Dupe(5), 0x80_00) 20 | run(0xFF_00, Dupe(0), 0x80_80_80_80_80_80_00_80) 21 | } 22 | 23 | // TestHighBitWhereLess verifies that the HighBitWhereLess function correctly identifies 24 | // bytes less than a comparison value. This is crucial for threshold-based processing 25 | // and range checks operating on multiple bytes in parallel. 26 | func TestHighBitWhereLess(t *testing.T) { 27 | run := func(v, c, want uint64) { 28 | if got := HighBitWhereLess(v, c); got != want { 29 | t.Errorf("HighBitWhereLess(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", v, c, got, want) 30 | } 31 | } 32 | 33 | run(0x06, Dupe(5), 0x80_80_80_80_80_80_80_00) 34 | run(0x04, Dupe(5), 0x80_80_80_80_80_80_80_80) 35 | run(0x01_02_03_04_05_06_07_08, Dupe(5), 0x80_80_80_80_00_00_00_00) 36 | } 37 | 38 | // TestHighBitWhereGreater verifies that the HighBitWhereGreater function correctly 39 | // identifies bytes greater than a comparison value. This functionality is essential for 40 | // detecting outliers, anomalies, and values exceeding specified thresholds. 41 | func TestHighBitWhereGreater(t *testing.T) { 42 | run := func(v, c, want uint64) { 43 | if got := HighBitWhereGreater(v, c); got != want { 44 | t.Errorf("HighBitWhereGreater(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", v, c, got, want) 45 | } 46 | } 47 | 48 | run(0x05, Dupe(5), 0x00) 49 | run(0x06, Dupe(5), 0x80) 50 | run(0xFF_04_05_06_00, Dupe(5), 0x80_00_00_80_00) 51 | } 52 | 53 | // TestSelectByLowBit verifies that values are correctly selected from a or b based on 54 | // the corresponding mask bit. This branchless selection is critical for data-dependent 55 | // operations where conditional logic would otherwise harm performance. 56 | func TestSelectByLowBit(t *testing.T) { 57 | run := func(a, b, mask, want uint64) { 58 | if got := SelectByLowBit(a, b, mask); got != want { 59 | t.Errorf("SelectByLowBit(0x%016x, 0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, mask, got, want) 60 | } 61 | } 62 | 63 | run(0x11_11_11_11, 0x22_22_22_22, 0x01_00_01_00, 0x11_22_11_22) 64 | } 65 | 66 | // TestMinBytes verifies that our parallel minimum function correctly selects the smaller 67 | // of two values for each byte position. This is essential for applications like image processing 68 | // where per-pixel minimum operations affect visual outcomes. 69 | func TestMinBytes(t *testing.T) { 70 | run := func(a, b, want uint64) { 71 | if got := SelectSmallerBytes(a, b); got != want { 72 | t.Errorf("SelectSmallerBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, got, want) 73 | } 74 | } 75 | 76 | run(0x01_02_03_04_05_06_07_08, 0x05_04_03_02_01_00_09_0A, 0x01_02_03_02_01_00_07_08) 77 | run(0x0000_0000_0000_0004, 0x1234_5678_90AB_CDEB, 0x0000_0000_0000_0004) 78 | } 79 | 80 | // TestMaxBytes verifies that our parallel maximum function correctly selects the larger 81 | // of two values for each byte position. This is critical for algorithms like feature extraction 82 | // and signal peak detection where maintaining maximum values is required. 83 | func TestMaxBytes(t *testing.T) { 84 | run := func(a, b, want uint64) { 85 | if got := SelectLargerBytes(a, b); got != want { 86 | t.Errorf("SelectLargerBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, got, want) 87 | } 88 | } 89 | 90 | run(0x01_02_03_04_05_06_07_08, 0x05_04_03_02_01_00_09_0A, 0x05_04_03_04_05_06_09_0A) 91 | run(0x04, 0xEB, 0xEB) 92 | run(0x01, 0x02, 0x02) 93 | } 94 | 95 | // TestSwapNibbles verifies that our nibble-swapping function correctly exchanges the high 96 | // and low 4 bits of each byte. This transformation is important for BCD encoding/decoding 97 | // and certain data format conversions that rely on nibble-level manipulations. 98 | func TestSwapNibbles(t *testing.T) { 99 | run := func(s, want uint64) { 100 | if got := SwapByteHalves(s); got != want { 101 | t.Errorf("SwapByteHalves(0x%016x) = 0x%016x; want 0x%016x", s, got, want) 102 | } 103 | } 104 | 105 | run(0xF0_F0_F0_F0_F0_F0_F0_F0, 0x0F_0F_0F_0F_0F_0F_0F_0F) 106 | } 107 | 108 | // TestReverseBits verifies that our bit-reversal function correctly reverses the order 109 | // of bits within each byte. This is crucial for operations like endianness conversion 110 | // and certain data transformations that depend on bit-level mirroring. 111 | func TestReverseBits(t *testing.T) { 112 | run := func(v, want uint64) { 113 | if got := ReverseEachByte(v); got != want { 114 | t.Errorf("ReverseEachByte(0x%016x) = 0x%016x; want 0x%016x", v, got, want) 115 | } 116 | } 117 | 118 | run(0x01_02_04_08_10_20_40_80, 0x80_40_20_10_08_04_02_01) 119 | run(0b01001000_11100001_11000011_11110000, 0b00010010_10000111_11000011_00001111) 120 | } 121 | 122 | // TestPopcountPerByte verifies that our parallel population count correctly counts the 123 | // set bits in each byte. This functionality is essential for feature extraction, hamming 124 | // distance calculation, and statistical analysis of binary data. 125 | func TestPopcountPerByte(t *testing.T) { 126 | run := func(p, want uint64) { 127 | if got := CountOnesPerByte(p); got != want { 128 | t.Errorf("CountOnesPerByte(0x%016x) = 0x%016x; want 0x%016x", p, got, want) 129 | } 130 | } 131 | 132 | run(0x0F_F0_55_AA_00_FF_33_CC, 0x04_04_04_04_00_08_04_04) 133 | } 134 | -------------------------------------------------------------------------------- /math.go: -------------------------------------------------------------------------------- 1 | package swar 2 | 3 | const ( 4 | // mEven selects even bytes in a uint64 5 | mEven uint64 = 0x00FF_00FF_00FF_00FF 6 | // mOdd selects odd bytes in a uint64 7 | mOdd uint64 = 0xFF00_FF00_FF00_FF00 8 | // laneNotHigh masks all bits except the high bit in each byte 9 | laneNotHigh uint64 = 0x7F7F_7F7F_7F7F_7F7F 10 | ) 11 | 12 | // SubtractBytesWithWrapping performs byte-wise subtraction with wrapping 13 | // Parallel subtraction across all 8 bytes with wrap-around behavior 14 | func SubtractBytesWithWrapping(a, b uint64) uint64 { 15 | return ((a | HighBits) - (b &^ HighBits)) ^ ((a ^ ^b) & HighBits) 16 | } 17 | 18 | // SubtractBytesWithMinimum performs byte-wise subtraction clamped at zero 19 | // Provides saturating subtraction to prevent underflow in all 8 bytes 20 | func SubtractBytesWithMinimum(a, b uint64) uint64 { 21 | diff := ((a | HighBits) - (b &^ HighBits)) ^ ((a ^ ^b) & HighBits) 22 | bo := ((^a & b) | ((^a | b) & diff)) & HighBits 23 | return diff &^ ((bo >> 7) * 0xFF) 24 | } 25 | 26 | // AddBytesWithWrapping performs byte-wise addition with wrap-around 27 | // Parallel addition across all 8 bytes with overflow wrapping to zero 28 | func AddBytesWithWrapping(a, b uint64) uint64 { 29 | sum := (a & laneNotHigh) + (b & laneNotHigh) 30 | return sum ^ ((a ^ b) & HighBits) 31 | } 32 | 33 | // AddBytesWithMaximum performs byte-wise addition clamped at 255 34 | // Saturating addition to prevent overflow in all 8 bytes 35 | func AddBytesWithMaximum(a, b uint64) uint64 { 36 | preSum := (a & laneNotHigh) + (b & laneNotHigh) 37 | sum := preSum ^ ((a ^ b) & HighBits) 38 | carry := ((a & b) | ((a | b) & ^sum)) & HighBits 39 | return sum | (carry>>7)*0xFF 40 | } 41 | 42 | // AbsoluteDifferenceBetweenBytes calculates |a-b| for each byte 43 | // Computes unsigned distances for metrics and signal processing 44 | func AbsoluteDifferenceBetweenBytes(a, b uint64) uint64 { 45 | d := a - b 46 | borrow := ((^a & b) | ((^a | b) & d)) & HighBits 47 | mask := (borrow >> 7) * 0xFF 48 | n := (a &^ mask) | (b & mask) 49 | m := (a & mask) | (b &^ mask) 50 | return ((n | HighBits) - (m &^ HighBits)) ^ ((n ^ ^m) & HighBits) 51 | } 52 | 53 | // SelectSmallerBytes returns min(a,b) for each byte 54 | // Efficient for clipping, filtering, and data preprocessing 55 | func SelectSmallerBytes(a, b uint64) uint64 { 56 | d := a - b 57 | borrow := ((^a & b) | ((^a | b) & d)) & HighBits 58 | mask := (borrow >> 7) * 0xFF 59 | return (a & mask) | (b &^ mask) 60 | } 61 | 62 | // SelectLargerBytes returns max(a,b) for each byte 63 | // Ideal for peak detection, ceiling operations, and filtering 64 | func SelectLargerBytes(a, b uint64) uint64 { 65 | d := a - b 66 | borrow := ((^a & b) | ((^a | b) & d)) & HighBits 67 | mask := (borrow >> 7) * 0xFF 68 | return (a &^ mask) | (b & mask) 69 | } 70 | 71 | // AverageBytes calculates (a+b)/2 for each byte without overflow 72 | // Perfect for signal processing, image manipulation, and smoothing 73 | func AverageBytes(a, b uint64) uint64 { 74 | common := a & b 75 | diff := (a ^ b) & 0xFEFE_FEFE_FEFE_FEFE 76 | return common + (diff >> 1) 77 | } 78 | 79 | // SwapByteHalves swaps the high and low nibbles in each byte 80 | // Useful for BCD encoding/decoding and nibble-level transforms 81 | func SwapByteHalves(v uint64) uint64 { 82 | lo := v & 0x0F0F_0F0F_0F0F_0F0F 83 | hi := v & 0xF0F0_F0F0_F0F0_F0F0 84 | return (lo << 4) | (hi >> 4) 85 | } 86 | 87 | // ReverseEachByte reverses the bit order within each byte 88 | // Useful for endianness conversion and bit-level manipulations 89 | func ReverseEachByte(v uint64) uint64 { 90 | x := ((v >> 1) & 0x5555555555555555) | ((v & 0x5555555555555555) << 1) 91 | x = ((x >> 2) & 0x3333333333333333) | ((x & 0x3333333333333333) << 2) 92 | x = ((x >> 4) & 0x0F0F0F0F0F0F0F0F) | ((x & 0x0F0F0F0F0F0F0F0F) << 4) 93 | return x 94 | } 95 | 96 | // SelectByLowBit selects values from a or b based on mask bits 97 | // Branchless selection between values based on conditions 98 | func SelectByLowBit(a, b, mask uint64) uint64 { 99 | byteMask := mask * 0xFF 100 | return (a & byteMask) | (b &^ byteMask) 101 | } 102 | 103 | // CountOnesPerByte counts set bits in each byte 104 | // Parallel population count for hamming distance and feature extraction 105 | func CountOnesPerByte(v uint64) uint64 { 106 | m1 := v - ((v >> 1) & 0x5555_5555_5555_5555) 107 | m2 := (m1 & 0x3333_3333_3333_3333) + ((m1 >> 2) & 0x3333_3333_3333_3333) 108 | return (m2 + (m2 >> 4)) & 0x0F0F_0F0F_0F0F_0F0F 109 | } 110 | -------------------------------------------------------------------------------- /math_test.go: -------------------------------------------------------------------------------- 1 | package swar 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | // TestAverageBytes verifies that our parallel averaging algorithm correctly calculates 8 | // the mean of corresponding bytes. This ensures proper data smoothing and interpolation 9 | // behavior when processing multiple values simultaneously. 10 | func TestAverageBytes(t *testing.T) { 11 | run := func(a, b, want uint64) { 12 | if got := AverageBytes(a, b); got != want { 13 | t.Errorf("AverageBytes(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, got, want) 14 | } 15 | } 16 | 17 | run(0x01_10_40_FF, 0xFF_30_80_FD, 0x80_20_60_FE) 18 | run(0x04, 0x08, 0x06) 19 | run(0x10_DD, 0x30_FF, 0x20_EE) 20 | run(0x0004, 0xCDEB, 0x6677) 21 | run(0x01FE, 0xCC11, 0x6687) 22 | } 23 | 24 | // TestAddSatBytes verifies that our saturating addition correctly clamps results to 0xFF 25 | // when overflow occurs. This is crucial for applications like image processing and signal 26 | // manipulation where preventing overflow is necessary for correct results. 27 | func TestAddSatBytes(t *testing.T) { 28 | run := func(a, b, want uint64) { 29 | if got := AddBytesWithMaximum(a, b); got != want { 30 | t.Errorf("AddBytesWithMaximum(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, got, want) 31 | } 32 | } 33 | 34 | run(0xFF_FE_FD, 0x01_01_01, 0xFF_FF_FE) 35 | run(0xFD_FC_FB, 0x03_03_03, 0xFF_FF_FE) 36 | } 37 | 38 | // TestAddBytesWithWrapping ensures that our wrapping addition correctly handles overflow 39 | // by wrapping around to zero. This behavior is essential for certain algorithms like 40 | // checksums and hash functions where wrap-around arithmetic is expected and required. 41 | func TestAddBytesWithWrapping(t *testing.T) { 42 | run := func(a, b, want uint64) { 43 | if got := AddBytesWithWrapping(a, b); got != want { 44 | t.Errorf("AddBytesWithWrapping(0x%016x, 0x%016x) = 0x%016x; want 0x%016x", a, b, got, want) 45 | } 46 | } 47 | 48 | run(0xFF_FE_FD, 0x01_01_01, 0x00_FF_FE) 49 | run(0xFD_FC_FB, 0x03_03_03, 0x00_FF_FE) 50 | run(0xF4_F9, 0x0F_01, 0x03_FA) 51 | run(0xFF_0F_FF, 0x01_F0_00, 0x00_FF_FF) 52 | } 53 | -------------------------------------------------------------------------------- /utils.go: -------------------------------------------------------------------------------- 1 | package swar 2 | 3 | import "unsafe" 4 | 5 | const ( 6 | // LowBits has the lowest bit set in each byte for value duplication 7 | LowBits uint64 = 0x0101_0101_0101_0101 8 | // packMask packs low bits from each byte into a single byte 9 | packMask uint64 = 0x0102_0408_1020_4080 10 | ) 11 | 12 | // BytesToLanes converts a []byte to []uint64 for SWAR processing 13 | // Returns uint64 lanes and index where unused bytes begin 14 | func BytesToLanes(b []byte) ([]uint64, int) { 15 | countChunks := len(b) / 8 16 | chunks := unsafe.Slice((*uint64)(unsafe.Pointer(&b[0])), countChunks) 17 | return chunks, countChunks * 8 18 | } 19 | 20 | // LanesToBytes converts []uint64 back to []byte 21 | // Zero-copy conversion for optimal performance 22 | func LanesToBytes(lanes []uint64) []byte { 23 | countBytes := len(lanes) * 8 24 | bytes := unsafe.Slice((*byte)(unsafe.Pointer(&lanes[0])), countBytes) 25 | return bytes 26 | } 27 | 28 | // Dupe duplicates a byte across all 8 bytes of a uint64 29 | // Creates comparison values for parallel operations 30 | func Dupe(c byte) uint64 { 31 | return uint64(c) * LowBits 32 | } 33 | 34 | // ExtractLowBits packs the low bit from each byte into a single byte 35 | // Compacts 8 comparison results into a single byte 36 | func ExtractLowBits(v uint64) byte { 37 | return byte((v * packMask) >> 56) 38 | } 39 | 40 | // IntToLanes converts a uint64 to an 8-byte array 41 | // Access individual bytes for mixed SWAR/byte-level operations 42 | func IntToLanes(i uint64) [8]byte { 43 | return *(*[8]byte)(unsafe.Pointer(&i)) 44 | } 45 | 46 | // LanesToInt converts an 8-byte array to uint64 47 | // Zero-copy conversion from byte-level to SWAR format 48 | func LanesToInt(lanes [8]byte) uint64 { 49 | return *(*uint64)(unsafe.Pointer(&lanes)) 50 | } 51 | 52 | // Lookup provides precomputed data for optimized operations 53 | // OnesPositions maps byte values to positions of their set bits 54 | var Lookup = struct { 55 | OnesPositions [256][]int 56 | }{ 57 | func() (res [256][]int) { 58 | for b := range res { 59 | for i := 0; i < 8; i++ { 60 | if b>>i&1 == 1 { 61 | res[b] = append(res[b], i) 62 | } 63 | } 64 | } 65 | return 66 | }()} 67 | --------------------------------------------------------------------------------