├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── avx2_amd64.s ├── nosimd.go ├── sse2_amd64.s ├── xor.go ├── xor_amd64.go ├── xor_other.go └── xor_test.go /.gitattributes: -------------------------------------------------------------------------------- 1 | *.s linguist-language=go 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | *.exe 3 | *.dll 4 | *.so 5 | *.dylib 6 | 7 | # Test binary, build with `go test -c` 8 | *.test 9 | 10 | # Output of the go coverage tool, specifically when used with LiteIDE 11 | *.out 12 | 13 | # Project-local glide cache, RE: https://github.com/Masterminds/glide/issues/736 14 | .glide/ 15 | /backup/ 16 | /backup2/ 17 | /.idea 18 | /backup3/ 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Temple3x 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # XOR 2 | 3 | XOR code engine in pure Go 4 | 5 | more than 50GB/S per core 6 | 7 | Move to: https://github.com/templexxx/xorsimd 8 | 9 | ## Introduction: 10 | 11 | 1. Use SIMD (SSE2 or AVX2) for speeding up 12 | 2. ... 13 | 14 | ## Installation 15 | To get the package use the standard: 16 | ```bash 17 | go get github.com/templexxx/xor 18 | ``` 19 | 20 | ## Documentation 21 | 22 | See the associated [GoDoc](http://godoc.org/github.com/templexxx/xor) 23 | 24 | 25 | ## Performance 26 | 27 | Performance depends mainly on: 28 | 29 | 1. SIMD extension 30 | 2. unit size of worker 31 | 3. hardware ( CPU RAM etc) 32 | 33 | Example of performance on my MacBook 2014-mid(i5-4278U 2.6GHz 2 physical cores). The 16MB per shards. 34 | ``` 35 | speed = ( shards * size ) / cost 36 | ``` 37 | | data_shards | shard_size |speed (MB/S) | 38 | |----------|----|-----| 39 | | 2 |1KB|64127.95 | 40 | |2|1400B|59657.55| 41 | |2|16KB|35370.84| 42 | | 2 | 16MB|12128.95 | 43 | | 5 |1KB| 78837.33 | 44 | |5|1400B|58054.89| 45 | |5|16KB|50161.19| 46 | |5| 16MB|12750.41| 47 | 48 | ## Who is using this? 49 | 50 | 1. https://github.com/xtaci/kcp-go -- A Production-Grade Reliable-UDP Library for golang 51 | -------------------------------------------------------------------------------- /avx2_amd64.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // addr of mem 4 | #define DST BX 5 | #define SRC SI 6 | #define SRC0 TMP4 7 | #define SRC1 TMP5 8 | 9 | // loop args 10 | // num of vect 11 | #define VECT CX 12 | #define LEN DX 13 | // pos of matrix 14 | #define POS R8 15 | 16 | // tmp store 17 | // num of vect or ... 18 | #define TMP1 R9 19 | // pos of matrix or ... 20 | #define TMP2 R10 21 | // store addr of data/parity or ... 22 | #define TMP3 R11 23 | #define TMP4 R12 24 | #define TMP5 R13 25 | #define TMP6 R14 26 | 27 | // func bytesAVX2mini(dst, src0, src1 []byte, size int) 28 | TEXT ·bytesAVX2mini(SB), NOSPLIT, $0 29 | MOVQ len+72(FP), LEN 30 | CMPQ LEN, $0 31 | JE ret 32 | MOVQ dst+0(FP), DST 33 | MOVQ src0+24(FP), SRC0 34 | MOVQ src1+48(FP), SRC1 35 | TESTQ $31, LEN 36 | JNZ not_aligned 37 | 38 | aligned: 39 | MOVQ $0, POS 40 | 41 | loop32b: 42 | VMOVDQU (SRC0)(POS*1), Y0 43 | VPXOR (SRC1)(POS*1), Y0, Y0 44 | VMOVDQU Y0, (DST)(POS*1) 45 | ADDQ $32, POS 46 | CMPQ LEN, POS 47 | JNE loop32b 48 | VZEROUPPER 49 | RET 50 | 51 | loop_1b: 52 | MOVB -1(SRC0)(LEN*1), TMP1 53 | MOVB -1(SRC1)(LEN*1), TMP2 54 | XORB TMP1, TMP2 55 | MOVB TMP2, -1(DST)(LEN*1) 56 | SUBQ $1, LEN 57 | TESTQ $7, LEN 58 | JNZ loop_1b 59 | CMPQ LEN, $0 60 | JE ret 61 | TESTQ $31, LEN 62 | JZ aligned 63 | 64 | not_aligned: 65 | TESTQ $7, LEN 66 | JNE loop_1b 67 | MOVQ LEN, TMP1 68 | ANDQ $31, TMP1 69 | 70 | loop_8b: 71 | MOVQ -8(SRC0)(LEN*1), TMP2 72 | MOVQ -8(SRC1)(LEN*1), TMP3 73 | XORQ TMP2, TMP3 74 | MOVQ TMP3, -8(DST)(LEN*1) 75 | SUBQ $8, LEN 76 | SUBQ $8, TMP1 77 | JG loop_8b 78 | 79 | CMPQ LEN, $32 80 | JGE aligned 81 | RET 82 | 83 | ret: 84 | RET 85 | 86 | // func bytesAVX2small(dst, src0, src1 []byte, size int) 87 | TEXT ·bytesAVX2small(SB), NOSPLIT, $0 88 | MOVQ len+72(FP), LEN 89 | CMPQ LEN, $0 90 | JE ret 91 | MOVQ dst+0(FP), DST 92 | MOVQ src0+24(FP), SRC0 93 | MOVQ src1+48(FP), SRC1 94 | TESTQ $127, LEN 95 | JNZ not_aligned 96 | 97 | aligned: 98 | MOVQ $0, POS 99 | 100 | loop128b: 101 | VMOVDQU (SRC0)(POS*1), Y0 102 | VMOVDQU 32(SRC0)(POS*1), Y1 103 | VMOVDQU 64(SRC0)(POS*1), Y2 104 | VMOVDQU 96(SRC0)(POS*1), Y3 105 | VPXOR (SRC1)(POS*1), Y0, Y0 106 | VPXOR 32(SRC1)(POS*1), Y1, Y1 107 | VPXOR 64(SRC1)(POS*1), Y2, Y2 108 | VPXOR 96(SRC1)(POS*1), Y3, Y3 109 | VMOVDQU Y0, (DST)(POS*1) 110 | VMOVDQU Y1, 32(DST)(POS*1) 111 | VMOVDQU Y2, 64(DST)(POS*1) 112 | VMOVDQU Y3, 96(DST)(POS*1) 113 | 114 | ADDQ $128, POS 115 | CMPQ LEN, POS 116 | JNE loop128b 117 | VZEROUPPER 118 | RET 119 | 120 | loop_1b: 121 | MOVB -1(SRC0)(LEN*1), TMP1 122 | MOVB -1(SRC1)(LEN*1), TMP2 123 | XORB TMP1, TMP2 124 | MOVB TMP2, -1(DST)(LEN*1) 125 | SUBQ $1, LEN 126 | TESTQ $7, LEN 127 | JNZ loop_1b 128 | CMPQ LEN, $0 129 | JE ret 130 | TESTQ $127, LEN 131 | JZ aligned 132 | 133 | not_aligned: 134 | TESTQ $7, LEN 135 | JNE loop_1b 136 | MOVQ LEN, TMP1 137 | ANDQ $127, TMP1 138 | 139 | loop_8b: 140 | MOVQ -8(SRC0)(LEN*1), TMP2 141 | MOVQ -8(SRC1)(LEN*1), TMP3 142 | XORQ TMP2, TMP3 143 | MOVQ TMP3, -8(DST)(LEN*1) 144 | SUBQ $8, LEN 145 | SUBQ $8, TMP1 146 | JG loop_8b 147 | 148 | CMPQ LEN, $128 149 | JGE aligned 150 | RET 151 | 152 | ret: 153 | RET 154 | 155 | // func bytesAVX2big(dst, src0, src1 []byte, size int) 156 | TEXT ·bytesAVX2big(SB), NOSPLIT, $0 157 | MOVQ len+72(FP), LEN 158 | CMPQ LEN, $0 159 | JE ret 160 | MOVQ dst+0(FP), DST 161 | MOVQ src0+24(FP), SRC0 162 | MOVQ src1+48(FP), SRC1 163 | TESTQ $127, LEN 164 | JNZ not_aligned 165 | 166 | aligned: 167 | MOVQ $0, POS 168 | 169 | loop128b: 170 | VMOVDQU (SRC0)(POS*1), Y0 171 | VMOVDQU 32(SRC0)(POS*1), Y1 172 | VMOVDQU 64(SRC0)(POS*1), Y2 173 | VMOVDQU 96(SRC0)(POS*1), Y3 174 | VPXOR (SRC1)(POS*1), Y0, Y0 175 | VPXOR 32(SRC1)(POS*1), Y1, Y1 176 | VPXOR 64(SRC1)(POS*1), Y2, Y2 177 | VPXOR 96(SRC1)(POS*1), Y3, Y3 178 | LONG $0xe77da1c4; WORD $0x0304 179 | LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20 180 | LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40 181 | LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60 182 | 183 | ADDQ $128, POS 184 | CMPQ LEN, POS 185 | JNE loop128b 186 | SFENCE 187 | VZEROUPPER 188 | RET 189 | 190 | loop_1b: 191 | MOVB -1(SRC0)(LEN*1), TMP1 192 | MOVB -1(SRC1)(LEN*1), TMP2 193 | XORB TMP1, TMP2 194 | MOVB TMP2, -1(DST)(LEN*1) 195 | SUBQ $1, LEN 196 | TESTQ $7, LEN 197 | JNZ loop_1b 198 | CMPQ LEN, $0 199 | JE ret 200 | TESTQ $127, LEN 201 | JZ aligned 202 | 203 | not_aligned: 204 | TESTQ $7, LEN 205 | JNE loop_1b 206 | MOVQ LEN, TMP1 207 | ANDQ $127, TMP1 208 | 209 | loop_8b: 210 | MOVQ -8(SRC0)(LEN*1), TMP2 211 | MOVQ -8(SRC1)(LEN*1), TMP3 212 | XORQ TMP2, TMP3 213 | MOVQ TMP3, -8(DST)(LEN*1) 214 | SUBQ $8, LEN 215 | SUBQ $8, TMP1 216 | JG loop_8b 217 | 218 | CMPQ LEN, $128 219 | JGE aligned 220 | RET 221 | 222 | ret: 223 | RET 224 | 225 | // func matrixAVX2small(dst []byte, src [][]byte) 226 | TEXT ·matrixAVX2small(SB), NOSPLIT, $0 227 | MOVQ dst+0(FP), DST 228 | MOVQ src+24(FP), SRC 229 | MOVQ vec+32(FP), VECT 230 | MOVQ len+8(FP), LEN 231 | TESTQ $127, LEN 232 | JNZ not_aligned 233 | 234 | aligned: 235 | MOVQ $0, POS 236 | 237 | loop128b: 238 | MOVQ VECT, TMP1 239 | SUBQ $2, TMP1 240 | MOVQ $0, TMP2 241 | MOVQ (SRC)(TMP2*1), TMP3 242 | MOVQ TMP3, TMP4 243 | VMOVDQU (TMP3)(POS*1), Y0 244 | VMOVDQU 32(TMP4)(POS*1), Y1 245 | VMOVDQU 64(TMP3)(POS*1), Y2 246 | VMOVDQU 96(TMP4)(POS*1), Y3 247 | 248 | next_vect: 249 | ADDQ $24, TMP2 250 | MOVQ (SRC)(TMP2*1), TMP3 251 | MOVQ TMP3, TMP4 252 | VMOVDQU (TMP3)(POS*1), Y4 253 | VMOVDQU 32(TMP4)(POS*1), Y5 254 | VMOVDQU 64(TMP3)(POS*1), Y6 255 | VMOVDQU 96(TMP4)(POS*1), Y7 256 | VPXOR Y4, Y0, Y0 257 | VPXOR Y5, Y1, Y1 258 | VPXOR Y6, Y2, Y2 259 | VPXOR Y7, Y3, Y3 260 | SUBQ $1, TMP1 261 | JGE next_vect 262 | 263 | VMOVDQU Y0, (DST)(POS*1) 264 | VMOVDQU Y1, 32(DST)(POS*1) 265 | VMOVDQU Y2, 64(DST)(POS*1) 266 | VMOVDQU Y3, 96(DST)(POS*1) 267 | 268 | ADDQ $128, POS 269 | CMPQ LEN, POS 270 | JNE loop128b 271 | VZEROUPPER 272 | RET 273 | 274 | loop_1b: 275 | MOVQ VECT, TMP1 276 | MOVQ $0, TMP2 277 | MOVQ (SRC)(TMP2*1), TMP3 278 | SUBQ $2, TMP1 279 | MOVB -1(TMP3)(LEN*1), TMP5 280 | 281 | next_vect_1b: 282 | ADDQ $24, TMP2 283 | MOVQ (SRC)(TMP2*1), TMP3 284 | MOVB -1(TMP3)(LEN*1), TMP6 285 | XORB TMP6, TMP5 286 | SUBQ $1, TMP1 287 | JGE next_vect_1b 288 | 289 | MOVB TMP5, -1(DST)(LEN*1) 290 | SUBQ $1, LEN 291 | TESTQ $7, LEN 292 | JNZ loop_1b 293 | 294 | CMPQ LEN, $0 295 | JE ret 296 | TESTQ $127, LEN 297 | JZ aligned 298 | 299 | not_aligned: 300 | TESTQ $7, LEN 301 | JNE loop_1b 302 | MOVQ LEN, TMP4 303 | ANDQ $127, TMP4 304 | 305 | loop_8b: 306 | MOVQ VECT, TMP1 307 | MOVQ $0, TMP2 308 | MOVQ (SRC)(TMP2*1), TMP3 309 | SUBQ $2, TMP1 310 | MOVQ -8(TMP3)(LEN*1), TMP5 311 | 312 | next_vect_8b: 313 | ADDQ $24, TMP2 314 | MOVQ (SRC)(TMP2*1), TMP3 315 | MOVQ -8(TMP3)(LEN*1), TMP6 316 | XORQ TMP6, TMP5 317 | SUBQ $1, TMP1 318 | JGE next_vect_8b 319 | 320 | MOVQ TMP5, -8(DST)(LEN*1) 321 | SUBQ $8, LEN 322 | SUBQ $8, TMP4 323 | JG loop_8b 324 | 325 | CMPQ LEN, $128 326 | JGE aligned 327 | RET 328 | 329 | ret: 330 | RET 331 | 332 | // func matrixAVX2big(dst []byte, src [][]byte) 333 | TEXT ·matrixAVX2big(SB), NOSPLIT, $0 334 | MOVQ dst+0(FP), DST 335 | MOVQ src+24(FP), SRC 336 | MOVQ vec+32(FP), VECT 337 | MOVQ len+8(FP), LEN 338 | TESTQ $127, LEN 339 | JNZ not_aligned 340 | 341 | aligned: 342 | MOVQ $0, POS 343 | 344 | loop128b: 345 | MOVQ VECT, TMP1 346 | SUBQ $2, TMP1 347 | MOVQ $0, TMP2 348 | MOVQ (SRC)(TMP2*1), TMP3 349 | MOVQ TMP3, TMP4 350 | VMOVDQU (TMP3)(POS*1), Y0 351 | VMOVDQU 32(TMP4)(POS*1), Y1 352 | VMOVDQU 64(TMP3)(POS*1), Y2 353 | VMOVDQU 96(TMP4)(POS*1), Y3 354 | 355 | next_vect: 356 | ADDQ $24, TMP2 357 | MOVQ (SRC)(TMP2*1), TMP3 358 | MOVQ TMP3, TMP4 359 | VMOVDQU (TMP3)(POS*1), Y4 360 | VMOVDQU 32(TMP4)(POS*1), Y5 361 | VMOVDQU 64(TMP3)(POS*1), Y6 362 | VMOVDQU 96(TMP4)(POS*1), Y7 363 | VPXOR Y4, Y0, Y0 364 | VPXOR Y5, Y1, Y1 365 | VPXOR Y6, Y2, Y2 366 | VPXOR Y7, Y3, Y3 367 | SUBQ $1, TMP1 368 | JGE next_vect 369 | 370 | LONG $0xe77da1c4; WORD $0x0304 // VMOVNTDQ go1.8 has 371 | LONG $0xe77da1c4; WORD $0x034c; BYTE $0x20 372 | LONG $0xe77da1c4; WORD $0x0354; BYTE $0x40 373 | LONG $0xe77da1c4; WORD $0x035c; BYTE $0x60 374 | 375 | ADDQ $128, POS 376 | CMPQ LEN, POS 377 | JNE loop128b 378 | VZEROUPPER 379 | RET 380 | 381 | loop_1b: 382 | MOVQ VECT, TMP1 383 | MOVQ $0, TMP2 384 | MOVQ (SRC)(TMP2*1), TMP3 385 | SUBQ $2, TMP1 386 | MOVB -1(TMP3)(LEN*1), TMP5 387 | 388 | next_vect_1b: 389 | ADDQ $24, TMP2 390 | MOVQ (SRC)(TMP2*1), TMP3 391 | MOVB -1(TMP3)(LEN*1), TMP6 392 | XORB TMP6, TMP5 393 | SUBQ $1, TMP1 394 | JGE next_vect_1b 395 | 396 | MOVB TMP5, -1(DST)(LEN*1) 397 | SUBQ $1, LEN 398 | TESTQ $7, LEN 399 | JNZ loop_1b 400 | 401 | CMPQ LEN, $0 402 | JE ret 403 | TESTQ $127, LEN 404 | JZ aligned 405 | 406 | not_aligned: 407 | TESTQ $7, LEN 408 | JNE loop_1b 409 | MOVQ LEN, TMP4 410 | ANDQ $127, TMP4 411 | 412 | loop_8b: 413 | MOVQ VECT, TMP1 414 | MOVQ $0, TMP2 415 | MOVQ (SRC)(TMP2*1), TMP3 416 | SUBQ $2, TMP1 417 | MOVQ -8(TMP3)(LEN*1), TMP5 418 | 419 | next_vect_8b: 420 | ADDQ $24, TMP2 421 | MOVQ (SRC)(TMP2*1), TMP3 422 | MOVQ -8(TMP3)(LEN*1), TMP6 423 | XORQ TMP6, TMP5 424 | SUBQ $1, TMP1 425 | JGE next_vect_8b 426 | 427 | MOVQ TMP5, -8(DST)(LEN*1) 428 | SUBQ $8, LEN 429 | SUBQ $8, TMP4 430 | JG loop_8b 431 | 432 | CMPQ LEN, $128 433 | JGE aligned 434 | RET 435 | 436 | ret: 437 | RET 438 | 439 | -------------------------------------------------------------------------------- /nosimd.go: -------------------------------------------------------------------------------- 1 | // Copyright 2013 The Go Authors. All rights reserved. 2 | // Use of this source code is governed by a BSD-style 3 | // license that can be found in the LICENSE file. 4 | 5 | package xor 6 | 7 | import ( 8 | "runtime" 9 | "unsafe" 10 | ) 11 | 12 | const wordSize = int(unsafe.Sizeof(uintptr(0))) 13 | const supportsUnaligned = runtime.GOARCH == "386" || runtime.GOARCH == "amd64" || runtime.GOARCH == "ppc64" || runtime.GOARCH == "ppc64le" || runtime.GOARCH == "s390x" 14 | 15 | // xor the bytes in a and b. The destination is assumed to have enough space. 16 | func bytesNoSIMD(dst, a, b []byte, size int) { 17 | if supportsUnaligned { 18 | fastXORBytes(dst, a, b, size) 19 | } else { 20 | // TODO(hanwen): if (dst, a, b) have common alignment 21 | // we could still try fastXORBytes. It is not clear 22 | // how often this happens, and it's only worth it if 23 | // the block encryption itself is hardware 24 | // accelerated. 25 | safeXORBytes(dst, a, b, size) 26 | } 27 | } 28 | 29 | // split slice for cache-friendly 30 | const unitSize = 16 * 1024 31 | 32 | func matrixNoSIMD(dst []byte, src [][]byte) { 33 | size := len(src[0]) 34 | start := 0 35 | do := unitSize 36 | for start < size { 37 | end := start + do 38 | if end <= size { 39 | partNoSIMD(start, end, dst, src) 40 | start = start + do 41 | } else { 42 | partNoSIMD(start, size, dst, src) 43 | start = size 44 | } 45 | } 46 | } 47 | 48 | // split vect will improve performance with big data by reducing cache pollution 49 | func partNoSIMD(start, end int, dst []byte, src [][]byte) { 50 | bytesNoSIMD(dst[start:end], src[0][start:end], src[1][start:end], end-start) 51 | for i := 2; i < len(src); i++ { 52 | bytesNoSIMD(dst[start:end], dst[start:end], src[i][start:end], end-start) 53 | } 54 | } 55 | 56 | // fastXORBytes xor in bulk. It only works on architectures that 57 | // support unaligned read/writes. 58 | func fastXORBytes(dst, a, b []byte, n int) { 59 | w := n / wordSize 60 | if w > 0 { 61 | wordBytes := w * wordSize 62 | fastXORWords(dst[:wordBytes], a[:wordBytes], b[:wordBytes]) 63 | } 64 | for i := n - n%wordSize; i < n; i++ { 65 | dst[i] = a[i] ^ b[i] 66 | } 67 | } 68 | 69 | func safeXORBytes(dst, a, b []byte, n int) { 70 | ex := n % 8 71 | for i := 0; i < ex; i++ { 72 | dst[i] = a[i] ^ b[i] 73 | } 74 | 75 | for i := ex; i < n; i += 8 { 76 | _dst := dst[i : i+8] 77 | _a := a[i : i+8] 78 | _b := b[i : i+8] 79 | _dst[0] = _a[0] ^ _b[0] 80 | _dst[1] = _a[1] ^ _b[1] 81 | _dst[2] = _a[2] ^ _b[2] 82 | _dst[3] = _a[3] ^ _b[3] 83 | 84 | _dst[4] = _a[4] ^ _b[4] 85 | _dst[5] = _a[5] ^ _b[5] 86 | _dst[6] = _a[6] ^ _b[6] 87 | _dst[7] = _a[7] ^ _b[7] 88 | } 89 | } 90 | 91 | // fastXORWords XORs multiples of 4 or 8 bytes (depending on architecture.) 92 | // The arguments are assumed to be of equal length. 93 | func fastXORWords(dst, a, b []byte) { 94 | dw := *(*[]uintptr)(unsafe.Pointer(&dst)) 95 | aw := *(*[]uintptr)(unsafe.Pointer(&a)) 96 | bw := *(*[]uintptr)(unsafe.Pointer(&b)) 97 | n := len(b) / wordSize 98 | ex := n % 8 99 | for i := 0; i < ex; i++ { 100 | dw[i] = aw[i] ^ bw[i] 101 | } 102 | 103 | for i := ex; i < n; i += 8 { 104 | _dw := dw[i : i+8] 105 | _aw := aw[i : i+8] 106 | _bw := bw[i : i+8] 107 | _dw[0] = _aw[0] ^ _bw[0] 108 | _dw[1] = _aw[1] ^ _bw[1] 109 | _dw[2] = _aw[2] ^ _bw[2] 110 | _dw[3] = _aw[3] ^ _bw[3] 111 | _dw[4] = _aw[4] ^ _bw[4] 112 | _dw[5] = _aw[5] ^ _bw[5] 113 | _dw[6] = _aw[6] ^ _bw[6] 114 | _dw[7] = _aw[7] ^ _bw[7] 115 | } 116 | } 117 | -------------------------------------------------------------------------------- /sse2_amd64.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // addr of mem 4 | #define DST BX 5 | #define SRC SI 6 | #define SRC0 TMP4 7 | #define SRC1 TMP5 8 | 9 | // loop args 10 | // num of vect 11 | #define VECT CX 12 | #define LEN DX 13 | // pos of matrix 14 | #define POS R8 15 | 16 | // tmp store 17 | // num of vect or ... 18 | #define TMP1 R9 19 | // pos of matrix or ... 20 | #define TMP2 R10 21 | // store addr of data/parity or ... 22 | #define TMP3 R11 23 | #define TMP4 R12 24 | #define TMP5 R13 25 | #define TMP6 R14 26 | 27 | // func bytesSrc0(dst, src0, src1 []byte) 28 | TEXT ·xorSrc0(SB), NOSPLIT, $0 29 | MOVQ len+32(FP), LEN 30 | CMPQ LEN, $0 31 | JE ret 32 | MOVQ dst+0(FP), DST 33 | MOVQ src0+24(FP), SRC0 34 | MOVQ src1+48(FP), SRC1 35 | TESTQ $15, LEN 36 | JNZ not_aligned 37 | 38 | aligned: 39 | MOVQ $0, POS 40 | 41 | loop16b: 42 | MOVOU (SRC0)(POS*1), X0 43 | XORPD (SRC1)(POS*1), X0 44 | MOVOU X0, (DST)(POS*1) 45 | ADDQ $16, POS 46 | CMPQ LEN, POS 47 | JNE loop16b 48 | RET 49 | 50 | loop_1b: 51 | MOVB -1(SRC0)(LEN*1), TMP1 52 | MOVB -1(SRC1)(LEN*1), TMP2 53 | XORB TMP1, TMP2 54 | MOVB TMP2, -1(DST)(LEN*1) 55 | SUBQ $1, LEN 56 | TESTQ $7, LEN 57 | JNZ loop_1b 58 | CMPQ LEN, $0 59 | JE ret 60 | TESTQ $15, LEN 61 | JZ aligned 62 | 63 | not_aligned: 64 | TESTQ $7, LEN 65 | JNE loop_1b 66 | MOVQ LEN, TMP1 67 | ANDQ $15, TMP1 68 | 69 | loop_8b: 70 | MOVQ -8(SRC0)(LEN*1), TMP2 71 | MOVQ -8(SRC1)(LEN*1), TMP3 72 | XORQ TMP2, TMP3 73 | MOVQ TMP3, -8(DST)(LEN*1) 74 | SUBQ $8, LEN 75 | SUBQ $8, TMP1 76 | JG loop_8b 77 | 78 | CMPQ LEN, $16 79 | JGE aligned 80 | RET 81 | 82 | ret: 83 | RET 84 | 85 | // func bytesSrc1(dst, src0, src1 []byte) 86 | TEXT ·xorSrc1(SB), NOSPLIT, $0 87 | MOVQ len+56(FP), LEN 88 | CMPQ LEN, $0 89 | JE ret 90 | MOVQ dst+0(FP), DST 91 | MOVQ src0+24(FP), SRC0 92 | MOVQ src1+48(FP), SRC1 93 | TESTQ $15, LEN 94 | JNZ not_aligned 95 | 96 | aligned: 97 | MOVQ $0, POS 98 | 99 | loop16b: 100 | MOVOU (SRC0)(POS*1), X0 101 | XORPD (SRC1)(POS*1), X0 102 | MOVOU X0, (DST)(POS*1) 103 | ADDQ $16, POS 104 | CMPQ LEN, POS 105 | JNE loop16b 106 | RET 107 | 108 | loop_1b: 109 | MOVB -1(SRC0)(LEN*1), TMP1 110 | MOVB -1(SRC1)(LEN*1), TMP2 111 | XORB TMP1, TMP2 112 | MOVB TMP2, -1(DST)(LEN*1) 113 | SUBQ $1, LEN 114 | TESTQ $7, LEN 115 | JNZ loop_1b 116 | CMPQ LEN, $0 117 | JE ret 118 | TESTQ $15, LEN 119 | JZ aligned 120 | 121 | not_aligned: 122 | TESTQ $7, LEN 123 | JNE loop_1b 124 | MOVQ LEN, TMP1 125 | ANDQ $15, TMP1 126 | 127 | loop_8b: 128 | MOVQ -8(SRC0)(LEN*1), TMP2 129 | MOVQ -8(SRC1)(LEN*1), TMP3 130 | XORQ TMP2, TMP3 131 | MOVQ TMP3, -8(DST)(LEN*1) 132 | SUBQ $8, LEN 133 | SUBQ $8, TMP1 134 | JG loop_8b 135 | 136 | CMPQ LEN, $16 137 | JGE aligned 138 | RET 139 | 140 | ret: 141 | RET 142 | 143 | // func bytesSSE2mini(dst, src0, src1 []byte, size int) 144 | TEXT ·bytesSSE2mini(SB), NOSPLIT, $0 145 | MOVQ len+72(FP), LEN 146 | CMPQ LEN, $0 147 | JE ret 148 | MOVQ dst+0(FP), DST 149 | MOVQ src0+24(FP), SRC0 150 | MOVQ src1+48(FP), SRC1 151 | TESTQ $15, LEN 152 | JNZ not_aligned 153 | 154 | aligned: 155 | MOVQ $0, POS 156 | 157 | loop16b: 158 | MOVOU (SRC0)(POS*1), X0 159 | XORPD (SRC1)(POS*1), X0 160 | 161 | // MOVOU (SRC1)(POS*1), X4 162 | // PXOR X4, X0 163 | MOVOU X0, (DST)(POS*1) 164 | ADDQ $16, POS 165 | CMPQ LEN, POS 166 | JNE loop16b 167 | RET 168 | 169 | loop_1b: 170 | MOVB -1(SRC0)(LEN*1), TMP1 171 | MOVB -1(SRC1)(LEN*1), TMP2 172 | XORB TMP1, TMP2 173 | MOVB TMP2, -1(DST)(LEN*1) 174 | SUBQ $1, LEN 175 | TESTQ $7, LEN 176 | JNZ loop_1b 177 | CMPQ LEN, $0 178 | JE ret 179 | TESTQ $15, LEN 180 | JZ aligned 181 | 182 | not_aligned: 183 | TESTQ $7, LEN 184 | JNE loop_1b 185 | MOVQ LEN, TMP1 186 | ANDQ $15, TMP1 187 | 188 | loop_8b: 189 | MOVQ -8(SRC0)(LEN*1), TMP2 190 | MOVQ -8(SRC1)(LEN*1), TMP3 191 | XORQ TMP2, TMP3 192 | MOVQ TMP3, -8(DST)(LEN*1) 193 | SUBQ $8, LEN 194 | SUBQ $8, TMP1 195 | JG loop_8b 196 | 197 | CMPQ LEN, $16 198 | JGE aligned 199 | RET 200 | 201 | ret: 202 | RET 203 | 204 | // func bytesSSE2small(dst, src0, src1 []byte, size int) 205 | TEXT ·bytesSSE2small(SB), NOSPLIT, $0 206 | MOVQ len+72(FP), LEN 207 | CMPQ LEN, $0 208 | JE ret 209 | MOVQ dst+0(FP), DST 210 | MOVQ src0+24(FP), SRC0 211 | MOVQ src1+48(FP), SRC1 212 | TESTQ $63, LEN 213 | JNZ not_aligned 214 | 215 | aligned: 216 | MOVQ $0, POS 217 | 218 | loop64b: 219 | MOVOU (SRC0)(POS*1), X0 220 | MOVOU 16(SRC0)(POS*1), X1 221 | MOVOU 32(SRC0)(POS*1), X2 222 | MOVOU 48(SRC0)(POS*1), X3 223 | 224 | MOVOU (SRC1)(POS*1), X4 225 | MOVOU 16(SRC1)(POS*1), X5 226 | MOVOU 32(SRC1)(POS*1), X6 227 | MOVOU 48(SRC1)(POS*1), X7 228 | 229 | PXOR X4, X0 230 | PXOR X5, X1 231 | PXOR X6, X2 232 | PXOR X7, X3 233 | 234 | MOVOU X0, (DST)(POS*1) 235 | MOVOU X1, 16(DST)(POS*1) 236 | MOVOU X2, 32(DST)(POS*1) 237 | MOVOU X3, 48(DST)(POS*1) 238 | 239 | ADDQ $64, POS 240 | CMPQ LEN, POS 241 | JNE loop64b 242 | RET 243 | 244 | loop_1b: 245 | MOVB -1(SRC0)(LEN*1), TMP1 246 | MOVB -1(SRC1)(LEN*1), TMP2 247 | XORB TMP1, TMP2 248 | MOVB TMP2, -1(DST)(LEN*1) 249 | SUBQ $1, LEN 250 | TESTQ $7, LEN 251 | JNZ loop_1b 252 | CMPQ LEN, $0 253 | JE ret 254 | TESTQ $63, LEN 255 | JZ aligned 256 | 257 | not_aligned: 258 | TESTQ $7, LEN 259 | JNE loop_1b 260 | MOVQ LEN, TMP1 261 | ANDQ $63, TMP1 262 | 263 | loop_8b: 264 | MOVQ -8(SRC0)(LEN*1), TMP2 265 | MOVQ -8(SRC1)(LEN*1), TMP3 266 | XORQ TMP2, TMP3 267 | MOVQ TMP3, -8(DST)(LEN*1) 268 | SUBQ $8, LEN 269 | SUBQ $8, TMP1 270 | JG loop_8b 271 | 272 | CMPQ LEN, $64 273 | JGE aligned 274 | RET 275 | 276 | ret: 277 | RET 278 | 279 | // func bytesSSE2big(dst, src0, src1 []byte, size int) 280 | TEXT ·bytesSSE2big(SB), NOSPLIT, $0 281 | MOVQ len+72(FP), LEN 282 | CMPQ LEN, $0 283 | JE ret 284 | MOVQ dst+0(FP), DST 285 | MOVQ src0+24(FP), SRC0 286 | MOVQ src1+48(FP), SRC1 287 | TESTQ $63, LEN 288 | JNZ not_aligned 289 | 290 | aligned: 291 | MOVQ $0, POS 292 | 293 | loop64b: 294 | MOVOU (SRC0)(POS*1), X0 295 | MOVOU 16(SRC0)(POS*1), X1 296 | MOVOU 32(SRC0)(POS*1), X2 297 | MOVOU 48(SRC0)(POS*1), X3 298 | 299 | MOVOU (SRC1)(POS*1), X4 300 | MOVOU 16(SRC1)(POS*1), X5 301 | MOVOU 32(SRC1)(POS*1), X6 302 | MOVOU 48(SRC1)(POS*1), X7 303 | 304 | PXOR X4, X0 305 | PXOR X5, X1 306 | PXOR X6, X2 307 | PXOR X7, X3 308 | 309 | LONG $0xe70f4266; WORD $0x0304 // MOVNTDQ 310 | LONG $0xe70f4266; WORD $0x034c; BYTE $0x10 311 | LONG $0xe70f4266; WORD $0x0354; BYTE $0x20 312 | LONG $0xe70f4266; WORD $0x035c; BYTE $0x30 313 | 314 | ADDQ $64, POS 315 | CMPQ LEN, POS 316 | JNE loop64b 317 | RET 318 | 319 | loop_1b: 320 | MOVB -1(SRC0)(LEN*1), TMP1 321 | MOVB -1(SRC1)(LEN*1), TMP2 322 | XORB TMP1, TMP2 323 | MOVB TMP2, -1(DST)(LEN*1) 324 | SUBQ $1, LEN 325 | TESTQ $7, LEN 326 | JNZ loop_1b 327 | CMPQ LEN, $0 328 | JE ret 329 | TESTQ $63, LEN 330 | JZ aligned 331 | 332 | not_aligned: 333 | TESTQ $7, LEN 334 | JNE loop_1b 335 | MOVQ LEN, TMP1 336 | ANDQ $63, TMP1 337 | 338 | loop_8b: 339 | MOVQ -8(SRC0)(LEN*1), TMP2 340 | MOVQ -8(SRC1)(LEN*1), TMP3 341 | XORQ TMP2, TMP3 342 | MOVQ TMP3, -8(DST)(LEN*1) 343 | SUBQ $8, LEN 344 | SUBQ $8, TMP1 345 | JG loop_8b 346 | 347 | CMPQ LEN, $64 348 | JGE aligned 349 | RET 350 | 351 | ret: 352 | RET 353 | 354 | // func matrixSSE2small(dst []byte, src [][]byte) 355 | TEXT ·matrixSSE2small(SB), NOSPLIT, $0 356 | MOVQ dst+0(FP), DST 357 | MOVQ src+24(FP), SRC 358 | MOVQ vec+32(FP), VECT 359 | MOVQ len+8(FP), LEN 360 | TESTQ $63, LEN 361 | JNZ not_aligned 362 | 363 | aligned: 364 | MOVQ $0, POS 365 | 366 | loop64b: 367 | MOVQ VECT, TMP1 368 | SUBQ $2, TMP1 369 | MOVQ $0, TMP2 370 | MOVQ (SRC)(TMP2*1), TMP3 371 | MOVQ TMP3, TMP4 372 | MOVOU (TMP3)(POS*1), X0 373 | MOVOU 16(TMP4)(POS*1), X1 374 | MOVOU 32(TMP3)(POS*1), X2 375 | MOVOU 48(TMP4)(POS*1), X3 376 | 377 | next_vect: 378 | ADDQ $24, TMP2 379 | MOVQ (SRC)(TMP2*1), TMP3 380 | MOVQ TMP3, TMP4 381 | MOVOU (TMP3)(POS*1), X4 382 | MOVOU 16(TMP4)(POS*1), X5 383 | MOVOU 32(TMP3)(POS*1), X6 384 | MOVOU 48(TMP4)(POS*1), X7 385 | PXOR X4, X0 386 | PXOR X5, X1 387 | PXOR X6, X2 388 | PXOR X7, X3 389 | SUBQ $1, TMP1 390 | JGE next_vect 391 | 392 | MOVOU X0, (DST)(POS*1) 393 | MOVOU X1, 16(DST)(POS*1) 394 | MOVOU X2, 32(DST)(POS*1) 395 | MOVOU X3, 48(DST)(POS*1) 396 | 397 | ADDQ $64, POS 398 | CMPQ LEN, POS 399 | JNE loop64b 400 | RET 401 | 402 | loop_1b: 403 | MOVQ VECT, TMP1 404 | MOVQ $0, TMP2 405 | MOVQ (SRC)(TMP2*1), TMP3 406 | SUBQ $2, TMP1 407 | MOVB -1(TMP3)(LEN*1), TMP5 408 | 409 | next_vect_1b: 410 | ADDQ $24, TMP2 411 | MOVQ (SRC)(TMP2*1), TMP3 412 | MOVB -1(TMP3)(LEN*1), TMP6 413 | XORB TMP6, TMP5 414 | SUBQ $1, TMP1 415 | JGE next_vect_1b 416 | 417 | MOVB TMP5, -1(DST)(LEN*1) 418 | SUBQ $1, LEN 419 | TESTQ $7, LEN 420 | JNZ loop_1b 421 | 422 | CMPQ LEN, $0 423 | JE ret 424 | TESTQ $63, LEN 425 | JZ aligned 426 | 427 | not_aligned: 428 | TESTQ $7, LEN 429 | JNE loop_1b 430 | MOVQ LEN, TMP4 431 | ANDQ $63, TMP4 432 | 433 | loop_8b: 434 | MOVQ VECT, TMP1 435 | MOVQ $0, TMP2 436 | MOVQ (SRC)(TMP2*1), TMP3 437 | SUBQ $2, TMP1 438 | MOVQ -8(TMP3)(LEN*1), TMP5 439 | 440 | next_vect_8b: 441 | ADDQ $24, TMP2 442 | MOVQ (SRC)(TMP2*1), TMP3 443 | MOVQ -8(TMP3)(LEN*1), TMP6 444 | XORQ TMP6, TMP5 445 | SUBQ $1, TMP1 446 | JGE next_vect_8b 447 | 448 | MOVQ TMP5, -8(DST)(LEN*1) 449 | SUBQ $8, LEN 450 | SUBQ $8, TMP4 451 | JG loop_8b 452 | 453 | CMPQ LEN, $64 454 | JGE aligned 455 | RET 456 | 457 | ret: 458 | RET 459 | 460 | // func matrixSSE2big(dst []byte, src [][]byte) 461 | TEXT ·matrixSSE2big(SB), NOSPLIT, $0 462 | MOVQ dst+0(FP), DST 463 | MOVQ src+24(FP), SRC 464 | MOVQ vec+32(FP), VECT 465 | MOVQ len+8(FP), LEN 466 | TESTQ $63, LEN 467 | JNZ not_aligned 468 | 469 | aligned: 470 | MOVQ $0, POS 471 | 472 | loop64b: 473 | MOVQ VECT, TMP1 474 | SUBQ $2, TMP1 475 | MOVQ $0, TMP2 476 | MOVQ (SRC)(TMP2*1), TMP3 477 | MOVQ TMP3, TMP4 478 | MOVOU (TMP3)(POS*1), X0 479 | MOVOU 16(TMP4)(POS*1), X1 480 | MOVOU 32(TMP3)(POS*1), X2 481 | MOVOU 48(TMP4)(POS*1), X3 482 | 483 | next_vect: 484 | ADDQ $24, TMP2 485 | MOVQ (SRC)(TMP2*1), TMP3 486 | MOVQ TMP3, TMP4 487 | MOVOU (TMP3)(POS*1), X4 488 | MOVOU 16(TMP4)(POS*1), X5 489 | MOVOU 32(TMP3)(POS*1), X6 490 | MOVOU 48(TMP4)(POS*1), X7 491 | PXOR X4, X0 492 | PXOR X5, X1 493 | PXOR X6, X2 494 | PXOR X7, X3 495 | SUBQ $1, TMP1 496 | JGE next_vect 497 | 498 | LONG $0xe70f4266; WORD $0x0304 499 | LONG $0xe70f4266; WORD $0x034c; BYTE $0x10 500 | LONG $0xe70f4266; WORD $0x0354; BYTE $0x20 501 | LONG $0xe70f4266; WORD $0x035c; BYTE $0x30 502 | 503 | ADDQ $64, POS 504 | CMPQ LEN, POS 505 | JNE loop64b 506 | RET 507 | 508 | loop_1b: 509 | MOVQ VECT, TMP1 510 | MOVQ $0, TMP2 511 | MOVQ (SRC)(TMP2*1), TMP3 512 | SUBQ $2, TMP1 513 | MOVB -1(TMP3)(LEN*1), TMP5 514 | 515 | next_vect_1b: 516 | ADDQ $24, TMP2 517 | MOVQ (SRC)(TMP2*1), TMP3 518 | MOVB -1(TMP3)(LEN*1), TMP6 519 | XORB TMP6, TMP5 520 | SUBQ $1, TMP1 521 | JGE next_vect_1b 522 | 523 | MOVB TMP5, -1(DST)(LEN*1) 524 | SUBQ $1, LEN 525 | TESTQ $7, LEN 526 | JNZ loop_1b 527 | 528 | CMPQ LEN, $0 529 | JE ret 530 | TESTQ $63, LEN 531 | JZ aligned 532 | 533 | not_aligned: 534 | TESTQ $7, LEN 535 | JNE loop_1b 536 | MOVQ LEN, TMP4 537 | ANDQ $63, TMP4 538 | 539 | loop_8b: 540 | MOVQ VECT, TMP1 541 | MOVQ $0, TMP2 542 | MOVQ (SRC)(TMP2*1), TMP3 543 | SUBQ $2, TMP1 544 | MOVQ -8(TMP3)(LEN*1), TMP5 545 | 546 | next_vect_8b: 547 | ADDQ $24, TMP2 548 | MOVQ (SRC)(TMP2*1), TMP3 549 | MOVQ -8(TMP3)(LEN*1), TMP6 550 | XORQ TMP6, TMP5 551 | SUBQ $1, TMP1 552 | JGE next_vect_8b 553 | 554 | MOVQ TMP5, -8(DST)(LEN*1) 555 | SUBQ $8, LEN 556 | SUBQ $8, TMP4 557 | JG loop_8b 558 | 559 | CMPQ LEN, $64 560 | JGE aligned 561 | RET 562 | 563 | ret: 564 | RET 565 | 566 | TEXT ·hasSSE2(SB), NOSPLIT, $0 567 | XORQ AX, AX 568 | INCL AX 569 | CPUID 570 | SHRQ $26, DX 571 | ANDQ $1, DX 572 | MOVB DX, ret+0(FP) 573 | RET 574 | 575 | -------------------------------------------------------------------------------- /xor.go: -------------------------------------------------------------------------------- 1 | package xor 2 | 3 | // SIMD Extensions 4 | const ( 5 | none = iota 6 | avx2 7 | // first introduced by Intel with the initial version of the Pentium 4 in 2001 8 | // so I think we can assume all amd64 has sse2 9 | sse2 10 | ) 11 | 12 | var extension = none 13 | 14 | // Bytes : chose the shortest one as xor size 15 | // it's better to use it for big data ( > 64bytes ) 16 | func Bytes(dst, src0, src1 []byte) { 17 | size := len(dst) 18 | if size > len(src0) { 19 | size = len(src0) 20 | } 21 | if size > len(src1) { 22 | size = len(src1) 23 | } 24 | xorBytes(dst, src0, src1, size) 25 | } 26 | 27 | // BytesSameLen : all slice's length must be equal 28 | // cut size branch, save time for small data 29 | func BytesSameLen(dst, src0, src1 []byte) { 30 | xorSrc1(dst, src0, src1) 31 | } 32 | 33 | // BytesSrc0 : src1 >= src0, dst >= src0 34 | // xor src0's len bytes 35 | func BytesSrc0(dst, src0, src1 []byte) { 36 | xorSrc0(dst, src0, src1) 37 | } 38 | 39 | // BytesSrc1 : src0 >= src1, dst >= src1 40 | // xor src1's len bytes 41 | func BytesSrc1(dst, src0, src1 []byte) { 42 | xorSrc1(dst, src0, src1) 43 | } 44 | 45 | // Matrix : all slice's length must be equal && != 0 46 | // len(src) must >= 2 47 | func Matrix(dst []byte, src [][]byte) { 48 | xorMatrix(dst, src) 49 | } 50 | -------------------------------------------------------------------------------- /xor_amd64.go: -------------------------------------------------------------------------------- 1 | package xor 2 | 3 | import "github.com/templexxx/cpufeat" 4 | 5 | func init() { 6 | getEXT() 7 | } 8 | 9 | func getEXT() { 10 | if cpufeat.X86.HasAVX2 { 11 | extension = avx2 12 | } else { 13 | extension = sse2 14 | } 15 | return 16 | } 17 | 18 | func xorBytes(dst, src0, src1 []byte, size int) { 19 | switch extension { 20 | case avx2: 21 | bytesAVX2(dst, src0, src1, size) 22 | default: 23 | bytesSSE2(dst, src0, src1, size) 24 | } 25 | } 26 | 27 | // non-temporal hint store 28 | const nontmp = 8 * 1024 29 | const avx2loopsize = 128 30 | 31 | func bytesAVX2(dst, src0, src1 []byte, size int) { 32 | if size < avx2loopsize { 33 | bytesAVX2mini(dst, src0, src1, size) 34 | } else if size >= avx2loopsize && size <= nontmp { 35 | bytesAVX2small(dst, src0, src1, size) 36 | } else { 37 | bytesAVX2big(dst, src0, src1, size) 38 | } 39 | } 40 | 41 | const sse2loopsize = 64 42 | 43 | func bytesSSE2(dst, src0, src1 []byte, size int) { 44 | if size < sse2loopsize { 45 | bytesSSE2mini(dst, src0, src1, size) 46 | } else if size >= sse2loopsize && size <= nontmp { 47 | bytesSSE2small(dst, src0, src1, size) 48 | } else { 49 | bytesSSE2big(dst, src0, src1, size) 50 | } 51 | } 52 | 53 | func xorMatrix(dst []byte, src [][]byte) { 54 | switch extension { 55 | case avx2: 56 | matrixAVX2(dst, src) 57 | default: 58 | matrixSSE2(dst, src) 59 | } 60 | } 61 | 62 | func matrixAVX2(dst []byte, src [][]byte) { 63 | size := len(dst) 64 | if size > nontmp { 65 | matrixAVX2big(dst, src) 66 | } else { 67 | matrixAVX2small(dst, src) 68 | } 69 | } 70 | 71 | func matrixSSE2(dst []byte, src [][]byte) { 72 | size := len(dst) 73 | if size > nontmp { 74 | matrixSSE2big(dst, src) 75 | } else { 76 | matrixSSE2small(dst, src) 77 | } 78 | } 79 | 80 | //go:noescape 81 | func xorSrc0(dst, src0, src1 []byte) 82 | 83 | //go:noescape 84 | func xorSrc1(dst, src0, src1 []byte) 85 | 86 | //go:noescape 87 | func bytesAVX2mini(dst, src0, src1 []byte, size int) 88 | 89 | //go:noescape 90 | func bytesAVX2big(dst, src0, src1 []byte, size int) 91 | 92 | //go:noescape 93 | func bytesAVX2small(dst, src0, src1 []byte, size int) 94 | 95 | //go:noescape 96 | func bytesSSE2mini(dst, src0, src1 []byte, size int) 97 | 98 | //go:noescape 99 | func bytesSSE2small(dst, src0, src1 []byte, size int) 100 | 101 | //go:noescape 102 | func bytesSSE2big(dst, src0, src1 []byte, size int) 103 | 104 | //go:noescape 105 | func matrixAVX2small(dst []byte, src [][]byte) 106 | 107 | //go:noescape 108 | func matrixAVX2big(dst []byte, src [][]byte) 109 | 110 | //go:noescape 111 | func matrixSSE2small(dst []byte, src [][]byte) 112 | 113 | //go:noescape 114 | func matrixSSE2big(dst []byte, src [][]byte) 115 | 116 | //go:noescape 117 | func hasAVX2() bool 118 | 119 | //go:noescape 120 | func hasSSE2() bool 121 | -------------------------------------------------------------------------------- /xor_other.go: -------------------------------------------------------------------------------- 1 | // +build !amd64 noasm 2 | 3 | package xor 4 | 5 | func xorBytes(dst, src0, src1 []byte, size int) { 6 | bytesNoSIMD(dst, src0, src1, size) 7 | } 8 | 9 | func xorMatrix(dst []byte, src [][]byte) { 10 | matrixNoSIMD(dst, src) 11 | } 12 | 13 | func xorSrc0(dst, src0, src1 []byte) { 14 | bytesNoSIMD(dst, src0, src1, len(src0)) 15 | } 16 | 17 | func xorSrc1(dst, src0, src1 []byte) { 18 | bytesNoSIMD(dst, src0, src1, len(src1)) 19 | } 20 | -------------------------------------------------------------------------------- /xor_test.go: -------------------------------------------------------------------------------- 1 | package xor 2 | 3 | import ( 4 | "bytes" 5 | "math/rand" 6 | "testing" 7 | ) 8 | 9 | func TestVerifyBytesNoSIMD(t *testing.T) { 10 | for i := 1; i <= unitSize+16+2; i++ { 11 | if !verifyBytesNoSIMD(i) { 12 | t.Fatal("xor fault ", "size:", i) 13 | } 14 | } 15 | } 16 | 17 | func verifyBytesNoSIMD(size int) bool { 18 | dst := make([]byte, size) 19 | src0 := make([]byte, size) 20 | src1 := make([]byte, size) 21 | expect := make([]byte, size) 22 | rand.Seed(7) 23 | fillRandom(src0) 24 | rand.Seed(8) 25 | fillRandom(src1) 26 | for i := 0; i < size; i++ { 27 | expect[i] = src0[i] ^ src1[i] 28 | } 29 | xorBytes(dst, src0, src1, size) 30 | return bytes.Equal(expect, dst) 31 | } 32 | 33 | func TestVerifyBytes(t *testing.T) { 34 | for i := 1; i <= unitSize+16+2; i++ { 35 | if !verifyBytes(i) { 36 | t.Fatal("xor fault ", "size:", i) 37 | } 38 | } 39 | } 40 | 41 | func verifyBytes(size int) bool { 42 | dst := make([]byte, size) 43 | src0 := make([]byte, size) 44 | src1 := make([]byte, size) 45 | expect := make([]byte, size) 46 | rand.Seed(7) 47 | fillRandom(src0) 48 | rand.Seed(8) 49 | fillRandom(src1) 50 | for i := 0; i < size; i++ { 51 | expect[i] = src0[i] ^ src1[i] 52 | } 53 | xorBytes(dst, src0, src1, size) 54 | return bytes.Equal(expect, dst) 55 | } 56 | 57 | func TestVerifyBytesSrc1(t *testing.T) { 58 | for i := 1; i <= unitSize+16+2; i++ { 59 | if !verifyBytesSrc1(i) { 60 | t.Fatal("xor fault ", "size:", i) 61 | } 62 | } 63 | } 64 | func verifyBytesSrc1(size int) bool { 65 | dst := make([]byte, size) 66 | src0 := make([]byte, size) 67 | src1 := make([]byte, size) 68 | expect := make([]byte, size) 69 | rand.Seed(7) 70 | fillRandom(src0) 71 | rand.Seed(8) 72 | fillRandom(src1) 73 | for i := 0; i < size; i++ { 74 | expect[i] = src0[i] ^ src1[i] 75 | } 76 | xorSrc0(dst, src0, src1) 77 | return bytes.Equal(expect, dst) 78 | } 79 | 80 | func TestVerifyMatrixNoSIMD(t *testing.T) { 81 | for i := 1; i <= unitSize+16+2; i++ { 82 | if !verifyMatrixNoSIMD(i) { 83 | t.Fatal("xor fault ", "size:", i) 84 | } 85 | } 86 | } 87 | 88 | func verifyMatrixNoSIMD(size int) bool { 89 | numSRC := 3 90 | dst := make([]byte, size) 91 | expect := make([]byte, size) 92 | src := make([][]byte, numSRC) 93 | for i := 0; i < numSRC; i++ { 94 | src[i] = make([]byte, size) 95 | rand.Seed(int64(i)) 96 | fillRandom(src[i]) 97 | } 98 | for i := 0; i < size; i++ { 99 | expect[i] = src[0][i] ^ src[1][i] 100 | } 101 | for i := 2; i < numSRC; i++ { 102 | for j := 0; j < size; j++ { 103 | expect[j] ^= src[i][j] 104 | } 105 | } 106 | matrixNoSIMD(dst, src) 107 | return bytes.Equal(expect, dst) 108 | } 109 | 110 | func TestVerifyMatrix(t *testing.T) { 111 | for i := 1; i <= unitSize+16+2; i++ { 112 | if !verifyMatrix(i) { 113 | t.Fatal("xor fault ", "size:", i) 114 | } 115 | } 116 | } 117 | 118 | func verifyMatrix(size int) bool { 119 | numSRC := 3 120 | dst := make([]byte, size) 121 | expect := make([]byte, size) 122 | src := make([][]byte, numSRC) 123 | for i := 0; i < numSRC; i++ { 124 | src[i] = make([]byte, size) 125 | rand.Seed(int64(i)) 126 | fillRandom(src[i]) 127 | } 128 | for i := 0; i < size; i++ { 129 | expect[i] = src[0][i] ^ src[1][i] 130 | } 131 | for i := 2; i < numSRC; i++ { 132 | for j := 0; j < size; j++ { 133 | expect[j] ^= src[i][j] 134 | } 135 | } 136 | xorMatrix(dst, src) 137 | return bytes.Equal(expect, dst) 138 | } 139 | 140 | func BenchmarkBytesNoSIMDx12B(b *testing.B) { 141 | benchmarkBytesNoSIMD(b, 12) 142 | } 143 | func BenchmarkBytes12B(b *testing.B) { 144 | benchmarkBytesMini(b, 12) 145 | } 146 | func BenchmarkBytesNoSIMD16B(b *testing.B) { 147 | benchmarkBytesNoSIMD(b, 16) 148 | } 149 | func BenchmarkBytes16B(b *testing.B) { 150 | benchmarkBytesMini(b, 16) 151 | } 152 | func BenchmarkBytesNoSIMD24B(b *testing.B) { 153 | benchmarkBytesNoSIMD(b, 24) 154 | } 155 | func BenchmarkBytes24B(b *testing.B) { 156 | benchmarkBytesMini(b, 24) 157 | } 158 | func BenchmarkBytesNoSIMD32B(b *testing.B) { 159 | benchmarkBytesNoSIMD(b, 32) 160 | } 161 | func BenchmarkBytes32B(b *testing.B) { 162 | benchmarkBytesMini(b, 32) 163 | } 164 | func benchmarkBytesMini(b *testing.B, size int) { 165 | src0 := make([]byte, size) 166 | src1 := make([]byte, size) 167 | dst := make([]byte, size) 168 | rand.Seed(int64(0)) 169 | fillRandom(src0) 170 | rand.Seed(int64(1)) 171 | fillRandom(src1) 172 | BytesSrc1(dst, src0, src1) 173 | b.SetBytes(int64(size) * 2) 174 | b.ResetTimer() 175 | for i := 0; i < b.N; i++ { 176 | BytesSrc1(dst, src0, src1) 177 | } 178 | } 179 | 180 | func BenchmarkBytesNoSIMD1K(b *testing.B) { 181 | benchmarkBytesNoSIMD(b, 1024) 182 | } 183 | func BenchmarkBytesNoSIMD16K(b *testing.B) { 184 | benchmarkBytesNoSIMD(b, 16*1024) 185 | } 186 | func BenchmarkBytesNoSIMD16M(b *testing.B) { 187 | benchmarkBytesNoSIMD(b, 16*1024*1024) 188 | } 189 | func benchmarkBytesNoSIMD(b *testing.B, size int) { 190 | src1 := make([]byte, size) 191 | src2 := make([]byte, size) 192 | dst := make([]byte, size) 193 | rand.Seed(int64(0)) 194 | fillRandom(src1) 195 | rand.Seed(int64(1)) 196 | fillRandom(src2) 197 | bytesNoSIMD(dst, src1, src2, size) 198 | b.SetBytes(int64(size) * 2) 199 | b.ResetTimer() 200 | for i := 0; i < b.N; i++ { 201 | bytesNoSIMD(dst, src1, src2, size) 202 | } 203 | } 204 | 205 | func BenchmarkBytes1K(b *testing.B) { 206 | benchmarkBytes(b, 1024) 207 | } 208 | func BenchmarkBytes16K(b *testing.B) { 209 | benchmarkBytes(b, 16*1024) 210 | } 211 | func BenchmarkBytes16M(b *testing.B) { 212 | benchmarkBytes(b, 16*1024*1024) 213 | } 214 | 215 | // compare with bytes 216 | func BenchmarkMatrix2x1K(b *testing.B) { 217 | benchmarkMatrix(b, 2, 1024) 218 | } 219 | func BenchmarkMatrix2x16K(b *testing.B) { 220 | benchmarkMatrix(b, 2, 16*1024) 221 | } 222 | func BenchmarkMatrix2x16M(b *testing.B) { 223 | benchmarkMatrix(b, 2, 16*1024*1024) 224 | } 225 | func benchmarkBytes(b *testing.B, size int) { 226 | src1 := make([]byte, size) 227 | src2 := make([]byte, size) 228 | dst := make([]byte, size) 229 | rand.Seed(int64(0)) 230 | fillRandom(src1) 231 | rand.Seed(int64(1)) 232 | fillRandom(src2) 233 | xorBytes(dst, src1, src2, size) 234 | b.SetBytes(int64(size) * 2) 235 | b.ResetTimer() 236 | for i := 0; i < b.N; i++ { 237 | xorBytes(dst, src1, src2, size) 238 | } 239 | } 240 | 241 | func BenchmarkMatrixNoSIMD5x1K(b *testing.B) { 242 | benchmarkMatrixNoSIMD(b, 5, 1024) 243 | } 244 | func BenchmarkMatrixNoSIMD5x16K(b *testing.B) { 245 | benchmarkMatrixNoSIMD(b, 5, 16*1024) 246 | } 247 | func BenchmarkMatrixNoSIMD5x16M(b *testing.B) { 248 | benchmarkMatrixNoSIMD(b, 5, 16*1024*1024) 249 | } 250 | func benchmarkMatrixNoSIMD(b *testing.B, numSRC, size int) { 251 | src := make([][]byte, numSRC) 252 | dst := make([]byte, size) 253 | for i := 0; i < numSRC; i++ { 254 | rand.Seed(int64(i)) 255 | src[i] = make([]byte, size) 256 | fillRandom(src[i]) 257 | } 258 | matrixNoSIMD(dst, src) 259 | b.SetBytes(int64(size * numSRC)) 260 | b.ResetTimer() 261 | for i := 0; i < b.N; i++ { 262 | matrixNoSIMD(dst, src) 263 | } 264 | } 265 | 266 | func BenchmarkMatrix5x1K(b *testing.B) { 267 | benchmarkMatrix(b, 5, 1024) 268 | } 269 | func BenchmarkMatrix5x16K(b *testing.B) { 270 | benchmarkMatrix(b, 5, 16*1024) 271 | } 272 | func BenchmarkMatrix5x16M(b *testing.B) { 273 | benchmarkMatrix(b, 5, 16*1024*1024) 274 | } 275 | func benchmarkMatrix(b *testing.B, numSRC, size int) { 276 | src := make([][]byte, numSRC) 277 | dst := make([]byte, size) 278 | for i := 0; i < numSRC; i++ { 279 | rand.Seed(int64(i)) 280 | src[i] = make([]byte, size) 281 | fillRandom(src[i]) 282 | } 283 | xorMatrix(dst, src) 284 | b.SetBytes(int64(size * numSRC)) 285 | b.ResetTimer() 286 | for i := 0; i < b.N; i++ { 287 | xorMatrix(dst, src) 288 | } 289 | } 290 | 291 | func fillRandom(p []byte) { 292 | for i := 0; i < len(p); i += 7 { 293 | val := rand.Int63() 294 | for j := 0; i+j < len(p) && j < 7; j++ { 295 | p[i+j] = byte(val) 296 | val >>= 8 297 | } 298 | } 299 | } 300 | --------------------------------------------------------------------------------