├── README ├── mem.s ├── memcsa15.s ├── pospopcnt_test.go ├── reference.go ├── reg.s ├── regcsa15.s ├── regcsa3.s ├── regcsa7.s └── scalar.s /README: -------------------------------------------------------------------------------- 1 | An 8 bit positional population count implementation for Go using AVX-2. 2 | 3 | The algorithm gathers corresponding bits of a the 32 bytes in an AVX 4 | register using vpmovmskb and then performs scalar popcnt instructions 5 | to compute the populations. The 8 counters are kept in 8 general 6 | purpose registers for the duration of the algorithm. This algorithm 7 | achieves up to 11 GB/s on Haswell and Skylake. 8 | 9 | An enhanced implementation performs one CSA step to reduce 96 bytes into 10 | 64 before adding the results to counters with interleaved dependency 11 | chains. This one achieves up to 16 GB/s. 12 | 13 | This code is the result of a Stack Overflow question: https://stackoverflow.com/q/63248047/417501 14 | 15 | This code is an experiment and should not be integrated into third party 16 | programs. Use http://github.com/fuzxxl/pospop instead. 17 | 18 | Benchmark results on a Intel(R) Xeon(R) W-2133 CPU @ 3.60GHz processor. 19 | 20 | goos: linux 21 | goarch: amd64 22 | BenchmarkReference/10-12 14835181 80.6 ns/op 124.12 MB/s 23 | BenchmarkReference/32-12 4425584 258 ns/op 124.24 MB/s 24 | BenchmarkReference/1000-12 126886 7919 ns/op 126.28 MB/s 25 | BenchmarkReference/2000-12 69042 15809 ns/op 126.51 MB/s 26 | BenchmarkReference/4000-12 36360 31707 ns/op 126.16 MB/s 27 | BenchmarkReference/10000-12 15001 78828 ns/op 126.86 MB/s 28 | BenchmarkReference/100000-12 1291 795464 ns/op 125.71 MB/s 29 | BenchmarkReference/10000000-12 14 78161869 ns/op 127.94 MB/s 30 | BenchmarkReference/1000000000-12 1 7779490681 ns/op 128.54 MB/s 31 | BenchmarkScalarReg/10-12 49062314 23.5 ns/op 425.79 MB/s 32 | BenchmarkScalarReg/32-12 16471662 72.1 ns/op 443.77 MB/s 33 | BenchmarkScalarReg/1000-12 530245 2242 ns/op 445.94 MB/s 34 | BenchmarkScalarReg/2000-12 267555 4471 ns/op 447.28 MB/s 35 | BenchmarkScalarReg/4000-12 133797 8931 ns/op 447.90 MB/s 36 | BenchmarkScalarReg/10000-12 50118 22347 ns/op 447.49 MB/s 37 | BenchmarkScalarReg/100000-12 5326 222991 ns/op 448.45 MB/s 38 | BenchmarkScalarReg/10000000-12 51 21961648 ns/op 455.34 MB/s 39 | BenchmarkScalarReg/1000000000-12 1 2198404416 ns/op 454.88 MB/s 40 | BenchmarkScalarMem/10-12 33757770 35.9 ns/op 278.25 MB/s 41 | BenchmarkScalarMem/32-12 10092933 104 ns/op 306.44 MB/s 42 | BenchmarkScalarMem/1000-12 375811 3195 ns/op 312.95 MB/s 43 | BenchmarkScalarMem/2000-12 188787 6351 ns/op 314.93 MB/s 44 | BenchmarkScalarMem/4000-12 82556 12662 ns/op 315.90 MB/s 45 | BenchmarkScalarMem/10000-12 36529 31582 ns/op 316.63 MB/s 46 | BenchmarkScalarMem/100000-12 3664 317268 ns/op 315.19 MB/s 47 | BenchmarkScalarMem/10000000-12 37 31625600 ns/op 316.20 MB/s 48 | BenchmarkScalarMem/1000000000-12 1 3160044812 ns/op 316.45 MB/s 49 | BenchmarkReg/10-12 48229350 24.0 ns/op 417.40 MB/s 50 | BenchmarkReg/32-12 185803699 6.14 ns/op 5212.18 MB/s 51 | BenchmarkReg/1000-12 9914378 108 ns/op 9299.97 MB/s 52 | BenchmarkReg/2000-12 5322217 211 ns/op 9469.33 MB/s 53 | BenchmarkReg/4000-12 3261402 356 ns/op 11235.73 MB/s 54 | BenchmarkReg/10000-12 1305704 888 ns/op 11256.88 MB/s 55 | BenchmarkReg/100000-12 116280 8679 ns/op 11522.49 MB/s 56 | BenchmarkReg/10000000-12 1310 899563 ns/op 11116.51 MB/s 57 | BenchmarkReg/1000000000-12 13 90355989 ns/op 11067.33 MB/s 58 | BenchmarkRegCSA3/10-12 48295509 24.1 ns/op 415.67 MB/s 59 | BenchmarkRegCSA3/32-12 192285118 6.23 ns/op 5133.08 MB/s 60 | BenchmarkRegCSA3/1000-12 14713641 79.8 ns/op 12530.09 MB/s 61 | BenchmarkRegCSA3/2000-12 6915092 160 ns/op 12474.22 MB/s 62 | BenchmarkRegCSA3/4000-12 4544103 249 ns/op 16076.66 MB/s 63 | BenchmarkRegCSA3/10000-12 1852636 643 ns/op 15544.93 MB/s 64 | BenchmarkRegCSA3/100000-12 197012 6080 ns/op 16447.29 MB/s 65 | BenchmarkRegCSA3/10000000-12 1712 677164 ns/op 14767.46 MB/s 66 | BenchmarkRegCSA3/1000000000-12 16 69779618 ns/op 14330.83 MB/s 67 | BenchmarkRegCSA7/10-12 47901508 23.9 ns/op 418.06 MB/s 68 | BenchmarkRegCSA7/32-12 186971718 6.26 ns/op 5113.49 MB/s 69 | BenchmarkRegCSA7/1000-12 16944604 70.8 ns/op 14120.96 MB/s 70 | BenchmarkRegCSA7/2000-12 7842502 140 ns/op 14311.54 MB/s 71 | BenchmarkRegCSA7/4000-12 5453896 202 ns/op 19824.29 MB/s 72 | BenchmarkRegCSA7/10000-12 2285653 512 ns/op 19520.84 MB/s 73 | BenchmarkRegCSA7/100000-12 250798 4740 ns/op 21095.40 MB/s 74 | BenchmarkRegCSA7/10000000-12 2007 584689 ns/op 17103.11 MB/s 75 | BenchmarkRegCSA7/1000000000-12 18 66597449 ns/op 15015.59 MB/s 76 | BenchmarkMem/10-12 32649992 35.7 ns/op 280.11 MB/s 77 | BenchmarkMem/32-12 191536897 6.00 ns/op 5336.23 MB/s 78 | BenchmarkMem/1000-12 7814206 138 ns/op 7243.51 MB/s 79 | BenchmarkMem/2000-12 4420798 259 ns/op 7710.20 MB/s 80 | BenchmarkMem/4000-12 2801730 409 ns/op 9790.36 MB/s 81 | BenchmarkMem/10000-12 1090711 1075 ns/op 9304.78 MB/s 82 | BenchmarkMem/100000-12 100048 10439 ns/op 9579.30 MB/s 83 | BenchmarkMem/10000000-12 1106 1078156 ns/op 9275.10 MB/s 84 | BenchmarkMem/1000000000-12 10 111021710 ns/op 9007.25 MB/s 85 | PASS 86 | ok pospopcnt 125.448s 87 | -------------------------------------------------------------------------------- /mem.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // func PospopcntMem(counts *[8]int32, buf []byte) 4 | TEXT ·PospopcntMem(SB),NOSPLIT,$0-32 5 | MOVQ counts+0(FP), DI 6 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 7 | MOVQ buf_len+16(FP), CX // CX = len(buf) 8 | 9 | SUBQ $32, CX // pre-subtract 32 bit from CX 10 | JL scalar 11 | 12 | vector: VMOVDQU (SI), Y0 // load 32 bytes from buf 13 | PREFETCHT1 384(SI) 14 | ADDQ $32, SI // advance SI past them 15 | 16 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 17 | POPCNTL AX, AX // count population of AX 18 | ADDL AX, 4*7(DI) // add to counter 19 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 20 | 21 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 22 | POPCNTL AX, AX // count population of AX 23 | ADDL AX, 4*6(DI) // add to counter 24 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 25 | 26 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 27 | POPCNTL AX, AX // count population of AX 28 | ADDL AX, 4*5(DI) // add to counter 29 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 30 | 31 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 32 | POPCNTL AX, AX // count population of AX 33 | ADDL AX, 4*4(DI) // add to counter 34 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 35 | 36 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 37 | POPCNTL AX, AX // count population of AX 38 | ADDL AX, 4*3(DI) // add to counter 39 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 40 | 41 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 42 | POPCNTL AX, AX // count population of AX 43 | ADDL AX, 4*2(DI) // add to counter 44 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 45 | 46 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 47 | POPCNTL AX, AX // count population of AX 48 | ADDL AX, 4*1(DI) // add to counter 49 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 50 | 51 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 52 | POPCNTL AX, AX // count population of AX 53 | ADDL AX, 4*0(DI) // add to counter 54 | 55 | SUBQ $32, CX 56 | JGE vector // repeat as long as bytes are left 57 | 58 | scalar: ADDQ $32, CX // undo last subtraction 59 | JE done // if CX=0, there's nothing left 60 | 61 | loop: MOVBLZX (SI), AX // load a byte from buf 62 | INCQ SI // advance past it 63 | 64 | SHRL $1, AX 65 | ADCL $0, 4*0(DI) // add it to the counters 66 | 67 | SHRL $1, AX 68 | ADCL $0, 4*1(DI) // add it to the counters 69 | 70 | SHRL $1, AX 71 | ADCL $0, 4*2(DI) // add it to the counters 72 | 73 | SHRL $1, AX 74 | ADCL $0, 4*3(DI) // add it to the counters 75 | 76 | SHRL $1, AX 77 | ADCL $0, 4*4(DI) // add it to the counters 78 | 79 | SHRL $1, AX 80 | ADCL $0, 4*5(DI) // add it to the counters 81 | 82 | SHRL $1, AX 83 | ADCL $0, 4*6(DI) // add it to the counters 84 | 85 | SHRL $1, AX 86 | ADCL $0, 4*7(DI) // add it to the counters 87 | 88 | DECQ CX // mark this byte as done 89 | JNE loop // and proceed if any bytes are left 90 | 91 | done: VZEROUPPER // restore SSE-compatibility 92 | RET 93 | -------------------------------------------------------------------------------- /memcsa15.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | #define CSA(A, B, C, D) \ 4 | VPAND A, B, D \ 5 | VPXOR A, B, A \ 6 | VPAND A, C, B \ 7 | VPXOR A, C, A \ 8 | VPOR B, D, B 9 | 10 | // func PospopcntMemCSA15(counts *[8]int32, buf []byte) 11 | TEXT ·PospopcntMemCSA15(SB),NOSPLIT,$0-32 12 | MOVQ counts+0(FP), DI 13 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 14 | MOVQ buf_len+16(FP), CX // CX = len(buf) 15 | 16 | SUBQ $15*32, CX // pre-decrement CX 17 | JL vec3 18 | 19 | vec15: VMOVDQU 0*32(SI), Y0 // load 480 bytes from buf into Y0--Y14 20 | VMOVDQU 1*32(SI), Y1 21 | VMOVDQU 2*32(SI), Y2 22 | CSA(Y0, Y1, Y2, Y15) 23 | 24 | VMOVDQU 3*32(SI), Y3 25 | VMOVDQU 4*32(SI), Y4 26 | VMOVDQU 5*32(SI), Y5 27 | CSA(Y3, Y4, Y5, Y15) 28 | 29 | VMOVDQU 6*32(SI), Y6 30 | VMOVDQU 7*32(SI), Y7 31 | VMOVDQU 8*32(SI), Y8 32 | CSA(Y6, Y7, Y8, Y15) 33 | 34 | VMOVDQU 9*32(SI), Y9 35 | VMOVDQU 10*32(SI), Y10 36 | VMOVDQU 11*32(SI), Y11 37 | CSA(Y9, Y10, Y11, Y15) 38 | 39 | VMOVDQU 12*32(SI), Y12 40 | VMOVDQU 13*32(SI), Y13 41 | VMOVDQU 14*32(SI), Y14 42 | CSA(Y12, Y13, Y14, Y15) 43 | 44 | ADDQ $15*32, SI 45 | #define D 48 46 | PREFETCHT0 (D+ 0)*32(SI) 47 | PREFETCHT0 (D+ 2)*32(SI) 48 | PREFETCHT0 (D+ 4)*32(SI) 49 | PREFETCHT0 (D+ 6)*32(SI) 50 | PREFETCHT0 (D+ 8)*32(SI) 51 | PREFETCHT0 (D+10)*32(SI) 52 | PREFETCHT0 (D+12)*32(SI) 53 | PREFETCHT0 (D+14)*32(SI) 54 | 55 | CSA(Y0, Y3, Y6, Y15) 56 | CSA(Y1, Y4, Y7, Y15) 57 | CSA(Y0, Y9, Y12, Y15) 58 | CSA(Y1, Y3, Y10, Y15) 59 | CSA(Y1, Y9, Y13, Y15) 60 | CSA(Y3, Y4, Y9, Y15) 61 | 62 | VPMOVMSKB Y3, AX 63 | VPADDB Y3, Y3, Y3 64 | POPCNTL AX, AX 65 | VPMOVMSKB Y4, BX 66 | VPADDB Y4, Y4, Y4 67 | POPCNTL BX, BX 68 | LEAL (AX)(BX*2), AX 69 | VPMOVMSKB Y0, BX 70 | VPADDB Y0, Y0, Y0 71 | POPCNTL BX, BX 72 | VPMOVMSKB Y1, DX 73 | VPADDB Y1, Y1, Y1 74 | POPCNTL DX, DX 75 | LEAL (BX)(DX*2), BX 76 | LEAL (BX)(AX*4), AX 77 | ADDL AX, 7*4(DI) 78 | 79 | VPMOVMSKB Y3, AX 80 | VPADDB Y3, Y3, Y3 81 | POPCNTL AX, AX 82 | VPMOVMSKB Y4, BX 83 | VPADDB Y4, Y4, Y4 84 | POPCNTL BX, BX 85 | LEAL (AX)(BX*2), AX 86 | VPMOVMSKB Y0, BX 87 | VPADDB Y0, Y0, Y0 88 | POPCNTL BX, BX 89 | VPMOVMSKB Y1, DX 90 | VPADDB Y1, Y1, Y1 91 | POPCNTL DX, DX 92 | LEAL (BX)(DX*2), BX 93 | LEAL (BX)(AX*4), AX 94 | ADDL AX, 6*4(DI) 95 | 96 | VPMOVMSKB Y3, AX 97 | VPADDB Y3, Y3, Y3 98 | POPCNTL AX, AX 99 | VPMOVMSKB Y4, BX 100 | VPADDB Y4, Y4, Y4 101 | POPCNTL BX, BX 102 | LEAL (AX)(BX*2), AX 103 | VPMOVMSKB Y0, BX 104 | VPADDB Y0, Y0, Y0 105 | POPCNTL BX, BX 106 | VPMOVMSKB Y1, DX 107 | VPADDB Y1, Y1, Y1 108 | POPCNTL DX, DX 109 | LEAL (BX)(DX*2), BX 110 | LEAL (BX)(AX*4), AX 111 | ADDL AX, 5*4(DI) 112 | 113 | VPMOVMSKB Y3, AX 114 | VPADDB Y3, Y3, Y3 115 | POPCNTL AX, AX 116 | VPMOVMSKB Y4, BX 117 | VPADDB Y4, Y4, Y4 118 | POPCNTL BX, BX 119 | LEAL (AX)(BX*2), AX 120 | VPMOVMSKB Y0, BX 121 | VPADDB Y0, Y0, Y0 122 | POPCNTL BX, BX 123 | VPMOVMSKB Y1, DX 124 | VPADDB Y1, Y1, Y1 125 | POPCNTL DX, DX 126 | LEAL (BX)(DX*2), BX 127 | LEAL (BX)(AX*4), AX 128 | ADDL AX, 4*4(DI) 129 | 130 | VPMOVMSKB Y3, AX 131 | VPADDB Y3, Y3, Y3 132 | POPCNTL AX, AX 133 | VPMOVMSKB Y4, BX 134 | VPADDB Y4, Y4, Y4 135 | POPCNTL BX, BX 136 | LEAL (AX)(BX*2), AX 137 | VPMOVMSKB Y0, BX 138 | VPADDB Y0, Y0, Y0 139 | POPCNTL BX, BX 140 | VPMOVMSKB Y1, DX 141 | VPADDB Y1, Y1, Y1 142 | POPCNTL DX, DX 143 | LEAL (BX)(DX*2), BX 144 | LEAL (BX)(AX*4), AX 145 | ADDL AX, 3*4(DI) 146 | 147 | VPMOVMSKB Y3, AX 148 | VPADDB Y3, Y3, Y3 149 | POPCNTL AX, AX 150 | VPMOVMSKB Y4, BX 151 | VPADDB Y4, Y4, Y4 152 | POPCNTL BX, BX 153 | LEAL (AX)(BX*2), AX 154 | VPMOVMSKB Y0, BX 155 | VPADDB Y0, Y0, Y0 156 | POPCNTL BX, BX 157 | VPMOVMSKB Y1, DX 158 | VPADDB Y1, Y1, Y1 159 | POPCNTL DX, DX 160 | LEAL (BX)(DX*2), BX 161 | LEAL (BX)(AX*4), AX 162 | ADDL AX, 2*4(DI) 163 | 164 | VPMOVMSKB Y3, AX 165 | VPADDB Y3, Y3, Y3 166 | POPCNTL AX, AX 167 | VPMOVMSKB Y4, BX 168 | VPADDB Y4, Y4, Y4 169 | POPCNTL BX, BX 170 | LEAL (AX)(BX*2), AX 171 | VPMOVMSKB Y0, BX 172 | VPADDB Y0, Y0, Y0 173 | POPCNTL BX, BX 174 | VPMOVMSKB Y1, DX 175 | VPADDB Y1, Y1, Y1 176 | POPCNTL DX, DX 177 | LEAL (BX)(DX*2), BX 178 | LEAL (BX)(AX*4), AX 179 | ADDL AX, 1*4(DI) 180 | 181 | VPMOVMSKB Y3, AX 182 | POPCNTL AX, AX 183 | VPMOVMSKB Y4, BX 184 | POPCNTL BX, BX 185 | LEAL (AX)(BX*2), AX 186 | VPMOVMSKB Y0, BX 187 | POPCNTL BX, BX 188 | VPMOVMSKB Y1, DX 189 | POPCNTL DX, DX 190 | LEAL (BX)(DX*2), BX 191 | LEAL (BX)(AX*4), AX 192 | ADDL AX, 0*4(DI) 193 | 194 | SUBQ $15*32, CX 195 | JGE vec15 // repeat as long as bytes are left 196 | 197 | vec3: ADDQ $15*32, CX // undo last subtraction 198 | JZ end // skip load/store if not needed 199 | 200 | // load counts into register R8--R15 201 | MOVL 4*0(DI), R8 202 | MOVL 4*1(DI), R9 203 | MOVL 4*2(DI), R10 204 | MOVL 4*3(DI), R11 205 | MOVL 4*4(DI), R12 206 | MOVL 4*5(DI), R13 207 | MOVL 4*6(DI), R14 208 | MOVL 4*7(DI), R15 209 | 210 | SUBQ $32, CX // pre-subtract 32 bit from CX 211 | JL scalar 212 | 213 | vector: VMOVDQU (SI), Y0 // load 32 bytes from buf 214 | ADDQ $32, SI // advance SI past them 215 | 216 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 217 | POPCNTL AX, AX // count population of AX 218 | ADDL AX, R15 // add to counter 219 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 220 | 221 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 222 | POPCNTL AX, AX // count population of AX 223 | ADDL AX, R14 // add to counter 224 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 225 | 226 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 227 | POPCNTL AX, AX // count population of AX 228 | ADDL AX, R13 // add to counter 229 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 230 | 231 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 232 | POPCNTL AX, AX // count population of AX 233 | ADDL AX, R12 // add to counter 234 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 235 | 236 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 237 | POPCNTL AX, AX // count population of AX 238 | ADDL AX, R11 // add to counter 239 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 240 | 241 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 242 | POPCNTL AX, AX // count population of AX 243 | ADDL AX, R10 // add to counter 244 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 245 | 246 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 247 | POPCNTL AX, AX // count population of AX 248 | ADDL AX, R9 // add to counter 249 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 250 | 251 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 252 | POPCNTL AX, AX // count population of AX 253 | ADDL AX, R8 // add to counter 254 | 255 | SUBQ $32, CX 256 | JGE vector // repeat as long as bytes are left 257 | 258 | scalar: ADDQ $32, CX // undo last subtraction 259 | JE done // if CX=0, there's nothing left 260 | 261 | loop: MOVBLZX (SI), AX // load a byte from buf 262 | INCQ SI // advance past it 263 | 264 | SHRL $1, AX // is bit 0 set? 265 | ADCL $0, R8 // add it to R8 266 | 267 | SHRL $1, AX // is bit 0 set? 268 | ADCL $0, R9 // add it to R9 269 | 270 | SHRL $1, AX // is bit 0 set? 271 | ADCL $0, R10 // add it to R10 272 | 273 | SHRL $1, AX // is bit 0 set? 274 | ADCL $0, R11 // add it to R11 275 | 276 | SHRL $1, AX // is bit 0 set? 277 | ADCL $0, R12 // add it to R12 278 | 279 | SHRL $1, AX // is bit 0 set? 280 | ADCL $0, R13 // add it to R13 281 | 282 | SHRL $1, AX // is bit 0 set? 283 | ADCL $0, R14 // add it to R14 284 | 285 | SHRL $1, AX // is bit 0 set? 286 | ADCL $0, R15 // add it to R15 287 | 288 | DECQ CX // mark this byte as done 289 | JNE loop // and proceed if any bytes are left 290 | 291 | // write R8--R15 back to counts 292 | done: MOVL R8, 4*0(DI) 293 | MOVL R9, 4*1(DI) 294 | MOVL R10, 4*2(DI) 295 | MOVL R11, 4*3(DI) 296 | MOVL R12, 4*4(DI) 297 | MOVL R13, 4*5(DI) 298 | MOVL R14, 4*6(DI) 299 | MOVL R15, 4*7(DI) 300 | 301 | end: VZEROUPPER // restore SSE-compatibility 302 | RET 303 | -------------------------------------------------------------------------------- /pospopcnt_test.go: -------------------------------------------------------------------------------- 1 | package pospopcnt 2 | 3 | import "math/rand" 4 | import "testing" 5 | import "strconv" 6 | 7 | // test sizes 8 | var testSizes = []int{ 10, 32, 1000, 2000, 4000, 10000, 100000, 10000000, 1000000000 } 9 | 10 | func TestScalarReg(t *testing.T) { 11 | testHarness(PospopcntScalarReg, t) 12 | } 13 | 14 | func TestScalarMem(t *testing.T) { 15 | testHarness(PospopcntScalarMem, t) 16 | } 17 | 18 | func TestReg(t *testing.T) { 19 | testHarness(PospopcntReg, t) 20 | } 21 | 22 | func TestRegCSA3(t *testing.T) { 23 | testHarness(PospopcntRegCSA3, t) 24 | } 25 | 26 | func TestRegCSA7(t *testing.T) { 27 | testHarness(PospopcntRegCSA7, t) 28 | } 29 | 30 | func TestRegCSA15(t *testing.T) { 31 | testHarness(PospopcntRegCSA15, t) 32 | } 33 | 34 | func TestMem(t *testing.T) { 35 | testHarness(PospopcntMem, t) 36 | } 37 | 38 | func TestMemCSA15(t *testing.T) { 39 | testHarness(PospopcntMemCSA15, t) 40 | } 41 | 42 | func BenchmarkReference(b *testing.B) { 43 | outerHarness(PospopcntReference, b) 44 | } 45 | 46 | func BenchmarkScalarReg(b *testing.B) { 47 | outerHarness(PospopcntScalarReg, b) 48 | } 49 | 50 | func BenchmarkScalarMem(b *testing.B) { 51 | outerHarness(PospopcntScalarMem, b) 52 | } 53 | 54 | func BenchmarkReg(b *testing.B) { 55 | outerHarness(PospopcntReg, b) 56 | } 57 | 58 | func BenchmarkRegCSA3(b *testing.B) { 59 | outerHarness(PospopcntRegCSA3, b) 60 | } 61 | 62 | func BenchmarkRegCSA7(b *testing.B) { 63 | outerHarness(PospopcntRegCSA7, b) 64 | } 65 | 66 | func BenchmarkRegCSA15(b *testing.B) { 67 | outerHarness(PospopcntRegCSA15, b) 68 | } 69 | 70 | func BenchmarkMemCSA15(b *testing.B) { 71 | outerHarness(PospopcntMemCSA15, b) 72 | } 73 | 74 | func BenchmarkMem(b *testing.B) { 75 | outerHarness(PospopcntMem, b) 76 | } 77 | 78 | 79 | // test harness: make sure the function does the same thing as the reference. 80 | func testHarness(pospopcnt func(*[8]int32, []byte), t *testing.T) { 81 | t.Helper() 82 | 83 | buf := make([]byte, 12345) // an intentionally odd nmber 84 | rand.Read(buf) 85 | 86 | refCounts := [8]int32{1234112, 12341234, 5635635, 423452345, 2345232, 32452345, 23452452, 2342542,} 87 | testCounts := refCounts 88 | 89 | PospopcntReference(&refCounts, buf) 90 | pospopcnt(&testCounts, buf) 91 | 92 | if refCounts != testCounts { 93 | t.Error("long counts don't match") 94 | } 95 | 96 | buf = []byte{55} 97 | 98 | PospopcntReference(&refCounts, buf) 99 | pospopcnt(&testCounts, buf) 100 | 101 | if refCounts != testCounts { 102 | t.Error("single byte counts don't match") 103 | } 104 | } 105 | 106 | // outer harness: run benchmarks on pospopcnt for various data sizes. 107 | func outerHarness(pospopcnt func(*[8]int32, []byte), b *testing.B) { 108 | for i := range testSizes { 109 | b.Run(strconv.Itoa(testSizes[i]), func(b *testing.B) { 110 | innerHarness(pospopcnt, b, testSizes[i]) 111 | }) 112 | } 113 | } 114 | 115 | // inner harness: benchmark harness for one test at one data size 116 | func innerHarness(pospopcnt func(*[8]int32, []byte), b *testing.B, n int) { 117 | b.Helper() 118 | 119 | if n <= 0 { 120 | b.Errorf("buffer size must be positive: %d", n) 121 | } 122 | 123 | b.SetBytes(int64(n)) 124 | 125 | buf := make([]byte, n) 126 | rand.Read(buf) 127 | 128 | var counts [8]int32 129 | 130 | b.ResetTimer() 131 | for i := 0; i < b.N; i++ { 132 | pospopcnt(&counts, buf) 133 | } 134 | } 135 | -------------------------------------------------------------------------------- /reference.go: -------------------------------------------------------------------------------- 1 | package pospopcnt 2 | 3 | // vectorised positional popcount with counters in registers 4 | func PospopcntReg(counts *[8]int32, buf []byte) 5 | 6 | // vectorised positional popcount with counters in registers 7 | // and 3-way CSA reduction 8 | func PospopcntRegCSA3(counts *[8]int32, buf []byte) 9 | 10 | // vectorised positional popcount with counters in registers 11 | // and 7-way CSA reduction 12 | func PospopcntRegCSA7(counts *[8]int32, buf []byte) 13 | 14 | // vectorised positional popcount with counters in registers 15 | // and 15-way CSA reduction 16 | func PospopcntRegCSA15(counts *[8]int32, buf []byte) 17 | 18 | // vectorised positional popcount with counters in memory 19 | // and 15-way CSA reduction 20 | func PospopcntMemCSA15(counts *[8]int32, buf []byte) 21 | 22 | // vectorised positional popcount with counters in memory 23 | func PospopcntMem(counts *[8]int32, buf []byte) 24 | 25 | // scalar positional popcount with counters in registers 26 | func PospopcntScalarReg(counts *[8]int32, buf []byte) 27 | 28 | // scalar positional popcount with counters in memory 29 | func PospopcntScalarMem(counts *[8]int32, buf []byte) 30 | 31 | // positional popcount reference implementation 32 | func PospopcntReference(counts *[8]int32, buf []byte) { 33 | for i := 0; i < len(buf); i++ { 34 | for j := 0; j < 8; j++ { 35 | (*counts)[j] += int32(buf[i]) >> j & 1 36 | } 37 | } 38 | } 39 | -------------------------------------------------------------------------------- /reg.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // func PospopcntReg(counts *[8]int32, buf []byte) 4 | TEXT ·PospopcntReg(SB),NOSPLIT,$0-32 5 | MOVQ counts+0(FP), DI 6 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 7 | MOVQ buf_len+16(FP), CX // CX = len(buf) 8 | 9 | // load counts into register R8--R15 10 | MOVL 4*0(DI), R8 11 | MOVL 4*1(DI), R9 12 | MOVL 4*2(DI), R10 13 | MOVL 4*3(DI), R11 14 | MOVL 4*4(DI), R12 15 | MOVL 4*5(DI), R13 16 | MOVL 4*6(DI), R14 17 | MOVL 4*7(DI), R15 18 | 19 | SUBQ $32, CX // pre-subtract 32 bit from CX 20 | JL scalar 21 | 22 | vector: VMOVDQU (SI), Y0 // load 32 bytes from buf 23 | PREFETCHT0 384(SI) // prefetch some data 24 | ADDQ $32, SI // advance SI past them 25 | 26 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 27 | POPCNTL AX, AX // count population of AX 28 | ADDL AX, R15 // add to counter 29 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 30 | 31 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 32 | POPCNTL AX, AX // count population of AX 33 | ADDL AX, R14 // add to counter 34 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 35 | 36 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 37 | POPCNTL AX, AX // count population of AX 38 | ADDL AX, R13 // add to counter 39 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 40 | 41 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 42 | POPCNTL AX, AX // count population of AX 43 | ADDL AX, R12 // add to counter 44 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 45 | 46 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 47 | POPCNTL AX, AX // count population of AX 48 | ADDL AX, R11 // add to counter 49 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 50 | 51 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 52 | POPCNTL AX, AX // count population of AX 53 | ADDL AX, R10 // add to counter 54 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 55 | 56 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 57 | POPCNTL AX, AX // count population of AX 58 | ADDL AX, R9 // add to counter 59 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 60 | 61 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 62 | POPCNTL AX, AX // count population of AX 63 | ADDL AX, R8 // add to counter 64 | 65 | SUBQ $32, CX 66 | JGE vector // repeat as long as bytes are left 67 | 68 | scalar: ADDQ $32, CX // undo last subtraction 69 | JE done // if CX=0, there's nothing left 70 | 71 | loop: MOVBLZX (SI), AX // load a byte from buf 72 | INCQ SI // advance past it 73 | 74 | SHRL $1, AX // CF=LSB, shift byte to the right 75 | ADCL $0, R8 // add CF to R8 76 | 77 | SHRL $1, AX 78 | ADCL $0, R9 // add CF to R9 79 | 80 | SHRL $1, AX 81 | ADCL $0, R10 // add CF to R10 82 | 83 | SHRL $1, AX 84 | ADCL $0, R11 // add CF to R11 85 | 86 | SHRL $1, AX 87 | ADCL $0, R12 // add CF to R12 88 | 89 | SHRL $1, AX 90 | ADCL $0, R13 // add CF to R13 91 | 92 | SHRL $1, AX 93 | ADCL $0, R14 // add CF to R14 94 | 95 | SHRL $1, AX 96 | ADCL $0, R15 // add CF to R15 97 | 98 | DECQ CX // mark this byte as done 99 | JNE loop // and proceed if any bytes are left 100 | 101 | // write R8--R15 back to counts 102 | done: MOVL R8, 4*0(DI) 103 | MOVL R9, 4*1(DI) 104 | MOVL R10, 4*2(DI) 105 | MOVL R11, 4*3(DI) 106 | MOVL R12, 4*4(DI) 107 | MOVL R13, 4*5(DI) 108 | MOVL R14, 4*6(DI) 109 | MOVL R15, 4*7(DI) 110 | 111 | VZEROUPPER // restore SSE-compatibility 112 | RET 113 | -------------------------------------------------------------------------------- /regcsa15.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | #define CSA(A, B, C, D) \ 4 | VPAND A, B, D \ 5 | VPXOR A, B, A \ 6 | VPAND A, C, B \ 7 | VPXOR A, C, A \ 8 | VPOR B, D, B 9 | 10 | // func PospopcntRegCSA15(counts *[8]int32, buf []byte) 11 | TEXT ·PospopcntRegCSA15(SB),NOSPLIT,$0-32 12 | MOVQ counts+0(FP), DI 13 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 14 | MOVQ buf_len+16(FP), CX // CX = len(buf) 15 | 16 | // load counts into register R8--R15 17 | MOVL 4*0(DI), R8 18 | MOVL 4*1(DI), R9 19 | MOVL 4*2(DI), R10 20 | MOVL 4*3(DI), R11 21 | MOVL 4*4(DI), R12 22 | MOVL 4*5(DI), R13 23 | MOVL 4*6(DI), R14 24 | MOVL 4*7(DI), R15 25 | 26 | SUBQ $15*32, CX // pre-decrement CX 27 | JL vec3 28 | 29 | vec15: VMOVDQU 0*32(SI), Y0 // load 480 bytes from buf into Y0--Y14 30 | VMOVDQU 1*32(SI), Y1 31 | VMOVDQU 2*32(SI), Y2 32 | CSA(Y0, Y1, Y2, Y15) 33 | 34 | VMOVDQU 3*32(SI), Y3 35 | VMOVDQU 4*32(SI), Y4 36 | VMOVDQU 5*32(SI), Y5 37 | CSA(Y3, Y4, Y5, Y15) 38 | 39 | VMOVDQU 6*32(SI), Y6 40 | VMOVDQU 7*32(SI), Y7 41 | VMOVDQU 8*32(SI), Y8 42 | CSA(Y6, Y7, Y8, Y15) 43 | 44 | VMOVDQU 9*32(SI), Y9 45 | VMOVDQU 10*32(SI), Y10 46 | VMOVDQU 11*32(SI), Y11 47 | CSA(Y9, Y10, Y11, Y15) 48 | 49 | VMOVDQU 12*32(SI), Y12 50 | VMOVDQU 13*32(SI), Y13 51 | VMOVDQU 14*32(SI), Y14 52 | CSA(Y12, Y13, Y14, Y15) 53 | 54 | ADDQ $15*32, SI 55 | #define D 48 56 | PREFETCHT0 (D+ 0)*32(SI) 57 | PREFETCHT0 (D+ 2)*32(SI) 58 | PREFETCHT0 (D+ 4)*32(SI) 59 | PREFETCHT0 (D+ 6)*32(SI) 60 | PREFETCHT0 (D+ 8)*32(SI) 61 | PREFETCHT0 (D+10)*32(SI) 62 | PREFETCHT0 (D+12)*32(SI) 63 | PREFETCHT0 (D+14)*32(SI) 64 | 65 | CSA(Y0, Y3, Y6, Y15) 66 | CSA(Y1, Y4, Y7, Y15) 67 | CSA(Y0, Y9, Y12, Y15) 68 | CSA(Y1, Y3, Y10, Y15) 69 | CSA(Y1, Y9, Y13, Y15) 70 | CSA(Y3, Y4, Y9, Y15) 71 | 72 | VPMOVMSKB Y3, AX 73 | VPADDB Y3, Y3, Y3 74 | POPCNTL AX, AX 75 | VPMOVMSKB Y4, BX 76 | VPADDB Y4, Y4, Y4 77 | POPCNTL BX, BX 78 | LEAL (AX)(BX*2), AX 79 | VPMOVMSKB Y0, BX 80 | VPADDB Y0, Y0, Y0 81 | POPCNTL BX, BX 82 | VPMOVMSKB Y1, DX 83 | VPADDB Y1, Y1, Y1 84 | POPCNTL DX, DX 85 | LEAL (BX)(DX*2), BX 86 | LEAL (BX)(AX*4), AX 87 | ADDL AX, R15 88 | 89 | VPMOVMSKB Y3, AX 90 | VPADDB Y3, Y3, Y3 91 | POPCNTL AX, AX 92 | VPMOVMSKB Y4, BX 93 | VPADDB Y4, Y4, Y4 94 | POPCNTL BX, BX 95 | LEAL (AX)(BX*2), AX 96 | VPMOVMSKB Y0, BX 97 | VPADDB Y0, Y0, Y0 98 | POPCNTL BX, BX 99 | VPMOVMSKB Y1, DX 100 | VPADDB Y1, Y1, Y1 101 | POPCNTL DX, DX 102 | LEAL (BX)(DX*2), BX 103 | LEAL (BX)(AX*4), AX 104 | ADDL AX, R14 105 | 106 | VPMOVMSKB Y3, AX 107 | VPADDB Y3, Y3, Y3 108 | POPCNTL AX, AX 109 | VPMOVMSKB Y4, BX 110 | VPADDB Y4, Y4, Y4 111 | POPCNTL BX, BX 112 | LEAL (AX)(BX*2), AX 113 | VPMOVMSKB Y0, BX 114 | VPADDB Y0, Y0, Y0 115 | POPCNTL BX, BX 116 | VPMOVMSKB Y1, DX 117 | VPADDB Y1, Y1, Y1 118 | POPCNTL DX, DX 119 | LEAL (BX)(DX*2), BX 120 | LEAL (BX)(AX*4), AX 121 | ADDL AX, R13 122 | 123 | VPMOVMSKB Y3, AX 124 | VPADDB Y3, Y3, Y3 125 | POPCNTL AX, AX 126 | VPMOVMSKB Y4, BX 127 | VPADDB Y4, Y4, Y4 128 | POPCNTL BX, BX 129 | LEAL (AX)(BX*2), AX 130 | VPMOVMSKB Y0, BX 131 | VPADDB Y0, Y0, Y0 132 | POPCNTL BX, BX 133 | VPMOVMSKB Y1, DX 134 | VPADDB Y1, Y1, Y1 135 | POPCNTL DX, DX 136 | LEAL (BX)(DX*2), BX 137 | LEAL (BX)(AX*4), AX 138 | ADDL AX, R12 139 | 140 | VPMOVMSKB Y3, AX 141 | VPADDB Y3, Y3, Y3 142 | POPCNTL AX, AX 143 | VPMOVMSKB Y4, BX 144 | VPADDB Y4, Y4, Y4 145 | POPCNTL BX, BX 146 | LEAL (AX)(BX*2), AX 147 | VPMOVMSKB Y0, BX 148 | VPADDB Y0, Y0, Y0 149 | POPCNTL BX, BX 150 | VPMOVMSKB Y1, DX 151 | VPADDB Y1, Y1, Y1 152 | POPCNTL DX, DX 153 | LEAL (BX)(DX*2), BX 154 | LEAL (BX)(AX*4), AX 155 | ADDL AX, R11 156 | 157 | VPMOVMSKB Y3, AX 158 | VPADDB Y3, Y3, Y3 159 | POPCNTL AX, AX 160 | VPMOVMSKB Y4, BX 161 | VPADDB Y4, Y4, Y4 162 | POPCNTL BX, BX 163 | LEAL (AX)(BX*2), AX 164 | VPMOVMSKB Y0, BX 165 | VPADDB Y0, Y0, Y0 166 | POPCNTL BX, BX 167 | VPMOVMSKB Y1, DX 168 | VPADDB Y1, Y1, Y1 169 | POPCNTL DX, DX 170 | LEAL (BX)(DX*2), BX 171 | LEAL (BX)(AX*4), AX 172 | ADDL AX, R10 173 | 174 | VPMOVMSKB Y3, AX 175 | VPADDB Y3, Y3, Y3 176 | POPCNTL AX, AX 177 | VPMOVMSKB Y4, BX 178 | VPADDB Y4, Y4, Y4 179 | POPCNTL BX, BX 180 | LEAL (AX)(BX*2), AX 181 | VPMOVMSKB Y0, BX 182 | VPADDB Y0, Y0, Y0 183 | POPCNTL BX, BX 184 | VPMOVMSKB Y1, DX 185 | VPADDB Y1, Y1, Y1 186 | POPCNTL DX, DX 187 | LEAL (BX)(DX*2), BX 188 | LEAL (BX)(AX*4), AX 189 | ADDL AX, R9 190 | 191 | VPMOVMSKB Y3, AX 192 | POPCNTL AX, AX 193 | VPMOVMSKB Y4, BX 194 | POPCNTL BX, BX 195 | LEAL (AX)(BX*2), AX 196 | VPMOVMSKB Y0, BX 197 | POPCNTL BX, BX 198 | VPMOVMSKB Y1, DX 199 | POPCNTL DX, DX 200 | LEAL (BX)(DX*2), BX 201 | LEAL (BX)(AX*4), AX 202 | ADDL AX, R8 203 | 204 | SUBQ $15*32, CX 205 | JGE vec15 // repeat as long as bytes are left 206 | 207 | vec3: ADDQ $15*32, CX // undo last subtraction 208 | SUBQ $32, CX // pre-subtract 32 bit from CX 209 | JL scalar 210 | 211 | vector: VMOVDQU (SI), Y0 // load 32 bytes from buf 212 | ADDQ $32, SI // advance SI past them 213 | 214 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 215 | POPCNTL AX, AX // count population of AX 216 | ADDL AX, R15 // add to counter 217 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 218 | 219 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 220 | POPCNTL AX, AX // count population of AX 221 | ADDL AX, R14 // add to counter 222 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 223 | 224 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 225 | POPCNTL AX, AX // count population of AX 226 | ADDL AX, R13 // add to counter 227 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 228 | 229 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 230 | POPCNTL AX, AX // count population of AX 231 | ADDL AX, R12 // add to counter 232 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 233 | 234 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 235 | POPCNTL AX, AX // count population of AX 236 | ADDL AX, R11 // add to counter 237 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 238 | 239 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 240 | POPCNTL AX, AX // count population of AX 241 | ADDL AX, R10 // add to counter 242 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 243 | 244 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 245 | POPCNTL AX, AX // count population of AX 246 | ADDL AX, R9 // add to counter 247 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 248 | 249 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 250 | POPCNTL AX, AX // count population of AX 251 | ADDL AX, R8 // add to counter 252 | 253 | SUBQ $32, CX 254 | JGE vector // repeat as long as bytes are left 255 | 256 | scalar: ADDQ $32, CX // undo last subtraction 257 | JE done // if CX=0, there's nothing left 258 | 259 | loop: MOVBLZX (SI), AX // load a byte from buf 260 | INCQ SI // advance past it 261 | 262 | SHRL $1, AX // is bit 0 set? 263 | ADCL $0, R8 // add it to R8 264 | 265 | SHRL $1, AX // is bit 0 set? 266 | ADCL $0, R9 // add it to R9 267 | 268 | SHRL $1, AX // is bit 0 set? 269 | ADCL $0, R10 // add it to R10 270 | 271 | SHRL $1, AX // is bit 0 set? 272 | ADCL $0, R11 // add it to R11 273 | 274 | SHRL $1, AX // is bit 0 set? 275 | ADCL $0, R12 // add it to R12 276 | 277 | SHRL $1, AX // is bit 0 set? 278 | ADCL $0, R13 // add it to R13 279 | 280 | SHRL $1, AX // is bit 0 set? 281 | ADCL $0, R14 // add it to R14 282 | 283 | SHRL $1, AX // is bit 0 set? 284 | ADCL $0, R15 // add it to R15 285 | 286 | DECQ CX // mark this byte as done 287 | JNE loop // and proceed if any bytes are left 288 | 289 | // write R8--R15 back to counts 290 | done: MOVL R8, 4*0(DI) 291 | MOVL R9, 4*1(DI) 292 | MOVL R10, 4*2(DI) 293 | MOVL R11, 4*3(DI) 294 | MOVL R12, 4*4(DI) 295 | MOVL R13, 4*5(DI) 296 | MOVL R14, 4*6(DI) 297 | MOVL R15, 4*7(DI) 298 | 299 | VZEROUPPER // restore SSE-compatibility 300 | RET 301 | -------------------------------------------------------------------------------- /regcsa3.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // func PospopcntRegCSA(counts *[8]int32, buf []byte) 4 | TEXT ·PospopcntRegCSA3(SB),NOSPLIT,$0-32 5 | MOVQ counts+0(FP), DI 6 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 7 | MOVQ buf_len+16(FP), CX // CX = len(buf) 8 | 9 | // load counts into register R8--R15 10 | MOVL 4*0(DI), R8 11 | MOVL 4*1(DI), R9 12 | MOVL 4*2(DI), R10 13 | MOVL 4*3(DI), R11 14 | MOVL 4*4(DI), R12 15 | MOVL 4*5(DI), R13 16 | MOVL 4*6(DI), R14 17 | MOVL 4*7(DI), R15 18 | 19 | SUBQ $96, CX // pre-subtract 32 bit from CX 20 | JL vec 21 | 22 | vec3: VMOVDQU (SI), Y0 // load 96 bytes from buf into Y0--Y2 23 | VMOVDQU 32(SI), Y1 24 | VMOVDQU 64(SI), Y2 25 | ADDQ $96, SI // advance SI past them 26 | PREFETCHT0 10*32(SI) 27 | PREFETCHT0 12*32(SI) 28 | 29 | VPXOR Y0, Y1, Y3 // first adder: sum 30 | VPAND Y0, Y1, Y0 // first adder: carry out 31 | VPAND Y2, Y3, Y1 // second adder: carry out 32 | VPXOR Y2, Y3, Y2 // second adder: sum (full sum) 33 | VPOR Y0, Y1, Y0 // full adder: carry out 34 | 35 | VPMOVMSKB Y0, AX // MSB of carry out bytes 36 | VPMOVMSKB Y2, DX // MSB of sum bytes 37 | VPADDB Y0, Y0, Y0 // shift carry out bytes left 38 | VPADDB Y2, Y2, Y2 // shift sum bytes left 39 | POPCNTL AX, AX // carry bytes population count 40 | POPCNTL DX, DX // sum bytes population count 41 | LEAL (DX)(AX*2), AX // sum popcount plus 2x carry popcount 42 | ADDL AX, R15 43 | 44 | VPMOVMSKB Y0, AX // MSB of carry out bytes 45 | VPMOVMSKB Y2, DX // MSB of sum bytes 46 | VPADDB Y0, Y0, Y0 // shift carry out bytes left 47 | VPADDB Y2, Y2, Y2 // shift sum bytes left 48 | POPCNTL AX, AX // carry bytes population count 49 | POPCNTL DX, DX // sum bytes population count 50 | LEAL (DX)(AX*2), AX // sum popcount plus 2x carry popcount 51 | ADDL AX, R14 52 | 53 | VPMOVMSKB Y0, AX // MSB of carry out bytes 54 | VPMOVMSKB Y2, DX // MSB of sum bytes 55 | VPADDB Y0, Y0, Y0 // shift carry out bytes left 56 | VPADDB Y2, Y2, Y2 // shift sum bytes left 57 | POPCNTL AX, AX // carry bytes population count 58 | POPCNTL DX, DX // sum bytes population count 59 | LEAL (DX)(AX*2), AX // sum popcount plus 2x carry popcount 60 | ADDL AX, R13 61 | 62 | VPMOVMSKB Y0, AX // MSB of carry out bytes 63 | VPMOVMSKB Y2, DX // MSB of sum bytes 64 | VPADDB Y0, Y0, Y0 // shift carry out bytes left 65 | VPADDB Y2, Y2, Y2 // shift sum bytes left 66 | POPCNTL AX, AX // carry bytes population count 67 | POPCNTL DX, DX // sum bytes population count 68 | LEAL (DX)(AX*2), AX // sum popcount plus 2x carry popcount 69 | ADDL AX, R12 70 | 71 | VPMOVMSKB Y0, AX // MSB of carry out bytes 72 | VPMOVMSKB Y2, DX // MSB of sum bytes 73 | VPADDB Y0, Y0, Y0 // shift carry out bytes left 74 | VPADDB Y2, Y2, Y2 // shift sum bytes left 75 | POPCNTL AX, AX // carry bytes population count 76 | POPCNTL DX, DX // sum bytes population count 77 | LEAL (DX)(AX*2), AX // sum popcount plus 2x carry popcount 78 | ADDL AX, R11 79 | 80 | VPMOVMSKB Y0, AX // MSB of carry out bytes 81 | VPMOVMSKB Y2, DX // MSB of sum bytes 82 | VPADDB Y0, Y0, Y0 // shift carry out bytes left 83 | VPADDB Y2, Y2, Y2 // shift sum bytes left 84 | POPCNTL AX, AX // carry bytes population count 85 | POPCNTL DX, DX // sum bytes population count 86 | LEAL (DX)(AX*2), AX // sum popcount plus 2x carry popcount 87 | ADDL AX, R10 88 | 89 | VPMOVMSKB Y0, AX // MSB of carry out bytes 90 | VPMOVMSKB Y2, DX // MSB of sum bytes 91 | VPADDB Y0, Y0, Y0 // shift carry out bytes left 92 | VPADDB Y2, Y2, Y2 // shift sum bytes left 93 | POPCNTL AX, AX // carry bytes population count 94 | POPCNTL DX, DX // sum bytes population count 95 | LEAL (DX)(AX*2), AX // sum popcount plus 2x carry popcount 96 | ADDL AX, R9 97 | 98 | VPMOVMSKB Y0, AX // MSB of carry out bytes 99 | VPMOVMSKB Y2, DX // MSB of sum bytes 100 | POPCNTL AX, AX // carry bytes population count 101 | POPCNTL DX, DX // sum bytes population count 102 | LEAL (DX)(AX*2), AX // sum popcount plus 2x carry popcount 103 | ADDL AX, R8 104 | 105 | SUBQ $96, CX 106 | JGE vec3 // repeat as long as bytes are left 107 | 108 | vec: ADDQ $96, CX // undo last subtraction 109 | SUBQ $32, CX // pre-subtract 32 110 | JL scalar 111 | 112 | vector: VMOVDQU (SI), Y0 // load 32 bytes from buf 113 | ADDQ $32, SI // advance SI past them 114 | 115 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 116 | POPCNTL AX, AX // count population of AX 117 | ADDL AX, R15 // add to counter 118 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 119 | 120 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 121 | POPCNTL AX, AX // count population of AX 122 | ADDL AX, R14 // add to counter 123 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 124 | 125 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 126 | POPCNTL AX, AX // count population of AX 127 | ADDL AX, R13 // add to counter 128 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 129 | 130 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 131 | POPCNTL AX, AX // count population of AX 132 | ADDL AX, R12 // add to counter 133 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 134 | 135 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 136 | POPCNTL AX, AX // count population of AX 137 | ADDL AX, R11 // add to counter 138 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 139 | 140 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 141 | POPCNTL AX, AX // count population of AX 142 | ADDL AX, R10 // add to counter 143 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 144 | 145 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 146 | POPCNTL AX, AX // count population of AX 147 | ADDL AX, R9 // add to counter 148 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 149 | 150 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 151 | POPCNTL AX, AX // count population of AX 152 | ADDL AX, R8 // add to counter 153 | 154 | SUBQ $32, CX 155 | JGE vector // repeat as long as bytes are left 156 | 157 | scalar: ADDQ $32, CX // undo last subtraction 158 | JE done // if CX=0, there's nothing left 159 | loop: MOVBLZX (SI), AX // load a byte from buf 160 | INCQ SI // advance past it 161 | 162 | SHRL $1, AX // is bit 0 set? 163 | ADCL $0, R8 // add it to R8 164 | 165 | SHRL $1, AX // is bit 0 set? 166 | ADCL $0, R9 // add it to R9 167 | 168 | SHRL $1, AX // is bit 0 set? 169 | ADCL $0, R10 // add it to R10 170 | 171 | SHRL $1, AX // is bit 0 set? 172 | ADCL $0, R11 // add it to R11 173 | 174 | SHRL $1, AX // is bit 0 set? 175 | ADCL $0, R12 // add it to R12 176 | 177 | SHRL $1, AX // is bit 0 set? 178 | ADCL $0, R13 // add it to R13 179 | 180 | SHRL $1, AX // is bit 0 set? 181 | ADCL $0, R14 // add it to R14 182 | 183 | SHRL $1, AX // is bit 0 set? 184 | ADCL $0, R15 // add it to R15 185 | 186 | DECQ CX // mark this byte as done 187 | JNE loop // and proceed if any bytes are left 188 | 189 | // write R8--R15 back to counts 190 | done: MOVL R8, 4*0(DI) 191 | MOVL R9, 4*1(DI) 192 | MOVL R10, 4*2(DI) 193 | MOVL R11, 4*3(DI) 194 | MOVL R12, 4*4(DI) 195 | MOVL R13, 4*5(DI) 196 | MOVL R14, 4*6(DI) 197 | MOVL R15, 4*7(DI) 198 | 199 | VZEROUPPER // restore SSE-compatibility 200 | RET 201 | -------------------------------------------------------------------------------- /regcsa7.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | #define CSA(A, B, C, D) \ 4 | VPAND A, B, D \ 5 | VPXOR A, B, A \ 6 | VPAND A, C, B \ 7 | VPXOR A, C, A \ 8 | VPOR B, D, B 9 | 10 | // func PospopcntRegCSA7(counts *[8]int32, buf []byte) 11 | TEXT ·PospopcntRegCSA7(SB),NOSPLIT,$0-32 12 | MOVQ counts+0(FP), DI 13 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 14 | MOVQ buf_len+16(FP), CX // CX = len(buf) 15 | 16 | // load counts into register R8--R15 17 | MOVL 4*0(DI), R8 18 | MOVL 4*1(DI), R9 19 | MOVL 4*2(DI), R10 20 | MOVL 4*3(DI), R11 21 | MOVL 4*4(DI), R12 22 | MOVL 4*5(DI), R13 23 | MOVL 4*6(DI), R14 24 | MOVL 4*7(DI), R15 25 | 26 | SUBQ $7*32, CX // pre-decrement CX 27 | JL vec3 28 | 29 | vec7: VMOVDQU 0*32(SI), Y0 // load 224 bytes from buf into Y0--Y6 30 | VMOVDQU 1*32(SI), Y1 31 | VMOVDQU 2*32(SI), Y2 32 | CSA(Y0, Y1, Y2, Y7) // Y0 = 0^1^2, Y1 = [0+1+2] 33 | 34 | VMOVDQU 3*32(SI), Y3 35 | VMOVDQU 4*32(SI), Y4 36 | VMOVDQU 5*32(SI), Y5 37 | CSA(Y3, Y4, Y5, Y7) // Y3 = 3^4^5, Y4 = [3+4+5] 38 | 39 | VMOVDQU 6*32(SI), Y6 40 | CSA(Y0, Y3, Y6, Y7) // Y0 = 0^1^2^3^4^5^6, Y3 = [[0+1+2]+[3+4+5]+6] 41 | CSA(Y1, Y3, Y4, Y7) // Y1 = [0+1+2+3+4+5+6], Y3 = [[0+1+2+3+4+5+6] 42 | 43 | ADDQ $7*32, SI 44 | PREFETCHT0 16*32(SI) 45 | PREFETCHT0 18*32(SI) 46 | PREFETCHT0 20*32(SI) 47 | PREFETCHT0 22*32(SI) 48 | 49 | VPMOVMSKB Y0, AX 50 | VPADDB Y0, Y0, Y0 51 | POPCNTL AX, AX 52 | VPMOVMSKB Y1, BX 53 | VPADDB Y1, Y1, Y1 54 | POPCNTL BX, BX 55 | VPMOVMSKB Y3, DX 56 | VPADDB Y3, Y3, Y3 57 | POPCNTL DX, DX 58 | LEAL (AX)(BX*2), AX 59 | LEAL (AX)(DX*4), AX 60 | ADDL AX, R15 61 | 62 | VPMOVMSKB Y0, AX 63 | VPADDB Y0, Y0, Y0 64 | POPCNTL AX, AX 65 | VPMOVMSKB Y1, BX 66 | VPADDB Y1, Y1, Y1 67 | POPCNTL BX, BX 68 | VPMOVMSKB Y3, DX 69 | VPADDB Y3, Y3, Y3 70 | POPCNTL DX, DX 71 | LEAL (AX)(BX*2), AX 72 | LEAL (AX)(DX*4), AX 73 | ADDL AX, R14 74 | 75 | VPMOVMSKB Y0, AX 76 | VPADDB Y0, Y0, Y0 77 | POPCNTL AX, AX 78 | VPMOVMSKB Y1, BX 79 | VPADDB Y1, Y1, Y1 80 | POPCNTL BX, BX 81 | VPMOVMSKB Y3, DX 82 | VPADDB Y3, Y3, Y3 83 | POPCNTL DX, DX 84 | LEAL (AX)(BX*2), AX 85 | LEAL (AX)(DX*4), AX 86 | ADDL AX, R13 87 | 88 | VPMOVMSKB Y0, AX 89 | VPADDB Y0, Y0, Y0 90 | POPCNTL AX, AX 91 | VPMOVMSKB Y1, BX 92 | VPADDB Y1, Y1, Y1 93 | POPCNTL BX, BX 94 | VPMOVMSKB Y3, DX 95 | VPADDB Y3, Y3, Y3 96 | POPCNTL DX, DX 97 | LEAL (AX)(BX*2), AX 98 | LEAL (AX)(DX*4), AX 99 | ADDL AX, R12 100 | 101 | VPMOVMSKB Y0, AX 102 | VPADDB Y0, Y0, Y0 103 | POPCNTL AX, AX 104 | VPMOVMSKB Y1, BX 105 | VPADDB Y1, Y1, Y1 106 | POPCNTL BX, BX 107 | VPMOVMSKB Y3, DX 108 | VPADDB Y3, Y3, Y3 109 | POPCNTL DX, DX 110 | LEAL (AX)(BX*2), AX 111 | LEAL (AX)(DX*4), AX 112 | ADDL AX, R11 113 | 114 | VPMOVMSKB Y0, AX 115 | VPADDB Y0, Y0, Y0 116 | POPCNTL AX, AX 117 | VPMOVMSKB Y1, BX 118 | VPADDB Y1, Y1, Y1 119 | POPCNTL BX, BX 120 | VPMOVMSKB Y3, DX 121 | VPADDB Y3, Y3, Y3 122 | POPCNTL DX, DX 123 | LEAL (AX)(BX*2), AX 124 | LEAL (AX)(DX*4), AX 125 | ADDL AX, R10 126 | 127 | VPMOVMSKB Y0, AX 128 | VPADDB Y0, Y0, Y0 129 | POPCNTL AX, AX 130 | VPMOVMSKB Y1, BX 131 | VPADDB Y1, Y1, Y1 132 | POPCNTL BX, BX 133 | VPMOVMSKB Y3, DX 134 | VPADDB Y3, Y3, Y3 135 | POPCNTL DX, DX 136 | LEAL (AX)(BX*2), AX 137 | LEAL (AX)(DX*4), AX 138 | ADDL AX, R9 139 | 140 | VPMOVMSKB Y0, AX 141 | POPCNTL AX, AX 142 | VPMOVMSKB Y1, BX 143 | POPCNTL BX, BX 144 | VPMOVMSKB Y3, DX 145 | POPCNTL DX, DX 146 | LEAL (AX)(BX*2), AX 147 | LEAL (AX)(DX*4), AX 148 | ADDL AX, R8 149 | 150 | SUBQ $7*32, CX 151 | JGE vec7 // repeat as long as bytes are left 152 | 153 | vec3: ADDQ $7*32, CX // undo last subtraction 154 | SUBQ $32, CX // pre-subtract 32 bit from CX 155 | JL scalar 156 | 157 | vector: VMOVDQU (SI), Y0 // load 32 bytes from buf 158 | ADDQ $32, SI // advance SI past them 159 | 160 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 161 | POPCNTL AX, AX // count population of AX 162 | ADDL AX, R15 // add to counter 163 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 164 | 165 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 166 | POPCNTL AX, AX // count population of AX 167 | ADDL AX, R14 // add to counter 168 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 169 | 170 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 171 | POPCNTL AX, AX // count population of AX 172 | ADDL AX, R13 // add to counter 173 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 174 | 175 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 176 | POPCNTL AX, AX // count population of AX 177 | ADDL AX, R12 // add to counter 178 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 179 | 180 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 181 | POPCNTL AX, AX // count population of AX 182 | ADDL AX, R11 // add to counter 183 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 184 | 185 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 186 | POPCNTL AX, AX // count population of AX 187 | ADDL AX, R10 // add to counter 188 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 189 | 190 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 191 | POPCNTL AX, AX // count population of AX 192 | ADDL AX, R9 // add to counter 193 | VPADDD Y0, Y0, Y0 // shift Y0 left by one place 194 | 195 | VPMOVMSKB Y0, AX // move MSB of Y0 bytes to AX 196 | POPCNTL AX, AX // count population of AX 197 | ADDL AX, R8 // add to counter 198 | 199 | SUBQ $32, CX 200 | JGE vector // repeat as long as bytes are left 201 | 202 | scalar: ADDQ $32, CX // undo last subtraction 203 | JE done // if CX=0, there's nothing left 204 | 205 | loop: MOVBLZX (SI), AX // load a byte from buf 206 | INCQ SI // advance past it 207 | 208 | SHRL $1, AX // is bit 0 set? 209 | ADCL $0, R8 // add it to R8 210 | 211 | SHRL $1, AX // is bit 0 set? 212 | ADCL $0, R9 // add it to R9 213 | 214 | SHRL $1, AX // is bit 0 set? 215 | ADCL $0, R10 // add it to R10 216 | 217 | SHRL $1, AX // is bit 0 set? 218 | ADCL $0, R11 // add it to R11 219 | 220 | SHRL $1, AX // is bit 0 set? 221 | ADCL $0, R12 // add it to R12 222 | 223 | SHRL $1, AX // is bit 0 set? 224 | ADCL $0, R13 // add it to R13 225 | 226 | SHRL $1, AX // is bit 0 set? 227 | ADCL $0, R14 // add it to R14 228 | 229 | SHRL $1, AX // is bit 0 set? 230 | ADCL $0, R15 // add it to R15 231 | 232 | DECQ CX // mark this byte as done 233 | JNE loop // and proceed if any bytes are left 234 | 235 | // write R8--R15 back to counts 236 | done: MOVL R8, 4*0(DI) 237 | MOVL R9, 4*1(DI) 238 | MOVL R10, 4*2(DI) 239 | MOVL R11, 4*3(DI) 240 | MOVL R12, 4*4(DI) 241 | MOVL R13, 4*5(DI) 242 | MOVL R14, 4*6(DI) 243 | MOVL R15, 4*7(DI) 244 | 245 | VZEROUPPER // restore SSE-compatibility 246 | RET 247 | -------------------------------------------------------------------------------- /scalar.s: -------------------------------------------------------------------------------- 1 | #include "textflag.h" 2 | 3 | // func PospopcntScalarReg(counts *[8]int32, buf []byte) 4 | TEXT ·PospopcntScalarReg(SB),NOSPLIT,$0-32 5 | MOVQ counts+0(FP), DI 6 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 7 | MOVQ buf_len+16(FP), CX // CX = len(buf) 8 | 9 | // load counts into register R8--R15 10 | MOVL 4*0(DI), R8 11 | MOVL 4*1(DI), R9 12 | MOVL 4*2(DI), R10 13 | MOVL 4*3(DI), R11 14 | MOVL 4*4(DI), R12 15 | MOVL 4*5(DI), R13 16 | MOVL 4*6(DI), R14 17 | MOVL 4*7(DI), R15 18 | 19 | TESTQ CX, CX 20 | JE done // if CX=0, there's nothing left 21 | 22 | loop: MOVBLZX (SI), AX // load a byte from buf 23 | INCQ SI // advance past it 24 | 25 | SHRL $1, AX // CF=LSB, shift byte to the right 26 | ADCL $0, R8 // add CF to R8 27 | 28 | SHRL $1, AX 29 | ADCL $0, R9 // add CF to R9 30 | 31 | SHRL $1, AX 32 | ADCL $0, R10 // add CF to R10 33 | 34 | SHRL $1, AX 35 | ADCL $0, R11 // add CF to R11 36 | 37 | SHRL $1, AX 38 | ADCL $0, R12 // add CF to R12 39 | 40 | SHRL $1, AX 41 | ADCL $0, R13 // add CF to R13 42 | 43 | SHRL $1, AX 44 | ADCL $0, R14 // add CF to R14 45 | 46 | SHRL $1, AX 47 | ADCL $0, R15 // add CF to R15 48 | 49 | DECQ CX // mark this byte as done 50 | JNE loop // and procced if any bytes are left 51 | 52 | // write R8--R15 back to counts 53 | done: MOVL R8, 4*0(DI) 54 | MOVL R9, 4*1(DI) 55 | MOVL R10, 4*2(DI) 56 | MOVL R11, 4*3(DI) 57 | MOVL R12, 4*4(DI) 58 | MOVL R13, 4*5(DI) 59 | MOVL R14, 4*6(DI) 60 | MOVL R15, 4*7(DI) 61 | 62 | RET 63 | 64 | // func PospopcntScalarMem(counts *[8]int32, buf []byte) 65 | TEXT ·PospopcntScalarMem(SB),NOSPLIT,$0-32 66 | MOVQ counts+0(FP), DI 67 | MOVQ buf_base+8(FP), SI // SI = &buf[0] 68 | MOVQ buf_len+16(FP), CX // CX = len(buf) 69 | 70 | TESTQ CX, CX 71 | JE done // if CX=0, there's nothing left 72 | 73 | loop: MOVBLZX (SI), AX // load a byte from buf 74 | INCQ SI // advance past it 75 | 76 | SHRL $1, AX // CF=LSB, shift byte to the right 77 | ADCL $0, 4*0(DI) // add it to the counters 78 | 79 | SHRL $1, AX // CF=LSB, shift byte to the right 80 | ADCL $0, 4*1(DI) // add it to the counters 81 | 82 | SHRL $1, AX // CF=LSB, shift byte to the right 83 | ADCL $0, 4*2(DI) // add it to the counters 84 | 85 | SHRL $1, AX // CF=LSB, shift byte to the right 86 | ADCL $0, 4*3(DI) // add it to the counters 87 | 88 | SHRL $1, AX // CF=LSB, shift byte to the right 89 | ADCL $0, 4*4(DI) // add it to the counters 90 | 91 | SHRL $1, AX // CF=LSB, shift byte to the right 92 | ADCL $0, 4*5(DI) // add it to the counters 93 | 94 | SHRL $1, AX // CF=LSB, shift byte to the right 95 | ADCL $0, 4*6(DI) // add it to the counters 96 | 97 | SHRL $1, AX // CF=LSB, shift byte to the right 98 | ADCL $0, 4*7(DI) // add it to the counters 99 | 100 | DECQ CX // mark this byte as done 101 | JNE loop // and proceed if any bytes are left 102 | 103 | done: RET 104 | --------------------------------------------------------------------------------