├── README ├── highway_stub.go ├── bench_test.go ├── LICENSE ├── highway_test.go ├── highway.go ├── highway_amd64.s └── sum.py /README: -------------------------------------------------------------------------------- 1 | go-highway: Google's Highway Hash 2 | 3 | godoc: https://godoc.org/github.com/dgryski/go-highway 4 | -------------------------------------------------------------------------------- /highway_stub.go: -------------------------------------------------------------------------------- 1 | package highway 2 | 3 | //go:generate python -m peachpy.x86_64 sum.py -S -o highway_amd64.s -mabi=goasm 4 | //go:noescape 5 | 6 | func hashSSE(keys, init0, init1 *Lanes, p []byte) uint64 7 | -------------------------------------------------------------------------------- /bench_test.go: -------------------------------------------------------------------------------- 1 | package highway 2 | 3 | import "testing" 4 | 5 | var total uint64 6 | var buf = make([]byte, 8<<10) 7 | 8 | func BenchmarkHighway8(b *testing.B) { benchmarkHash(b, 8) } 9 | func BenchmarkHighway16(b *testing.B) { benchmarkHash(b, 16) } 10 | func BenchmarkHighway40(b *testing.B) { benchmarkHash(b, 40) } 11 | func BenchmarkHighway64(b *testing.B) { benchmarkHash(b, 64) } 12 | func BenchmarkHighway1K(b *testing.B) { benchmarkHash(b, 1024) } 13 | func BenchmarkHighway8K(b *testing.B) { benchmarkHash(b, 8192) } 14 | 15 | func benchmarkHash(b *testing.B, size int64) { 16 | b.SetBytes(size) 17 | bsz := buf[:size] 18 | total = 0 19 | keys := Lanes{} 20 | for i := 0; i < b.N; i++ { 21 | total += Hash(keys, bsz) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2021 Damian Gryski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /highway_test.go: -------------------------------------------------------------------------------- 1 | package highway 2 | 3 | import ( 4 | "testing" 5 | ) 6 | 7 | func TestHighway(t *testing.T) { 8 | 9 | input := make([]byte, 64) 10 | 11 | var tests = []uint64{ 12 | 0x907a56de22c26e53, 0x7eab43aac7cddd78, 0xb8d0569ab0b53d62, 13 | 0x5c6befab8a463d80, 0xf205a46893007eda, 0x2b8a1668e4a94541, 14 | 0xbd4ccc325befca6f, 0x4d02ae1738f59482, 0xe1205108e55f3171, 15 | 0x32d2644ec77a1584, 0xf6e10acdb103a90b, 0xc3bbf4615b415c15, 16 | 0x243cc2040063fa9c, 0xa89a58ce65e641ff, 0x24b031a348455a23, 17 | 0x40793f86a449f33b, 0xcfab3489f97eb832, 0x19fe67d2c8c5c0e2, 18 | 0x04dd90a69c565cc2, 0x75d9518e2371c504, 0x38ad9b1141d3dd16, 19 | 0x0264432ccd8a70e0, 0xa9db5a6288683390, 0xd7b05492003f028c, 20 | 0x205f615aea59e51e, 0xeee0c89621052884, 0x1bfc1a93a7284f4f, 21 | 0x512175b5b70da91d, 0xf71f8976a0a2c639, 0xae093fef1f84e3e7, 22 | 0x22ca92b01161860f, 0x9fc7007ccf035a68, 0xa0c964d9ecd580fc, 23 | 0x2c90f73ca03181fc, 0x185cf84e5691eb9e, 0x4fc1f5ef2752aa9b, 24 | 0xf5b7391a5e0a33eb, 0xb9b84b83b4e96c9c, 0x5e42fe712a5cd9b4, 25 | 0xa150f2f90c3f97dc, 0x7fa522d75e2d637d, 0x181ad0cc0dffd32b, 26 | 0x3889ed981e854028, 0xfb4297e8c586ee2d, 0x6d064a45bb28059c, 27 | 0x90563609b3ec860c, 0x7aa4fce94097c666, 0x1326bac06b911e08, 28 | 0xb926168d2b154f34, 0x9919848945b1948d, 0xa2a98fc534825ebe, 29 | 0xe9809095213ef0b6, 0x582e5483707bc0e9, 0x086e9414a88a6af5, 30 | 0xee86b98d20f6743d, 0xf89b7ff609b1c0a7, 0x4c7d9cc19e22c3e8, 31 | 0x9a97005024562a6f, 0x5dd41cf423e6ebef, 0xdf13609c0468e227, 32 | 0x6e0da4f64188155a, 0xb755ba4b50d7d4a1, 0x887a3484647479bd, 33 | 0xab8eebe9bf2139a0, 0x75542c5d4cd2a6ff, 34 | } 35 | 36 | key := Lanes{0x0706050403020100, 0x0F0E0D0C0B0A0908, 0x1716151413121110, 0x1F1E1D1C1B1A1918} 37 | 38 | for i := range input { 39 | input[i] = byte(i) 40 | 41 | if h := Hash(key, input[:i]); h != tests[i] { 42 | t.Errorf("Hash(..., input[:%d])=%016x, want %016x\n", i, h, tests[i]) 43 | } else { 44 | t.Logf("PASS: Hash(..., input[:%d])=%016x, want %016x\n", i, h, tests[i]) 45 | } 46 | } 47 | } 48 | 49 | func TestCompare(t *testing.T) { 50 | 51 | input := make([]byte, 64) 52 | 53 | key := Lanes{0x0706050403020100, 0x0F0E0D0C0B0A0908, 0x1716151413121110, 0x1F1E1D1C1B1A1918} 54 | 55 | for i := range input { 56 | input[i] = byte(i) 57 | 58 | want := Hash(key, input[:i]) 59 | got := hashSSE(&key, &init0, &init1, input[:i]) 60 | 61 | if got != want { 62 | t.Errorf("hashSSE(..., input[:%d])=%016x, want %016x\n", i, got, want) 63 | } else { 64 | t.Logf("PASS: hashSSE(..., input[:%d])=%016x\n", i, got) 65 | } 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /highway.go: -------------------------------------------------------------------------------- 1 | // Package highway implements Google's HighwayHash 2 | /* 3 | https://github.com/google/highwayhash 4 | */ 5 | package highway 6 | 7 | import ( 8 | "encoding/binary" 9 | 10 | "github.com/intel-go/cpuid" 11 | ) 12 | 13 | const ( 14 | NumLanes = 4 15 | packetSize = 8 * NumLanes 16 | ) 17 | 18 | type Lanes [NumLanes]uint64 19 | 20 | var ( 21 | init0 = Lanes{0xdbe6d5d5fe4cce2f, 0xa4093822299f31d0, 0x13198a2e03707344, 0x243f6a8885a308d3} 22 | init1 = Lanes{0x3bd39e10cb0ef593, 0xc0acf169b5f18a8c, 0xbe5466cf34e90c6c, 0x452821e638d01377} 23 | ) 24 | 25 | var useSSE = cpuid.HasFeature(cpuid.SSE4_1) 26 | 27 | type state struct { 28 | v0, v1 Lanes 29 | mul0, mul1 Lanes 30 | } 31 | 32 | func newstate(s *state, keys Lanes) { 33 | var permutedKeys Lanes 34 | rotate64by32(&keys, &permutedKeys) 35 | for lane := range keys { 36 | s.v0[lane] = init0[lane] ^ keys[lane] 37 | s.v1[lane] = init1[lane] ^ permutedKeys[lane] 38 | s.mul0[lane] = init0[lane] 39 | s.mul1[lane] = init1[lane] 40 | } 41 | } 42 | 43 | func (s *state) Update(packet []byte) { 44 | for lane := 0; lane < NumLanes; lane++ { 45 | s.v1[lane] += binary.LittleEndian.Uint64(packet[8*lane:]) 46 | s.v1[lane] += s.mul0[lane] 47 | const mask32 = 0xFFFFFFFF 48 | v1_32 := s.v1[lane] & mask32 49 | s.mul0[lane] ^= v1_32 * (s.v0[lane] >> 32) 50 | s.v0[lane] += s.mul1[lane] 51 | v0_32 := s.v0[lane] & mask32 52 | s.mul1[lane] ^= v0_32 * (s.v1[lane] >> 32) 53 | } 54 | 55 | zipperMergeAndAdd(s.v1[1], s.v1[0], &s.v0[1], &s.v0[0]) 56 | zipperMergeAndAdd(s.v1[3], s.v1[2], &s.v0[3], &s.v0[2]) 57 | zipperMergeAndAdd(s.v0[1], s.v0[0], &s.v1[1], &s.v1[0]) 58 | zipperMergeAndAdd(s.v0[3], s.v0[2], &s.v1[3], &s.v1[2]) 59 | } 60 | 61 | func (s *state) Finalize() uint64 { 62 | 63 | s.PermuteAndUpdate() 64 | s.PermuteAndUpdate() 65 | s.PermuteAndUpdate() 66 | s.PermuteAndUpdate() 67 | 68 | return s.v0[0] + s.v1[0] + s.mul0[0] + s.mul1[0] 69 | } 70 | 71 | func zipperMergeAndAdd(v1, v0 uint64, add1, add0 *uint64) { 72 | *add0 += (((v0 & 0xff000000) | (v1 & 0xff00000000)) >> 24) | 73 | (((v0 & 0xff0000000000) | (v1 & 0xff000000000000)) >> 16) | 74 | (v0 & 0xff0000) | ((v0 & 0xff00) << 32) | 75 | ((v1 & 0xff00000000000000) >> 8) | (v0 << 56) 76 | *add1 += (((v1 & 0xff000000) | (v0 & 0xff00000000)) >> 24) | 77 | (v1 & 0xff0000) | ((v1 & 0xff0000000000) >> 16) | 78 | ((v1 & 0xff00) << 24) | ((v0 & 0xff000000000000) >> 8) | 79 | ((v1 & 0xff) << 48) | (v0 & 0xff00000000000000) 80 | } 81 | 82 | func rot32(x uint64) uint64 { 83 | return (x >> 32) | (x << 32) 84 | } 85 | 86 | func rotate32By(count uint, lanes *Lanes) { 87 | for i := 0; i < 4; i++ { 88 | half0 := uint32(lanes[i] & 0xffffffff) 89 | half1 := uint32(lanes[i] >> 32) 90 | lanes[i] = uint64(half0<>(32-count)) 91 | lanes[i] |= uint64((half1<>(32-count))) << 32 92 | } 93 | } 94 | 95 | func rotate64by32(v, permuted *Lanes) { 96 | permuted[0] = rot32(v[0]) 97 | permuted[1] = rot32(v[1]) 98 | permuted[2] = rot32(v[2]) 99 | permuted[3] = rot32(v[3]) 100 | } 101 | 102 | func permute(v, permuted *Lanes) { 103 | permuted[0] = rot32(v[2]) 104 | permuted[1] = rot32(v[3]) 105 | permuted[2] = rot32(v[0]) 106 | permuted[3] = rot32(v[1]) 107 | } 108 | 109 | func (s *state) PermuteAndUpdate() { 110 | var permuted Lanes 111 | 112 | permute(&s.v0, &permuted) 113 | 114 | var bytes [32]byte 115 | 116 | binary.LittleEndian.PutUint64(bytes[0:], permuted[0]) 117 | binary.LittleEndian.PutUint64(bytes[8:], permuted[1]) 118 | binary.LittleEndian.PutUint64(bytes[16:], permuted[2]) 119 | binary.LittleEndian.PutUint64(bytes[24:], permuted[3]) 120 | 121 | s.Update(bytes[:]) 122 | } 123 | 124 | func Hash(key Lanes, bytes []byte) uint64 { 125 | 126 | if useSSE { 127 | return hashSSE(&key, &init0, &init1, bytes) 128 | } 129 | 130 | var s state 131 | 132 | size := len(bytes) 133 | sizeMod32 := size & (packetSize - 1) 134 | 135 | newstate(&s, key) 136 | // Hash entire 32-byte packets. 137 | truncatedSize := size - sizeMod32 138 | for i := 0; i < truncatedSize/8; i += NumLanes { 139 | s.Update(bytes) 140 | bytes = bytes[32:] 141 | } 142 | 143 | if sizeMod32 != 0 { 144 | // Update with final 32-byte packet. 145 | for i := 0; i < NumLanes; i++ { 146 | s.v0[i] += uint64(sizeMod32)<<32 + uint64(sizeMod32) 147 | } 148 | rotate32By(uint(sizeMod32), &s.v1) 149 | 150 | sizeMod4 := sizeMod32 & 3 151 | var finalPacket [packetSize]byte 152 | copy(finalPacket[:], bytes[:len(bytes)-sizeMod4]) 153 | remainder := bytes[len(bytes)-sizeMod4:] 154 | 155 | if sizeMod32&16 != 0 { 156 | copy(finalPacket[28:], bytes[len(bytes)-4:]) 157 | } else { 158 | if sizeMod4 != 0 { 159 | finalPacket[16+0] = remainder[0] 160 | finalPacket[16+1] = remainder[sizeMod4>>1] 161 | finalPacket[16+2] = remainder[sizeMod4-1] 162 | } 163 | } 164 | 165 | s.Update(finalPacket[:]) 166 | } 167 | 168 | return s.Finalize() 169 | } 170 | -------------------------------------------------------------------------------- /highway_amd64.s: -------------------------------------------------------------------------------- 1 | // +build !noasm 2 | // Generated by PeachPy 0.2.0 from sum.py 3 | 4 | 5 | // func hashSSE(keys uintptr, init0 uintptr, init1 uintptr, p_base uintptr, p_len int64, p_cap int64) uint64 6 | TEXT ·hashSSE(SB),4,$0-56 7 | MOVQ keys+0(FP), AX 8 | MOVQ init0+8(FP), BX 9 | MOVQ init1+16(FP), CX 10 | MOVOU 0(AX), X0 11 | MOVOU 16(AX), X1 12 | MOVOU 0(BX), X2 13 | MOVOU 16(BX), X3 14 | MOVOU 0(CX), X4 15 | MOVOU 16(CX), X5 16 | PSHUFL $177, X1, X6 17 | PSHUFL $177, X0, X7 18 | PXOR X2, X0 19 | PXOR X3, X1 20 | PXOR X4, X7 21 | PXOR X5, X6 22 | MOVQ p_base+24(FP), AX 23 | MOVQ p_len+32(FP), BX 24 | CMPQ BX, $32 25 | JLT loop1_end 26 | loop1_begin: 27 | MOVOU 0(AX), X8 28 | MOVOU 16(AX), X9 29 | PADDQ X8, X7 30 | PADDQ X9, X6 31 | PADDQ X2, X7 32 | PADDQ X3, X6 33 | MOVO X0, X8 34 | MOVO X1, X9 35 | MOVO X7, X10 36 | MOVO X6, X11 37 | PSRLQ $32, X8 38 | PSRLQ $32, X9 39 | PMULULQ X8, X10 40 | PMULULQ X9, X11 41 | PXOR X10, X2 42 | PXOR X11, X3 43 | PADDQ X4, X0 44 | PADDQ X5, X1 45 | MOVO X7, X8 46 | MOVO X6, X9 47 | MOVO X0, X10 48 | MOVO X1, X11 49 | PSRLQ $32, X8 50 | PSRLQ $32, X9 51 | PMULULQ X8, X10 52 | PMULULQ X9, X11 53 | PXOR X10, X4 54 | PXOR X11, X5 55 | MOVQ $4223284375849987, CX 56 | MOVQ CX, X8 57 | MOVQ $506661594022413323, CX 58 | MOVQ CX, X9 59 | MOVLHPS X9, X8 60 | MOVO X7, X10 61 | PSHUFB X8, X10 62 | MOVO X6, X11 63 | PSHUFB X8, X11 64 | PADDQ X10, X0 65 | PADDQ X11, X1 66 | MOVO X0, X10 67 | PSHUFB X8, X10 68 | MOVO X1, X11 69 | PSHUFB X8, X11 70 | PADDQ X10, X7 71 | PADDQ X11, X6 72 | ADDQ $32, AX 73 | SUBQ $32, BX 74 | CMPQ BX, $32 75 | JGE loop1_begin 76 | loop1_end: 77 | CMPQ BX, $0 78 | JEQ finalize 79 | MOVQ BX, CX 80 | SHLQ $32, CX 81 | ADDQ BX, CX 82 | MOVQ CX, X8 83 | PINSRQ $1, CX, X8 84 | PADDQ X8, X0 85 | PADDQ X8, X1 86 | MOVQ BX, CX 87 | MOVO X7, X8 88 | MOVQ CX, X9 89 | PSLLL X9, X8 90 | SUBQ $32, CX 91 | NEGQ CX 92 | MOVQ CX, X9 93 | PSRLL X9, X7 94 | POR X8, X7 95 | MOVQ BX, CX 96 | MOVO X6, X8 97 | MOVQ CX, X9 98 | PSLLL X9, X8 99 | SUBQ $32, CX 100 | NEGQ CX 101 | MOVQ CX, X9 102 | PSRLL X9, X6 103 | POR X8, X6 104 | MOVQ BX, CX 105 | ANDQ $3, CX 106 | NEGQ CX 107 | ADDQ BX, CX 108 | MOVQ AX, DX 109 | PXOR X8, X8 110 | PXOR X9, X9 111 | CMPQ CX, $0 112 | JEQ memcpy32_fin 113 | CMPQ CX, $16 114 | JLT memcpy32_skipLoad16 115 | MOVOU 0(AX), X8 116 | ADDQ $16, AX 117 | SUBQ $16, CX 118 | XORQ DI, DI 119 | CMPQ CX, $8 120 | JLT skip81 121 | MOVQ 0(AX), SI 122 | MOVQ SI, X9 123 | SUBQ $8, CX 124 | ADDQ $8, AX 125 | MOVQ $1, DI 126 | skip81: 127 | XORQ SI, SI 128 | CMPQ CX, $0 129 | JEQ __local3 130 | CMPQ CX, $1 131 | JEQ __local5 132 | CMPQ CX, $2 133 | JEQ __local13 134 | CMPQ CX, $3 135 | JEQ __local11 136 | CMPQ CX, $4 137 | JEQ __local4 138 | CMPQ CX, $5 139 | JEQ __local1 140 | CMPQ CX, $6 141 | JEQ __local15 142 | MOVBQZX 6(AX), CX 143 | SHLQ $48, CX 144 | ORQ CX, SI 145 | __local15: 146 | MOVBQZX 5(AX), CX 147 | SHLQ $40, CX 148 | ORQ CX, SI 149 | __local1: 150 | MOVBQZX 4(AX), CX 151 | SHLQ $32, CX 152 | ORQ CX, SI 153 | __local4: 154 | MOVBQZX 3(AX), CX 155 | SHLQ $24, CX 156 | ORQ CX, SI 157 | __local11: 158 | MOVBQZX 2(AX), CX 159 | SHLQ $16, CX 160 | ORQ CX, SI 161 | __local13: 162 | MOVBQZX 1(AX), CX 163 | SHLQ $8, CX 164 | ORQ CX, SI 165 | __local5: 166 | MOVBQZX 0(AX), CX 167 | SHLQ $0, CX 168 | ORQ CX, SI 169 | CMPQ DI, $1 170 | JEQ insert11 171 | PINSRQ $0, SI, X9 172 | JMP fin161 173 | insert11: 174 | PINSRQ $1, SI, X9 175 | fin161: 176 | __local3: 177 | JMP memcpy32_fin 178 | memcpy32_skipLoad16: 179 | XORQ DI, DI 180 | CMPQ CX, $8 181 | JLT skip80 182 | MOVQ 0(AX), SI 183 | MOVQ SI, X8 184 | SUBQ $8, CX 185 | ADDQ $8, AX 186 | MOVQ $1, DI 187 | skip80: 188 | XORQ SI, SI 189 | CMPQ CX, $0 190 | JEQ __local2 191 | CMPQ CX, $1 192 | JEQ __local12 193 | CMPQ CX, $2 194 | JEQ __local10 195 | CMPQ CX, $3 196 | JEQ __local9 197 | CMPQ CX, $4 198 | JEQ __local8 199 | CMPQ CX, $5 200 | JEQ __local7 201 | CMPQ CX, $6 202 | JEQ __local6 203 | MOVBQZX 6(AX), CX 204 | SHLQ $48, CX 205 | ORQ CX, SI 206 | __local6: 207 | MOVBQZX 5(AX), CX 208 | SHLQ $40, CX 209 | ORQ CX, SI 210 | __local7: 211 | MOVBQZX 4(AX), CX 212 | SHLQ $32, CX 213 | ORQ CX, SI 214 | __local8: 215 | MOVBQZX 3(AX), CX 216 | SHLQ $24, CX 217 | ORQ CX, SI 218 | __local9: 219 | MOVBQZX 2(AX), CX 220 | SHLQ $16, CX 221 | ORQ CX, SI 222 | __local10: 223 | MOVBQZX 1(AX), CX 224 | SHLQ $8, CX 225 | ORQ CX, SI 226 | __local12: 227 | MOVBQZX 0(AX), CX 228 | SHLQ $0, CX 229 | ORQ CX, SI 230 | CMPQ DI, $1 231 | JEQ insert10 232 | PINSRQ $0, SI, X8 233 | JMP fin160 234 | insert10: 235 | PINSRQ $1, SI, X8 236 | fin160: 237 | __local2: 238 | memcpy32_fin: 239 | CMPQ BX, $16 240 | JLT mod4check 241 | ADDQ BX, DX 242 | SUBQ $4, DX 243 | MOVL 0(DX), AX 244 | PINSRD $3, AX, X9 245 | JMP afterMod4 246 | mod4check: 247 | MOVQ BX, CX 248 | ANDQ $3, CX 249 | NEGQ CX 250 | ADDQ BX, CX 251 | ADDQ CX, DX 252 | MOVQ BX, AX 253 | ANDQ $3, AX 254 | JEQ afterMod4 255 | XORQ BX, BX 256 | MOVBQZX 0(DX), BX 257 | MOVQ AX, CX 258 | SHRQ $1, CX 259 | MOVQ DX, DI 260 | ADDQ CX, DI 261 | MOVBQZX 0(DI), CX 262 | SHLQ $8, CX 263 | ORQ CX, BX 264 | MOVQ AX, CX 265 | SUBQ $1, CX 266 | MOVQ DX, DI 267 | ADDQ CX, DI 268 | MOVBQZX 0(DI), CX 269 | SHLQ $16, CX 270 | ORQ CX, BX 271 | PINSRQ $0, BX, X9 272 | afterMod4: 273 | PADDQ X8, X7 274 | PADDQ X9, X6 275 | PADDQ X2, X7 276 | PADDQ X3, X6 277 | MOVO X0, X8 278 | MOVO X1, X9 279 | MOVO X7, X10 280 | MOVO X6, X11 281 | PSRLQ $32, X8 282 | PSRLQ $32, X9 283 | PMULULQ X8, X10 284 | PMULULQ X9, X11 285 | PXOR X10, X2 286 | PXOR X11, X3 287 | PADDQ X4, X0 288 | PADDQ X5, X1 289 | MOVO X7, X8 290 | MOVO X6, X9 291 | MOVO X0, X10 292 | MOVO X1, X11 293 | PSRLQ $32, X8 294 | PSRLQ $32, X9 295 | PMULULQ X8, X10 296 | PMULULQ X9, X11 297 | PXOR X10, X4 298 | PXOR X11, X5 299 | MOVQ $4223284375849987, AX 300 | MOVQ AX, X8 301 | MOVQ $506661594022413323, AX 302 | MOVQ AX, X9 303 | MOVLHPS X9, X8 304 | MOVO X7, X10 305 | PSHUFB X8, X10 306 | MOVO X6, X11 307 | PSHUFB X8, X11 308 | PADDQ X10, X0 309 | PADDQ X11, X1 310 | MOVO X0, X10 311 | PSHUFB X8, X10 312 | MOVO X1, X11 313 | PSHUFB X8, X11 314 | PADDQ X10, X7 315 | PADDQ X11, X6 316 | finalize: 317 | MOVQ $4, AX 318 | loop0_begin: 319 | PSHUFL $177, X1, X8 320 | PSHUFL $177, X0, X9 321 | PADDQ X8, X7 322 | PADDQ X9, X6 323 | PADDQ X2, X7 324 | PADDQ X3, X6 325 | MOVO X0, X8 326 | MOVO X1, X9 327 | MOVO X7, X10 328 | MOVO X6, X11 329 | PSRLQ $32, X8 330 | PSRLQ $32, X9 331 | PMULULQ X8, X10 332 | PMULULQ X9, X11 333 | PXOR X10, X2 334 | PXOR X11, X3 335 | PADDQ X4, X0 336 | PADDQ X5, X1 337 | MOVO X7, X8 338 | MOVO X6, X9 339 | MOVO X0, X10 340 | MOVO X1, X11 341 | PSRLQ $32, X8 342 | PSRLQ $32, X9 343 | PMULULQ X8, X10 344 | PMULULQ X9, X11 345 | PXOR X10, X4 346 | PXOR X11, X5 347 | MOVQ $4223284375849987, BX 348 | MOVQ BX, X8 349 | MOVQ $506661594022413323, BX 350 | MOVQ BX, X9 351 | MOVLHPS X9, X8 352 | MOVO X7, X10 353 | PSHUFB X8, X10 354 | MOVO X6, X11 355 | PSHUFB X8, X11 356 | PADDQ X10, X0 357 | PADDQ X11, X1 358 | MOVO X0, X10 359 | PSHUFB X8, X10 360 | MOVO X1, X11 361 | PSHUFB X8, X11 362 | PADDQ X10, X7 363 | PADDQ X11, X6 364 | DECQ AX 365 | JNE loop0_begin 366 | PADDQ X7, X0 367 | PADDQ X4, X2 368 | PADDQ X2, X0 369 | MOVQ X0, AX 370 | MOVQ AX, ret+48(FP) 371 | RET 372 | -------------------------------------------------------------------------------- /sum.py: -------------------------------------------------------------------------------- 1 | import peachpy.x86_64 2 | 3 | 4 | class State: 5 | def __init__(self): 6 | self.v0lo = XMMRegister() 7 | self.v0hi = XMMRegister() 8 | self.v1lo = XMMRegister() 9 | self.v1hi = XMMRegister() 10 | self.mul0lo = XMMRegister() 11 | self.mul0hi = XMMRegister() 12 | self.mul1lo = XMMRegister() 13 | self.mul1hi = XMMRegister() 14 | 15 | def load(self, ptr): 16 | # load state into xmm registers 17 | for i, r in enumerate([ 18 | self.v0lo, self.v0hi, self.v1lo, self.v1hi, self.mul0lo, 19 | self.mul0hi, self.mul1lo, self.mul1hi 20 | ]): 21 | MOVDQU(r, [ptr + i * r.size]) 22 | 23 | def store(self, ptr): 24 | # load state into xmm registers 25 | for i, r in enumerate([ 26 | self.v0lo, self.v0hi, self.v1lo, self.v1hi, self.mul0lo, 27 | self.mul0hi, self.mul1lo, self.mul1hi 28 | ]): 29 | MOVDQU([ptr + i * r.size], r) 30 | 31 | 32 | def mm_shufmask(a, b, c, d): 33 | return (a << 6) | (b << 4) | (c << 2) | d 34 | 35 | 36 | def permute(dstlo, dsthi, srclo, srchi): 37 | PSHUFD(dstlo, srchi, mm_shufmask(2, 3, 0, 1)) 38 | PSHUFD(dsthi, srclo, mm_shufmask(2, 3, 0, 1)) 39 | 40 | def rotate32By(dst, count): 41 | t = XMMRegister() 42 | c = XMMRegister() 43 | MOVDQA(t, dst) 44 | MOVQ(c, count) 45 | PSLLD(t, c) 46 | SUB(count, 32) 47 | NEG(count) 48 | MOVQ(c, count) 49 | PSRLD(dst, c) 50 | POR(dst, t) 51 | 52 | def zippermask(): 53 | x = GeneralPurposeRegister64() 54 | mask = XMMRegister() 55 | tmpmask = XMMRegister() 56 | 57 | MOV(x, 0x000F010E05020C03) 58 | MOVQ(mask, x) 59 | MOV(x, 0x070806090D0A040B) 60 | MOVQ(tmpmask, x) 61 | MOVLHPS(mask, tmpmask) 62 | 63 | return mask 64 | 65 | 66 | def zipper(mask, mlo, mhi, vlo, vhi): 67 | MOVDQA(vlo, mlo) 68 | PSHUFB(vlo, mask) 69 | MOVDQA(vhi, mhi) 70 | PSHUFB(vhi, mask) 71 | 72 | 73 | def update(plo, phi, state): 74 | PADDQ(state.v1lo, plo) 75 | PADDQ(state.v1hi, phi) 76 | PADDQ(state.v1lo, state.mul0lo) 77 | PADDQ(state.v1hi, state.mul0hi) 78 | 79 | dstlo = XMMRegister() 80 | dsthi = XMMRegister() 81 | srclo = XMMRegister() 82 | srchi = XMMRegister() 83 | 84 | MOVDQA(srclo, state.v0lo) 85 | MOVDQA(srchi, state.v0hi) 86 | MOVDQA(dstlo, state.v1lo) 87 | MOVDQA(dsthi, state.v1hi) 88 | PSRLQ(srclo, 32) 89 | PSRLQ(srchi, 32) 90 | 91 | PMULUDQ(dstlo, srclo) 92 | PMULUDQ(dsthi, srchi) 93 | PXOR(state.mul0lo, dstlo) 94 | PXOR(state.mul0hi, dsthi) 95 | 96 | ### 97 | 98 | PADDQ(state.v0lo, state.mul1lo) 99 | PADDQ(state.v0hi, state.mul1hi) 100 | 101 | ### 102 | 103 | MOVDQA(srclo, state.v1lo) 104 | MOVDQA(srchi, state.v1hi) 105 | MOVDQA(dstlo, state.v0lo) 106 | MOVDQA(dsthi, state.v0hi) 107 | PSRLQ(srclo, 32) 108 | PSRLQ(srchi, 32) 109 | 110 | PMULUDQ(dstlo, srclo) 111 | PMULUDQ(dsthi, srchi) 112 | PXOR(state.mul1lo, dstlo) 113 | PXOR(state.mul1hi, dsthi) 114 | 115 | ###### 116 | 117 | mask = zippermask() 118 | zipper(mask, state.v1lo, state.v1hi, dstlo, dsthi) 119 | PADDQ(state.v0lo, dstlo) 120 | PADDQ(state.v0hi, dsthi) 121 | 122 | zipper(mask, state.v0lo, state.v0hi, dstlo, dsthi) 123 | PADDQ(state.v1lo, dstlo) 124 | PADDQ(state.v1hi, dsthi) 125 | 126 | 127 | def permuteAndUpdate(state): 128 | plo, phi = XMMRegister(), XMMRegister() 129 | 130 | permute(plo, phi, state.v0lo, state.v0hi) 131 | update(plo, phi, state) 132 | 133 | 134 | def finalize(state): 135 | c = GeneralPurposeRegister64() 136 | MOV(c, 4) 137 | with Loop() as loop: 138 | permuteAndUpdate(state) 139 | DEC(c) 140 | JNZ(loop.begin) 141 | 142 | PADDQ(state.v0lo, state.v1lo) 143 | PADDQ(state.mul0lo, state.mul1lo) 144 | 145 | PADDQ(state.v0lo, state.mul0lo) 146 | 147 | ret = GeneralPurposeRegister64() 148 | 149 | MOVQ(ret, state.v0lo) 150 | 151 | return ret 152 | 153 | 154 | def newstate(reg_keys, reg_init0, reg_init1): 155 | state = State() 156 | 157 | MOVDQU(state.v0lo, [reg_keys]) 158 | MOVDQU(state.v0hi, [reg_keys + 16]) 159 | MOVDQU(state.mul0lo, [reg_init0]) 160 | MOVDQU(state.mul0hi, [reg_init0 + 16]) 161 | MOVDQU(state.mul1lo, [reg_init1]) 162 | MOVDQU(state.mul1hi, [reg_init1 + 16]) 163 | 164 | permute(state.v1hi, state.v1lo, state.v0lo, state.v0hi) 165 | 166 | PXOR(state.v0lo, state.mul0lo) 167 | PXOR(state.v0hi, state.mul0hi) 168 | PXOR(state.v1lo, state.mul1lo) 169 | PXOR(state.v1hi, state.mul1hi) 170 | 171 | return state 172 | 173 | 174 | def memcpy32(x0, x1, p, l): 175 | 176 | fin = Label("memcpy32_fin") 177 | CMP(l, 0) 178 | JE(fin) 179 | 180 | skipLoad16 = Label("memcpy32_skipLoad16") 181 | CMP(l, 16) 182 | JL(skipLoad16) 183 | MOVDQU(x0, [p]) 184 | ADD(p, 16) 185 | SUB(l, 16) 186 | memcpy16(x1, p, l) 187 | JMP(fin) 188 | LABEL(skipLoad16) 189 | memcpy16(x0, p, l) 190 | 191 | LABEL(fin) 192 | 193 | 194 | def memcpy16(xmm0, p, l): 195 | 196 | b = GeneralPurposeRegister64() 197 | offs = GeneralPurposeRegister64() 198 | XOR(offs, offs) 199 | 200 | skip8 = Label() 201 | CMP(l, 8) 202 | JL(skip8) 203 | MOV(b, [p]) 204 | MOVQ(xmm0, b) 205 | SUB(l, 8) 206 | ADD(p, 8) 207 | MOV(offs, 1) 208 | LABEL(skip8) 209 | 210 | XOR(b, b) 211 | # no support for jump tables 212 | labels = [Label() for i in range(0, 8)] 213 | for i in range(0, 7): 214 | CMP(l, i) 215 | JE(labels[i]) 216 | char = GeneralPurposeRegister64() 217 | for i in range(7, 0, -1): 218 | LABEL(labels[i]) 219 | MOVZX(char, byte[p + i - 1]) 220 | SHL(char, (i - 1) * 8) 221 | OR(b, char) 222 | 223 | fin16 = Label() 224 | insert1 = Label() 225 | CMP(offs, 1) 226 | JZ(insert1) 227 | PINSRQ(xmm0, b, 0) 228 | JMP(fin16) 229 | LABEL(insert1) 230 | PINSRQ(xmm0, b, 1) 231 | LABEL(fin16) 232 | LABEL(labels[0]) 233 | 234 | 235 | def MakeHash(): 236 | 237 | keys = Argument(ptr()) 238 | init0 = Argument(ptr()) 239 | init1 = Argument(ptr()) 240 | p_base = Argument(ptr()) 241 | p_len = Argument(int64_t) 242 | p_cap = Argument(int64_t) 243 | 244 | with Function( 245 | "hashSSE", (keys, init0, init1, p_base, p_len, p_cap), 246 | uint64_t, 247 | target=uarch.default + isa.sse4_1) as function: 248 | 249 | reg_keys = GeneralPurposeRegister64() 250 | reg_init0 = GeneralPurposeRegister64() 251 | reg_init1 = GeneralPurposeRegister64() 252 | 253 | LOAD.ARGUMENT(reg_keys, keys) 254 | LOAD.ARGUMENT(reg_init0, init0) 255 | LOAD.ARGUMENT(reg_init1, init1) 256 | state = newstate(reg_keys, reg_init0, reg_init1) 257 | 258 | reg_p = GeneralPurposeRegister64() 259 | reg_p_len = GeneralPurposeRegister64() 260 | LOAD.ARGUMENT(reg_p, p_base) 261 | LOAD.ARGUMENT(reg_p_len, p_len) 262 | 263 | reg_plo = XMMRegister() 264 | reg_phi = XMMRegister() 265 | 266 | loop = Loop() 267 | CMP(reg_p_len, 32) 268 | JL(loop.end) 269 | with loop: 270 | MOVDQU(reg_plo, [reg_p]) 271 | MOVDQU(reg_phi, [reg_p + 16]) 272 | 273 | update(reg_plo, reg_phi, state) 274 | 275 | ADD(reg_p, 32) 276 | SUB(reg_p_len, 32) 277 | CMP(reg_p_len, 32) 278 | JGE(loop.begin) 279 | 280 | ### 281 | 282 | # reg_p_len is now remainder mod 32 283 | lfinalize = Label("finalize") 284 | CMP(reg_p_len, 0) 285 | JZ(lfinalize) 286 | 287 | 288 | # TODO(dgryski): remove this variable; reuse reg_p_len 289 | reg_remMod32 = GeneralPurposeRegister64() 290 | MOV(reg_remMod32, reg_p_len) 291 | SHL(reg_remMod32, 32) 292 | ADD(reg_remMod32, reg_p_len) 293 | 294 | reg_xmm0 = XMMRegister() 295 | MOVQ(reg_xmm0, reg_remMod32) 296 | PINSRQ(reg_xmm0, reg_remMod32, 1) 297 | PADDQ(state.v0lo, reg_xmm0) 298 | PADDQ(state.v0hi, reg_xmm0) 299 | 300 | reg_copylen = GeneralPurposeRegister64() 301 | MOV(reg_copylen, reg_p_len) 302 | rotate32By(state.v1lo, reg_copylen) 303 | MOV(reg_copylen, reg_p_len) 304 | rotate32By(state.v1hi, reg_copylen) 305 | 306 | # copy(finalPacket[:], bytes[:len(bytes)-remainderMod4]) 307 | MOV(reg_copylen, reg_p_len) 308 | AND(reg_copylen, 3) 309 | NEG(reg_copylen) 310 | ADD(reg_copylen, reg_p_len) 311 | 312 | reg_remainder = GeneralPurposeRegister64() 313 | MOV(reg_remainder, reg_p) 314 | 315 | PXOR(reg_plo, reg_plo) 316 | PXOR(reg_phi, reg_phi) 317 | # reg_p is destroyed 318 | memcpy32(reg_plo, reg_phi, reg_p, reg_copylen) 319 | 320 | mod4check = Label("mod4check") 321 | afterMod4 = Label("afterMod4") 322 | CMP(reg_p_len, 16) 323 | JL(mod4check) 324 | # TODO(dgryski): copy(finalPacket[28:], bytes[len(bytes)-4:]) 325 | final = GeneralPurposeRegister32() 326 | ADD(reg_remainder, reg_p_len) 327 | SUB(reg_remainder, 4) 328 | MOV(final, [reg_remainder]) 329 | # load last 4 bytes from remainder 330 | # shove them into last 4 bytes of final packet 331 | # == high 4 bytes of 332 | PINSRD(reg_phi, final, 3) 333 | JMP(afterMod4) 334 | LABEL(mod4check) 335 | MOV(reg_copylen, reg_p_len) 336 | AND(reg_copylen, 3) 337 | NEG(reg_copylen) 338 | ADD(reg_copylen, reg_p_len) 339 | ADD(reg_remainder, reg_copylen) 340 | reg_remMod4 = GeneralPurposeRegister64() 341 | MOV(reg_remMod4, reg_p_len) 342 | AND(reg_remMod4, 3) 343 | JZ(afterMod4) 344 | final = GeneralPurposeRegister64() 345 | XOR(final, final) 346 | MOVZX(final, byte[reg_remainder]) 347 | tmp = GeneralPurposeRegister64() 348 | MOV(tmp, reg_remMod4) 349 | SHR(tmp, 1) 350 | offs = GeneralPurposeRegister64() 351 | MOV(offs, reg_remainder) 352 | ADD(offs, tmp) 353 | MOVZX(tmp, byte[offs]) 354 | SHL(tmp, 8) 355 | OR(final, tmp) 356 | MOV(tmp, reg_remMod4) 357 | SUB(tmp, 1) 358 | MOV(offs, reg_remainder) 359 | ADD(offs, tmp) 360 | MOVZX(tmp, byte[offs]) 361 | SHL(tmp, 16) 362 | OR(final, tmp) 363 | PINSRQ(reg_phi, final, 0) 364 | LABEL(afterMod4) 365 | 366 | update(reg_plo, reg_phi, state) 367 | 368 | LABEL(lfinalize) 369 | ret = finalize(state) 370 | RETURN(ret) 371 | 372 | 373 | MakeHash() 374 | --------------------------------------------------------------------------------