├── CMakeLists.txt ├── LICENSE ├── README.md └── src ├── examples.cc ├── hash_benchmark.cc ├── parallel-murmur3.h └── parallel-xxhash.h /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 4 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) 5 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) 6 | 7 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 -O3 -g3 -Wall -Werror -fno-strict-aliasing -Wno-sign-compare -march=native -I src/third-party/") 8 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O3 -g3 -Wall -Werror -fno-strict-aliasing -Wno-sign-compare -march=native -I src/third-party/") 9 | 10 | set(CMAKE_EXE_LINKER_FLAGS "") 11 | 12 | add_executable(examples 13 | src/examples.cc) 14 | 15 | 16 | enable_testing() 17 | # Usual workaround for the broken test build dependency handling in CMake. 18 | # Every test must depend on this dummy target. 19 | add_test(build_test_code "${CMAKE_COMMAND}" --build ${CMAKE_BINARY_DIR} --target all) 20 | 21 | macro(define_test name) 22 | add_executable(${name}.testbin 23 | src/test/${name}.cc) 24 | add_test(${name} bin/${name}.testbin) 25 | set_tests_properties(${name} PROPERTIES DEPENDS 26 | build_test_code) 27 | endmacro() 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Juho Snellman 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | parallel-xxhash 2 | --------------- 3 | 4 | Two implementations of the [xxHash](http:cyan4973.github.io/xxHash/) 5 | hash function (specifically, the 32 bit version). 6 | 7 | - `parallel`: Computing the hash values of 8 keys in parallel, using 8 | AVX2 intrinsics. 9 | - `scalar`: Computing the hash value of a single key. 10 | 11 | These are very special purpose implementations, and will not be 12 | of any interest in most programs. Use these functions if: 13 | 14 | - Your keys have a constant size, and are a multiple of 4 bytes 15 | long. Variable size keys are not supported. Nor are non-word 16 | aligned keys. 17 | - You have an application that can receive and process inputs 18 | in batches such that you usually have 8 keys to process at once. 19 | (Doesn't need to be full batches of 8, but the break-even point 20 | vs. a fast scalar hash is probably around batches of 3-4 keys). 21 | - You can arrange for your hash keys to be in a column-major 22 | order without too much pain. 23 | - You can compile the program with -mavx2. (While there is a 24 | non-avx2 fallback, there's not a lot of point to it). 25 | - A single 32 bit hash value per key is sufficient. 26 | 27 | The `scalar` implementation is mainly included as a fallback, for 28 | programs that generally use the parallel code, but have some 29 | exceptional cases that need identical hash codes for individual 30 | keys. Except for very small keys (at most 20 bytes), you are probably 31 | better off with the reference implementation of some modern 32 | hash function. 33 | 34 | DATA LAYOUT 35 | ----------- 36 | 37 | For the parallel version the keys should be laid out adjacent to 38 | each other, in column-major order. That is, the first word in 39 | `keys` should be the first word of the first key. The second word 40 | of `keys` should be the first word of the second key. And so on: 41 | 42 | ``` 43 | key1[0] key2[0] ... key7[0] 44 | key2[1] key2[1] ... key7[1] 45 | ... 46 | key1[SizeWords-1] key2[SizeWords-1] ... key7[SizeWords-1] 47 | ``` 48 | 49 | EXAMPLES 50 | -------- 51 | 52 | Assume the following definitions: 53 | 54 | ```c++ 55 | static const uint32_t KEY_LENGTH = 3; 56 | 57 | static uint32_t rows[][KEY_LENGTH] = { 58 | {1, 2, 3}, 59 | {4, 5, 6}, 60 | {7, 8, 9}, 61 | {10, 11, 12}, 62 | {13, 14, 15}, 63 | {16, 17, 18}, 64 | {19, 20, 21}, 65 | {22, 23, 24}, 66 | }; 67 | 68 | static uint32_t cols[][8] = { 69 | {1, 4, 7, 10, 13, 16, 19, 22}, 70 | {2, 5, 8, 11, 14, 17, 20, 23}, 71 | {3, 6, 9, 12, 15, 18, 21, 24}, 72 | }; 73 | 74 | static uint32_t seeds[] = { 0x3afc8e77, 0x924f408d }; 75 | static const uint32_t seed_count = sizeof(seeds) / sizeof(uint32_t); 76 | ``` 77 | 78 | The implementations could be used as follows to compute hash values 79 | for each of the key / seed combinations: 80 | 81 | ```c++ 82 | void example_scalar() { 83 | for (int s = 0; s < seed_count; ++s) { 84 | for (int i = 0; i < 8; ++i) { 85 | uint32_t* row = rows[i]; 86 | uint32_t res = xxhash32<3>::scalar(row, seeds[s]); 87 | printf("seed=%08x, key={%u,%u,%u}, hash=%u\n", 88 | seeds[s], row[0], row[1], row[2], res); 89 | } 90 | } 91 | } 92 | 93 | void example_parallel() { 94 | for (int s = 0; s < seed_count; ++s) { 95 | uint32_t res[8]; 96 | __m256i hash = xxhash32<3>::parallel(cols[0], seeds[s], res); 97 | for (int i = 0; i < 8; ++i) { 98 | printf("seed=%08x, key={%u,%u,%u}, hash=%u\n", 99 | seeds[s], cols[0][i], cols[1][i], cols[2][i], 100 | res[i]); 101 | } 102 | } 103 | } 104 | ``` 105 | -------------------------------------------------------------------------------- /src/examples.cc: -------------------------------------------------------------------------------- 1 | // -*- mode: c++; c-basic-offset: 4 indent-tabs-mode: nil -*- */ 2 | // 3 | // Copyright 2017 Juho Snellman, released under a MIT license 4 | 5 | #include "parallel-murmur3.h" 6 | #include "parallel-xxhash.h" 7 | 8 | #ifdef KEY_LENGTH 9 | #undef KEY_LENGTH 10 | #endif 11 | 12 | static const int KEY_LENGTH = 3; 13 | 14 | static uint32_t rows[][KEY_LENGTH] = { 15 | {1, 2, 3}, 16 | {4, 5, 6}, 17 | {7, 8, 9}, 18 | {10, 11, 12}, 19 | {13, 14, 15}, 20 | {16, 17, 18}, 21 | {19, 20, 21}, 22 | {22, 23, 24}, 23 | }; 24 | 25 | static uint32_t cols[][8] = { 26 | {1, 4, 7, 10, 13, 16, 19, 22}, 27 | {2, 5, 8, 11, 14, 17, 20, 23}, 28 | {3, 6, 9, 12, 15, 18, 21, 24}, 29 | }; 30 | 31 | static uint32_t seeds[] = { 0x3afc8e77, 0x924f408d }; 32 | static const uint32_t seed_count = sizeof(seeds) / sizeof(uint32_t); 33 | 34 | void example_murmur3_scalar() { 35 | for (int s = 0; s < seed_count; ++s) { 36 | for (int i = 0; i < 8; ++i) { 37 | uint32_t* row = rows[i]; 38 | uint32_t res = murmur3<3>::scalar(row, seeds[s]); 39 | printf("seed=%08x, key={%u,%u,%u}, hash=%u\n", 40 | seeds[s], row[0], row[1], row[2], res); 41 | } 42 | } 43 | } 44 | 45 | void example_murmur3_parallel() { 46 | for (int s = 0; s < seed_count; ++s) { 47 | uint32_t res[8]; 48 | murmur3<3>::parallel(cols[0], seeds[s], res); 49 | for (int i = 0; i < 8; ++i) { 50 | printf("seed=%08x, key={%u,%u,%u}, hash=%u\n", 51 | seeds[s], cols[0][i], cols[1][i], cols[2][i], 52 | res[i]); 53 | } 54 | } 55 | } 56 | 57 | void example_murmur3_parallel_multiseed() { 58 | __m256i hash[seed_count]; 59 | murmur3<3>::parallel_multiseed(cols[0], seeds, hash); 60 | for (int s = 0; s < seed_count; ++s) { 61 | uint32_t res[8]; 62 | _mm256_storeu_si256((__m256i*) res, hash[s]); 63 | for (int i = 0; i < 8; ++i) { 64 | printf("seed=%08x, key={%u,%u,%u}, hash=%u\n", 65 | seeds[s], cols[0][i], cols[1][i], cols[2][i], 66 | res[i]); 67 | } 68 | } 69 | } 70 | 71 | void example_xxhash32_scalar() { 72 | for (int s = 0; s < seed_count; ++s) { 73 | for (int i = 0; i < 8; ++i) { 74 | uint32_t* row = rows[i]; 75 | uint32_t res = xxhash32<3>::scalar(row, seeds[s]); 76 | printf("seed=%08x, key={%u,%u,%u}, hash=%u\n", 77 | seeds[s], row[0], row[1], row[2], res); 78 | } 79 | } 80 | } 81 | 82 | void example_xxhash32_parallel() { 83 | for (int s = 0; s < seed_count; ++s) { 84 | uint32_t res[8]; 85 | xxhash32<3>::parallel(cols[0], seeds[s], res); 86 | for (int i = 0; i < 8; ++i) { 87 | printf("seed=%08x, key={%u,%u,%u}, hash=%u\n", 88 | seeds[s], cols[0][i], cols[1][i], cols[2][i], 89 | res[i]); 90 | } 91 | } 92 | } 93 | 94 | int main (void) { 95 | example_murmur3_scalar(); 96 | example_murmur3_parallel(); 97 | example_murmur3_parallel_multiseed(); 98 | example_xxhash32_scalar(); 99 | example_xxhash32_parallel(); 100 | } 101 | -------------------------------------------------------------------------------- /src/hash_benchmark.cc: -------------------------------------------------------------------------------- 1 | #include "parallel-murmur3.h" 2 | #include "parallel-xxhash.h" 3 | 4 | #include 5 | #include 6 | extern "C" { 7 | #include "third-party/MurmurHash3.h" 8 | } 9 | #include "third-party/cityhash/city.h" 10 | #include "third-party/xxhash.h" 11 | #include "third-party/metrohash64.h" 12 | 13 | #if !defined(KEY_LENGTH) 14 | #error "Remember to pass in a -DKEY_LENGTH" 15 | #endif 16 | 17 | struct test_parallel { 18 | __attribute__((noinline)) 19 | void run(uint32_t* keys, uint32_t* seed, uint32_t* res) { 20 | auto h = hash(keys, seed[0]); 21 | _mm256_storeu_si256((__m256i*) res, h); 22 | } 23 | 24 | __attribute__((noinline)) 25 | __m256i hash(uint32_t* keys, uint32 seed) { 26 | __m256i res; 27 | murmur3::parallel(keys, seed, (uint32_t*) &res); 28 | return res; 29 | } 30 | }; 31 | 32 | template 33 | struct test_parallel_multiseed { 34 | __attribute__((noinline)) 35 | void run(uint32_t* keys, uint32_t* seed, uint32_t* res) { 36 | // No point in playing further noinline games here. 37 | murmur3::parallel_multiseed( 38 | keys, seed, (__m256i*) res); 39 | } 40 | }; 41 | 42 | struct test_scalar { 43 | __attribute__((noinline)) 44 | void run(uint32_t* keys, uint32_t* seed, uint32_t* res) { 45 | for (int i = 0; i < 8; ++i) { 46 | res[i] = hash(&keys[KEY_LENGTH * i], seed[0]); 47 | } 48 | } 49 | 50 | __attribute__((noinline)) 51 | uint32_t hash(uint32_t* key, uint32 seed) { 52 | return murmur3::scalar(key, seed); 53 | } 54 | }; 55 | 56 | struct test_parallel_xxhash32 { 57 | __attribute__((noinline)) 58 | void run(uint32_t* keys, uint32_t* seed, uint32_t* res) { 59 | auto h = hash(keys, seed[0]); 60 | _mm256_storeu_si256((__m256i*) res, h); 61 | } 62 | 63 | __attribute__((noinline)) 64 | __m256i hash(uint32_t* keys, uint32 seed) { 65 | __m256i res; 66 | xxhash32::parallel(keys, seed, (uint32_t*) &res); 67 | return res; 68 | } 69 | }; 70 | 71 | struct test_scalar_xxhash32 { 72 | __attribute__((noinline)) 73 | void run(uint32_t* keys, uint32_t* seed, uint32_t* res) { 74 | for (int i = 0; i < 8; ++i) { 75 | res[i] = hash(&keys[KEY_LENGTH * i], seed[0]); 76 | } 77 | } 78 | 79 | __attribute__((noinline)) 80 | uint32_t hash(uint32_t* key, uint32 seed) { 81 | return xxhash32::scalar(key, seed); 82 | } 83 | }; 84 | 85 | 86 | struct test_original { 87 | __attribute__((noinline)) 88 | void run(uint32_t* keys, uint32_t* seed, uint32_t* res) { 89 | for (int i = 0; i < 8; ++i) { 90 | MurmurHash3_x86_32(&keys[KEY_LENGTH * i], 91 | 4 * KEY_LENGTH, 92 | seed[0], 93 | &res[i]); 94 | } 95 | } 96 | }; 97 | 98 | struct test_cityhash { 99 | __attribute__((noinline)) 100 | void run(uint32_t* keys, uint32_t* seed, uint32_t* res) { 101 | for (int i = 0; i < 8; ++i) { 102 | res[i] = CityHash64WithSeed((const char*) 103 | &keys[KEY_LENGTH * i], 104 | 4 * KEY_LENGTH, 105 | seed[0]); 106 | } 107 | } 108 | }; 109 | 110 | struct test_cityhash32 { 111 | __attribute__((noinline)) 112 | void run(uint32_t* keys, uint32_t* seed, uint32_t* res) { 113 | for (int i = 0; i < 8; ++i) { 114 | res[i] = CityHash32((const char*) 115 | &keys[KEY_LENGTH * i], 116 | 4 * KEY_LENGTH); 117 | } 118 | } 119 | }; 120 | 121 | struct test_xxhash32 { 122 | __attribute__((noinline)) 123 | void run(uint32_t* keys, uint32_t* seed, uint32_t* res) { 124 | for (int i = 0; i < 8; ++i) { 125 | res[i] = XXH32((const char*) 126 | &keys[KEY_LENGTH * i], 127 | 4 * KEY_LENGTH, 128 | i); 129 | } 130 | } 131 | }; 132 | 133 | struct test_xxhash64 { 134 | __attribute__((noinline)) 135 | void run(uint32_t* keys, uint32_t* seed, uint32_t* res) { 136 | for (int i = 0; i < 8; ++i) { 137 | res[i] = XXH64((const char*) 138 | &keys[KEY_LENGTH * i], 139 | 4 * KEY_LENGTH, 140 | i); 141 | } 142 | } 143 | }; 144 | 145 | struct test_metrohash64 { 146 | __attribute__((noinline)) 147 | void run(uint32_t* keys, uint32_t* seed, uint32_t* res) { 148 | for (int i = 0; i < 8; ++i) { 149 | MetroHash64::Hash((const unsigned char*) 150 | &keys[KEY_LENGTH * i], 151 | 4 * KEY_LENGTH, 152 | (uint8_t*) &res[i * 2], 153 | seed[0]); 154 | } 155 | } 156 | }; 157 | 158 | template 159 | bool bench(const char* label, uint64_t n, uint32_t* keys) { 160 | Q tester; 161 | uint32_t seed[] = { 162 | 0x3afc8e77, 0x924f408d, 163 | 0x8c2a315e, 0x78884cdb, 164 | 0xd2ef9767, 0xee5e590c, 165 | 0x06201e43, 0xb2e4d8df, 166 | }; 167 | uint32_t res[8 * 8]; 168 | 169 | auto start = std::chrono::system_clock::now(); 170 | 171 | for (int i = 0; i < n; ++i) { 172 | tester.run(keys, seed, res); 173 | } 174 | 175 | auto end = std::chrono::system_clock::now(); 176 | std::chrono::duration t = end - start; 177 | 178 | uint64_t total_bytes = n * KEY_LENGTH * 4 * 8 * WorkFactor; 179 | printf("%s,%d,%lf,%ld,%lf,%lf\n", 180 | label, 181 | 4 * KEY_LENGTH, 182 | t.count(), 183 | total_bytes, 184 | t.count() * 1e9 / total_bytes, 185 | t.count() * 1e9 / (n * 8 * WorkFactor)); 186 | 187 | return true; 188 | } 189 | 190 | void init_keys(uint32_t* rows, uint32_t* cols) { 191 | for (int i = 0; i < 8; ++i) { 192 | for (int j = 0; j < KEY_LENGTH; ++j) { 193 | uint32_t value = (i + 1) * ((j << 8) + 1); 194 | rows[i * KEY_LENGTH + j] = value; 195 | cols[j * 8 + i] = value; 196 | } 197 | } 198 | } 199 | 200 | int main(void) { 201 | uint32_t rows[KEY_LENGTH * 8 * sizeof(uint32_t)]; 202 | uint32_t cols[KEY_LENGTH * 8 * sizeof(uint32_t)]; 203 | 204 | int n = (1 << 27) / KEY_LENGTH; 205 | 206 | if (KEY_LENGTH == 1) { 207 | printf("impl,keysize,time,bytes,ns_per_byte,ns_per_key\n"); 208 | } 209 | 210 | init_keys(rows, cols); 211 | bench("parallel murmur3", n, cols); 212 | bench("parallel xxhash32", n, cols); 213 | // bench, 1>("parallel_multiseed<1>", n, 214 | // cols); 215 | // bench, 2>("parallel_multiseed<2>", n, 216 | // cols); 217 | // bench, 4>("parallel_multiseed<4>", n, 218 | // cols); 219 | // bench("scalar murmur3", n, rows); 220 | // bench("original murmur3", n, rows); 221 | bench("scalar xxhash32", n, cols); 222 | bench("cityhash", n, rows); 223 | // bench("cityhash32", n, rows); 224 | // bench("xxhash32", n, rows); 225 | bench("xxhash64", n, rows); 226 | bench("metrohash64", n, rows); 227 | } 228 | -------------------------------------------------------------------------------- /src/parallel-murmur3.h: -------------------------------------------------------------------------------- 1 | // -*- mode: c++; c-basic-offset: 4 indent-tabs-mode: nil -*- */ 2 | // 3 | // Copyright 2017 Juho Snellman, released under a MIT license: 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining 6 | // a copy of this software and associated documentation files (the 7 | // "Software"), to deal in the Software without restriction, including 8 | // without limitation the rights to use, copy, modify, merge, publish, 9 | // distribute, sublicense, and/or sell copies of the Software, and to 10 | // permit persons to whom the Software is furnished to do so, subject to 11 | // the following conditions: 12 | // 13 | // The above copyright notice and this permission notice shall be 14 | // included in all copies or substantial portions of the Software. 15 | // 16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | // 24 | // README 25 | // ------ 26 | // 27 | // Three AVX2 implementations of the Murmur3 hash functions. 28 | // 29 | // - parallel: Computing the hash values of 8 keys in parallel. 30 | // - parallel_multiseed: Computing N hash values for each of 8 keys 31 | // in parallel. Each of the N hash values for a key will be computed 32 | // using a different key. 33 | // - scalar: Computing the hash value of a single key. 34 | // 35 | // You probably don't want to use any of these, look in 36 | // parallel-xxhash.h instead. (It is missing parallel_multiseed, since 37 | // my use case for that disappeared. But it'd be trivial to implement). 38 | 39 | #ifndef PARALLEL_MURMUR3_H 40 | #define PARALLEL_MURMUR3_H 41 | 42 | #include 43 | #include 44 | #include 45 | 46 | template 47 | struct murmur3 { 48 | 49 | // Compute a hash value for 8 keys of SizeWords*4 bytes each. 50 | static void parallel(const uint32_t* keys, uint32_t seed, 51 | uint32_t res[8]) { 52 | const __m256i c1 = _mm256_set1_epi32(0xcc9e2d51); 53 | const __m256i c2 = _mm256_set1_epi32(0x1b873593); 54 | __m256i h = _mm256_set1_epi32(seed); 55 | 56 | for (int i = 0; i < SizeWords; ++i) { 57 | __m256i k = _mm256_loadu_si256((__m256i*) (keys + i * 8)); 58 | k = _mm256_mullo_epi32(k, c1); 59 | k = mm256_rol32<15>(k); 60 | k = _mm256_mullo_epi32(k, c2); 61 | 62 | h = _mm256_xor_si256(h, k); 63 | h = mm256_rol32<13>(h); 64 | h = _mm256_add_epi32(_mm256_mullo_epi32(h, 65 | _mm256_set1_epi32(5)), 66 | _mm256_set1_epi32(0xe6546b64)); 67 | } 68 | 69 | // Mixing in the length here is pretty silly, since it's always 70 | // constant. But there's probably some value in producing bitwise 71 | // identical results to the original murmur3 code. 72 | h = _mm256_xor_si256(h, _mm256_set1_epi32(SizeWords * 4)); 73 | 74 | _mm256_storeu_si256((__m256i*) res, mm256_fmix32(h)); 75 | } 76 | 77 | // For each of 8 keys, compute N hash values each with a different 78 | // starting seed value. The hash values will be written to "res". 79 | template 80 | static void parallel_multiseed(const uint32_t* keys, uint32_t seeds[N], 81 | __m256i res[N]) { 82 | const __m256i c1 = _mm256_set1_epi32(0xcc9e2d51); 83 | const __m256i c2 = _mm256_set1_epi32(0x1b873593); 84 | __m256i h[N]; 85 | for (int j = 0; j < N; ++j) { 86 | h[j] = _mm256_set1_epi32(seeds[j]); 87 | } 88 | 89 | for (int i = 0; i < SizeWords; ++i) { 90 | __m256i k = _mm256_loadu_si256((__m256i*) (keys + i * 8)); 91 | k = _mm256_mullo_epi32(k, c1); 92 | k = mm256_rol32<15>(k); 93 | k = _mm256_mullo_epi32(k, c2); 94 | 95 | for (int j = 0; j < N; ++j) { 96 | h[j] = _mm256_xor_si256(h[j], k); 97 | h[j] = mm256_rol32<13>(h[j]); 98 | h[j] = _mm256_add_epi32(_mm256_mullo_epi32(h[j], 99 | _mm256_set1_epi32(5)), 100 | _mm256_set1_epi32(0xe6546b64)); 101 | } 102 | } 103 | 104 | for (int j = 0; j < N; ++j) { 105 | h[j] = _mm256_xor_si256(h[j], _mm256_set1_epi32(SizeWords * 4)); 106 | res[j] = mm256_fmix32(h[j]); 107 | } 108 | } 109 | 110 | // Compute a hash value for the key. 111 | static uint32_t scalar(uint32_t* key, uint32_t seed) { 112 | const uint32_t c1s = 0xcc9e2d51; 113 | const uint32_t c2s = 0x1b873593; 114 | uint32_t h = seed; 115 | 116 | for (int i = 0; i < SizeWords; ++i) { 117 | uint32_t k = key[i]; 118 | k *= c1s; 119 | k = rol32<15>(k); 120 | k *= c2s; 121 | 122 | h ^= k; 123 | h = rol32<13>(h); 124 | h = h*5 + 0xe6546b64; 125 | } 126 | 127 | h ^= SizeWords * 4; 128 | 129 | return fmix32(h); 130 | } 131 | 132 | private: 133 | static __m256i mm256_fmix32(__m256i h) { 134 | h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 16)); 135 | h = _mm256_mullo_epi32(h, _mm256_set1_epi32(0x85ebca6b)); 136 | h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 13)); 137 | h = _mm256_mullo_epi32(h, _mm256_set1_epi32(0xc2b2ae35)); 138 | h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 16)); 139 | 140 | return h; 141 | } 142 | 143 | template 144 | static __m256i mm256_rol32(__m256i x) { 145 | return _mm256_or_si256(_mm256_slli_epi32(x, r), 146 | _mm256_srli_epi32(x, 32 - r)); 147 | } 148 | 149 | static uint32_t fmix32(uint32_t h) { 150 | h ^= h >> 16; 151 | h *= 0x85ebca6b; 152 | h ^= h >> 13; 153 | h *= 0xc2b2ae35; 154 | h ^= h >> 16; 155 | 156 | return h; 157 | } 158 | 159 | template 160 | static uint32_t rol32(uint32_t x) { 161 | return (x << r) | (x >> (32 - r)); 162 | } 163 | }; 164 | 165 | #endif // PARALLEL_MURMUR3_H 166 | -------------------------------------------------------------------------------- /src/parallel-xxhash.h: -------------------------------------------------------------------------------- 1 | // -*- mode: c++; c-basic-offset: 4 indent-tabs-mode: nil -*- */ 2 | // 3 | // Copyright 2017 Juho Snellman, released under a MIT license: 4 | // 5 | // Permission is hereby granted, free of charge, to any person obtaining 6 | // a copy of this software and associated documentation files (the 7 | // "Software"), to deal in the Software without restriction, including 8 | // without limitation the rights to use, copy, modify, merge, publish, 9 | // distribute, sublicense, and/or sell copies of the Software, and to 10 | // permit persons to whom the Software is furnished to do so, subject to 11 | // the following conditions: 12 | // 13 | // The above copyright notice and this permission notice shall be 14 | // included in all copies or substantial portions of the Software. 15 | // 16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 17 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 19 | // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE 20 | // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 21 | // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION 22 | // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | // 24 | // README 25 | // ------ 26 | // 27 | // Two implementations of the 32 bit version of the XXHash 28 | // hash function. 29 | // 30 | // - parallel: Computing the hash values of 8 keys in parallel, using 31 | // AVX2 intrinsics. (There's also a version with identical semantics 32 | // but plain C++, which the compiler might or might not be able to 33 | // auto-vectorize.) 34 | // - scalar: Computing the hash value of a single key. 35 | // 36 | // These are very special purpose implementations, and will not be 37 | // of any interest in most programs. Use these functions if: 38 | // 39 | // - Your keys have a constant size, and are a multiple of 4 bytes 40 | // long. Variable size keys are not supported. Nor are non-word 41 | // aligned keys. 42 | // - You have an application that can receive and process inputs 43 | // in batches such that you usually have 8 keys to process at once. 44 | // (Doesn't need to be full batches of 8, but the break-even point 45 | // vs. a fast scalar hash is probably around batches of 3-4 keys). 46 | // - You can arrange for your hash keys to be in a column-major 47 | // order without too much pain. 48 | // - You can compile the program with -mavx2. (While there is a 49 | // non-avx2 fallback, there's not a lot of point to it). 50 | // - A single 32 bit hash value per key is sufficient. 51 | // 52 | // The "scalar" implementation is mainly included as a fallback, for 53 | // programs that generally use the parallel code, but have some 54 | // exceptional cases that need identical hash codes for individual 55 | // keys. Except for very small keys (at most 20 bytes), you are probably 56 | // better off with the reference implementation of some modern 57 | // hash function. 58 | // 59 | // DATA LAYOUT 60 | // ----------- 61 | // 62 | // For the parallel version the keys should be laid out adjacent to 63 | // each other, in column-major order. That is, the first word in 64 | // "keys" should be the first word of the first key. The second word 65 | // of "keys should be the first word of the second key. And so on: 66 | // 67 | // key1[0] key2[0] ... key7[0] 68 | // key2[1] key2[1] ... key7[1] 69 | // ... 70 | // key1[SizeWords-1] key2[SizeWords-1] ... key7[SizeWords-1] 71 | // 72 | // EXAMPLES 73 | // -------- 74 | // 75 | // Assume the following definitions: 76 | // 77 | // static const uint32_t KEY_LENGTH = 3; 78 | // 79 | // static uint32_t rows[][KEY_LENGTH] = { 80 | // {1, 2, 3}, 81 | // {4, 5, 6}, 82 | // {7, 8, 9}, 83 | // {10, 11, 12}, 84 | // {13, 14, 15}, 85 | // {16, 17, 18}, 86 | // {19, 20, 21}, 87 | // {22, 23, 24}, 88 | // }; 89 | // 90 | // static uint32_t cols[][8] = { 91 | // {1, 4, 7, 10, 13, 16, 19, 22}, 92 | // {2, 5, 8, 11, 14, 17, 20, 23}, 93 | // {3, 6, 9, 12, 15, 18, 21, 24}, 94 | // }; 95 | // 96 | // static uint32_t seeds[] = { 0x3afc8e77, 0x924f408d }; 97 | // static const uint32_t seed_count = sizeof(seeds) / sizeof(uint32_t); 98 | // 99 | // The implementations could be used as follows to compute hash values 100 | // for each of the key / seed combinations: 101 | // 102 | // void example_scalar() { 103 | // for (int s = 0; s < seed_count; ++s) { 104 | // for (int i = 0; i < 8; ++i) { 105 | // uint32_t* row = rows[i]; 106 | // uint32_t res = xxhash32<3>::scalar(row, seeds[s]); 107 | // printf("seed=%08x, key={%u,%u,%u}, hash=%u\n", 108 | // seeds[s], row[0], row[1], row[2], res); 109 | // } 110 | // } 111 | // } 112 | // 113 | // void example_parallel() { 114 | // for (int s = 0; s < seed_count; ++s) { 115 | // uint32_t res[8]; 116 | // __m256i hash = xxhash32<3>::parallel(cols[0], seeds[s], res); 117 | // for (int i = 0; i < 8; ++i) { 118 | // printf("seed=%08x, key={%u,%u,%u}, hash=%u\n", 119 | // seeds[s], cols[0][i], cols[1][i], cols[2][i], 120 | // res[i]); 121 | // } 122 | // } 123 | // } 124 | 125 | 126 | #ifndef PARALLEL_XXHASH_H 127 | #define PARALLEL_XXHASH_H 128 | 129 | #include 130 | #include 131 | #if __AVX2__ 132 | #include 133 | #endif 134 | 135 | template 136 | struct xxhash32 { 137 | 138 | #if __AVX2__ 139 | // Compute a hash value for 8 keys of SizeWords*4 bytes each. 140 | static void parallel(const uint32_t* keys, uint32_t seed, 141 | uint32_t res[8]) { 142 | __m256i h = _mm256_set1_epi32(seed + PRIME32_5); 143 | 144 | if (SizeWords >= 4) { 145 | __m256i v1 = _mm256_set1_epi32(seed + PRIME32_1 + PRIME32_2); 146 | __m256i v2 = _mm256_set1_epi32(seed + PRIME32_2); 147 | __m256i v3 = _mm256_set1_epi32(seed); 148 | __m256i v4 = _mm256_set1_epi32(seed - PRIME32_1); 149 | for (int i = 0; i < (SizeWords & ~3); i += 4) { 150 | __m256i k1 = _mm256_loadu_si256((__m256i*) (keys + (i + 0) * 8)); 151 | __m256i k2 = _mm256_loadu_si256((__m256i*) (keys + (i + 1) * 8)); 152 | __m256i k3 = _mm256_loadu_si256((__m256i*) (keys + (i + 2) * 8)); 153 | __m256i k4 = _mm256_loadu_si256((__m256i*) (keys + (i + 3) * 8)); 154 | v1 = mm256_round(v1, k1); 155 | v2 = mm256_round(v2, k2); 156 | v3 = mm256_round(v3, k3); 157 | v4 = mm256_round(v4, k4); 158 | } 159 | 160 | h = mm256_rol32<1>(v1) + mm256_rol32<7>(v2) + mm256_rol32<12>(v3) + mm256_rol32<18>(v4); 161 | } 162 | 163 | // Mixing in the length here is pretty silly, since it's always 164 | // constant. But there's probably some value in producing bitwise 165 | // identical results to the original xxhash code. 166 | h = _mm256_add_epi32(h, _mm256_set1_epi32(SizeWords * 4)); 167 | 168 | for (int i = -(SizeWords & 3); i < 0; ++i) { 169 | __m256i v = _mm256_loadu_si256((__m256i*) (keys + (SizeWords + i) * 8)); 170 | h = _mm256_add_epi32(h, 171 | _mm256_mullo_epi32(v, 172 | _mm256_set1_epi32(PRIME32_3))); 173 | h = _mm256_mullo_epi32(mm256_rol32<17>(h), 174 | _mm256_set1_epi32(PRIME32_4)); 175 | } 176 | 177 | _mm256_storeu_si256((__m256i*) res, mm256_fmix32(h)); 178 | } 179 | 180 | #else 181 | 182 | // This will get auto-vectorized perfectly on GCC 6 with 183 | // -mavx2. It gets auto-vectorized a little bit suboptimally on 184 | // GCC 4.92, and not at all on clang 3.8. So it's just a bit too 185 | // fragile actually use as the main implementation. 186 | static void parallel(const uint32_t* key, uint32_t seed, 187 | uint32_t res[8]) { 188 | #warning "No AVX2 support detected, using a fallback version instead." 189 | uint32_t h[8]; 190 | for (int i = 0; i < 8; ++i) { 191 | h[i] = seed + PRIME32_5; 192 | } 193 | if (SizeWords >= 4) { 194 | uint32_t v1[8]; 195 | uint32_t v2[8]; 196 | uint32_t v3[8]; 197 | uint32_t v4[8]; 198 | for (int i = 0; i < 8; ++i) { 199 | v1[i] = seed + PRIME32_1 + PRIME32_2; 200 | v2[i] = seed + PRIME32_2; 201 | v3[i] = seed + 0; 202 | v4[i] = seed - PRIME32_1; 203 | } 204 | for (int i = 0; i < (SizeWords & ~3); i += 4) { 205 | for (int j = 0; j < 8; ++j) { 206 | v1[j] = round(v1[j], key[(i + 0) * 8 + j]); 207 | v2[j] = round(v2[j], key[(i + 1) * 8 + j]); 208 | v3[j] = round(v3[j], key[(i + 2) * 8 + j]); 209 | v4[j] = round(v4[j], key[(i + 3) * 8 + j]); 210 | } 211 | } 212 | 213 | for (int i = 0; i < 8; ++i) { 214 | h[i] = rol32<1>(v1[i]) + rol32<7>(v2[i]) + rol32<12>(v3[i]) + rol32<18>(v4[i]); 215 | } 216 | } 217 | 218 | for (int i = 0; i < 8; ++i) { 219 | h[i] += 4 * SizeWords; 220 | } 221 | 222 | for (int i = -(SizeWords & 3); i < 0; ++i) { 223 | for (int j = 0; j < 8; ++j) { 224 | h[j] += key[SizeWords + i * 8 + j] * PRIME32_3; 225 | h[j] = rol32<17>(h[j]) * PRIME32_4; 226 | } 227 | } 228 | 229 | for (int i = 0; i < 8; ++i) { 230 | res[i] = fmix32(h[i]); 231 | } 232 | } 233 | 234 | #endif // __AVX2__ 235 | 236 | // Compute a 32 bit hash value for the key. 237 | static uint32_t scalar(uint32_t* key, uint32_t seed) { 238 | uint32_t h = seed + PRIME32_5; 239 | 240 | if (SizeWords >= 4) { 241 | uint32_t v1 = seed + PRIME32_1 + PRIME32_2; 242 | uint32_t v2 = seed + PRIME32_2; 243 | uint32_t v3 = seed + 0; 244 | uint32_t v4 = seed - PRIME32_1; 245 | for (int i = 0; i < (SizeWords & ~3); i += 4) { 246 | v1 = round(v1, key[i]); 247 | v2 = round(v2, key[i + 1]); 248 | v3 = round(v3, key[i + 2]); 249 | v4 = round(v4, key[i + 3]); 250 | } 251 | 252 | h = rol32<1>(v1) + rol32<7>(v2) + rol32<12>(v3) + rol32<18>(v4); 253 | } 254 | 255 | h += 4 * SizeWords; 256 | 257 | for (int i = -(SizeWords & 3); i < 0; ++i) { 258 | h += key[SizeWords + i] * PRIME32_3; 259 | h = rol32<17>(h) * PRIME32_4; 260 | } 261 | 262 | return fmix32(h); 263 | } 264 | 265 | private: 266 | #if __AVX2__ 267 | static __m256i mm256_round(__m256i seed, __m256i input) { 268 | seed = _mm256_add_epi32(seed, 269 | _mm256_mullo_epi32(input, 270 | _mm256_set1_epi32(PRIME32_2))); 271 | seed = mm256_rol32<13>(seed); 272 | seed = _mm256_mullo_epi32(seed, 273 | _mm256_set1_epi32(PRIME32_1)); 274 | return seed; 275 | } 276 | 277 | static __m256i mm256_fmix32(__m256i h) { 278 | h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 15)); 279 | h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_2)); 280 | h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 13)); 281 | h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_3)); 282 | h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 16)); 283 | 284 | return h; 285 | } 286 | 287 | template 288 | static __m256i mm256_rol32(__m256i x) { 289 | return _mm256_or_si256(_mm256_slli_epi32(x, r), 290 | _mm256_srli_epi32(x, 32 - r)); 291 | } 292 | #endif // __AVX2__ 293 | 294 | static uint32_t round(uint32_t seed, uint32_t input) { 295 | seed += input * PRIME32_2; 296 | seed = rol32<13>(seed); 297 | seed *= PRIME32_1; 298 | return seed; 299 | } 300 | 301 | static uint32_t fmix32(uint32_t h) { 302 | h ^= h >> 15; 303 | h *= PRIME32_2; 304 | h ^= h >> 13; 305 | h *= PRIME32_3; 306 | h ^= h >> 16; 307 | 308 | return h; 309 | } 310 | 311 | template 312 | static uint32_t rol32(uint32_t x) { 313 | return (x << r) | (x >> (32 - r)); 314 | } 315 | 316 | static const uint32_t PRIME32_1 = 2654435761U; 317 | static const uint32_t PRIME32_2 = 2246822519U; 318 | static const uint32_t PRIME32_3 = 3266489917U; 319 | static const uint32_t PRIME32_4 = 668265263U; 320 | static const uint32_t PRIME32_5 = 374761393U; 321 | }; 322 | 323 | #endif // PARALLEL_XXHASH_H 324 | --------------------------------------------------------------------------------