├── CMakeLists.txt
├── LICENSE
├── README.md
└── src
    ├── examples.cc
    ├── hash_benchmark.cc
    ├── parallel-murmur3.h
    └── parallel-xxhash.h


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8)
 2 | 
 3 | set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 4 | set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 5 | set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 6 | 
 7 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -std=c99 -O3 -g3 -Wall -Werror -fno-strict-aliasing -Wno-sign-compare -march=native -I src/third-party/")
 8 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -O3 -g3 -Wall -Werror -fno-strict-aliasing -Wno-sign-compare -march=native -I src/third-party/")
 9 | 
10 | set(CMAKE_EXE_LINKER_FLAGS "")
11 | 
12 | add_executable(examples
13 |   src/examples.cc)
14 | 
15 | 
16 | enable_testing()
17 | # Usual workaround for the broken test build dependency handling in CMake.
18 | # Every test must depend on this dummy target.
19 | add_test(build_test_code "${CMAKE_COMMAND}" --build ${CMAKE_BINARY_DIR} --target all)
20 | 
21 | macro(define_test name)
22 |   add_executable(${name}.testbin
23 |     src/test/${name}.cc)
24 |   add_test(${name} bin/${name}.testbin)
25 |   set_tests_properties(${name} PROPERTIES DEPENDS
26 |     build_test_code)
27 | endmacro()
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Juho Snellman
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | parallel-xxhash
  2 | ---------------
  3 | 
  4 | Two implementations of the [xxHash](http:cyan4973.github.io/xxHash/)
  5 | hash function (specifically, the 32 bit version).
  6 | 
  7 | - `parallel`: Computing the hash values of 8 keys in parallel, using
  8 |   AVX2 intrinsics.
  9 | - `scalar`: Computing the hash value of a single key.
 10 | 
 11 | These are very special purpose implementations, and will not be
 12 | of any interest in most programs. Use these functions if:
 13 | 
 14 | - Your keys have a constant size, and are a multiple of 4 bytes
 15 |   long. Variable size keys are not supported. Nor are non-word
 16 |   aligned keys.
 17 | - You have an application that can receive and process inputs
 18 |   in batches such that you usually have 8 keys to process at once.
 19 |   (Doesn't need to be full batches of 8, but the break-even point
 20 |   vs. a fast scalar hash is probably around batches of 3-4 keys).
 21 | - You can arrange for your hash keys to be in a column-major
 22 |   order without too much pain.
 23 | - You can compile the program with -mavx2. (While there is a
 24 |   non-avx2 fallback, there's not a lot of point to it).
 25 | - A single 32 bit hash value per key is sufficient.
 26 | 
 27 | The `scalar` implementation is mainly included as a fallback, for
 28 | programs that generally use the parallel code, but have some
 29 | exceptional cases that need identical hash codes for individual
 30 | keys. Except for very small keys (at most 20 bytes), you are probably
 31 | better off with the reference implementation of some modern
 32 | hash function.
 33 | 
 34 | DATA LAYOUT
 35 | -----------
 36 | 
 37 | For the parallel version the keys should be laid out adjacent to
 38 | each other, in column-major order. That is, the first word in
 39 | `keys` should be the first word of the first key. The second word
 40 | of `keys` should be the first word of the second key. And so on:
 41 | 
 42 | ```
 43 |   key1[0] key2[0] ... key7[0]
 44 |   key2[1] key2[1] ... key7[1]
 45 |   ...
 46 |   key1[SizeWords-1] key2[SizeWords-1] ... key7[SizeWords-1]
 47 | ```
 48 | 
 49 | EXAMPLES
 50 | --------
 51 | 
 52 | Assume the following definitions:
 53 | 
 54 | ```c++
 55 |    static const uint32_t KEY_LENGTH = 3;
 56 | 
 57 |    static uint32_t rows[][KEY_LENGTH] = {
 58 |       {1, 2, 3},
 59 |       {4, 5, 6},
 60 |       {7, 8, 9},
 61 |       {10, 11, 12},
 62 |       {13, 14, 15},
 63 |       {16, 17, 18},
 64 |       {19, 20, 21},
 65 |       {22, 23, 24},
 66 |   };
 67 | 
 68 |   static uint32_t cols[][8] = {
 69 |       {1, 4, 7, 10, 13, 16, 19, 22},
 70 |       {2, 5, 8, 11, 14, 17, 20, 23},
 71 |       {3, 6, 9, 12, 15, 18, 21, 24},
 72 |   };
 73 | 
 74 |   static uint32_t seeds[] = { 0x3afc8e77, 0x924f408d };
 75 |   static const uint32_t seed_count = sizeof(seeds) / sizeof(uint32_t);
 76 | ```
 77 | 
 78 | The implementations could be used as follows to compute hash values
 79 | for each of the key / seed combinations:
 80 | 
 81 | ```c++
 82 |   void example_scalar() {
 83 |       for (int s = 0; s < seed_count; ++s) {
 84 |           for (int i = 0; i < 8; ++i) {
 85 |               uint32_t* row = rows[i];
 86 |               uint32_t res = xxhash32<3>::scalar(row, seeds[s]);
 87 |               printf("seed=%08x, key={%u,%u,%u}, hash=%u\n",
 88 |                      seeds[s], row[0], row[1], row[2], res);
 89 |           }
 90 |       }
 91 |   }
 92 | 
 93 |   void example_parallel() {
 94 |       for (int s = 0; s < seed_count; ++s) {
 95 |           uint32_t res[8];
 96 |           __m256i hash = xxhash32<3>::parallel(cols[0], seeds[s], res);
 97 |           for (int i = 0; i < 8; ++i) {
 98 |               printf("seed=%08x, key={%u,%u,%u}, hash=%u\n",
 99 |                      seeds[s], cols[0][i], cols[1][i], cols[2][i],
100 |                      res[i]);
101 |           }
102 |       }
103 |   }
104 | ```
105 | 


--------------------------------------------------------------------------------
/src/examples.cc:
--------------------------------------------------------------------------------
  1 | // -*- mode: c++; c-basic-offset: 4 indent-tabs-mode: nil -*- */
  2 | //
  3 | // Copyright 2017 Juho Snellman, released under a MIT license
  4 | 
  5 | #include "parallel-murmur3.h"
  6 | #include "parallel-xxhash.h"
  7 | 
  8 | #ifdef KEY_LENGTH
  9 | #undef KEY_LENGTH
 10 | #endif
 11 | 
 12 | static const int KEY_LENGTH = 3;
 13 | 
 14 | static uint32_t rows[][KEY_LENGTH] = {
 15 |     {1, 2, 3},
 16 |     {4, 5, 6},
 17 |     {7, 8, 9},
 18 |     {10, 11, 12},
 19 |     {13, 14, 15},
 20 |     {16, 17, 18},
 21 |     {19, 20, 21},
 22 |     {22, 23, 24},
 23 | };
 24 | 
 25 | static uint32_t cols[][8] = {
 26 |     {1, 4, 7, 10, 13, 16, 19, 22},
 27 |     {2, 5, 8, 11, 14, 17, 20, 23},
 28 |     {3, 6, 9, 12, 15, 18, 21, 24},
 29 | };
 30 | 
 31 | static uint32_t seeds[] = { 0x3afc8e77, 0x924f408d };
 32 | static const uint32_t seed_count = sizeof(seeds) / sizeof(uint32_t);
 33 | 
 34 | void example_murmur3_scalar() {
 35 |     for (int s = 0; s < seed_count; ++s) {
 36 |         for (int i = 0; i < 8; ++i) {
 37 |             uint32_t* row = rows[i];
 38 |             uint32_t res = murmur3<3>::scalar(row, seeds[s]);
 39 |             printf("seed=%08x, key={%u,%u,%u}, hash=%u\n",
 40 |                    seeds[s], row[0], row[1], row[2], res);
 41 |         }
 42 |     }
 43 | }
 44 | 
 45 | void example_murmur3_parallel() {
 46 |     for (int s = 0; s < seed_count; ++s) {
 47 |         uint32_t res[8];
 48 |         murmur3<3>::parallel(cols[0], seeds[s], res);
 49 |         for (int i = 0; i < 8; ++i) {
 50 |             printf("seed=%08x, key={%u,%u,%u}, hash=%u\n",
 51 |                    seeds[s], cols[0][i], cols[1][i], cols[2][i],
 52 |                    res[i]);
 53 |         }
 54 |     }
 55 | }
 56 | 
 57 | void example_murmur3_parallel_multiseed() {
 58 |     __m256i hash[seed_count];
 59 |     murmur3<3>::parallel_multiseed<seed_count>(cols[0], seeds, hash);
 60 |     for (int s = 0; s < seed_count; ++s) {
 61 |         uint32_t res[8];
 62 |         _mm256_storeu_si256((__m256i*) res, hash[s]);
 63 |         for (int i = 0; i < 8; ++i) {
 64 |             printf("seed=%08x, key={%u,%u,%u}, hash=%u\n",
 65 |                    seeds[s], cols[0][i], cols[1][i], cols[2][i],
 66 |                    res[i]);
 67 |         }
 68 |     }
 69 | }
 70 | 
 71 | void example_xxhash32_scalar() {
 72 |     for (int s = 0; s < seed_count; ++s) {
 73 |         for (int i = 0; i < 8; ++i) {
 74 |             uint32_t* row = rows[i];
 75 |             uint32_t res = xxhash32<3>::scalar(row, seeds[s]);
 76 |             printf("seed=%08x, key={%u,%u,%u}, hash=%u\n",
 77 |                    seeds[s], row[0], row[1], row[2], res);
 78 |         }
 79 |     }
 80 | }
 81 | 
 82 | void example_xxhash32_parallel() {
 83 |     for (int s = 0; s < seed_count; ++s) {
 84 |         uint32_t res[8];
 85 |         xxhash32<3>::parallel(cols[0], seeds[s], res);
 86 |         for (int i = 0; i < 8; ++i) {
 87 |             printf("seed=%08x, key={%u,%u,%u}, hash=%u\n",
 88 |                    seeds[s], cols[0][i], cols[1][i], cols[2][i],
 89 |                    res[i]);
 90 |         }
 91 |     }
 92 | }
 93 | 
 94 | int main (void) {
 95 |     example_murmur3_scalar();
 96 |     example_murmur3_parallel();
 97 |     example_murmur3_parallel_multiseed();
 98 |     example_xxhash32_scalar();
 99 |     example_xxhash32_parallel();
100 | }
101 | 


--------------------------------------------------------------------------------
/src/hash_benchmark.cc:
--------------------------------------------------------------------------------
  1 | #include "parallel-murmur3.h"
  2 | #include "parallel-xxhash.h"
  3 | 
  4 | #include <chrono>
  5 | #include <cstdio>
  6 | extern "C" {
  7 | #include "third-party/MurmurHash3.h"
  8 | }
  9 | #include "third-party/cityhash/city.h"
 10 | #include "third-party/xxhash.h"
 11 | #include "third-party/metrohash64.h"
 12 | 
 13 | #if !defined(KEY_LENGTH)
 14 | #error "Remember to pass in a -DKEY_LENGTH"
 15 | #endif
 16 | 
 17 | struct test_parallel {
 18 |     __attribute__((noinline))
 19 |     void run(uint32_t* keys, uint32_t* seed, uint32_t* res) {
 20 |         auto h = hash(keys, seed[0]);
 21 |         _mm256_storeu_si256((__m256i*) res, h);
 22 |     }
 23 | 
 24 |     __attribute__((noinline))
 25 |     __m256i hash(uint32_t* keys, uint32 seed) {
 26 |         __m256i res;
 27 |         murmur3<KEY_LENGTH>::parallel(keys, seed, (uint32_t*) &res);
 28 |         return res;
 29 |     }
 30 | };
 31 | 
 32 | template<int SeedCount>
 33 | struct test_parallel_multiseed {
 34 |     __attribute__((noinline))
 35 |     void run(uint32_t* keys, uint32_t* seed, uint32_t* res) {
 36 |         // No point in playing further noinline games here.
 37 |         murmur3<KEY_LENGTH>::parallel_multiseed<SeedCount>(
 38 |             keys, seed, (__m256i*) res);
 39 |     }
 40 | };
 41 | 
 42 | struct test_scalar {
 43 |     __attribute__((noinline))
 44 |     void run(uint32_t* keys, uint32_t* seed, uint32_t* res) {
 45 |         for (int i = 0; i < 8; ++i) {
 46 |             res[i] = hash(&keys[KEY_LENGTH * i], seed[0]);
 47 |         }
 48 |     }
 49 | 
 50 |     __attribute__((noinline))
 51 |     uint32_t hash(uint32_t* key, uint32 seed) {
 52 |         return murmur3<KEY_LENGTH>::scalar(key, seed);
 53 |     }
 54 | };
 55 | 
 56 | struct test_parallel_xxhash32 {
 57 |     __attribute__((noinline))
 58 |     void run(uint32_t* keys, uint32_t* seed, uint32_t* res) {
 59 |         auto h = hash(keys, seed[0]);
 60 |         _mm256_storeu_si256((__m256i*) res, h);
 61 |     }
 62 | 
 63 |     __attribute__((noinline))
 64 |     __m256i hash(uint32_t* keys, uint32 seed) {
 65 |         __m256i res;
 66 |         xxhash32<KEY_LENGTH>::parallel(keys, seed, (uint32_t*) &res);
 67 |         return res;
 68 |     }
 69 | };
 70 | 
 71 | struct test_scalar_xxhash32 {
 72 |     __attribute__((noinline))
 73 |     void run(uint32_t* keys, uint32_t* seed, uint32_t* res) {
 74 |         for (int i = 0; i < 8; ++i) {
 75 |             res[i] = hash(&keys[KEY_LENGTH * i], seed[0]);
 76 |         }
 77 |     }
 78 | 
 79 |     __attribute__((noinline))
 80 |     uint32_t hash(uint32_t* key, uint32 seed) {
 81 |         return xxhash32<KEY_LENGTH>::scalar(key, seed);
 82 |     }
 83 | };
 84 | 
 85 | 
 86 | struct test_original {
 87 |     __attribute__((noinline))
 88 |     void run(uint32_t* keys, uint32_t* seed, uint32_t* res) {
 89 |         for (int i = 0; i < 8; ++i) {
 90 |             MurmurHash3_x86_32(&keys[KEY_LENGTH * i],
 91 |                                4 * KEY_LENGTH,
 92 |                                seed[0],
 93 |                                &res[i]);
 94 |         }
 95 |     }
 96 | };
 97 | 
 98 | struct test_cityhash {
 99 |     __attribute__((noinline))
100 |     void run(uint32_t* keys, uint32_t* seed, uint32_t* res) {
101 |         for (int i = 0; i < 8; ++i) {
102 |             res[i] = CityHash64WithSeed((const char*)
103 |                                         &keys[KEY_LENGTH * i],
104 |                                         4 * KEY_LENGTH,
105 |                                         seed[0]);
106 |         }
107 |     }
108 | };
109 | 
110 | struct test_cityhash32 {
111 |     __attribute__((noinline))
112 |     void run(uint32_t* keys, uint32_t* seed, uint32_t* res) {
113 |         for (int i = 0; i < 8; ++i) {
114 |             res[i] = CityHash32((const char*)
115 |                                 &keys[KEY_LENGTH * i],
116 |                                 4 * KEY_LENGTH);
117 |         }
118 |     }
119 | };
120 | 
121 | struct test_xxhash32 {
122 |     __attribute__((noinline))
123 |     void run(uint32_t* keys, uint32_t* seed, uint32_t* res) {
124 |         for (int i = 0; i < 8; ++i) {
125 |             res[i] = XXH32((const char*)
126 |                            &keys[KEY_LENGTH * i],
127 |                            4 * KEY_LENGTH,
128 |                            i);
129 |         }
130 |     }
131 | };
132 | 
133 | struct test_xxhash64 {
134 |     __attribute__((noinline))
135 |     void run(uint32_t* keys, uint32_t* seed, uint32_t* res) {
136 |         for (int i = 0; i < 8; ++i) {
137 |             res[i] = XXH64((const char*)
138 |                            &keys[KEY_LENGTH * i],
139 |                            4 * KEY_LENGTH,
140 |                            i);
141 |         }
142 |     }
143 | };
144 | 
145 | struct test_metrohash64 {
146 |     __attribute__((noinline))
147 |     void run(uint32_t* keys, uint32_t* seed, uint32_t* res) {
148 |         for (int i = 0; i < 8; ++i) {
149 |             MetroHash64::Hash((const unsigned char*)
150 |                               &keys[KEY_LENGTH * i],
151 |                               4 * KEY_LENGTH,
152 |                               (uint8_t*) &res[i * 2],
153 |                               seed[0]);
154 |         }
155 |     }
156 | };
157 | 
158 | template<typename Q, int WorkFactor=1>
159 | bool bench(const char* label, uint64_t n, uint32_t* keys) {
160 |     Q tester;
161 |     uint32_t seed[] = {
162 |         0x3afc8e77, 0x924f408d,
163 |         0x8c2a315e, 0x78884cdb,
164 |         0xd2ef9767, 0xee5e590c,
165 |         0x06201e43, 0xb2e4d8df,
166 |     };
167 |     uint32_t res[8 * 8];
168 | 
169 |     auto start = std::chrono::system_clock::now();
170 | 
171 |     for (int i = 0; i < n; ++i) {
172 |         tester.run(keys, seed, res);
173 |     }
174 | 
175 |     auto end = std::chrono::system_clock::now();
176 |     std::chrono::duration<double> t = end - start;
177 | 
178 |     uint64_t total_bytes = n * KEY_LENGTH * 4 * 8 * WorkFactor;
179 |     printf("%s,%d,%lf,%ld,%lf,%lf\n",
180 |            label,
181 |            4 * KEY_LENGTH,
182 |            t.count(),
183 |            total_bytes,
184 |            t.count() * 1e9 / total_bytes,
185 |            t.count() * 1e9 / (n * 8 * WorkFactor));
186 | 
187 |     return true;
188 | }
189 | 
190 | void init_keys(uint32_t* rows, uint32_t* cols) {
191 |     for (int i = 0; i < 8; ++i) {
192 |         for (int j = 0; j < KEY_LENGTH; ++j) {
193 |             uint32_t value = (i + 1) * ((j << 8) + 1);
194 |             rows[i * KEY_LENGTH + j] = value;
195 |             cols[j * 8 + i] = value;
196 |         }
197 |     }
198 | }
199 | 
200 | int main(void) {
201 |     uint32_t rows[KEY_LENGTH * 8 * sizeof(uint32_t)];
202 |     uint32_t cols[KEY_LENGTH * 8 * sizeof(uint32_t)];
203 | 
204 |     int n = (1 << 27) / KEY_LENGTH;
205 | 
206 |     if (KEY_LENGTH == 1) {
207 |         printf("impl,keysize,time,bytes,ns_per_byte,ns_per_key\n");
208 |     }
209 | 
210 |     init_keys(rows, cols);
211 |     bench<test_parallel>("parallel murmur3", n, cols);
212 |     bench<test_parallel_xxhash32>("parallel xxhash32", n, cols);
213 |     // bench<test_parallel_multiseed<1>, 1>("parallel_multiseed<1>", n,
214 |     //                                      cols);
215 |     // bench<test_parallel_multiseed<2>, 2>("parallel_multiseed<2>", n,
216 |     //                                      cols);
217 |     // bench<test_parallel_multiseed<4>, 4>("parallel_multiseed<4>", n,
218 |     //                                      cols);
219 |     // bench<test_scalar>("scalar murmur3", n, rows);
220 |     // bench<test_original>("original murmur3", n, rows);
221 |     bench<test_scalar_xxhash32>("scalar xxhash32", n, cols);
222 |     bench<test_cityhash>("cityhash", n, rows);
223 |     // bench<test_cityhash32>("cityhash32", n, rows);
224 |     // bench<test_xxhash32>("xxhash32", n, rows);
225 |     bench<test_xxhash64>("xxhash64", n, rows);
226 |     bench<test_metrohash64>("metrohash64", n, rows);
227 | }
228 | 


--------------------------------------------------------------------------------
/src/parallel-murmur3.h:
--------------------------------------------------------------------------------
  1 | // -*- mode: c++; c-basic-offset: 4 indent-tabs-mode: nil -*- */
  2 | //
  3 | // Copyright 2017 Juho Snellman, released under a MIT license:
  4 | //
  5 | // Permission is hereby granted, free of charge, to any person obtaining
  6 | // a copy of this software and associated documentation files (the
  7 | // "Software"), to deal in the Software without restriction, including
  8 | // without limitation the rights to use, copy, modify, merge, publish,
  9 | // distribute, sublicense, and/or sell copies of the Software, and to
 10 | // permit persons to whom the Software is furnished to do so, subject to
 11 | // the following conditions:
 12 | //
 13 | // The above copyright notice and this permission notice shall be
 14 | // included in all copies or substantial portions of the Software.
 15 | //
 16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 | // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 20 | // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 21 | // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 22 | // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 23 | //
 24 | // README
 25 | // ------
 26 | //
 27 | // Three AVX2 implementations of the Murmur3 hash functions.
 28 | //
 29 | // - parallel: Computing the hash values of 8 keys in parallel.
 30 | // - parallel_multiseed: Computing N hash values for each of 8 keys
 31 | //   in parallel. Each of the N hash values for a key will be computed
 32 | //   using a different key.
 33 | // - scalar: Computing the hash value of a single key.
 34 | //
 35 | // You probably don't want to use any of these, look in
 36 | // parallel-xxhash.h instead. (It is missing parallel_multiseed, since
 37 | // my use case for that disappeared. But it'd be trivial to implement).
 38 | 
 39 | #ifndef PARALLEL_MURMUR3_H
 40 | #define PARALLEL_MURMUR3_H
 41 | 
 42 | #include <cstdint>
 43 | #include <cstdio>
 44 | #include <immintrin.h>
 45 | 
 46 | template<int SizeWords>
 47 | struct murmur3 {
 48 | 
 49 |     // Compute a hash value for 8 keys of SizeWords*4 bytes each.
 50 |     static void parallel(const uint32_t* keys, uint32_t seed,
 51 |                          uint32_t res[8]) {
 52 |         const __m256i c1 = _mm256_set1_epi32(0xcc9e2d51);
 53 |         const __m256i c2 = _mm256_set1_epi32(0x1b873593);
 54 |         __m256i h = _mm256_set1_epi32(seed);
 55 | 
 56 |         for (int i = 0; i < SizeWords; ++i) {
 57 |             __m256i k = _mm256_loadu_si256((__m256i*) (keys + i * 8));
 58 |             k = _mm256_mullo_epi32(k, c1);
 59 |             k = mm256_rol32<15>(k);
 60 |             k = _mm256_mullo_epi32(k, c2);
 61 | 
 62 |             h = _mm256_xor_si256(h, k);
 63 |             h = mm256_rol32<13>(h);
 64 |             h = _mm256_add_epi32(_mm256_mullo_epi32(h,
 65 |                                                     _mm256_set1_epi32(5)),
 66 |                                  _mm256_set1_epi32(0xe6546b64));
 67 |         }
 68 | 
 69 |         // Mixing in the length here is pretty silly, since it's always
 70 |         // constant. But there's probably some value in producing bitwise
 71 |         // identical results to the original murmur3 code.
 72 |         h = _mm256_xor_si256(h, _mm256_set1_epi32(SizeWords * 4));
 73 | 
 74 |         _mm256_storeu_si256((__m256i*) res, mm256_fmix32(h));
 75 |     }
 76 | 
 77 |     // For each of 8 keys, compute N hash values each with a different
 78 |     // starting seed value. The hash values will be written to "res".
 79 |     template<int N>
 80 |     static void parallel_multiseed(const uint32_t* keys, uint32_t seeds[N],
 81 |                                    __m256i res[N]) {
 82 |         const __m256i c1 = _mm256_set1_epi32(0xcc9e2d51);
 83 |         const __m256i c2 = _mm256_set1_epi32(0x1b873593);
 84 |         __m256i h[N];
 85 |         for (int j = 0; j < N; ++j) {
 86 |             h[j] = _mm256_set1_epi32(seeds[j]);
 87 |         }
 88 | 
 89 |         for (int i = 0; i < SizeWords; ++i) {
 90 |             __m256i k = _mm256_loadu_si256((__m256i*) (keys + i * 8));
 91 |             k = _mm256_mullo_epi32(k, c1);
 92 |             k = mm256_rol32<15>(k);
 93 |             k = _mm256_mullo_epi32(k, c2);
 94 | 
 95 |             for (int j = 0; j < N; ++j) {
 96 |                 h[j] = _mm256_xor_si256(h[j], k);
 97 |                 h[j] = mm256_rol32<13>(h[j]);
 98 |                 h[j] = _mm256_add_epi32(_mm256_mullo_epi32(h[j],
 99 |                                                            _mm256_set1_epi32(5)),
100 |                                         _mm256_set1_epi32(0xe6546b64));
101 |             }
102 |         }
103 | 
104 |         for (int j = 0; j < N; ++j) {
105 |             h[j] = _mm256_xor_si256(h[j], _mm256_set1_epi32(SizeWords * 4));
106 |             res[j] = mm256_fmix32(h[j]);
107 |         }
108 |     }
109 | 
110 |     // Compute a hash value for the key.
111 |     static uint32_t scalar(uint32_t* key, uint32_t seed) {
112 |         const uint32_t c1s = 0xcc9e2d51;
113 |         const uint32_t c2s = 0x1b873593;
114 |         uint32_t h = seed;
115 | 
116 |         for (int i = 0; i < SizeWords; ++i) {
117 |             uint32_t k = key[i];
118 |             k *= c1s;
119 |             k = rol32<15>(k);
120 |             k *= c2s;
121 | 
122 |             h ^= k;
123 |             h = rol32<13>(h);
124 |             h = h*5 + 0xe6546b64;
125 |         }
126 | 
127 |         h ^= SizeWords * 4;
128 | 
129 |         return fmix32(h);
130 |     }
131 | 
132 | private:
133 |     static __m256i mm256_fmix32(__m256i h) {
134 |         h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 16));
135 |         h = _mm256_mullo_epi32(h, _mm256_set1_epi32(0x85ebca6b));
136 |         h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 13));
137 |         h = _mm256_mullo_epi32(h, _mm256_set1_epi32(0xc2b2ae35));
138 |         h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 16));
139 | 
140 |         return h;
141 |     }
142 | 
143 |     template<int r>
144 |     static __m256i mm256_rol32(__m256i x) {
145 |         return _mm256_or_si256(_mm256_slli_epi32(x, r),
146 |                                _mm256_srli_epi32(x, 32 - r));
147 |     }
148 | 
149 |     static uint32_t fmix32(uint32_t h) {
150 |         h ^= h >> 16;
151 |         h *= 0x85ebca6b;
152 |         h ^= h >> 13;
153 |         h *= 0xc2b2ae35;
154 |         h ^= h >> 16;
155 | 
156 |         return h;
157 |     }
158 | 
159 |     template<int r>
160 |     static uint32_t rol32(uint32_t x) {
161 |         return (x << r) | (x >> (32 - r));
162 |     }
163 | };
164 | 
165 | #endif // PARALLEL_MURMUR3_H
166 | 


--------------------------------------------------------------------------------
/src/parallel-xxhash.h:
--------------------------------------------------------------------------------
  1 | // -*- mode: c++; c-basic-offset: 4 indent-tabs-mode: nil -*- */
  2 | //
  3 | // Copyright 2017 Juho Snellman, released under a MIT license:
  4 | //
  5 | // Permission is hereby granted, free of charge, to any person obtaining
  6 | // a copy of this software and associated documentation files (the
  7 | // "Software"), to deal in the Software without restriction, including
  8 | // without limitation the rights to use, copy, modify, merge, publish,
  9 | // distribute, sublicense, and/or sell copies of the Software, and to
 10 | // permit persons to whom the Software is furnished to do so, subject to
 11 | // the following conditions:
 12 | //
 13 | // The above copyright notice and this permission notice shall be
 14 | // included in all copies or substantial portions of the Software.
 15 | //
 16 | // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 17 | // EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 18 | // MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 19 | // NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
 20 | // LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
 21 | // OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
 22 | // WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 23 | //
 24 | // README
 25 | // ------
 26 | //
 27 | // Two implementations of the 32 bit version of the XXHash
 28 | // hash function.
 29 | //
 30 | // - parallel: Computing the hash values of 8 keys in parallel, using
 31 | //   AVX2 intrinsics. (There's also a version with identical semantics
 32 | //   but plain C++, which the compiler might or might not be able to
 33 | //   auto-vectorize.)
 34 | // - scalar: Computing the hash value of a single key.
 35 | //
 36 | // These are very special purpose implementations, and will not be
 37 | // of any interest in most programs. Use these functions if:
 38 | //
 39 | // - Your keys have a constant size, and are a multiple of 4 bytes
 40 | //   long. Variable size keys are not supported. Nor are non-word
 41 | //   aligned keys.
 42 | // - You have an application that can receive and process inputs
 43 | //   in batches such that you usually have 8 keys to process at once.
 44 | //   (Doesn't need to be full batches of 8, but the break-even point
 45 | //   vs. a fast scalar hash is probably around batches of 3-4 keys).
 46 | // - You can arrange for your hash keys to be in a column-major
 47 | //   order without too much pain.
 48 | // - You can compile the program with -mavx2. (While there is a
 49 | //   non-avx2 fallback, there's not a lot of point to it).
 50 | // - A single 32 bit hash value per key is sufficient.
 51 | //
 52 | // The "scalar" implementation is mainly included as a fallback, for
 53 | // programs that generally use the parallel code, but have some
 54 | // exceptional cases that need identical hash codes for individual
 55 | // keys. Except for very small keys (at most 20 bytes), you are probably
 56 | // better off with the reference implementation of some modern
 57 | // hash function.
 58 | //
 59 | // DATA LAYOUT
 60 | // -----------
 61 | //
 62 | // For the parallel version the keys should be laid out adjacent to
 63 | // each other, in column-major order. That is, the first word in
 64 | // "keys" should be the first word of the first key. The second word
 65 | // of "keys should be the first word of the second key. And so on:
 66 | //
 67 | //   key1[0] key2[0] ... key7[0]
 68 | //   key2[1] key2[1] ... key7[1]
 69 | //   ...
 70 | //   key1[SizeWords-1] key2[SizeWords-1] ... key7[SizeWords-1]
 71 | //
 72 | // EXAMPLES
 73 | // --------
 74 | //
 75 | // Assume the following definitions:
 76 | //
 77 | //   static const uint32_t KEY_LENGTH = 3;
 78 | //
 79 | //    static uint32_t rows[][KEY_LENGTH] = {
 80 | //       {1, 2, 3},
 81 | //       {4, 5, 6},
 82 | //       {7, 8, 9},
 83 | //       {10, 11, 12},
 84 | //       {13, 14, 15},
 85 | //       {16, 17, 18},
 86 | //       {19, 20, 21},
 87 | //       {22, 23, 24},
 88 | //   };
 89 | //
 90 | //   static uint32_t cols[][8] = {
 91 | //       {1, 4, 7, 10, 13, 16, 19, 22},
 92 | //       {2, 5, 8, 11, 14, 17, 20, 23},
 93 | //       {3, 6, 9, 12, 15, 18, 21, 24},
 94 | //   };
 95 | //
 96 | //   static uint32_t seeds[] = { 0x3afc8e77, 0x924f408d };
 97 | //   static const uint32_t seed_count = sizeof(seeds) / sizeof(uint32_t);
 98 | //
 99 | // The implementations could be used as follows to compute hash values
100 | // for each of the key / seed combinations:
101 | //
102 | //   void example_scalar() {
103 | //       for (int s = 0; s < seed_count; ++s) {
104 | //           for (int i = 0; i < 8; ++i) {
105 | //               uint32_t* row = rows[i];
106 | //               uint32_t res = xxhash32<3>::scalar(row, seeds[s]);
107 | //               printf("seed=%08x, key={%u,%u,%u}, hash=%u\n",
108 | //                      seeds[s], row[0], row[1], row[2], res);
109 | //           }
110 | //       }
111 | //   }
112 | //
113 | //   void example_parallel() {
114 | //       for (int s = 0; s < seed_count; ++s) {
115 | //           uint32_t res[8];
116 | //           __m256i hash = xxhash32<3>::parallel(cols[0], seeds[s], res);
117 | //           for (int i = 0; i < 8; ++i) {
118 | //               printf("seed=%08x, key={%u,%u,%u}, hash=%u\n",
119 | //                      seeds[s], cols[0][i], cols[1][i], cols[2][i],
120 | //                      res[i]);
121 | //           }
122 | //       }
123 | //   }
124 | 
125 | 
126 | #ifndef PARALLEL_XXHASH_H
127 | #define PARALLEL_XXHASH_H
128 | 
129 | #include <cstdint>
130 | #include <cstdio>
131 | #if __AVX2__
132 | #include <immintrin.h>
133 | #endif
134 | 
135 | template<int SizeWords>
136 | struct xxhash32 {
137 | 
138 | #if __AVX2__
139 |     // Compute a hash value for 8 keys of SizeWords*4 bytes each.
140 |     static void parallel(const uint32_t* keys, uint32_t seed,
141 |                          uint32_t res[8]) {
142 |         __m256i h = _mm256_set1_epi32(seed + PRIME32_5);
143 | 
144 |         if (SizeWords >= 4) {
145 |             __m256i v1 = _mm256_set1_epi32(seed + PRIME32_1 + PRIME32_2);
146 |             __m256i v2 = _mm256_set1_epi32(seed + PRIME32_2);
147 |             __m256i v3 = _mm256_set1_epi32(seed);
148 |             __m256i v4 = _mm256_set1_epi32(seed - PRIME32_1);
149 |             for (int i = 0; i < (SizeWords & ~3); i += 4) {
150 |                 __m256i k1 = _mm256_loadu_si256((__m256i*) (keys + (i + 0) * 8));
151 |                 __m256i k2 = _mm256_loadu_si256((__m256i*) (keys + (i + 1) * 8));
152 |                 __m256i k3 = _mm256_loadu_si256((__m256i*) (keys + (i + 2) * 8));
153 |                 __m256i k4 = _mm256_loadu_si256((__m256i*) (keys + (i + 3) * 8));
154 |                 v1 = mm256_round(v1, k1);
155 |                 v2 = mm256_round(v2, k2);
156 |                 v3 = mm256_round(v3, k3);
157 |                 v4 = mm256_round(v4, k4);
158 |             }
159 | 
160 |             h = mm256_rol32<1>(v1) + mm256_rol32<7>(v2) + mm256_rol32<12>(v3) + mm256_rol32<18>(v4);
161 |         }
162 | 
163 |         // Mixing in the length here is pretty silly, since it's always
164 |         // constant. But there's probably some value in producing bitwise
165 |         // identical results to the original xxhash code.
166 |         h = _mm256_add_epi32(h, _mm256_set1_epi32(SizeWords * 4));
167 | 
168 |         for (int i = -(SizeWords & 3); i < 0; ++i) {
169 |             __m256i v = _mm256_loadu_si256((__m256i*) (keys + (SizeWords + i) * 8));
170 |             h = _mm256_add_epi32(h,
171 |                                  _mm256_mullo_epi32(v,
172 |                                                     _mm256_set1_epi32(PRIME32_3)));
173 |             h = _mm256_mullo_epi32(mm256_rol32<17>(h),
174 |                                    _mm256_set1_epi32(PRIME32_4));
175 |         }
176 | 
177 |         _mm256_storeu_si256((__m256i*) res, mm256_fmix32(h));
178 |     }
179 | 
180 | #else
181 | 
182 |     // This will get auto-vectorized perfectly on GCC 6 with
183 |     // -mavx2. It gets auto-vectorized a little bit suboptimally on
184 |     // GCC 4.92, and not at all on clang 3.8. So it's just a bit too
185 |     // fragile actually use as the main implementation.
186 |     static void parallel(const uint32_t* key, uint32_t seed,
187 |                          uint32_t res[8]) {
188 | #warning "No AVX2 support detected, using a fallback version instead."
189 |         uint32_t h[8];
190 |         for (int i = 0; i < 8; ++i) {
191 |             h[i] = seed + PRIME32_5;
192 |         }
193 |         if (SizeWords >= 4) {
194 |             uint32_t v1[8];
195 |             uint32_t v2[8];
196 |             uint32_t v3[8];
197 |             uint32_t v4[8];
198 |             for (int i = 0; i < 8; ++i) {
199 |                 v1[i] = seed + PRIME32_1 + PRIME32_2;
200 |                 v2[i] = seed + PRIME32_2;
201 |                 v3[i] = seed + 0;
202 |                 v4[i] = seed - PRIME32_1;
203 |             }
204 |             for (int i = 0; i < (SizeWords & ~3); i += 4) {
205 |                 for (int j = 0; j < 8; ++j) {
206 |                     v1[j] = round(v1[j], key[(i + 0) * 8 + j]);
207 |                     v2[j] = round(v2[j], key[(i + 1) * 8 + j]);
208 |                     v3[j] = round(v3[j], key[(i + 2) * 8 + j]);
209 |                     v4[j] = round(v4[j], key[(i + 3) * 8 + j]);
210 |                 }
211 |             }
212 | 
213 |             for (int i = 0; i < 8; ++i) {
214 |                 h[i] = rol32<1>(v1[i]) + rol32<7>(v2[i]) + rol32<12>(v3[i]) + rol32<18>(v4[i]);
215 |             }
216 |         }
217 | 
218 |         for (int i = 0; i < 8; ++i) {
219 |             h[i] += 4 * SizeWords;
220 |         }
221 | 
222 |         for (int i = -(SizeWords & 3); i < 0; ++i) {
223 |             for (int j = 0; j < 8; ++j) {
224 |                 h[j] += key[SizeWords + i * 8 + j] * PRIME32_3;
225 |                 h[j] = rol32<17>(h[j]) * PRIME32_4;
226 |             }
227 |         }
228 | 
229 |         for (int i = 0; i < 8; ++i) {
230 |             res[i] = fmix32(h[i]);
231 |         }
232 |     }
233 | 
234 | #endif // __AVX2__
235 | 
236 |     // Compute a 32 bit hash value for the key.
237 |     static uint32_t scalar(uint32_t* key, uint32_t seed) {
238 |         uint32_t h = seed + PRIME32_5;
239 | 
240 |         if (SizeWords >= 4) {
241 |             uint32_t v1 = seed + PRIME32_1 + PRIME32_2;
242 |             uint32_t v2 = seed + PRIME32_2;
243 |             uint32_t v3 = seed + 0;
244 |             uint32_t v4 = seed - PRIME32_1;
245 |             for (int i = 0; i < (SizeWords & ~3); i += 4) {
246 |                 v1 = round(v1, key[i]);
247 |                 v2 = round(v2, key[i + 1]);
248 |                 v3 = round(v3, key[i + 2]);
249 |                 v4 = round(v4, key[i + 3]);
250 |             }
251 | 
252 |             h = rol32<1>(v1) + rol32<7>(v2) + rol32<12>(v3) + rol32<18>(v4);
253 |         }
254 | 
255 |         h += 4 * SizeWords;
256 | 
257 |         for (int i = -(SizeWords & 3); i < 0; ++i) {
258 |             h += key[SizeWords + i] * PRIME32_3;
259 |             h = rol32<17>(h) * PRIME32_4;
260 |         }
261 | 
262 |         return fmix32(h);
263 |     }
264 | 
265 | private:
266 | #if __AVX2__
267 |     static __m256i mm256_round(__m256i seed, __m256i input) {
268 |         seed = _mm256_add_epi32(seed,
269 |                                 _mm256_mullo_epi32(input,
270 |                                                    _mm256_set1_epi32(PRIME32_2)));
271 |         seed = mm256_rol32<13>(seed);
272 |         seed = _mm256_mullo_epi32(seed,
273 |                                   _mm256_set1_epi32(PRIME32_1));
274 |         return seed;
275 |     }
276 | 
277 |     static __m256i mm256_fmix32(__m256i h) {
278 |         h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 15));
279 |         h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_2));
280 |         h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 13));
281 |         h = _mm256_mullo_epi32(h, _mm256_set1_epi32(PRIME32_3));
282 |         h = _mm256_xor_si256(h, _mm256_srli_epi32(h, 16));
283 | 
284 |         return h;
285 |     }
286 | 
287 |     template<int r>
288 |     static __m256i mm256_rol32(__m256i x) {
289 |         return _mm256_or_si256(_mm256_slli_epi32(x, r),
290 |                                _mm256_srli_epi32(x, 32 - r));
291 |     }
292 | #endif // __AVX2__
293 | 
294 |     static uint32_t round(uint32_t seed, uint32_t input) {
295 |         seed += input * PRIME32_2;
296 |         seed = rol32<13>(seed);
297 |         seed *= PRIME32_1;
298 |         return seed;
299 |     }
300 | 
301 |     static uint32_t fmix32(uint32_t h) {
302 |         h ^= h >> 15;
303 |         h *= PRIME32_2;
304 |         h ^= h >> 13;
305 |         h *= PRIME32_3;
306 |         h ^= h >> 16;
307 | 
308 |         return h;
309 |     }
310 | 
311 |     template<int r>
312 |     static uint32_t rol32(uint32_t x) {
313 |         return (x << r) | (x >> (32 - r));
314 |     }
315 | 
316 |     static const uint32_t PRIME32_1 = 2654435761U;
317 |     static const uint32_t PRIME32_2 = 2246822519U;
318 |     static const uint32_t PRIME32_3 = 3266489917U;
319 |     static const uint32_t PRIME32_4 =  668265263U;
320 |     static const uint32_t PRIME32_5 =  374761393U;
321 | };
322 | 
323 | #endif // PARALLEL_XXHASH_H
324 | 


--------------------------------------------------------------------------------