├── .gitattributes
├── khashv32-seed-6bb75f13.png
├── khashv64-seed-1dcedff1a8b17e89.png
├── .gitignore
├── LICENSE
├── k-hashv-old
    ├── README_v1.md
    ├── test_speed_v1.c
    └── khashv_v1.h
├── test_speed.c
├── README.md
└── khashv.h


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/khashv32-seed-6bb75f13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Keith-Cancel/k-hashv/HEAD/khashv32-seed-6bb75f13.png


--------------------------------------------------------------------------------
/khashv64-seed-1dcedff1a8b17e89.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Keith-Cancel/k-hashv/HEAD/khashv64-seed-1dcedff1a8b17e89.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Prerequisites
 2 | *.d
 3 | 
 4 | # Object files
 5 | *.o
 6 | *.ko
 7 | *.obj
 8 | *.elf
 9 | 
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 | 
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 | 
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 | 
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 | 
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 | 
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | *.idb
43 | *.pdb
44 | 
45 | # Kernel Module Compile Results
46 | *.mod*
47 | *.cmd
48 | .tmp_versions/
49 | modules.order
50 | Module.symvers
51 | Mkfile.old
52 | dkms.conf
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Keith-Cancel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/k-hashv-old/README_v1.md:
--------------------------------------------------------------------------------
  1 | # K-HASHV
  2 | A single header hash function with both vectorized and scalar versions. The function is quite fast when vectorized achieving approximately an average of **~9.6 GB/s** on a 7 year old Xeon E3-1230 v5.
  3 | 
  4 | Additionally, it also passes all the SMHasher hash function quality tests: https://github.com/rurban/smhasher
  5 | 
  6 | Moreover, it is quite easy to choose a new function at runtime by just using new seed as shown below:
  7 | ```C
  8 | #include "khashv.h"
  9 | 
 10 | void foo() {
 11 |     /*
 12 |     code ....
 13 |     */
 14 |     khashvSeed seed;
 15 |     khashv_prep_seed64(&seed, a_64_bit_value);
 16 |     uint64_t hash = khashv64(&seed, your_data, data_len);
 17 |     /*
 18 |     code ....
 19 |     */
 20 | }
 21 | ```
 22 | ### Note
 23 | This is **not a cryptographic hash function**, and it should not be used in for such applications.
 24 | 
 25 | ## Performance
 26 | When testing on 1.25 GB and 512 KB of random data I get the following on averages:
 27 | <table>
 28 | <thead><tr><th>Processor</th><th>1.25 GB Time</th><th>1.25 GB Speed</th> <th>512 KB Time</th><th>512 KB Speed</th><th>OS</th><th>Compiler</th><th>Type</th></tr></thead>
 29 | <tbody>
 30 | <tr> <td>Xeon E3-1230 v5</td> <td>0.1298 s</td> <td>9.6285 GB/s</td> <td>052.5107 us</td> <td>9.2987 GB/s</td><td>Linux</td><td>GCC 12.1.0</td><td><strong>Vectorized<strong></td></tr>
 31 | <tr> <td>Xeon E3-1230 v5</td> <td>1.1911 s</td> <td>1.0495 GB/s</td> <td>494.1932 us</td> <td>0.9880 GB/s</td><td>Linux</td><td>GCC 12.1.0</td><td><strong>Scalar<strong></td></tr>
 32 | <tr> <td>Xeon E3-1230 v5</td> <td>0.1418 s</td> <td>8.8142 GB/s</td> <td>055.9333 us</td> <td>8.7297 GB/s</td><td>Linux</td><td>Clang 14.0.6</td><td><strong>Vectorized<strong></td></tr>
 33 | <tr> <td>Ryzen 9 7900</td> <td>0.1227 s</td> <td>10.1881 GB/s</td> <td>046.0273 us</td> <td>10.6085 GB/s</td><td>Linux</td><td>GCC 12.2.1</td><td><strong>Vectorized<strong></td></tr>
 34 | <tr> <td>Ryzen 9 7900</td> <td>0.8693 s</td> <td>1.4379 GB/s</td> <td>375.0820 us</td> <td>1.3018 GB/s</td><td>Linux</td><td>GCC 12.2.1</td><td><strong>Scalar<strong></td></tr>
 35 | </tbody>
 36 | </table>
 37 | 
 38 | The scalar version is slower at a tad over ~1 GB/s on my system when compiling test_speed.c with gcc using `-O3`.
 39 | On windows Microsoft's compiler does not seem to generate as performant code from the intrinsics, but the GCC mingw64 compiler generates pretty comparable numbers for me at least.
 40 | 
 41 | Definitely, want to add other machines to this table. But if you are curious how it performs on your machine compile test_speed.c with `-O3 -march=native` and `-O3 -march=native -D KHASHV_SCALAR`.
 42 | 
 43 | ## Functions
 44 | ```C
 45 | // Prepares a seed from a 32-bit value
 46 | void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed)
 47 | ```
 48 | 
 49 | ```C
 50 | // Prepares a seed from a 64-bit value
 51 | void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed)
 52 | ```
 53 | 
 54 | ```C
 55 | // Sets  128-bits to be the seed
 56 | void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4])
 57 | ```
 58 | 
 59 | ```C
 60 | // Produces a 32-bit hash from the input data
 61 | uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len)
 62 | ```
 63 | 
 64 | ```C
 65 | // Produces a 64-bit hash from the input data
 66 | uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len)
 67 | ```
 68 | 
 69 | ## K-HASHV 64 Output
 70 | Here is the output of the 64 bit hash of the integers \[0, 259199\] using 0x1dcedff1a8b17e89 as the seed.
 71 | 
 72 | <img src="./khashv64-seed-1dcedff1a8b17e89.png" alt="drawing" width="800"/>
 73 | 
 74 | ## K-HASHV 32 Output
 75 | 
 76 | Here is the output of the 32 bit hash of the integers \[0, 518399\] using 0x6bb75f13 as the seed.
 77 | 
 78 | <img src="./khashv32-seed-6bb75f13.png" alt="drawing" width="800"/>
 79 | 
 80 | The output of the above images was generated by basically doing the following for a hash.
 81 | 
 82 | ```C
 83 | for(int i = 0; i < sizeof(hash_bytes); i++) {
 84 |     pixel[img_offset + i].r = hash_bytes[i];
 85 |     pixel[img_offset + i].g = hash_bytes[i];
 86 |     pixel[img_offset + i].b = hash_bytes[i];
 87 |     pixel[img_offset + i].a = 255;
 88 | }
 89 | ```
 90 | 
 91 | ## Things TODO
 92 | When thinking about things to improve the code and hash function these are the first few things that come to mind for me.
 93 | 
 94 | 1. The main thing would be try to get both Clang and MSVC to output code that runs as fast GCC or as close as possible. They both seem to do some silly things when compared to GCC losing some performance when looking at the generated assembly. Microsoft's compiler being the worst, and probably the fastest fix for me to implement would be to write some assembly code. However, it then would no longer be a single header file hash function since MSVC does not support inline assembly for 64-bit builds, and thusly would require a separate file.
 95 | 
 96 | 2. Then probably consider using intrinsics for some other systems like ARM NEON, but the for now there is scalar code and code written using GCC's vector built-ins that will generate vectorized code for other architectures that GCC supports.
 97 | 
 98 | 3. Probably, the next thing I could think of is to choose a better value for S1 and S2 that are used to basically substitute bytes. The current values where found randomly checking a small set of criteria. Mainly focusing on each bit of S1 and S2 as columns. Then Xor-ing them effectively creating an 8 bit input boolean function, and making sure the entire thing maps each input to a unique value. There likely are better values that could chosen, and criteria to look at that look at all bits at once. However, the search space is huge effectively 2^(2\*8\*16) possible permutations for S1 and S2. However, the current values do seem to work well, from my testing.
 99 | 
100 | ### Suggestions
101 | I am open to any other suggestions or improvments.
102 | 


--------------------------------------------------------------------------------
/k-hashv-old/test_speed_v1.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stddef.h>
  3 | #include <stdio.h>
  4 | #include <float.h>
  5 | #include "khashv.h"
  6 | 
  7 | #if defined(__MINGW32__) || defined(_WIN32)
  8 |     #include <windows.h>
  9 | 
 10 |     #define get_timer(x) QueryPerformanceCounter(&x)
 11 | 
 12 |     typedef LARGE_INTEGER timer;
 13 | 
 14 |     uint64_t time_ns(timer* start, timer* stop) {
 15 |         LARGE_INTEGER freq;
 16 |         if(!QueryPerformanceFrequency(&freq)) {
 17 |             return UINT64_MAX;
 18 |         }
 19 |         double ns    = stop->QuadPart - start->QuadPart;
 20 |         double ratio = 1000000000.0; // 1 billion ns = 1 second
 21 |         ratio /=  (double)freq.QuadPart;
 22 |         ns *= ratio;
 23 |         return (uint64_t)ns;
 24 |     }
 25 | 
 26 | #else
 27 |     #include <time.h>
 28 |     #define get_timer(x) clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &x)
 29 |     typedef struct timespec timer;
 30 | 
 31 |     uint64_t time_ns(timer* start, timer* stop) {
 32 |         int secs = stop->tv_sec - start->tv_sec;
 33 |         if(secs > 0) {
 34 |             uint64_t t0_ns = start->tv_sec * 1000000000;
 35 |             uint64_t t1_ns = stop->tv_sec  * 1000000000;
 36 |             t0_ns += start->tv_nsec;
 37 |             t1_ns += stop->tv_nsec;
 38 |             return t1_ns - t0_ns;
 39 |         }
 40 |         return stop->tv_nsec - start->tv_nsec;
 41 |     }
 42 | #endif
 43 | 
 44 | #define MB_TO_BYTES(x) (1024ULL * 1024ULL * (x))
 45 | 
 46 | double get_gbs(double t_ns, double gigs) {
 47 |     t_ns /= 1000000000;
 48 |     return gigs / t_ns;
 49 | }
 50 | 
 51 | double get_secs(double t_ns) {
 52 |     return t_ns / 1000000000.0;
 53 | }
 54 | 
 55 | double get_usecs(double t_ns) {
 56 |     return t_ns / 1000.0;
 57 | }
 58 | 
 59 | void populate_memory(const khashvSeed* seed, uint8_t* bytes, size_t size) {
 60 |     printf("Populating Memory: ");
 61 |     // Use the hash to populate memory with pseudo random bytes
 62 |     uint64_t state[2] = { 0x4d9ef2f9a304588a,	0x58ca10a39947b63b };
 63 |     for(size_t i = 0; i < size; i += sizeof(uint64_t)) {
 64 |         if(i != 0 && (i & 0x1ffffff) == 0) {
 65 |             printf(".");
 66 |             fflush(stdout);
 67 |         }
 68 |         state[0] = khashv64(seed, (uint8_t*)state, sizeof(uint64_t) * 2);
 69 |         memcpy(bytes + i, state, sizeof(uint64_t));
 70 |     }
 71 |     puts(" Populated!");
 72 | }
 73 | 
 74 | int gig_tests(khashvSeed seed) {
 75 |     size_t   size  = MB_TO_BYTES(1280);
 76 |     uint8_t* bytes = malloc(size);
 77 |     if(bytes == NULL) {
 78 |         fprintf(stderr, "Can not allocate memory for test!\n");
 79 |         return 1;
 80 |     }
 81 |     populate_memory(&seed, bytes, size);
 82 | 
 83 |     double   gigs       = (double)size / (double)MB_TO_BYTES(1024);
 84 |     double   sum        = 0;
 85 |     double   fastest    = DBL_MAX;
 86 | 
 87 |     const uint32_t hashes[12] = {
 88 |         0xa9ca46b1, 0x8c9f5264, 0x2094ffd9, 0x93946e70, 0x9b71dd71,
 89 |         0x2abeec74, 0x6bca7368, 0x151fff30, 0xc4228495, 0xfad35669,
 90 |         0x9f151590, 0x20a4045b
 91 |     };
 92 | 
 93 |     printf("Tests on %.3lf GB block: ", gigs);
 94 |     fflush(stdout);
 95 |     for(unsigned i = 0; i < 12; i++) {
 96 |         timer t0;
 97 |         timer t1;
 98 |         get_timer(t0);
 99 |         uint32_t h = khashv32(&seed, bytes, size);
100 |         get_timer(t1);
101 | 
102 |         if(h != hashes[i]) {
103 |             printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]);
104 |         }
105 | 
106 |         double t = time_ns(&t0, &t1);
107 |         if(t < fastest) {
108 |             fastest = t;
109 |         }
110 |         sum += t;
111 |         bytes[i] += 1;
112 |     }
113 | 
114 |     double avg = sum / 12;
115 |     printf(
116 |         "Avg: %lf GB/s, Avg Time: %lf s, Fastest: %lf GB/s\n",
117 |         get_gbs (avg, gigs),
118 |         get_secs(avg),
119 |         get_gbs (fastest, gigs)
120 |     );
121 |     fflush(stdout);
122 |     free(bytes);
123 |     return 0;
124 | }
125 | 
126 | int half_mb_tests(khashvSeed seed) {
127 |     size_t   size  = 1024 * 512;
128 |     uint8_t* bytes = malloc(size);
129 |     if(bytes == NULL) {
130 |         fprintf(stderr, "Can not allocate memory for test!\n");
131 |         return 1;
132 |     }
133 |     populate_memory(&seed, bytes, size);
134 | 
135 |     double   gigs       = (double)size / (double)MB_TO_BYTES(1024);
136 |     double   sum        = 0;
137 |     double   fastest    = DBL_MAX;
138 |     unsigned count      = 96;
139 |     const uint32_t hashes[96] = {
140 |         0x249b844b, 0x852e481c, 0xf7ce4779, 0x5b1e79c0, 0xc6280b69,
141 |         0x18aaed1f, 0x360a7b70, 0x6691373f, 0x62b0e7d2, 0x503f2a13,
142 |         0x55784198, 0x0449e145, 0xc1fec259, 0xfdde4bcc, 0x3d040585,
143 |         0x2d54b62c, 0x70f06c7e, 0xcc7a642f, 0xe784348b, 0xe360bb8a,
144 |         0xd4653bab, 0x129aac4f, 0xdf09ac90, 0xc770d23f, 0x1865b60c,
145 |         0x366d8ca9, 0x80b13f6f, 0x7317d810, 0x7816b809, 0x919adedb,
146 |         0x92713259, 0xb15e9216, 0x4cca4cd2, 0xb0bda9b9, 0xa3eb6a63,
147 |         0x1801f592, 0x7f6ebdfe, 0xcfd5f33c, 0x000c7082, 0x17265e0b,
148 |         0x6ba10359, 0x8c74f4eb, 0x803f3c08, 0x4ba6860d, 0x0716f9fb,
149 |         0x6e3c84ae, 0xe77a48f4, 0xc2374c75, 0x97f403ee, 0x3010b84b,
150 |         0x560ba778, 0x83103235, 0xfd4adabf, 0xa436bcf0, 0xaa8f96dc,
151 |         0x29922bec, 0xd5468b54, 0x4b1921b8, 0x2a8ce2d5, 0x86e336f4,
152 |         0x5fab2354, 0x0e07c225, 0xb181782a, 0xe799459f, 0xcf9541fd,
153 |         0xcd510976, 0xe70010ea, 0x6202cb22, 0x7d253b79, 0x4d047b53,
154 |         0xbd26b2ba, 0xc1df8a17, 0x48a6ed87, 0xa980b22c, 0x16b27278,
155 |         0xb5736e7c, 0x368bd0b9, 0xeee76414, 0xfe58e49d, 0xf3500e6d,
156 |         0xb57df9f5, 0xb52a7ed6, 0xaca79612, 0xccc9f98a, 0xa7140bd0,
157 |         0x7e45d2f9, 0xb91ddced, 0x9444f706, 0xa477bfb2, 0xcf7e1d5b,
158 |         0xd95eab3c, 0x737fa6e5, 0x5f548e79, 0x46539426, 0xef41aa94,
159 |         0xc0357213
160 |     };
161 | 
162 |     printf("Tests on 512 KB block: ");
163 |     fflush(stdout);
164 |     for(unsigned i = 0; i < count; i++) {
165 |         timer t0;
166 |         timer t1;
167 |         get_timer(t0);
168 |         uint32_t h = khashv32(&seed, bytes, size);
169 |         get_timer(t1);
170 | 
171 |         if(h != hashes[i]) {
172 |             printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]);
173 |         }
174 | 
175 |         double t = time_ns(&t0, &t1);
176 |         if(t < fastest) {
177 |             fastest = t;
178 |         }
179 |         sum += t;
180 |         bytes[i] += 1;
181 |     }
182 | 
183 |     double avg = sum / count;
184 |     printf(
185 |         "Avg: %lf GB/s, Avg Time: %lf us, Fastest: %lf GB/s\n",
186 |         get_gbs  (avg, gigs),
187 |         get_usecs(avg),
188 |         get_gbs  (fastest, gigs)
189 |     );
190 |     fflush(stdout);
191 |     free(bytes);
192 |     return 0;
193 | }
194 | 
195 | int main(int argc, char** argv) {
196 |     khashvSeed seed;
197 |     khashv_prep_seed64(&seed, 0xa9c163c960d480fb);
198 | 
199 |     if(gig_tests(seed)) {
200 |         return 1;
201 |     }
202 |     if(half_mb_tests(seed)) {
203 |         return 1;
204 |     }
205 |     return 0;
206 | }


--------------------------------------------------------------------------------
/test_speed.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stddef.h>
  3 | #include <stdio.h>
  4 | #include <float.h>
  5 | #include <math.h>
  6 | #include "khashv.h"
  7 | 
  8 | #if defined(__MINGW32__) || defined(_WIN32)
  9 |     #include <windows.h>
 10 | 
 11 |     #define get_timer(x) QueryPerformanceCounter(&x)
 12 | 
 13 |     typedef LARGE_INTEGER timer;
 14 | 
 15 |     uint64_t time_ns(timer* start, timer* stop) {
 16 |         LARGE_INTEGER freq;
 17 |         if(!QueryPerformanceFrequency(&freq)) {
 18 |             return UINT64_MAX;
 19 |         }
 20 |         double ns    = stop->QuadPart - start->QuadPart;
 21 |         double ratio = 1000000000.0; // 1 billion ns = 1 second
 22 |         ratio /=  (double)freq.QuadPart;
 23 |         ns *= ratio;
 24 |         return (uint64_t)ns;
 25 |     }
 26 | 
 27 | #else
 28 |     #include <time.h>
 29 |     #define get_timer(x) clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &x)
 30 |     typedef struct timespec timer;
 31 | 
 32 |     uint64_t time_ns(timer* start, timer* stop) {
 33 |         int secs = stop->tv_sec - start->tv_sec;
 34 |         if(secs > 0) {
 35 |             uint64_t t0_ns = start->tv_sec * 1000000000;
 36 |             uint64_t t1_ns = stop->tv_sec  * 1000000000;
 37 |             t0_ns += start->tv_nsec;
 38 |             t1_ns += stop->tv_nsec;
 39 |             return t1_ns - t0_ns;
 40 |         }
 41 |         return stop->tv_nsec - start->tv_nsec;
 42 |     }
 43 | #endif
 44 | 
 45 | #define MB_TO_BYTES(x) (1024ULL * 1024ULL * (x))
 46 | 
 47 | double get_gbs(double t_ns, double gigs) {
 48 |     t_ns /= 1000000000;
 49 |     return gigs / t_ns;
 50 | }
 51 | 
 52 | double get_secs(double t_ns) {
 53 |     return t_ns / 1000000000.0;
 54 | }
 55 | 
 56 | double get_usecs(double t_ns) {
 57 |     return t_ns / 1000.0;
 58 | }
 59 | 
 60 | void populate_memory(const khashvSeed* seed, uint8_t* bytes, size_t size) {
 61 |     printf("Populating Memory: ");
 62 |     // Use the hash to populate memory with pseudo random bytes
 63 |     uint64_t state[2] = { 0x4d9ef2f9a304588a,	0x58ca10a39947b63b };
 64 |     for(size_t i = 0; i < size; i += sizeof(uint64_t)) {
 65 |         if(i != 0 && (i & 0x1ffffff) == 0) {
 66 |             printf(".");
 67 |             fflush(stdout);
 68 |         }
 69 |         state[0] = khashv64(seed, (uint8_t*)state, sizeof(uint64_t) * 2);
 70 |         memcpy(bytes + i, state, sizeof(uint64_t));
 71 |     }
 72 |     puts(" Populated!");
 73 | }
 74 | 
 75 | int gig_tests(khashvSeed seed) {
 76 |     size_t   size  = MB_TO_BYTES(1280);
 77 |     uint8_t* bytes = malloc(size);
 78 |     if(bytes == NULL) {
 79 |         fprintf(stderr, "Can not allocate memory for test!\n");
 80 |         return 1;
 81 |     }
 82 |     populate_memory(&seed, bytes, size);
 83 | 
 84 |     double   gigs       = (double)size / (double)MB_TO_BYTES(1024);
 85 |     double   sum        = 0;
 86 |     double   fastest    = DBL_MAX;
 87 | 
 88 |     const uint32_t hashes[12] = {
 89 |         0x8b4c1a33, 0x485105dc, 0xaf1deb0e, 0x2d4a890c, 0x8349b700,
 90 |         0x29a3b3b9, 0xf1ed93ef, 0x8559b73f, 0x11452eff, 0xefa5fe1f,
 91 |         0x5834c363, 0xeb7224a5
 92 |     };
 93 | 
 94 |     printf("Tests on %.3lf GB block: ", gigs);
 95 |     fflush(stdout);
 96 |     for(unsigned i = 0; i < 12; i++) {
 97 |         timer t0;
 98 |         timer t1;
 99 |         get_timer(t0);
100 |         uint32_t h = khashv32(&seed, bytes, size);
101 |         get_timer(t1);
102 | 
103 |         if(h != hashes[i]) {
104 |             printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]);
105 |         }
106 | 
107 |         double t = time_ns(&t0, &t1);
108 |         if(t < fastest) {
109 |             fastest = t;
110 |         }
111 |         sum += t;
112 |         bytes[i] += 1;
113 |     }
114 | 
115 |     double avg = sum / 12;
116 |     double gbs = get_gbs (avg, gigs);
117 |     double sec = get_secs(avg);
118 |     double fgbs = get_gbs(fastest, gigs);
119 |     gbs  = round(gbs  * 10000.0) / 10000.0;
120 |     sec  = round(sec  * 10000.0) / 10000.0;
121 |     fgbs = round(fgbs * 10000.0) / 10000.0;
122 |     printf(
123 |         "Avg: %.4lf GB/s, Avg Time: %.4lf s, Fastest: %.4lf GB/s\n",
124 |         gbs,
125 |         sec,
126 |         fgbs
127 |     );
128 |     fflush(stdout);
129 |     free(bytes);
130 |     return 0;
131 | }
132 | 
133 | int half_mb_tests(khashvSeed seed) {
134 |     size_t   size  = 1024 * 512;
135 |     uint8_t* bytes = malloc(size);
136 |     if(bytes == NULL) {
137 |         fprintf(stderr, "Can not allocate memory for test!\n");
138 |         return 1;
139 |     }
140 |     populate_memory(&seed, bytes, size);
141 | 
142 |     double   gigs       = (double)size / (double)MB_TO_BYTES(1024);
143 |     double   sum        = 0;
144 |     double   fastest    = DBL_MAX;
145 |     unsigned count      = 96;
146 |     const uint32_t hashes[96] = {
147 |         0x3b181e13, 0x6df3efe4, 0xa1472e2f, 0xe7fe7261, 0x85db611b,
148 |         0x95b68b46, 0xa4738539, 0xc67cd2b3, 0x4630444d, 0xb357f7a3,
149 |         0x60ba4613, 0x20d50be8, 0x5908392d, 0xd5c1411e, 0xa315f311,
150 |         0xe92b8d4a, 0x3504718c, 0x78d5d987, 0xac324986, 0xa9c146a3,
151 |         0xea4120ac, 0x1ab20115, 0xb4cf0fc0, 0x3726e7c6, 0x781b19b4,
152 |         0x897a635f, 0x49c879a6, 0x414f698e, 0xef3c3c66, 0x668de11e,
153 |         0xf6f2af8d, 0x6db89e5f, 0xa2621047, 0x26736838, 0xca8539cf,
154 |         0xe1e92796, 0xbd178553, 0x31aedc2d, 0x41f4377f, 0x0683f7a2,
155 |         0xff1d7f6f, 0x4a788c33, 0xb4823086, 0xf3b45106, 0xf2e12a97,
156 |         0x1505b0e8, 0x32d16f9d, 0xa4ccbd11, 0x61f6aa54, 0x8dc4eb8d,
157 |         0xe7ac77ca, 0xb00dd338, 0x9330ce85, 0xae721ca9, 0x236eb8a2,
158 |         0xcd7aba61, 0x2fbd751e, 0x978edc2c, 0x09ef6175, 0x78d12480,
159 |         0x08b21322, 0x02826493, 0x36244a76, 0xb7e1489c, 0x365c631f,
160 |         0x08188ea8, 0x92bd6910, 0xa7cf34d0, 0x9b91a005, 0x8c7cfc38,
161 |         0xf732ae18, 0x87f2f485, 0xa42d236d, 0x967880e3, 0xf04cb79d,
162 |         0xfd9d613f, 0xfa7ae694, 0xfb680e60, 0x2de7c7c9, 0xa5979af7,
163 |         0x6b24f6a3, 0xfebb25de, 0x3163a706, 0x7d8d0a35, 0xb5cacfcf,
164 |         0xdf774e72, 0xd06db96e, 0x16d7e8db, 0xf1e368e7, 0x21efe8d5,
165 |         0x59d6f29f, 0xb0ee28bc, 0x849b575e, 0x96887453, 0x2eabdd1f,
166 |         0x3cdc8fa8
167 |     };
168 | 
169 |     printf("Tests on 512 KB block: ");
170 |     fflush(stdout);
171 |     for(unsigned i = 0; i < count; i++) {
172 |         timer t0;
173 |         timer t1;
174 |         get_timer(t0);
175 |         uint32_t h = khashv32(&seed, bytes, size);
176 |         get_timer(t1);
177 | 
178 |         if(h != hashes[i]) {
179 |             printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]);
180 |         }
181 | 
182 |         double t = time_ns(&t0, &t1);
183 |         if(t < fastest) {
184 |             fastest = t;
185 |         }
186 |         sum += t;
187 |         bytes[i] += 1;
188 |     }
189 | 
190 |     double avg = sum / count;
191 |     double gbs = get_gbs  (avg, gigs);
192 |     double usec = get_usecs(avg);
193 |     double fgbs = get_gbs  (fastest, gigs);
194 |     gbs  = round(gbs  * 10000.0) / 10000.0;
195 |     usec = round(usec * 10000.0) / 10000.0;
196 |     fgbs = round(fgbs * 10000.0) / 10000.0;
197 |     printf(
198 |         "Avg: %.4lf GB/s, Avg Time: %.4lf us, Fastest: %.4lf GB/s\n",
199 |         gbs,
200 |         usec,
201 |         fgbs
202 |     );
203 |     fflush(stdout);
204 |     free(bytes);
205 |     return 0;
206 | }
207 | 
208 | int main(int argc, char** argv) {
209 |     khashvSeed seed;
210 |     khashv_prep_seed64(&seed, 0xa9c163c960d480fb);
211 | 
212 |     if(gig_tests(seed)) {
213 |         return 1;
214 |     }
215 |     if(half_mb_tests(seed)) {
216 |         return 1;
217 |     }
218 |     return 0;
219 | }
220 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # K-HASHV 🔨
  2 | A single header hash function with both vectorized and scalar versions. The function is quite fast when vectorized achieving approximately an average of **~10.2 GB/s** on a 9 year (as of 2024) old Xeon E3-1230 v5. The header contains explicit intrinsics for x86_64, and also has a version that will use GCC's portable vector built-ins, and the last fall back is a scalar version for portability. The results of the function should be the same regardless of endianness.
  3 | 
  4 | Additionally, it also passes all the SMHasher hash function quality tests: https://github.com/rurban/smhasher. Additionally, it passes [SMHasher3](https://gitlab.com/fwojcik/smhasher3/-/blob/c56f2bddc1b3e114570d5cbe383ad207673f6c99/results/README.md) a fork of SMHasher with some more stringent tests. Some hashes that pass SMHasher fail in SMHasher3.
  5 | 
  6 | Moreover, it is quite easy to choose a new function at runtime by just using new seed as shown below:
  7 | ```C
  8 | #include "khashv.h"
  9 | 
 10 | void foo() {
 11 |     /*
 12 |     code ....
 13 |     */
 14 |     khashvSeed seed;
 15 |     khashv_prep_seed64(&seed, a_64_bit_value);
 16 |     uint64_t hash = khashv64(&seed, your_data, data_len);
 17 |     /*
 18 |     code ....
 19 |     */
 20 | }
 21 | ```
 22 | 
 23 | Issues, PRs and suggestions are welcome 😃
 24 | 
 25 | ### Note
 26 | This is **not a cryptographic hash function**, and it should not be used for such applications.
 27 | 
 28 | # Table of Contents
 29 |    * [Performance](#performance)
 30 |    * [API](#api)
 31 |       * [khashv_prep_seed32](#khashv_prep_seed32)
 32 |       * [khashv_prep_seed64](#khashv_prep_seed64)
 33 |       * [khashv_prep_seed128](#khashv_prep_seed128)
 34 |       * [khashv32](#khashv32)
 35 |       * [khashv64](#khashv64)
 36 |    * [Output](#output)
 37 |       * [64-bit Output](#khashv64-output)
 38 |       * [32-bit Output](#khashv32-output)
 39 |    * [Notes](#notes)
 40 |    * [TODO](#todo)
 41 |    * [Copyright and License](#copyright-and-license)
 42 | 
 43 | # Performance
 44 | When testing on 1.25 GB and 512 KB of random data I get the following on averages:
 45 | <table>
 46 | <thead><tr><th>Processor</th><th>1.25 GB Time</th><th>1.25 GB Speed</th> <th>512 KB Time</th><th>512 KB Speed</th><th>OS</th><th>Compiler</th><th>Type</th></tr></thead>
 47 | <tbody>
 48 | <tr> <td>Xeon E3-1230 v5</td> <td>0.1226 s</td> <td>10.1987 GB/s</td> <td>045.3515 us</td> <td>10.7666 GB/s</td><td>Linux</td><td>GCC 12.2.1</td><td><strong>Vectorized<strong></td></tr>
 49 | <tr> <td>Xeon E3-1230 v5</td> <td>1.1803 s</td> <td>1.0495 GB/s</td> <td>462.9862 us</td> <td>1.0546 GB/s</td><td>Linux</td><td>GCC 12.2.1</td><td><strong>Scalar<strong></td></tr>
 50 | <tr> <td>Xeon E3-1230 v5</td> <td>0.1388 s</td> <td>9.0061 GB/s</td> <td>052.8114 us</td> <td>9.2457 GB/s</td><td>Linux</td><td>Clang 15.0.7</td><td><strong>Vectorized<strong></td></tr>
 51 | <tr> <td>Ryzen 9 7900</td> <td>0.1182 s</td> <td>10.5742 GB/s</td> <td>044.4734</td> <td>10.9792 GB/s</td><td>Linux</td><td>GCC 12.2.1</td><td><strong>Vectorized<strong></td></tr>
 52 | <tr> <td>Ryzen 9 7900</td> <td>0.7890 s</td> <td>1.5843 GB/s</td> <td>307.4712 us</td> <td>1.5881 GB/s</td><td>Linux</td><td>GCC 12.2.1</td><td><strong>Scalar<strong></td></tr>
 53 | </tbody>
 54 | </table>
 55 | 
 56 | The scalar version is slower at a tad over ~1 GB/s on my system when compiling test_speed.c with gcc using `-O3`.
 57 | On windows Microsoft's compiler does not seem to generate as performant code from the intrinsics, but the GCC mingw64 compiler generates pretty comparable numbers for me at least.
 58 | 
 59 | Definitely, want to add other machines to this table. But if you are curious how it performs on your machine compile `test_speed.c` with `-O3 -lm -march=native` and `-O3 -lm -march=native -D KHASHV_SCALAR`.
 60 | 
 61 | # API
 62 | 
 63 | ## khashv_prep_seed32
 64 | ```C
 65 | // Prepares a seed from a 32-bit value
 66 | void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed)
 67 | ```
 68 | 
 69 | ## khashv_prep_seed64
 70 | ```C
 71 | // Prepares a seed from a 64-bit value
 72 | void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed)
 73 | ```
 74 | 
 75 | ## khashv_prep_seed128
 76 | ```C
 77 | // Sets  128-bits to be the seed
 78 | void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4])
 79 | ```
 80 | 
 81 | ## khashv32
 82 | ```C
 83 | // Produces a 32-bit hash from the input data
 84 | uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len)
 85 | ```
 86 | 
 87 | ## khashv64
 88 | ```C
 89 | // Produces a 64-bit hash from the input data
 90 | uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len)
 91 | ```
 92 | 
 93 | # Output
 94 | Here is the output of the hash function as images.
 95 | 
 96 | ## khashv64 Output
 97 | Here is the output of the 64 bit hash of the integers \[0, 259199\] using 0x1dcedff1a8b17e89 as the seed.
 98 | 
 99 | <img src="./khashv64-seed-1dcedff1a8b17e89.png" alt="drawing" width="800"/>
100 | 
101 | ## khashv32 Output
102 | 
103 | Here is the output of the 32 bit hash of the integers \[0, 518399\] using 0x6bb75f13 as the seed.
104 | 
105 | <img src="./khashv32-seed-6bb75f13.png" alt="drawing" width="800"/>
106 | 
107 | The output of the above images was generated by basically doing the following for a hash.
108 | 
109 | ```C
110 | for(int i = 0; i < sizeof(hash_bytes); i++) {
111 |     pixel[img_offset + i].r = hash_bytes[i];
112 |     pixel[img_offset + i].g = hash_bytes[i];
113 |     pixel[img_offset + i].b = hash_bytes[i];
114 |     pixel[img_offset + i].a = 255;
115 | }
116 | ```
117 | 
118 | # TODO
119 | When thinking about things to improve the code and hash function these are the first few things that come to mind for me.
120 | 1. A faster mixing function (e.g. `khashv_mix_words_<type>`) I think is probably the next thing that could be improved. If that could be made shorter/faster it would reduce latency for smaller inputs. Any ideas or feedback for this would be appreciated.
121 | 
122 | 2. The next thing would be try to get both Clang and MSVC to output code that runs as fast GCC or as close as possible. They both seem to do some silly things when compared to GCC losing some performance when looking at the generated assembly. Microsoft's compiler being the worst, and probably the fastest fix for me to implement would be to write some assembly code. However, it then would no longer be a single header file hash function since MSVC does not support inline assembly for 64-bit builds, and thusly would require a separate file.
123 | 
124 | 3. Then probably consider using intrinsics for some other systems like ARM NEON, but the for now there is scalar code and code written using GCC's vector built-ins that will generate vectorized code for other architectures that GCC supports.
125 | 
126 | 4. Probably, the next thing I could think of is to choose a better value for S1 and S2 that are used to basically substitute bytes. The current values where found randomly checking a small set of criteria. Mainly focusing on each bit of S1 and S2 as columns. Then Xor-ing them effectively creating an 8 bit input boolean function, and making sure the entire thing maps each input to a unique value. There likely are better values that could chosen, and criteria to look at that look at all bits at once. However, the search space is huge effectively 2^(2\*8\*16) possible permutations for S1 and S2. However, the current values do seem to work well, from my testing. An other constant that could be looked at as well is the new shuffle constant I have in v2 that randomly permutes the bytes, it's quite likely their exists a better constant for this as well.
127 | 
128 | 5. Maybe, write some assembly versions to get around some of the compiler differences. Also maybe a rust version.
129 | 
130 | # Copyright and License
131 | 
132 | Copyright (C) 2023, by Keith Cancel [<admin@keith.pro>](mailto:admin@keith.pro).
133 | 
134 | Under the MIT License
135 | 


--------------------------------------------------------------------------------
/khashv.h:
--------------------------------------------------------------------------------
   1 | /*
   2 | MIT License
   3 | Copyright (c) 2022 Keith-Cancel
   4 | Permission is hereby granted, free of charge, to any person obtaining a copy
   5 | of this software and associated documentation files (the “Software”), to deal
   6 | in the Software without restriction, including without limitation the rights
   7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   8 | copies of the Software, and to permit persons to whom the Software is
   9 | furnished to do so, subject to the following conditions:
  10 | The above copyright notice and this permission notice shall be included in all
  11 | copies or substantial portions of the Software.
  12 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  18 | SOFTWARE.
  19 | */
  20 | 
  21 | #ifndef K_HASH_V_H
  22 | #define K_HASH_V_H
  23 | #ifdef __cplusplus
  24 | extern "C" {
  25 | #define restrict
  26 | #endif
  27 | 
  28 | #include <stddef.h>
  29 | #include <stdint.h>
  30 | #include <stdlib.h>
  31 | #include <string.h>
  32 | 
  33 | // For MSVC compiler, no __SSE3__ macro
  34 | #if !defined(__SSE3__) && (defined(__AVX__) || defined(__AVX2__))
  35 |     #define __SSE3__
  36 | #endif
  37 | // Same deal
  38 | #if !defined(__SSE4_1__) && (defined(__AVX__) || defined(__AVX2__))
  39 |     #define __SSE4_1__
  40 | #endif
  41 | 
  42 | #if defined(__SSE3__)
  43 |     #include <immintrin.h>
  44 |     #if defined(__MINGW32__) || defined(_WIN32)
  45 |         #include <emmintrin.h>
  46 |     #endif
  47 | #endif
  48 | 
  49 | #if defined(__GNUC__) && !defined(__clang__)
  50 | #define KHASH_GCC_LEAST__(maj, min)     (__GNUC__ > maj || __GNUC__ == maj && __GNUC_MINOR__ >= min)
  51 | #else
  52 | #define KHASH_GCC_LEAST__(maj, min) 0
  53 | #endif
  54 | 
  55 | #if defined(__BYTE_ORDER__) && !defined(__BYTE_ORDER)
  56 | #define __BYTE_ORDER __BYTE_ORDER__
  57 | #endif
  58 | 
  59 | #if defined(__ORDER_LITTLE_ENDIAN__) && !defined(__LITTLE_ENDIAN)
  60 | #define __LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__
  61 | #endif
  62 | 
  63 | #if defined(__ORDER_BIG_ENDIAN__) && !defined(__BIG_ENDIAN)
  64 | #define __BIG_ENDIAN __ORDER_BIG_ENDIAN__
  65 | #endif
  66 | 
  67 | 
  68 | #if defined(__clang__) && defined(__has_attribute)
  69 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr)
  70 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(5, 0)
  71 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr)
  72 | #else
  73 | #define KHASH_CHK_ATTRIBUTE__(attr) 0
  74 | #endif
  75 | 
  76 | #if defined(__clang__) && defined(__has_builtin)
  77 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built)
  78 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(10, 1)
  79 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built)
  80 | #else
  81 | #define KHASH_CHK_BUILTIN__(built) 0
  82 | #endif
  83 | 
  84 | #if defined(_MSC_VER) && !defined(__clang__)
  85 | #define KHASH_FINLINE __forceinline
  86 | #define KHASH_BSWAP32(val) _byteswap_ulong(val)
  87 | #endif
  88 | 
  89 | #if !defined(KHASH_FINLINE) && (KHASH_CHK_ATTRIBUTE__(always_inline) || KHASH_GCC_LEAST__(3, 1))
  90 | #define KHASH_FINLINE __attribute__((always_inline)) inline
  91 | #endif
  92 | 
  93 | #if !defined(KHASH_BSWAP32) && (KHASH_CHK_BUILTIN__(__builtin_bswap32) || KHASH_GCC_LEAST__(4, 5))
  94 | #define KHASH_BSWAP32(val) __builtin_bswap32(val)
  95 | #endif
  96 | 
  97 | #if !defined(KHASH_OPT_SZ) && (KHASH_CHK_ATTRIBUTE__(optimize) || KHASH_GCC_LEAST__(4, 8))
  98 |     #define KHASH_OPT_SZ __attribute__((optimize("Os")))
  99 | #endif
 100 | 
 101 | #if !defined(KHASH_FINLINE)
 102 | #define KHASH_FINLINE inline
 103 | #endif
 104 | 
 105 | #if !defined(KHASH_OPT_SZ)
 106 | #define KHASH_OPT_SZ
 107 | #endif
 108 | 
 109 | #if !defined(KHASH_BSWAP32)
 110 |     #define KHASH_BSWAP32(val) ((val >> 24) | ((val >> 8) & 0xff00) | ((val << 8) & 0xff0000) |  (val << 24))
 111 | #endif
 112 | 
 113 | static KHASH_FINLINE int khashv_is_little_endian() {
 114 | #if defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN
 115 |     return 1;
 116 | #elif defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN
 117 |     return 0;
 118 | #elif defined(__BYTE_ORDER)
 119 |     #error "Mixed/Middle endian machine, you will need to write a custom byteswap routine"
 120 | #else
 121 |     // Otherwise hope the compiler's optimizer figures this is constant.
 122 |     // Also since the byte order macro does not exist there are
 123 |     // Middle-endian/Mixed endian machines out there but they are quite
 124 |     // rare/old. So I am not gonna worry about it since there are 24 or
 125 |     // 4! (four factorial) total endianess-es. So if the compiler does
 126 |     // not define __BYTE_ORDER, the hash output will be different, on
 127 |     // such machines, but the hash should still work fine.
 128 |     unsigned int x = 1;
 129 |     return *((char*)(&x)) == 1;
 130 | #endif
 131 | }
 132 | 
 133 | #define KHASH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 134 | 
 135 | struct khashv_block_s {
 136 |     union {
 137 |         uint8_t  bytes[16];
 138 |         uint32_t words[4];
 139 |         #if defined(__SSE3__)
 140 |         __m128i  vec;
 141 |         #endif
 142 |     };
 143 | };
 144 | 
 145 | typedef struct khashv_block_s khashvBlock;
 146 | typedef struct khashv_block_s khashvSeed;
 147 | 
 148 | static const khashvBlock khash_v_init = {
 149 |     .words = {
 150 |         // Really this could basically be almost anything
 151 |         // So just using some bytes of the SHA-256 hashes
 152 |         // of 1, 2, 3, and 4
 153 |         0x7785459a,  // SHA256 of the byte 0x01, using the last 4 bytes
 154 |         0x6457d986,  // SHA256 of the byte 0x02, using the last 4 bytes
 155 |         0xadff29c5,  // SHA256 of the byte 0x03, using the last 4 bytes
 156 |         0x81c89e71,  // SHA256 of the byte 0x04, using the last 4 bytes
 157 |     }};
 158 | 
 159 | static const uint8_t khashv_s1[16] = {
 160 |     0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7,
 161 |     0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12,
 162 | };
 163 | 
 164 | static const uint8_t khashv_s2[16] = {
 165 |     0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5,
 166 |     0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a,
 167 | };
 168 | 
 169 | static const uint8_t khashv_xored[256] = {
 170 |     0xf3, 0xb2, 0x17, 0x0c, 0x2e, 0x73, 0x35, 0x58,
 171 |     0x8c, 0x7e, 0xb6, 0x5c, 0xc4, 0x4a, 0x01, 0xfd,
 172 |     0xd2, 0x93, 0x36, 0x2d, 0x0f, 0x52, 0x14, 0x79,
 173 |     0xad, 0x5f, 0x97, 0x7d, 0xe5, 0x6b, 0x20, 0xdc,
 174 |     0x7a, 0x3b, 0x9e, 0x85, 0xa7, 0xfa, 0xbc, 0xd1,
 175 |     0x05, 0xf7, 0x3f, 0xd5, 0x4d, 0xc3, 0x88, 0x74,
 176 |     0xef, 0xae, 0x0b, 0x10, 0x32, 0x6f, 0x29, 0x44,
 177 |     0x90, 0x62, 0xaa, 0x40, 0xd8, 0x56, 0x1d, 0xe1,
 178 |     0xea, 0xab, 0x0e, 0x15, 0x37, 0x6a, 0x2c, 0x41,
 179 |     0x95, 0x67, 0xaf, 0x45, 0xdd, 0x53, 0x18, 0xe4,
 180 |     0x3d, 0x7c, 0xd9, 0xc2, 0xe0, 0xbd, 0xfb, 0x96,
 181 |     0x42, 0xb0, 0x78, 0x92, 0x0a, 0x84, 0xcf, 0x33,
 182 |     0x5e, 0x1f, 0xba, 0xa1, 0x83, 0xde, 0x98, 0xf5,
 183 |     0x21, 0xd3, 0x1b, 0xf1, 0x69, 0xe7, 0xac, 0x50,
 184 |     0xb9, 0xf8, 0x5d, 0x46, 0x64, 0x39, 0x7f, 0x12,
 185 |     0xc6, 0x34, 0xfc, 0x16, 0x8e, 0x00, 0x4b, 0xb7,
 186 |     0x0d, 0x4c, 0xe9, 0xf2, 0xd0, 0x8d, 0xcb, 0xa6,
 187 |     0x72, 0x80, 0x48, 0xa2, 0x3a, 0xb4, 0xff, 0x03,
 188 |     0xb1, 0xf0, 0x55, 0x4e, 0x6c, 0x31, 0x77, 0x1a,
 189 |     0xce, 0x3c, 0xf4, 0x1e, 0x86, 0x08, 0x43, 0xbf,
 190 |     0x47, 0x06, 0xa3, 0xb8, 0x9a, 0xc7, 0x81, 0xec,
 191 |     0x38, 0xca, 0x02, 0xe8, 0x70, 0xfe, 0xb5, 0x49,
 192 |     0xda, 0x9b, 0x3e, 0x25, 0x07, 0x5a, 0x1c, 0x71,
 193 |     0xa5, 0x57, 0x9f, 0x75, 0xed, 0x63, 0x28, 0xd4,
 194 |     0x6e, 0x2f, 0x8a, 0x91, 0xb3, 0xee, 0xa8, 0xc5,
 195 |     0x11, 0xe3, 0x2b, 0xc1, 0x59, 0xd7, 0x9c, 0x60,
 196 |     0x24, 0x65, 0xc0, 0xdb, 0xf9, 0xa4, 0xe2, 0x8f,
 197 |     0x5b, 0xa9, 0x61, 0x8b, 0x13, 0x9d, 0xd6, 0x2a,
 198 |     0x89, 0xc8, 0x6d, 0x76, 0x54, 0x09, 0x4f, 0x22,
 199 |     0xf6, 0x04, 0xcc, 0x26, 0xbe, 0x30, 0x7b, 0x87,
 200 |     0x66, 0x27, 0x82, 0x99, 0xbb, 0xe6, 0xa0, 0xcd,
 201 |     0x19, 0xeb, 0x23, 0xc9, 0x51, 0xdf, 0x94, 0x68,
 202 | };
 203 | 
 204 | /* Scalar Code */
 205 | 
 206 | static KHASH_FINLINE void khashv_bswap_be_block_scalar(khashvBlock* in) {
 207 |     // Byte swapping is only needed if we are not on on a little endian system
 208 |     if (khashv_is_little_endian()) {
 209 |         return;
 210 |     }
 211 |     for(int i = 0; i < 4; i++) {
 212 |         in->words[i] =  KHASH_BSWAP32(in->words[i]);
 213 |     }
 214 | }
 215 | 
 216 | static KHASH_FINLINE void khashv_rotr_5_bytes_scalar(khashvBlock* in) {
 217 |     khashv_bswap_be_block_scalar(in);
 218 |     khashvBlock tmp1;
 219 |     khashvBlock tmp2;
 220 |     // Avoid aliasing issues by using memcpy between these union values.
 221 |     memcpy(tmp1.bytes, in->words, 16);
 222 |     for(int i = 0; i < 16; i++) {
 223 |         tmp2.bytes[i] = tmp1.bytes[(i + 5) & 0xf];
 224 |     }
 225 |     memcpy(in->words, tmp2.bytes, 16);
 226 |     khashv_bswap_be_block_scalar(in);
 227 | }
 228 | 
 229 | static KHASH_FINLINE void khashv_shuffle_bytes_scalar(khashvBlock* in) {
 230 |     static const uint8_t shuffle[16] = {
 231 |         0x7, 0xe, 0x9, 0x0, 0xc, 0xf, 0xd, 0x8,
 232 |         0x5, 0xb, 0x6, 0x3, 0x4, 0x2, 0xa, 0x1
 233 |     };
 234 |     khashv_bswap_be_block_scalar(in);
 235 |     khashvBlock tmp1;
 236 |     khashvBlock tmp2;
 237 |     // Avoid aliasing issues by using memcpy between these union values.
 238 |     memcpy(tmp1.bytes, in->words, 16);
 239 |     for(int i = 0; i < 16; i++) {
 240 |         tmp2.bytes[i] = tmp1.bytes[shuffle[i]];
 241 |     }
 242 |     memcpy(in->words, tmp2.bytes, 16);
 243 |     khashv_bswap_be_block_scalar(in);
 244 | }
 245 | 
 246 | static KHASH_FINLINE void khashv_shl_13_block_scalar(khashvBlock* in) {
 247 |     for(int i = 0; i < 4; i++) {
 248 |         in->words[i] <<= 13;
 249 |     }
 250 | }
 251 | 
 252 | static KHASH_FINLINE void khashv_shr_3_block_scalar(khashvBlock* in) {
 253 |     for(int i = 0; i < 4; i++) {
 254 |         in->words[i] >>= 3;
 255 |     }
 256 | }
 257 | 
 258 | static KHASH_FINLINE void khashv_add_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) {
 259 |     for(int i = 0; i < 4; i++) {
 260 |         a->words[i] += b->words[i];
 261 |     }
 262 | }
 263 | 
 264 | static KHASH_FINLINE void khashv_xor_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) {
 265 |     for(int i = 0; i < 4; i++) {
 266 |         a->words[i] ^= b->words[i];
 267 |     }
 268 | }
 269 | 
 270 | // GCC and Clang with -O3 were vectorizing this quite poorly with -O3
 271 | // They could not detect that only a PSHUFB was needed and instead
 272 | // where generating tons of inserts and extracts from the vector
 273 | // registers. Thusly it was running slower than code that was not being
 274 | // vectorized on my machine. So I specify the optimization level directly.
 275 | // Tried a few other things to get GCC and Clang to generate more sane
 276 | // code or code using PSHUFB, but this seemed the cleanest.
 277 | // Example of what I mean: https://godbolt.org/z/PMnzsThPc
 278 | // Compared to this: https://godbolt.org/z/dWfjr7GWP
 279 | /*static KHASH_OPT_SZ void khashv_sub16(khashvBlock* tmp, const uint8_t sub[16]) {
 280 |     #if defined(__clang__)
 281 |         // Stop clang from being annoying!!!
 282 |         // The auto-vectorized code was worse at the time of writing this
 283 |         #pragma nounroll
 284 |         #pragma clang loop vectorize(disable)
 285 |         #pragma clang loop interleave(disable)
 286 |     #endif
 287 |     for (int i = 0; i < 16; i++) {
 288 |         tmp->bytes[i] = sub[tmp->bytes[i]];
 289 |     }
 290 | }
 291 | 
 292 | static KHASH_FINLINE void khashv_replace_scalar(khashvBlock* replace) {
 293 |     khashvBlock tmp;
 294 |     for (int i = 0; i < 16; i++) {
 295 |         tmp.bytes[i] = (replace->bytes[i] >> 4);
 296 |         replace->bytes[i] &= 0x0f;
 297 |     }
 298 |     khashv_sub16(replace, khashv_s1);
 299 |     khashv_sub16(&tmp, khashv_s2);
 300 |     for (int i = 0; i < 16; i++) {
 301 |         replace->bytes[i] ^= tmp.bytes[i];
 302 |     }
 303 | }*/
 304 | // Similar issue as the commented out code so stop the optimizers
 305 | // from getting crazy
 306 | static KHASH_OPT_SZ void khashv_replace_scalar(khashvBlock* replace) {
 307 |     khashvBlock tmp;
 308 |     memcpy(tmp.bytes, replace->words, 16);
 309 |     #if defined(__clang__)
 310 |         // Stop clang from being annoying!!!
 311 |         // The auto-vectorized code was worse at the time of writing this
 312 |         #pragma nounroll
 313 |         #pragma clang loop vectorize(disable)
 314 |         #pragma clang loop interleave(disable)
 315 |     #endif
 316 |     for(int i = 0; i < 16; i++) {
 317 |         tmp.bytes[i] = khashv_xored[tmp.bytes[i]];
 318 |     }
 319 |     memcpy(replace->words, tmp.bytes, 16);
 320 | }
 321 | 
 322 | static KHASH_FINLINE void khashv_mix_words_scalar(khashvBlock* in) {
 323 |     unsigned rots[4] = { 5, 7, 11, 17 };
 324 |     khashvBlock tmp  = { 0 };
 325 | 
 326 |     tmp = *in;
 327 |     khashv_shr_3_block_scalar(&tmp);
 328 |     khashv_xor_block_scalar(in, &tmp);
 329 | 
 330 |     for (int i = 0; i < 4; i++) {
 331 |         unsigned rot = rots[i];
 332 |         tmp = *in;
 333 |         khashv_rotr_5_bytes_scalar(&tmp);
 334 |         khashv_add_block_scalar(&tmp, in);
 335 |         for (int j = 0; j < 4; j++) {
 336 |             tmp.words[j] = KHASH_ROTR32(tmp.words[j], rot);
 337 |         }
 338 |         khashv_xor_block_scalar(in, &tmp);
 339 |     }
 340 | }
 341 | 
 342 | static void khashv_hash_scalar(khashvBlock* hash, const uint8_t* data, size_t data_len) {
 343 |     hash->words[0] ^= data_len;
 344 |     // size_t is bigger than 32 bits
 345 |     #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
 346 |         hash->words[1] ^= data_len >> 32;
 347 |     #endif
 348 | 
 349 |     khashvBlock tmp_1;
 350 |     khashvBlock tmp_2;
 351 |     khashvBlock tmp_h = *hash;
 352 | 
 353 |     const uint8_t* end = data + (data_len & ~((size_t)15));
 354 | 
 355 |     while (data < end) {
 356 |         memcpy(&tmp_2, data, 16);
 357 |         khashv_replace_scalar(&tmp_2);
 358 |         memcpy(&tmp_1.words, tmp_2.bytes, 16);
 359 | 
 360 |         khashv_bswap_be_block_scalar(&tmp_1);
 361 | 
 362 |         tmp_2 = tmp_1;
 363 |         //khashv_shl_13_block_scalar(&tmp_2);
 364 |         //khashv_add_block_scalar(&tmp_2, &tmp_1);
 365 |         for(int i = 0; i < 4; i++) {
 366 |             tmp_2.words[i] *= 8193;
 367 |         }
 368 |         khashv_xor_block_scalar(&tmp_h, &tmp_2);
 369 |         khashv_rotr_5_bytes_scalar(&tmp_h);
 370 |         khashv_add_block_scalar(&tmp_h, &tmp_1);
 371 | 
 372 |         tmp_1 = tmp_h;
 373 |         khashv_shuffle_bytes_scalar(&tmp_1);
 374 |         khashv_add_block_scalar(&tmp_h, &tmp_1);
 375 | 
 376 |         data += 16;
 377 |     }
 378 | 
 379 |     unsigned trailing = data_len & 0xf;
 380 |     if(trailing) {
 381 |         memset(&tmp_2, 0, 16);
 382 | 
 383 |         memcpy(&tmp_2.bytes, data, trailing);
 384 |         khashv_replace_scalar(&tmp_2);
 385 |         memcpy(&tmp_1.words, tmp_2.bytes, 16);
 386 | 
 387 |         khashv_bswap_be_block_scalar(&tmp_1);
 388 | 
 389 |         tmp_2 = tmp_1;
 390 |         //khashv_shl_13_block_scalar(&tmp_2);
 391 |         //khashv_add_block_scalar(&tmp_2, &tmp_1);
 392 |         for(int i = 0; i < 4; i++) {
 393 |             tmp_2.words[i] *= 8193;
 394 |         }
 395 |         khashv_xor_block_scalar(&tmp_h, &tmp_2);
 396 |         khashv_rotr_5_bytes_scalar(&tmp_h);
 397 |         khashv_add_block_scalar(&tmp_h, &tmp_1);
 398 | 
 399 |         tmp_1 = tmp_h;
 400 |         khashv_shuffle_bytes_scalar(&tmp_1);
 401 |         khashv_add_block_scalar(&tmp_h, &tmp_1);
 402 | 
 403 |     }
 404 |     khashv_mix_words_scalar(&tmp_h);
 405 |     *hash = tmp_h;
 406 | }
 407 | 
 408 | static inline void khashv_prep_seed32_scalar(khashvSeed* seed_prepped, uint32_t seed) {
 409 |     *seed_prepped = khash_v_init;
 410 |     seed_prepped->words[0] ^= seed;
 411 |     khashv_mix_words_scalar(seed_prepped);
 412 | }
 413 | 
 414 | static inline void khashv_prep_seed64_scalar(khashvSeed* seed_prepped, uint64_t seed) {
 415 |     *seed_prepped = khash_v_init;
 416 |     seed_prepped->words[0] ^= seed;
 417 |     khashv_mix_words_scalar(seed_prepped);
 418 |     // Do it again with the other part to make it different than the 32 bit seed.
 419 |     seed_prepped->words[1] ^= seed >> 32;
 420 |     khashv_mix_words_scalar(seed_prepped);
 421 | }
 422 | 
 423 | static inline void khashv_prep_seed128_scalar(khashvSeed* seed_prepped, const uint32_t seed[4]) {
 424 |     for(int i = 0; i < 4; i++) {
 425 |         seed_prepped->words[i] = seed[i];
 426 |     }
 427 | }
 428 | 
 429 | static inline uint32_t khashv32_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 430 |     khashvBlock h = *seed;
 431 |     khashv_hash_scalar(&h, data, data_len);
 432 |     return h.words[3];
 433 | }
 434 | 
 435 | static inline uint64_t khashv64_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 436 |     khashvBlock h = *seed;
 437 |     khashv_hash_scalar(&h, data, data_len);
 438 |     uint64_t r = h.words[1];
 439 |     r <<= 32;
 440 |     r  |= h.words[0];
 441 |     return r;
 442 | }
 443 | 
 444 | /* Vectorization for and Intel/AMD */
 445 | 
 446 | #if defined(__SSE3__)
 447 | 
 448 | #define KHASH_VECTOR 1
 449 | 
 450 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(11, 0))
 451 |     static KHASH_FINLINE __m128i _mm_loadu_si32(const void* data) {
 452 |         uint32_t val;
 453 |         memcpy(&val, data, sizeof(uint32_t));
 454 |         return _mm_cvtsi32_si128(val);
 455 |     }
 456 |     static KHASH_FINLINE __m128i _mm_loadu_si16(const void* data) {
 457 |         uint32_t val = 0;
 458 |         memcpy(&val, data, sizeof(uint16_t));
 459 |         return _mm_cvtsi32_si128(val);
 460 |     }
 461 | #endif
 462 | 
 463 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(9, 1))
 464 |     static KHASH_FINLINE __m128i _mm_loadu_si64(const void* data) {
 465 |         uint64_t val = 0;
 466 |         memcpy(&val, data, sizeof(uint64_t));
 467 |         return _mm_cvtsi64_si128(val);
 468 |     }
 469 | #endif
 470 | 
 471 | static KHASH_FINLINE __m128i khashv_mix_words_vector(__m128i val) {
 472 |     __m128i tmp1;
 473 |     __m128i tmp2;
 474 | 
 475 |     tmp1 = _mm_srli_epi32(val, 3);
 476 |     val  = _mm_xor_si128(tmp1, val);
 477 | 
 478 |     tmp1 = _mm_alignr_epi8(val, val, 5);
 479 |     tmp1 = _mm_add_epi32(val, tmp1);
 480 |     #if defined(__AVX512VL__)
 481 |         tmp1 = _mm_ror_epi32(tmp1, 5);
 482 |         val  = _mm_xor_si128(val, tmp1);
 483 |     #else
 484 |         tmp2 = _mm_srli_epi32(tmp1, 5);
 485 |         tmp1 = _mm_slli_epi32(tmp1, 27);
 486 |         tmp1 = _mm_or_si128(tmp1, tmp2);
 487 |         val  = _mm_xor_si128(val, tmp1);
 488 |     #endif
 489 | 
 490 |     tmp1 = _mm_alignr_epi8(val, val, 5);
 491 |     tmp1 = _mm_add_epi32(val, tmp1);
 492 |     #if defined(__AVX512VL__)
 493 |         tmp1 = _mm_ror_epi32(tmp1, 7);
 494 |         val  = _mm_xor_si128(val, tmp1);
 495 |     #else
 496 |         tmp2 = _mm_srli_epi32(tmp1, 7);
 497 |         tmp1 = _mm_slli_epi32(tmp1, 25);
 498 |         val  = _mm_xor_si128(val, tmp2);
 499 |         val  = _mm_xor_si128(val, tmp1);
 500 |     #endif
 501 | 
 502 |     tmp1 = _mm_alignr_epi8(val, val, 5);
 503 |     tmp1 = _mm_add_epi32(tmp1, val);
 504 |     #if defined(__AVX512VL__)
 505 |         tmp1 = _mm_ror_epi32(tmp1, 11);
 506 |         val  = _mm_xor_si128(val, tmp1);
 507 |     #else
 508 |         tmp2 = _mm_srli_epi32(tmp1, 11);
 509 |         tmp1 = _mm_slli_epi32(tmp1, 21);
 510 |         val  = _mm_xor_si128(val, tmp2);
 511 |         val  = _mm_xor_si128(val, tmp1);
 512 |     #endif
 513 | 
 514 |     tmp1 = _mm_alignr_epi8(val, val, 5);
 515 |     tmp1 = _mm_add_epi32(tmp1, val);
 516 |     #if defined(__AVX512VL__)
 517 |         tmp1 = _mm_ror_epi32(tmp1, 17);
 518 |         val  = _mm_xor_si128(val, tmp1);
 519 |     #else
 520 |         tmp2 = _mm_srli_epi32(tmp1, 17);
 521 |         tmp1 = _mm_slli_epi32(tmp1, 15);
 522 |         val  = _mm_xor_si128(val, tmp2);
 523 |         val  = _mm_xor_si128(val, tmp1);
 524 |     #endif
 525 | 
 526 |     return val;
 527 | }
 528 | 
 529 | static KHASH_FINLINE __m128i khashv_part_load_vector(const uint8_t* data, size_t len) {
 530 |     __m128i tmp  = { 0 };
 531 |     __m128i tmp2 = { 0 };
 532 |     switch(len) {
 533 |         case 1:
 534 |             #if defined(__SSE4_1__)
 535 |                 tmp = _mm_insert_epi8(tmp, data[0], 0);
 536 |             #else
 537 |                 tmp = _mm_cvtsi32_si128(data[0]);
 538 |             #endif
 539 |             break;
 540 |         case 2:
 541 |             tmp = _mm_loadu_si16(data);
 542 |             break;
 543 |         case 3:
 544 |             tmp = _mm_loadu_si16(data);
 545 |             #if defined(__SSE4_1__)
 546 |                 tmp = _mm_insert_epi8(tmp, data[2], 2);
 547 |             #else
 548 |                 tmp = _mm_insert_epi16(tmp, data[2], 1);
 549 |             #endif
 550 |             break;
 551 |         case 4:
 552 |             tmp = _mm_loadu_si32(data);
 553 |             break;
 554 |         case 5:
 555 |             tmp = _mm_loadu_si32(data);
 556 |             #if defined(__SSE4_1__)
 557 |                 tmp = _mm_insert_epi8(tmp, data[4], 4);
 558 |             #else
 559 |                 tmp = _mm_insert_epi16(tmp, data[4], 2);
 560 |             #endif
 561 |             break;
 562 |         case 6:
 563 |             tmp = _mm_loadu_si32(data);
 564 |             tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2);
 565 |             break;
 566 |         case 7:
 567 |             tmp = _mm_loadu_si32(data);
 568 |             tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2);
 569 |             #if defined(__SSE4_1__)
 570 |                 tmp = _mm_insert_epi8(tmp, data[6], 6);
 571 |             #else
 572 |                 tmp = _mm_insert_epi16(tmp, data[6], 3);
 573 |             #endif
 574 |             break;
 575 |         case 8:
 576 |             tmp = _mm_loadu_si64(data);
 577 |             break;
 578 |         case 9:
 579 |             tmp = _mm_loadu_si64(data);
 580 |             #if defined(__SSE4_1__)
 581 |                 tmp = _mm_insert_epi8(tmp, data[8], 8);
 582 |             #else
 583 |                 tmp = _mm_insert_epi16(tmp, data[8], 4);
 584 |             #endif
 585 |             break;
 586 |         case 10:
 587 |             tmp = _mm_loadu_si64(data);
 588 |             tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4);
 589 |             break;
 590 |         case 11:
 591 |             tmp = _mm_loadu_si64(data);
 592 |             tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4);
 593 |             #if defined(__SSE4_1__)
 594 |                 tmp = _mm_insert_epi8(tmp, data[10], 10);
 595 |             #else
 596 |                 tmp = _mm_insert_epi16(tmp, data[10], 5);
 597 |             #endif
 598 |             break;
 599 |         case 12:
 600 |             tmp = _mm_loadu_si64(data);
 601 |             #if defined(__SSE4_1__)
 602 |                 tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
 603 |             #else
 604 |                 tmp2 = _mm_loadu_si32(data + 8);
 605 |                 tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
 606 |                 tmp  = _mm_or_si128(tmp, tmp2);
 607 |             #endif
 608 |             break;
 609 |         case 13:
 610 |             tmp = _mm_loadu_si64(data);
 611 |             #if defined(__SSE4_1__)
 612 |                 tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
 613 |                 tmp = _mm_insert_epi8(tmp, data[12], 12);
 614 |             #else
 615 |                 tmp2 = _mm_loadu_si32(data + 8);
 616 |                 tmp2 = _mm_insert_epi16(tmp2, data[12], 2);
 617 |                 tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
 618 |                 tmp  = _mm_or_si128(tmp, tmp2);
 619 |             #endif
 620 |             break;
 621 |         case 14:
 622 |             tmp = _mm_loadu_si64(data);
 623 |             #if defined(__SSE4_1__)
 624 |                 tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
 625 |                 tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6);
 626 |             #else
 627 |                 tmp2 = _mm_loadu_si32(data + 8);
 628 |                 tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6);
 629 |                 tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
 630 |                 tmp  = _mm_or_si128(tmp, tmp2);
 631 |             #endif
 632 |             break;
 633 |         case 15:
 634 |             tmp = _mm_loadu_si64(data);
 635 |             #if defined(__SSE4_1__)
 636 |                 tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
 637 |                 tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6);
 638 |                 tmp = _mm_insert_epi8(tmp, data[14], 14);
 639 |             #else
 640 |                 tmp2 = _mm_loadu_si32(data + 8);
 641 |                 tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6);
 642 |                 tmp2 = _mm_insert_epi16(tmp2, data[14], 7);
 643 |                 tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
 644 |                 tmp  = _mm_or_si128(tmp, tmp2);
 645 |             #endif
 646 |             break;
 647 |         case 16:
 648 |             tmp = _mm_loadu_si128((__m128i*)data);
 649 |             break;
 650 |     }
 651 |     return tmp;
 652 | }
 653 | 
 654 | static const uint8_t khashv_shuff[16] = {
 655 |     0x7, 0xe, 0x9, 0x0, 0xc, 0xf, 0xd, 0x8,
 656 |     0x5, 0xb, 0x6, 0x3, 0x4, 0x2, 0xa, 0x1
 657 | };
 658 | 
 659 | static __m128i khashv_hash_vector(__m128i hash, const uint8_t* data, size_t data_len) {
 660 |     const __m128i s1     = _mm_loadu_si128((const __m128i*)khashv_s1);
 661 |     const __m128i s2     = _mm_loadu_si128((const __m128i*)khashv_s2);
 662 |     const __m128i shuff  = _mm_loadu_si128((const __m128i*)khashv_shuff);
 663 |     const __m128i mask   = _mm_set1_epi32(0x0f0f0f0f);
 664 | 
 665 |     __m128i       tmp_1;
 666 |     __m128i       tmp_2;
 667 | 
 668 |     #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
 669 |         tmp_1 = _mm_cvtsi64_si128(data_len);
 670 |     #else
 671 |         tmp_1 = _mm_cvtsi32_si128(data_len);
 672 |     #endif
 673 |     hash = _mm_xor_si128(tmp_1, hash);
 674 | 
 675 |     const uint8_t* end  = data + (data_len & ~((size_t)15));
 676 |     const uint8_t* end2 = data + data_len;
 677 |     while(data_len > 16 && data < end) {
 678 |         tmp_1 = _mm_lddqu_si128((const __m128i*)data);
 679 |         tmp_2 = _mm_srli_epi32  (tmp_1, 4);
 680 | 
 681 |         tmp_1 = _mm_and_si128   (tmp_1, mask);
 682 |         tmp_2 = _mm_and_si128   (tmp_2, mask);
 683 |         tmp_1 = _mm_shuffle_epi8(s1,    tmp_1);
 684 |         tmp_2 = _mm_shuffle_epi8(s2,    tmp_2);
 685 |         tmp_1 = _mm_xor_si128   (tmp_1, tmp_2);
 686 | 
 687 |         tmp_2 = _mm_slli_epi32 (tmp_1, 13);
 688 |         tmp_2 = _mm_add_epi32  (tmp_1, tmp_2);
 689 |         tmp_2 = _mm_xor_si128  (hash,  tmp_2);
 690 |         tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5);
 691 |         hash  = _mm_add_epi32  (tmp_2, tmp_1);
 692 | 
 693 |         tmp_1 = _mm_shuffle_epi8(hash, shuff);
 694 |         hash  = _mm_add_epi32(hash, tmp_1);
 695 | 
 696 |         data += 16;
 697 |     }
 698 |     uintptr_t trailing = end2 - data;
 699 |     if(trailing) {
 700 |         tmp_1 = khashv_part_load_vector(data, trailing);
 701 |         tmp_2 = _mm_srli_epi32  (tmp_1, 4);
 702 | 
 703 |         tmp_1 = _mm_and_si128   (tmp_1, mask);
 704 |         tmp_2 = _mm_and_si128   (tmp_2, mask);
 705 |         tmp_1 = _mm_shuffle_epi8(s1,    tmp_1);
 706 |         tmp_2 = _mm_shuffle_epi8(s2,    tmp_2);
 707 |         tmp_1 = _mm_xor_si128   (tmp_1, tmp_2);
 708 | 
 709 |         tmp_2 = _mm_slli_epi32 (tmp_1, 13);
 710 |         tmp_2 = _mm_add_epi32  (tmp_1, tmp_2);
 711 |         tmp_2 = _mm_xor_si128  (hash,  tmp_2);
 712 |         tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5);
 713 |         hash  = _mm_add_epi32  (tmp_2, tmp_1);
 714 | 
 715 |         tmp_1 = _mm_shuffle_epi8(hash, shuff);
 716 |         hash  = _mm_add_epi32(hash, tmp_1);
 717 |     }
 718 |     hash = khashv_mix_words_vector(hash);
 719 |     return hash;
 720 | }
 721 | 
 722 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) {
 723 |     __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init);
 724 |     s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed));
 725 |     seed_prepped->vec = khashv_mix_words_vector(s);
 726 | }
 727 | 
 728 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) {
 729 |     __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init);
 730 |     __m128i t = _mm_cvtsi32_si128(seed >> 32);
 731 |     s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed));
 732 |     s = khashv_mix_words_vector(s);
 733 |     s = _mm_xor_si128(s, _mm_shuffle_epi32(t, 0xf3));
 734 |     seed_prepped->vec = khashv_mix_words_vector(s);
 735 | }
 736 | 
 737 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) {
 738 |     seed_prepped->vec = _mm_loadu_si128((const __m128i*)seed);
 739 | }
 740 | 
 741 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 742 |     __m128i h = khashv_hash_vector(seed->vec, data, data_len);
 743 |     // using word[3] to avoid any overlap with with the
 744 |     // 64 bit hash which uses words [0] and [1], this ensures
 745 |     // the 2 bit outputs should behave differently when used.
 746 |     #if defined(__SSE4_1__)
 747 |         return _mm_extract_epi32(h, 3);
 748 |     #else
 749 |         h = _mm_shuffle_epi32(h, 0xff);
 750 |         return _mm_cvtsi128_si32(h);
 751 |     #endif
 752 | }
 753 | 
 754 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 755 |     __m128i h = khashv_hash_vector(seed->vec, data, data_len);
 756 |     return _mm_cvtsi128_si64(h);
 757 | }
 758 | 
 759 | #endif
 760 | 
 761 | /* Vectorization via GCCs Vectorization builtins */
 762 | // Handy since it allows vectorization without explicit intrinsics
 763 | // for a particular CPU.
 764 | 
 765 | #if !defined(KHASH_VECTOR) && KHASH_GCC_LEAST__(6, 1)
 766 | 
 767 | #define KHASH_VECTOR 1
 768 | 
 769 | typedef uint8_t  kv16ui __attribute__((vector_size(16)));
 770 | typedef uint32_t kv4ui  __attribute__((vector_size(16)));
 771 | 
 772 | static KHASH_FINLINE kv16ui khashv_sub_s1_gcc(kv16ui in) {
 773 |     const kv16ui mask = {
 774 |         0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
 775 |         0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf
 776 |     };
 777 |     const kv16ui sub = {
 778 |         0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7,
 779 |         0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12,
 780 |     };
 781 |     in &= mask;
 782 |     return __builtin_shuffle(sub, in);
 783 | }
 784 | 
 785 | static KHASH_FINLINE kv16ui khashv_sub_s2_gcc(kv16ui in) {
 786 |     const kv16ui sub = {
 787 |         0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5,
 788 |         0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a,
 789 |     };
 790 |     in >>= 4;
 791 |     return __builtin_shuffle(sub, in);
 792 | }
 793 | 
 794 | static KHASH_FINLINE kv4ui khashv_rotr_5_bytes_gcc(kv4ui input) {
 795 |     const kv16ui rotrLE = {
 796 |         0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc,
 797 |         0xd, 0xe, 0xf, 0x0, 0x1, 0x2, 0x3, 0x4
 798 |     };
 799 |     const kv16ui rotrBE = {
 800 |         0xb, 0x4, 0x5, 0x6, 0xf, 0x8, 0x9, 0xa,
 801 |         0x3, 0xc, 0xd, 0xe, 0x7, 0x0, 0x1, 0x2
 802 |     };
 803 |     kv16ui tmp;
 804 |     memcpy(&tmp, &input, 16);
 805 |     if (khashv_is_little_endian()) {
 806 |         tmp = __builtin_shuffle(tmp, rotrLE);
 807 |     } else {
 808 |         tmp = __builtin_shuffle(tmp, rotrBE);
 809 |     }
 810 |     memcpy(&input, &tmp, 16);
 811 |     return input;
 812 | }
 813 | 
 814 | static KHASH_FINLINE kv4ui khashv_shuffle_bytes_gcc(kv4ui input) {
 815 |     const kv16ui shuffLE = {
 816 |         0x7, 0xe, 0x9, 0x0, 0xc, 0xf, 0xd, 0x8,
 817 |         0x5, 0xb, 0x6, 0x3, 0x4, 0x2, 0xa, 0x1
 818 |     };
 819 |     const kv16ui shuffBE = {
 820 |         0x3, 0xa, 0xd, 0x4, 0xb, 0xe, 0xc, 0xf,
 821 |         0x0, 0x5, 0x8, 0x6, 0x2, 0x9, 0x1, 0x7,
 822 |     };
 823 |     kv16ui tmp;
 824 |     memcpy(&tmp, &input, 16);
 825 |     if (khashv_is_little_endian()) {
 826 |         tmp = __builtin_shuffle(tmp, shuffLE);
 827 |     } else {
 828 |         tmp = __builtin_shuffle(tmp, shuffBE);
 829 |     }
 830 |     memcpy(&input, &tmp, 16);
 831 |     return input;
 832 | }
 833 | 
 834 | static KHASH_FINLINE kv4ui khash_byteswap_vec32_gcc( kv4ui input ) {
 835 |     const kv16ui bswap32 = {
 836 |         0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4,
 837 |         0xb, 0xa, 0x9, 0x8, 0xf, 0xe, 0xd, 0xc,
 838 |     };
 839 |     kv16ui b;
 840 | 
 841 |     memcpy(&b, &input, 16);
 842 |     b = __builtin_shuffle(b, bswap32);
 843 |     memcpy(&input, &b, 16);
 844 |     return input;
 845 | }
 846 | 
 847 | static KHASH_FINLINE kv4ui khashv_replace_gcc(kv4ui input) {
 848 |     kv16ui s1;
 849 |     kv16ui s2;
 850 |     memcpy(&s1, &input, 16);
 851 |     s2 = khashv_sub_s2_gcc(s1);
 852 |     s1 = khashv_sub_s1_gcc(s1);
 853 |     s1 ^= s2;
 854 |     memcpy(&input, &s1, 16);
 855 |     return input;
 856 | }
 857 | 
 858 | static KHASH_FINLINE kv4ui khashv_mix_words_gcc(kv4ui val) {
 859 |     const unsigned rots[4] = { 5, 7, 11, 17 };
 860 |     kv4ui tmp = val >> 3;
 861 |     val ^= tmp;
 862 |     for (int i = 0; i < 4; i++) {
 863 |         unsigned rot = rots[i];
 864 |         kv4ui tmp = val;
 865 |         tmp  = khashv_rotr_5_bytes_gcc(tmp);
 866 |         tmp += val;
 867 |         tmp  = (tmp >> rot) | (tmp << (32 - rot));
 868 |         val ^= tmp;
 869 |     }
 870 |     return val;
 871 | }
 872 | 
 873 | static KHASH_FINLINE kv4ui khashv_hash_block_gcc(kv4ui hash, kv4ui input) {
 874 |     kv4ui tmp_1 = khashv_replace_gcc(input);
 875 |     if (!khashv_is_little_endian()) {
 876 |         tmp_1 = khash_byteswap_vec32_gcc(tmp_1);
 877 |     }
 878 |     kv4ui tmp_2 = tmp_1 * 8193;
 879 |     tmp_2 ^= hash;
 880 |     tmp_2  = khashv_rotr_5_bytes_gcc(tmp_2);
 881 |     hash   = tmp_1 + tmp_2;
 882 | 
 883 |     tmp_1  = khashv_shuffle_bytes_gcc(hash);
 884 |     hash   = hash + tmp_1;
 885 |     return hash;
 886 | }
 887 | 
 888 | static KHASH_FINLINE kv4ui khashv_hash_gcc(kv4ui hash, const uint8_t* data, size_t data_len) {
 889 |     hash[0] ^= data_len;
 890 |     #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
 891 |         hash[1] ^= data_len >> 32;
 892 |     #endif
 893 | 
 894 |     kv4ui data_v;
 895 |     const uint8_t* end = data + (data_len & ~((size_t)15));
 896 |     while (data < end) {
 897 |         memcpy(&data_v, data, 16);
 898 |         hash = khashv_hash_block_gcc(hash, data_v);
 899 |         data += 16;
 900 |     }
 901 | 
 902 |     unsigned trailing = data_len & 0xf;
 903 |     if(trailing) {
 904 |         memset(&data_v, 0, 16);
 905 |         memcpy(&data_v, data, trailing);
 906 |         hash = khashv_hash_block_gcc(hash, data_v);
 907 |     }
 908 |     return khashv_mix_words_gcc(hash);
 909 | }
 910 | 
 911 | 
 912 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) {
 913 |     kv4ui s;
 914 |     memcpy(&s, &khash_v_init, 16);
 915 |     s[0] ^= seed;
 916 |     s = khashv_mix_words_gcc(s);
 917 |     memcpy(seed_prepped, &s, 16);
 918 | }
 919 | 
 920 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) {
 921 |     kv4ui s;
 922 |     memcpy(&s, &khash_v_init, 16);
 923 |     s[0] ^= seed;
 924 |     s = khashv_mix_words_gcc(s);
 925 |     s[1] ^= seed >> 32;
 926 |     s = khashv_mix_words_gcc(s);
 927 |     memcpy(seed_prepped, &s, 16);
 928 | }
 929 | 
 930 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) {
 931 |     memcpy(seed_prepped->words, seed, 16);
 932 | }
 933 | 
 934 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 935 |     kv4ui h;
 936 |     memcpy(&h, seed, 16);
 937 |     h = khashv_hash_gcc(h, data, data_len);
 938 |     return h[3];
 939 | }
 940 | 
 941 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 942 |     kv4ui h;
 943 |     memcpy(&h, seed, 16);
 944 |     h = khashv_hash_gcc(h, data, data_len);
 945 |     uint64_t ret;
 946 |     if (khashv_is_little_endian()) {
 947 |         memcpy(&ret, &h, 8);
 948 |     } else {
 949 |         ret = h[1];
 950 |         ret = (ret << 32) | h[0];
 951 |     }
 952 |     return ret;
 953 | }
 954 | 
 955 | #endif
 956 | 
 957 | #if defined(KHASH_VECTOR) && !defined(KHASHV_SCALAR)
 958 | 
 959 |     static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) {
 960 |         khashv_prep_seed32_vector(seed_prepped, seed);
 961 |     }
 962 | 
 963 |     static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) {
 964 |         khashv_prep_seed64_vector(seed_prepped, seed);
 965 |     }
 966 | 
 967 |     static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) {
 968 |         khashv_prep_seed128_vector(seed_prepped, seed);
 969 |     }
 970 | 
 971 |     static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 972 |         return khashv32_vector(seed, data, data_len);
 973 |     }
 974 | 
 975 |     static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 976 |         return khashv64_vector(seed, data, data_len);
 977 |     }
 978 | 
 979 | #else
 980 | 
 981 |     static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) {
 982 |         khashv_prep_seed32_scalar(seed_prepped, seed);
 983 |     }
 984 | 
 985 |     static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) {
 986 |         khashv_prep_seed64_scalar(seed_prepped, seed);
 987 |     }
 988 | 
 989 |     static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) {
 990 |         khashv_prep_seed128_scalar(seed_prepped, seed);
 991 |     }
 992 | 
 993 |     static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 994 |         return khashv32_scalar(seed, data, data_len);
 995 |     }
 996 | 
 997 |     static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 998 |         return khashv64_scalar(seed, data, data_len);
 999 |     }
1000 | 
1001 | #endif
1002 | 
1003 | 
1004 | #ifdef __cplusplus
1005 | }
1006 | #endif
1007 | #endif
1008 | 


--------------------------------------------------------------------------------
/k-hashv-old/khashv_v1.h:
--------------------------------------------------------------------------------
   1 | /*
   2 | MIT License
   3 | Copyright (c) 2022 Keith-Cancel
   4 | Permission is hereby granted, free of charge, to any person obtaining a copy
   5 | of this software and associated documentation files (the “Software”), to deal
   6 | in the Software without restriction, including without limitation the rights
   7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
   8 | copies of the Software, and to permit persons to whom the Software is
   9 | furnished to do so, subject to the following conditions:
  10 | The above copyright notice and this permission notice shall be included in all
  11 | copies or substantial portions of the Software.
  12 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  18 | SOFTWARE.
  19 | */
  20 | 
  21 | #ifndef K_HASH_V_H
  22 | #define K_HASH_V_H
  23 | #ifdef __cplusplus
  24 | extern "C" {
  25 | #define restrict
  26 | #endif
  27 | 
  28 | #include <stddef.h>
  29 | #include <stdint.h>
  30 | #include <stdlib.h>
  31 | #include <string.h>
  32 | 
  33 | // For MSVC compiler, no __SSE3__ macro
  34 | #if !defined(__SSE3__) && (defined(__AVX__) || defined(__AVX2__))
  35 |     #define __SSE3__
  36 | #endif
  37 | // Same deal
  38 | #if !defined(__SSE4_1__) && (defined(__AVX__) || defined(__AVX2__))
  39 |     #define __SSE4_1__
  40 | #endif
  41 | 
  42 | #if defined(__SSE3__)
  43 |     #include <immintrin.h>
  44 |     #if defined(__MINGW32__) || defined(_WIN32)
  45 |         #include <emmintrin.h>
  46 |     #endif
  47 | #endif
  48 | 
  49 | #if defined(__GNUC__) && !defined(__clang__)
  50 | #define KHASH_GCC_LEAST__(maj, min)     (__GNUC__ > maj || __GNUC__ == maj && __GNUC_MINOR__ >= min)
  51 | #else
  52 | #define KHASH_GCC_LEAST__(maj, min) 0
  53 | #endif
  54 | 
  55 | #if defined(__BYTE_ORDER__) && !defined(__BYTE_ORDER)
  56 | #define __BYTE_ORDER __BYTE_ORDER__
  57 | #endif
  58 | 
  59 | #if defined(__ORDER_LITTLE_ENDIAN__) && !defined(__LITTLE_ENDIAN)
  60 | #define __LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__
  61 | #endif
  62 | 
  63 | #if defined(__ORDER_BIG_ENDIAN__) && !defined(__BIG_ENDIAN)
  64 | #define __BIG_ENDIAN __ORDER_BIG_ENDIAN__
  65 | #endif
  66 | 
  67 | 
  68 | #if defined(__clang__) && defined(__has_attribute)
  69 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr)
  70 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(5, 0)
  71 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr)
  72 | #else
  73 | #define KHASH_CHK_ATTRIBUTE__(attr) 0
  74 | #endif
  75 | 
  76 | #if defined(__clang__) && defined(__has_builtin)
  77 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built)
  78 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(10, 1)
  79 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built)
  80 | #else
  81 | #define KHASH_CHK_BUILTIN__(built) 0
  82 | #endif
  83 | 
  84 | #if defined(_MSC_VER) && !defined(__clang__)
  85 | #define KHASH_FINLINE __forceinline
  86 | #define KHASH_BSWAP32(val) _byteswap_ulong(val)
  87 | #endif
  88 | 
  89 | #if !defined(KHASH_FINLINE) && (KHASH_CHK_ATTRIBUTE__(always_inline) || KHASH_GCC_LEAST__(3, 1))
  90 | #define KHASH_FINLINE __attribute__((always_inline)) inline
  91 | #endif
  92 | 
  93 | #if !defined(KHASH_BSWAP32) && (KHASH_CHK_BUILTIN__(__builtin_bswap32) || KHASH_GCC_LEAST__(4, 5))
  94 | #define KHASH_BSWAP32(val) __builtin_bswap32(val)
  95 | #endif
  96 | 
  97 | #if !defined(KHASH_OPT_SZ) && (KHASH_CHK_ATTRIBUTE__(optimize) || KHASH_GCC_LEAST__(4, 8))
  98 |     #define KHASH_OPT_SZ __attribute__((optimize("Os")))
  99 | #endif
 100 | 
 101 | #if !defined(KHASH_FINLINE)
 102 | #define KHASH_FINLINE inline
 103 | #endif
 104 | 
 105 | #if !defined(KHASH_OPT_SZ)
 106 | #define KHASH_OPT_SZ
 107 | #endif
 108 | 
 109 | #if !defined(KHASH_BSWAP32)
 110 |     #define KHASH_BSWAP32(val) ((val >> 24) | ((val >> 8) & 0xff00) | ((val << 8) & 0xff0000) |  (val << 24))
 111 | #endif
 112 | 
 113 | static KHASH_FINLINE int khashv_is_little_endian() {
 114 | #if defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN
 115 |     return 1;
 116 | #elif defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN
 117 |     return 0;
 118 | #elif defined(__BYTE_ORDER)
 119 |     #error "Mixed/Middle endian machine, you will need to write a custom byteswap routine"
 120 | #else
 121 |     // Otherwise hope the compiler's optimizer figures this is constant.
 122 |     // Also since the byte order macro does not exist there are
 123 |     // Middle-endian/Mixed endian machines out there but they are quite
 124 |     // rare/old. So I am not gonna worry about it since there are 24 or
 125 |     // 4! (four factorial) total endianess-es. So if the compiler does
 126 |     // not define __BYTE_ORDER, the hash output will be different, on
 127 |     // such machines, but the hash should still work fine.
 128 |     unsigned int x = 1;
 129 |     return *((char*)(&x)) == 1;
 130 | #endif
 131 | }
 132 | 
 133 | #define KHASH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
 134 | 
 135 | struct khashv_block_s {
 136 |     union {
 137 |         uint8_t  bytes[16];
 138 |         uint32_t words[4];
 139 |         #if defined(__SSE3__)
 140 |         __m128i  vec;
 141 |         #endif
 142 |     };
 143 | };
 144 | 
 145 | typedef struct khashv_block_s khashvBlock;
 146 | typedef struct khashv_block_s khashvSeed;
 147 | 
 148 | static const khashvBlock khash_v_init = {
 149 |     .words = {
 150 |         // Really this could basically be almost anything
 151 |         // So just using some bytes of the SHA-256 hashes
 152 |         // of 1, 2, 3, and 4
 153 |         0x7785459a,  // SHA256 of the byte 0x01, using the last 4 bytes
 154 |         0x6457d986,  // SHA256 of the byte 0x02, using the last 4 bytes
 155 |         0xadff29c5,  // SHA256 of the byte 0x03, using the last 4 bytes
 156 |         0x81c89e71,  // SHA256 of the byte 0x04, using the last 4 bytes
 157 |     }};
 158 | 
 159 | static const uint8_t khashv_s1[16] = {
 160 |     0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7,
 161 |     0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12,
 162 | };
 163 | 
 164 | static const uint8_t khashv_s2[16] = {
 165 |     0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5,
 166 |     0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a,
 167 | };
 168 | 
 169 | static const uint8_t khashv_xored[256] = {
 170 |     0xf3, 0xb2, 0x17, 0x0c, 0x2e, 0x73, 0x35, 0x58,
 171 |     0x8c, 0x7e, 0xb6, 0x5c, 0xc4, 0x4a, 0x01, 0xfd,
 172 |     0xd2, 0x93, 0x36, 0x2d, 0x0f, 0x52, 0x14, 0x79,
 173 |     0xad, 0x5f, 0x97, 0x7d, 0xe5, 0x6b, 0x20, 0xdc,
 174 |     0x7a, 0x3b, 0x9e, 0x85, 0xa7, 0xfa, 0xbc, 0xd1,
 175 |     0x05, 0xf7, 0x3f, 0xd5, 0x4d, 0xc3, 0x88, 0x74,
 176 |     0xef, 0xae, 0x0b, 0x10, 0x32, 0x6f, 0x29, 0x44,
 177 |     0x90, 0x62, 0xaa, 0x40, 0xd8, 0x56, 0x1d, 0xe1,
 178 |     0xea, 0xab, 0x0e, 0x15, 0x37, 0x6a, 0x2c, 0x41,
 179 |     0x95, 0x67, 0xaf, 0x45, 0xdd, 0x53, 0x18, 0xe4,
 180 |     0x3d, 0x7c, 0xd9, 0xc2, 0xe0, 0xbd, 0xfb, 0x96,
 181 |     0x42, 0xb0, 0x78, 0x92, 0x0a, 0x84, 0xcf, 0x33,
 182 |     0x5e, 0x1f, 0xba, 0xa1, 0x83, 0xde, 0x98, 0xf5,
 183 |     0x21, 0xd3, 0x1b, 0xf1, 0x69, 0xe7, 0xac, 0x50,
 184 |     0xb9, 0xf8, 0x5d, 0x46, 0x64, 0x39, 0x7f, 0x12,
 185 |     0xc6, 0x34, 0xfc, 0x16, 0x8e, 0x00, 0x4b, 0xb7,
 186 |     0x0d, 0x4c, 0xe9, 0xf2, 0xd0, 0x8d, 0xcb, 0xa6,
 187 |     0x72, 0x80, 0x48, 0xa2, 0x3a, 0xb4, 0xff, 0x03,
 188 |     0xb1, 0xf0, 0x55, 0x4e, 0x6c, 0x31, 0x77, 0x1a,
 189 |     0xce, 0x3c, 0xf4, 0x1e, 0x86, 0x08, 0x43, 0xbf,
 190 |     0x47, 0x06, 0xa3, 0xb8, 0x9a, 0xc7, 0x81, 0xec,
 191 |     0x38, 0xca, 0x02, 0xe8, 0x70, 0xfe, 0xb5, 0x49,
 192 |     0xda, 0x9b, 0x3e, 0x25, 0x07, 0x5a, 0x1c, 0x71,
 193 |     0xa5, 0x57, 0x9f, 0x75, 0xed, 0x63, 0x28, 0xd4,
 194 |     0x6e, 0x2f, 0x8a, 0x91, 0xb3, 0xee, 0xa8, 0xc5,
 195 |     0x11, 0xe3, 0x2b, 0xc1, 0x59, 0xd7, 0x9c, 0x60,
 196 |     0x24, 0x65, 0xc0, 0xdb, 0xf9, 0xa4, 0xe2, 0x8f,
 197 |     0x5b, 0xa9, 0x61, 0x8b, 0x13, 0x9d, 0xd6, 0x2a,
 198 |     0x89, 0xc8, 0x6d, 0x76, 0x54, 0x09, 0x4f, 0x22,
 199 |     0xf6, 0x04, 0xcc, 0x26, 0xbe, 0x30, 0x7b, 0x87,
 200 |     0x66, 0x27, 0x82, 0x99, 0xbb, 0xe6, 0xa0, 0xcd,
 201 |     0x19, 0xeb, 0x23, 0xc9, 0x51, 0xdf, 0x94, 0x68,
 202 | };
 203 | 
 204 | /* Scalar Code */
 205 | 
 206 | static KHASH_FINLINE void khashv_bswap_be_block_scalar(khashvBlock* in) {
 207 |     // Byte swapping is only needed if we are not on on a little endian system
 208 |     if (khashv_is_little_endian()) {
 209 |         return;
 210 |     }
 211 |     for(int i = 0; i < 4; i++) {
 212 |         in->words[i] =  KHASH_BSWAP32(in->words[i]);
 213 |     }
 214 | }
 215 | 
 216 | static KHASH_FINLINE void khashv_rotr_5_bytes_scalar(khashvBlock* in) {
 217 |     khashv_bswap_be_block_scalar(in);
 218 |     khashvBlock tmp1;
 219 |     khashvBlock tmp2;
 220 |     // Avoid aliasing issues by using memcpy between these union values.
 221 |     memcpy(tmp1.bytes, in->words, 16);
 222 |     for(int i = 0; i < 16; i++) {
 223 |         tmp2.bytes[i] = tmp1.bytes[(i + 5) & 0xf];
 224 |     }
 225 |     memcpy(in->words, tmp2.bytes, 16);
 226 |     khashv_bswap_be_block_scalar(in);
 227 | }
 228 | 
 229 | static KHASH_FINLINE void khashv_rotr_9_bytes_scalar(khashvBlock* in) {
 230 |     khashv_bswap_be_block_scalar(in);
 231 |     khashvBlock tmp1;
 232 |     khashvBlock tmp2;
 233 |     // Avoid aliasing issues by using memcpy between these union values.
 234 |     memcpy(tmp1.bytes, in->words, 16);
 235 |     for(int i = 0; i < 16; i++) {
 236 |         tmp2.bytes[i] = tmp1.bytes[(i + 9) & 0xf];
 237 |     }
 238 |     memcpy(in->words, tmp2.bytes, 16);
 239 |     khashv_bswap_be_block_scalar(in);
 240 | }
 241 | 
 242 | static KHASH_FINLINE void khashv_shl_13_block_scalar(khashvBlock* in) {
 243 |     for(int i = 0; i < 4; i++) {
 244 |         in->words[i] <<= 13;
 245 |     }
 246 | }
 247 | 
 248 | static KHASH_FINLINE void khashv_shr_3_block_scalar(khashvBlock* in) {
 249 |     for(int i = 0; i < 4; i++) {
 250 |         in->words[i] >>= 3;
 251 |     }
 252 | }
 253 | 
 254 | static KHASH_FINLINE void khashv_add_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) {
 255 |     for(int i = 0; i < 4; i++) {
 256 |         a->words[i] += b->words[i];
 257 |     }
 258 | }
 259 | 
 260 | static KHASH_FINLINE void khashv_xor_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) {
 261 |     for(int i = 0; i < 4; i++) {
 262 |         a->words[i] ^= b->words[i];
 263 |     }
 264 | }
 265 | 
 266 | // GCC and Clang with -O3 were vectorizing this quite poorly with -O3
 267 | // They could not detect that only a PSHUFB was needed and instead
 268 | // where generating tons of inserts and extracts from the vector
 269 | // registers. Thusly it was running slower than code that was not being
 270 | // vectorized on my machine. So I specify the optimization level directly.
 271 | // Tried a few other things to get GCC and Clang to generate more sane
 272 | // code or code using PSHUFB, but this seemed the cleanest.
 273 | // Example of what I mean: https://godbolt.org/z/PMnzsThPc
 274 | // Compared to this: https://godbolt.org/z/dWfjr7GWP
 275 | /*static KHASH_OPT_SZ void khashv_sub16(khashvBlock* tmp, const uint8_t sub[16]) {
 276 |     #if defined(__clang__)
 277 |         // Stop clang from being annoying!!!
 278 |         // The auto-vectorized code was worse at the time of writing this
 279 |         #pragma nounroll
 280 |         #pragma clang loop vectorize(disable)
 281 |         #pragma clang loop interleave(disable)
 282 |     #endif
 283 |     for (int i = 0; i < 16; i++) {
 284 |         tmp->bytes[i] = sub[tmp->bytes[i]];
 285 |     }
 286 | }
 287 | 
 288 | static KHASH_FINLINE void khashv_replace_scalar(khashvBlock* replace) {
 289 |     khashvBlock tmp;
 290 |     for (int i = 0; i < 16; i++) {
 291 |         tmp.bytes[i] = (replace->bytes[i] >> 4);
 292 |         replace->bytes[i] &= 0x0f;
 293 |     }
 294 |     khashv_sub16(replace, khashv_s1);
 295 |     khashv_sub16(&tmp, khashv_s2);
 296 |     for (int i = 0; i < 16; i++) {
 297 |         replace->bytes[i] ^= tmp.bytes[i];
 298 |     }
 299 | }*/
 300 | // Similar issue as the commented out code so stop the optimizers
 301 | // from getting crazy
 302 | static KHASH_OPT_SZ void khashv_replace_scalar(khashvBlock* replace) {
 303 |     khashvBlock tmp;
 304 |     memcpy(tmp.bytes, replace->words, 16);
 305 |     #if defined(__clang__)
 306 |         // Stop clang from being annoying!!!
 307 |         // The auto-vectorized code was worse at the time of writing this
 308 |         #pragma nounroll
 309 |         #pragma clang loop vectorize(disable)
 310 |         #pragma clang loop interleave(disable)
 311 |     #endif
 312 |     for(int i = 0; i < 16; i++) {
 313 |         tmp.bytes[i] = khashv_xored[tmp.bytes[i]];
 314 |     }
 315 |     memcpy(replace->words, tmp.bytes, 16);
 316 | }
 317 | 
 318 | static KHASH_FINLINE void khashv_mix_words_scalar(khashvBlock* in) {
 319 |     unsigned rots[4] = { 5, 7, 11, 17 };
 320 |     khashvBlock tmp  = { 0 };
 321 |     for (int i = 0; i < 4; i++) {
 322 |         unsigned rot = rots[i];
 323 |         tmp = *in;
 324 |         khashv_rotr_5_bytes_scalar(&tmp);
 325 |         khashv_add_block_scalar(&tmp, in);
 326 |         for (int j = 0; j < 4; j++) {
 327 |             tmp.words[j] = KHASH_ROTR32(tmp.words[j], rot);
 328 |         }
 329 |         khashv_xor_block_scalar(in, &tmp);
 330 |     }
 331 | }
 332 | 
 333 | static void khashv_hash_scalar(khashvBlock* hash, const uint8_t* data, size_t data_len) {
 334 |     hash->words[0] ^= data_len;
 335 |     // size_t is bigger than 32 bits
 336 |     #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
 337 |         hash->words[1] ^= data_len >> 32;
 338 |     #endif
 339 | 
 340 |     khashvBlock tmp_1;
 341 |     khashvBlock tmp_2;
 342 |     khashvBlock tmp_h = *hash;
 343 | 
 344 |     const uint8_t* end = data + (data_len & ~((size_t)15));
 345 | 
 346 |     while (data < end) {
 347 |         memcpy(&tmp_2, data, 16);
 348 |         khashv_replace_scalar(&tmp_2);
 349 |         memcpy(&tmp_1.words, tmp_2.bytes, 16);
 350 | 
 351 |         khashv_bswap_be_block_scalar(&tmp_1);
 352 | 
 353 |         tmp_2 = tmp_1;
 354 |         //khashv_shl_13_block_scalar(&tmp_2);
 355 |         //khashv_add_block_scalar(&tmp_2, &tmp_1);
 356 |         for(int i = 0; i < 4; i++) {
 357 |             tmp_2.words[i] *= 8193;
 358 |         }
 359 |         khashv_xor_block_scalar(&tmp_h, &tmp_2);
 360 |         khashv_rotr_5_bytes_scalar(&tmp_h);
 361 |         khashv_add_block_scalar(&tmp_h, &tmp_1);
 362 | 
 363 |         tmp_2 = tmp_h;
 364 |         khashv_shr_3_block_scalar(&tmp_2);
 365 |         khashv_rotr_9_bytes_scalar(&tmp_h);
 366 |         khashv_add_block_scalar(&tmp_h, &tmp_2);
 367 | 
 368 |         data += 16;
 369 |     }
 370 | 
 371 |     unsigned trailing = data_len & 0xf;
 372 |     if(trailing) {
 373 |         memset(&tmp_2, 0, 16);
 374 | 
 375 |         memcpy(&tmp_2.bytes, data, trailing);
 376 |         khashv_replace_scalar(&tmp_2);
 377 |         memcpy(&tmp_1.words, tmp_2.bytes, 16);
 378 | 
 379 |         khashv_bswap_be_block_scalar(&tmp_1);
 380 | 
 381 |         tmp_2 = tmp_1;
 382 |         //khashv_shl_13_block_scalar(&tmp_2);
 383 |         //khashv_add_block_scalar(&tmp_2, &tmp_1);
 384 |         for(int i = 0; i < 4; i++) {
 385 |             tmp_2.words[i] *= 8193;
 386 |         }
 387 |         khashv_xor_block_scalar(&tmp_h, &tmp_2);
 388 |         khashv_rotr_5_bytes_scalar(&tmp_h);
 389 |         khashv_add_block_scalar(&tmp_h, &tmp_1);
 390 | 
 391 |         tmp_2 = tmp_h;
 392 |         khashv_shr_3_block_scalar(&tmp_2);
 393 |         khashv_rotr_9_bytes_scalar(&tmp_h);
 394 |         khashv_add_block_scalar(&tmp_h, &tmp_2);
 395 | 
 396 |     }
 397 |     khashv_mix_words_scalar(&tmp_h);
 398 |     *hash = tmp_h;
 399 | }
 400 | 
 401 | static inline void khashv_prep_seed32_scalar(khashvSeed* seed_prepped, uint32_t seed) {
 402 |     *seed_prepped = khash_v_init;
 403 |     seed_prepped->words[0] ^= seed;
 404 |     khashv_mix_words_scalar(seed_prepped);
 405 | }
 406 | 
 407 | static inline void khashv_prep_seed64_scalar(khashvSeed* seed_prepped, uint64_t seed) {
 408 |     *seed_prepped = khash_v_init;
 409 |     seed_prepped->words[0] ^= seed;
 410 |     khashv_mix_words_scalar(seed_prepped);
 411 |     // Do it again with the other part to make it different than the 32 bit seed.
 412 |     seed_prepped->words[1] ^= seed >> 32;
 413 |     khashv_mix_words_scalar(seed_prepped);
 414 | }
 415 | 
 416 | static inline void khashv_prep_seed128_scalar(khashvSeed* seed_prepped, const uint32_t seed[4]) {
 417 |     for(int i = 0; i < 4; i++) {
 418 |         seed_prepped->words[i] = seed[i];
 419 |     }
 420 | }
 421 | 
 422 | static inline uint32_t khashv32_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 423 |     khashvBlock h = *seed;
 424 |     khashv_hash_scalar(&h, data, data_len);
 425 |     return h.words[3];
 426 | }
 427 | 
 428 | static inline uint64_t khashv64_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 429 |     khashvBlock h = *seed;
 430 |     khashv_hash_scalar(&h, data, data_len);
 431 |     uint64_t r = h.words[1];
 432 |     r <<= 32;
 433 |     r  |= h.words[0];
 434 |     return r;
 435 | }
 436 | 
 437 | /* Vectorization for and Intel/AMD */
 438 | 
 439 | #if defined(__SSE3__)
 440 | 
 441 | #define KHASH_VECTOR 1
 442 | 
 443 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(11, 0))
 444 |     static KHASH_FINLINE __m128i _mm_loadu_si32(const void* data) {
 445 |         uint32_t val;
 446 |         memcpy(&val, data, sizeof(uint32_t));
 447 |         return _mm_cvtsi32_si128(val);
 448 |     }
 449 |     static KHASH_FINLINE __m128i _mm_loadu_si16(const void* data) {
 450 |         uint32_t val = 0;
 451 |         memcpy(&val, data, sizeof(uint16_t));
 452 |         return _mm_cvtsi32_si128(val);
 453 |     }
 454 | #endif
 455 | 
 456 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(9, 1))
 457 |     static KHASH_FINLINE __m128i _mm_loadu_si64(const void* data) {
 458 |         uint64_t val = 0;
 459 |         memcpy(&val, data, sizeof(uint64_t));
 460 |         return _mm_cvtsi64_si128(val);
 461 |     }
 462 | #endif
 463 | 
 464 | static KHASH_FINLINE __m128i khashv_mix_words_vector(__m128i val) {
 465 |     __m128i tmp1;
 466 |     __m128i tmp2;
 467 | 
 468 |     tmp1 = _mm_alignr_epi8(val, val, 5);
 469 |     tmp1 = _mm_add_epi32(val, tmp1);
 470 |     #if defined(__AVX512VL__)
 471 |         tmp1 = _mm_ror_epi32(tmp1, 5);
 472 |         val  = _mm_xor_si128(val, tmp1);
 473 |     #else
 474 |         tmp2 = _mm_srli_epi32(tmp1, 5);
 475 |         tmp1 = _mm_slli_epi32(tmp1, 27);
 476 |         val  = _mm_xor_si128(val, tmp2);
 477 |         val  = _mm_xor_si128(val, tmp1);
 478 |     #endif
 479 | 
 480 |     tmp1 = _mm_alignr_epi8(val, val, 5);
 481 |     tmp1 = _mm_add_epi32(val, tmp1);
 482 |     #if defined(__AVX512VL__)
 483 |         tmp1 = _mm_ror_epi32(tmp1, 7);
 484 |         val  = _mm_xor_si128(val, tmp1);
 485 |     #else
 486 |         tmp2 = _mm_srli_epi32(tmp1, 7);
 487 |         tmp1 = _mm_slli_epi32(tmp1, 25);
 488 |         val  = _mm_xor_si128(val, tmp2);
 489 |         val  = _mm_xor_si128(val, tmp1);
 490 |     #endif
 491 | 
 492 |     tmp1 = _mm_alignr_epi8(val, val, 5);
 493 |     tmp1 = _mm_add_epi32(tmp1, val);
 494 |     #if defined(__AVX512VL__)
 495 |         tmp1 = _mm_ror_epi32(tmp1, 11);
 496 |         val  = _mm_xor_si128(val, tmp1);
 497 |     #else
 498 |         tmp2 = _mm_srli_epi32(tmp1, 11);
 499 |         tmp1 = _mm_slli_epi32(tmp1, 21);
 500 |         val  = _mm_xor_si128(val, tmp2);
 501 |         val  = _mm_xor_si128(val, tmp1);
 502 |     #endif
 503 | 
 504 |     tmp1 = _mm_alignr_epi8(val, val, 5);
 505 |     tmp1 = _mm_add_epi32(tmp1, val);
 506 |     #if defined(__AVX512VL__)
 507 |         tmp1 = _mm_ror_epi32(tmp1, 17);
 508 |         val  = _mm_xor_si128(val, tmp1);
 509 |     #else
 510 |         tmp2 = _mm_srli_epi32(tmp1, 17);
 511 |         tmp1 = _mm_slli_epi32(tmp1, 15);
 512 |         val  = _mm_xor_si128(val, tmp2);
 513 |         val  = _mm_xor_si128(val, tmp1);
 514 |     #endif
 515 | 
 516 |     return val;
 517 | }
 518 | 
 519 | static KHASH_FINLINE __m128i khashv_part_load_vector(const uint8_t* data, size_t len) {
 520 |     __m128i tmp  = { 0 };
 521 |     __m128i tmp2 = { 0 };
 522 |     switch(len) {
 523 |         case 1:
 524 |             #if defined(__SSE4_1__)
 525 |                 tmp = _mm_insert_epi8(tmp, data[0], 0);
 526 |             #else
 527 |                 tmp = _mm_cvtsi32_si128(data[0]);
 528 |             #endif
 529 |             break;
 530 |         case 2:
 531 |             tmp = _mm_loadu_si16(data);
 532 |             break;
 533 |         case 3:
 534 |             tmp = _mm_loadu_si16(data);
 535 |             #if defined(__SSE4_1__)
 536 |                 tmp = _mm_insert_epi8(tmp, data[2], 2);
 537 |             #else
 538 |                 tmp = _mm_insert_epi16(tmp, data[2], 1);
 539 |             #endif
 540 |             break;
 541 |         case 4:
 542 |             tmp = _mm_loadu_si32(data);
 543 |             break;
 544 |         case 5:
 545 |             tmp = _mm_loadu_si32(data);
 546 |             #if defined(__SSE4_1__)
 547 |                 tmp = _mm_insert_epi8(tmp, data[4], 4);
 548 |             #else
 549 |                 tmp = _mm_insert_epi16(tmp, data[4], 2);
 550 |             #endif
 551 |             break;
 552 |         case 6:
 553 |             tmp = _mm_loadu_si32(data);
 554 |             tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2);
 555 |             break;
 556 |         case 7:
 557 |             tmp = _mm_loadu_si32(data);
 558 |             tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2);
 559 |             #if defined(__SSE4_1__)
 560 |                 tmp = _mm_insert_epi8(tmp, data[6], 6);
 561 |             #else
 562 |                 tmp = _mm_insert_epi16(tmp, data[6], 3);
 563 |             #endif
 564 |             break;
 565 |         case 8:
 566 |             tmp = _mm_loadu_si64(data);
 567 |             break;
 568 |         case 9:
 569 |             tmp = _mm_loadu_si64(data);
 570 |             #if defined(__SSE4_1__)
 571 |                 tmp = _mm_insert_epi8(tmp, data[8], 8);
 572 |             #else
 573 |                 tmp = _mm_insert_epi16(tmp, data[8], 4);
 574 |             #endif
 575 |             break;
 576 |         case 10:
 577 |             tmp = _mm_loadu_si64(data);
 578 |             tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4);
 579 |             break;
 580 |         case 11:
 581 |             tmp = _mm_loadu_si64(data);
 582 |             tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4);
 583 |             #if defined(__SSE4_1__)
 584 |                 tmp = _mm_insert_epi8(tmp, data[10], 10);
 585 |             #else
 586 |                 tmp = _mm_insert_epi16(tmp, data[10], 5);
 587 |             #endif
 588 |             break;
 589 |         case 12:
 590 |             tmp = _mm_loadu_si64(data);
 591 |             #if defined(__SSE4_1__)
 592 |                 tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
 593 |             #else
 594 |                 tmp2 = _mm_loadu_si32(data + 8);
 595 |                 tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
 596 |                 tmp  = _mm_or_si128(tmp, tmp2);
 597 |             #endif
 598 |             break;
 599 |         case 13:
 600 |             tmp = _mm_loadu_si64(data);
 601 |             #if defined(__SSE4_1__)
 602 |                 tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
 603 |                 tmp = _mm_insert_epi8(tmp, data[12], 12);
 604 |             #else
 605 |                 tmp2 = _mm_loadu_si32(data + 8);
 606 |                 tmp2 = _mm_insert_epi16(tmp2, data[12], 2);
 607 |                 tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
 608 |                 tmp  = _mm_or_si128(tmp, tmp2);
 609 |             #endif
 610 |             break;
 611 |         case 14:
 612 |             tmp = _mm_loadu_si64(data);
 613 |             #if defined(__SSE4_1__)
 614 |                 tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
 615 |                 tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6);
 616 |             #else
 617 |                 tmp2 = _mm_loadu_si32(data + 8);
 618 |                 tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6);
 619 |                 tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
 620 |                 tmp  = _mm_or_si128(tmp, tmp2);
 621 |             #endif
 622 |             break;
 623 |         case 15:
 624 |             tmp = _mm_loadu_si64(data);
 625 |             #if defined(__SSE4_1__)
 626 |                 tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
 627 |                 tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6);
 628 |                 tmp = _mm_insert_epi8(tmp, data[14], 14);
 629 |             #else
 630 |                 tmp2 = _mm_loadu_si32(data + 8);
 631 |                 tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6);
 632 |                 tmp2 = _mm_insert_epi16(tmp2, data[14], 7);
 633 |                 tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
 634 |                 tmp  = _mm_or_si128(tmp, tmp2);
 635 |             #endif
 636 |             break;
 637 |         case 16:
 638 |             tmp  = _mm_loadu_si64(data);
 639 |             #if defined(__SSE4_1__)
 640 |                 tmp = _mm_insert_epi64(tmp, *(uint64_t*)(data + 8), 1);
 641 |             #else
 642 |                 tmp2 = _mm_loadu_si64(data + 8);
 643 |                 tmp  = _mm_unpacklo_epi64(tmp, tmp2);
 644 |             #endif
 645 |             break;
 646 |     }
 647 |     return tmp;
 648 | }
 649 | 
 650 | static const uint8_t khashv_shuff[16] = {
 651 |     0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00,
 652 |     0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
 653 | };
 654 | 
 655 | static __m128i khashv_hash_vector(__m128i hash, const uint8_t* data, size_t data_len) {
 656 |     const __m128i s1     = _mm_loadu_si128((const __m128i*)khashv_s1);
 657 |     const __m128i s2     = _mm_loadu_si128((const __m128i*)khashv_s2);
 658 |     const __m128i shuff  = _mm_loadu_si128((const __m128i*)khashv_shuff);
 659 |     const __m128i mask   = _mm_set1_epi32(0x0f0f0f0f);
 660 | 
 661 |     __m128i       tmp_1;
 662 |     __m128i       tmp_2;
 663 | 
 664 |     #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
 665 |         tmp_1 = _mm_cvtsi64_si128(data_len);
 666 |     #else
 667 |         tmp_1 = _mm_cvtsi32_si128(data_len);
 668 |     #endif
 669 |     hash = _mm_xor_si128(tmp_1, hash);
 670 | 
 671 |     const uint8_t* end  = data + (data_len & ~((size_t)15));
 672 |     const uint8_t* end2 = data + data_len;
 673 |     while(data_len > 16 && data < end) {
 674 |         tmp_1 = _mm_lddqu_si128((const __m128i*)data);
 675 |         tmp_2 = _mm_srli_epi32  (tmp_1, 4);
 676 | 
 677 |         tmp_1 = _mm_and_si128   (tmp_1, mask);
 678 |         tmp_2 = _mm_and_si128   (tmp_2, mask);
 679 |         tmp_1 = _mm_shuffle_epi8(s1,    tmp_1);
 680 |         tmp_2 = _mm_shuffle_epi8(s2,    tmp_2);
 681 |         tmp_1 = _mm_xor_si128   (tmp_1, tmp_2);
 682 | 
 683 |         tmp_2 = _mm_slli_epi32 (tmp_1, 13);
 684 |         tmp_2 = _mm_add_epi32  (tmp_1, tmp_2);
 685 |         tmp_2 = _mm_xor_si128  (hash,  tmp_2);
 686 |         tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5);
 687 |         hash  = _mm_add_epi32  (tmp_2, tmp_1);
 688 | 
 689 |         tmp_2 = _mm_srli_epi32(hash, 3);
 690 |         tmp_1 = _mm_shuffle_epi8(hash, shuff);
 691 |         hash  = _mm_add_epi32 (tmp_2, tmp_1);
 692 | 
 693 |         data += 16;
 694 |     }
 695 |     uintptr_t trailing = end2 - data;
 696 |     if(trailing) {
 697 |         tmp_1 = khashv_part_load_vector(data, trailing);
 698 |         tmp_2 = _mm_srli_epi32  (tmp_1, 4);
 699 | 
 700 |         tmp_1 = _mm_and_si128   (tmp_1, mask);
 701 |         tmp_2 = _mm_and_si128   (tmp_2, mask);
 702 |         tmp_1 = _mm_shuffle_epi8(s1,    tmp_1);
 703 |         tmp_2 = _mm_shuffle_epi8(s2,    tmp_2);
 704 |         tmp_1 = _mm_xor_si128   (tmp_1, tmp_2);
 705 | 
 706 |         tmp_2 = _mm_slli_epi32 (tmp_1, 13);
 707 |         tmp_2 = _mm_add_epi32  (tmp_1, tmp_2);
 708 |         tmp_2 = _mm_xor_si128  (hash,  tmp_2);
 709 |         tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5);
 710 |         hash  = _mm_add_epi32  (tmp_2, tmp_1);
 711 | 
 712 |         tmp_2 = _mm_srli_epi32(hash, 3);
 713 |         tmp_1 = _mm_shuffle_epi8(hash, shuff);
 714 |         hash  = _mm_add_epi32 (tmp_2, tmp_1);
 715 |     }
 716 |     hash = khashv_mix_words_vector(hash);
 717 |     return hash;
 718 | }
 719 | 
 720 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) {
 721 |     __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init);
 722 |     s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed));
 723 |     seed_prepped->vec = khashv_mix_words_vector(s);
 724 | }
 725 | 
 726 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) {
 727 |     __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init);
 728 |     __m128i t = _mm_cvtsi32_si128(seed >> 32);
 729 |     s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed));
 730 |     s = khashv_mix_words_vector(s);
 731 |     s = _mm_xor_si128(s, _mm_shuffle_epi32(t, 0xf3));
 732 |     seed_prepped->vec = khashv_mix_words_vector(s);
 733 | }
 734 | 
 735 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) {
 736 |     seed_prepped->vec = _mm_loadu_si128((const __m128i*)seed);
 737 | }
 738 | 
 739 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 740 |     __m128i h = khashv_hash_vector(seed->vec, data, data_len);
 741 |     // using word[3] to avoid any overlap with with the
 742 |     // 64 bit hash which uses words [0] and [1], this ensures
 743 |     // the 2 bit outputs should behave differently when used.
 744 |     #if defined(__SSE4_1__)
 745 |         return _mm_extract_epi32(h, 3);
 746 |     #else
 747 |         h = _mm_shuffle_epi32(h, 0xff);
 748 |         return _mm_cvtsi128_si32(h);
 749 |     #endif
 750 | }
 751 | 
 752 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 753 |     __m128i h = khashv_hash_vector(seed->vec, data, data_len);
 754 |     return _mm_cvtsi128_si64(h);
 755 | }
 756 | 
 757 | #endif
 758 | 
 759 | /* Vectorization via GCCs Vectorization builtins */
 760 | // Handy since it allows vectorization without explicit intrinsics
 761 | // for a particular CPU.
 762 | 
 763 | #if !defined(KHASH_VECTOR) && KHASH_GCC_LEAST__(6, 1)
 764 | 
 765 | #define KHASH_VECTOR 1
 766 | 
 767 | typedef uint8_t  kv16ui __attribute__((vector_size(16)));
 768 | typedef uint32_t kv4ui  __attribute__((vector_size(16)));
 769 | 
 770 | static KHASH_FINLINE kv16ui khashv_sub_s1_gcc(kv16ui in) {
 771 |     const kv16ui mask = {
 772 |         0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
 773 |         0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf
 774 |     };
 775 |     const kv16ui sub = {
 776 |         0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7,
 777 |         0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12,
 778 |     };
 779 |     in &= mask;
 780 |     return __builtin_shuffle(sub, in);
 781 | }
 782 | 
 783 | static KHASH_FINLINE kv16ui khashv_sub_s2_gcc(kv16ui in) {
 784 |     const kv16ui sub = {
 785 |         0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5,
 786 |         0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a,
 787 |     };
 788 |     in >>= 4;
 789 |     return __builtin_shuffle(sub, in);
 790 | }
 791 | 
 792 | static KHASH_FINLINE kv4ui khashv_rotr_5_bytes_gcc(kv4ui input) {
 793 |     const kv16ui rotrLE = {
 794 |         0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc,
 795 |         0xd, 0xe, 0xf, 0x0, 0x1, 0x2, 0x3, 0x4
 796 |     };
 797 |     const kv16ui rotrBE = {
 798 |         0xb, 0x4, 0x5, 0x6, 0xf, 0x8, 0x9, 0xa,
 799 |         0x3, 0xc, 0xd, 0xe, 0x7, 0x0, 0x1, 0x2
 800 |     };
 801 |     kv16ui tmp;
 802 |     memcpy(&tmp, &input, 16);
 803 |     if (khashv_is_little_endian()) {
 804 |         tmp = __builtin_shuffle(tmp, rotrLE);
 805 |     } else {
 806 |         tmp = __builtin_shuffle(tmp, rotrBE);
 807 |     }
 808 |     memcpy(&input, &tmp, 16);
 809 |     return input;
 810 | }
 811 | 
 812 | static KHASH_FINLINE kv4ui khashv_rotr_9_bytes_gcc(kv4ui input) {
 813 |     const kv16ui rotrLE = {
 814 |         0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0,
 815 |         0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
 816 |     };
 817 |     const kv16ui rotrBE = {
 818 |         0xf, 0x8, 0x9, 0xa, 0x3, 0xc, 0xd, 0xe,
 819 |         0x7, 0x0, 0x1, 0x2, 0xb, 0x4, 0x5, 0x6,
 820 |     };
 821 |     kv16ui tmp;
 822 |     memcpy(&tmp, &input, 16);
 823 |     if (khashv_is_little_endian()) {
 824 |         tmp = __builtin_shuffle(tmp, rotrLE);
 825 |     } else {
 826 |         tmp = __builtin_shuffle(tmp, rotrBE);
 827 |     }
 828 |     memcpy(&input, &tmp, 16);
 829 |     return input;
 830 | }
 831 | 
 832 | static KHASH_FINLINE kv4ui khash_byteswap_vec32_gcc( kv4ui input ) {
 833 |     const kv16ui bswap32 = {
 834 |         0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4,
 835 |         0xb, 0xa, 0x9, 0x8, 0xf, 0xe, 0xd, 0xc,
 836 |     };
 837 |     kv16ui b;
 838 | 
 839 |     memcpy(&b, &input, 16);
 840 |     b = __builtin_shuffle(b, bswap32);
 841 |     memcpy(&input, &b, 16);
 842 |     return input;
 843 | }
 844 | 
 845 | static KHASH_FINLINE kv4ui khashv_replace_gcc(kv4ui input) {
 846 |     kv16ui s1;
 847 |     kv16ui s2;
 848 |     memcpy(&s1, &input, 16);
 849 |     s2 = khashv_sub_s2_gcc(s1);
 850 |     s1 = khashv_sub_s1_gcc(s1);
 851 |     s1 ^= s2;
 852 |     memcpy(&input, &s1, 16);
 853 |     return input;
 854 | }
 855 | 
 856 | static KHASH_FINLINE kv4ui khashv_mix_words_gcc(kv4ui val) {
 857 |     const unsigned rots[4] = { 5, 7, 11, 17 };
 858 |     for (int i = 0; i < 4; i++) {
 859 |         unsigned rot = rots[i];
 860 |         kv4ui tmp = val;
 861 |         tmp  = khashv_rotr_5_bytes_gcc(tmp);
 862 |         tmp += val;
 863 |         tmp  = (tmp >> rot) | (tmp << (32 - rot));
 864 |         val ^= tmp;
 865 |     }
 866 |     return val;
 867 | }
 868 | 
 869 | static KHASH_FINLINE kv4ui khashv_hash_block_gcc(kv4ui hash, kv4ui input) {
 870 |     kv4ui tmp_1 = khashv_replace_gcc(input);
 871 |     if (!khashv_is_little_endian()) {
 872 |         tmp_1 = khash_byteswap_vec32_gcc(tmp_1);
 873 |     }
 874 |     kv4ui tmp_2 = tmp_1 * 8193;
 875 |     tmp_2 ^= hash;
 876 |     tmp_2  = khashv_rotr_5_bytes_gcc(tmp_2);
 877 |     hash   = tmp_1 + tmp_2;
 878 | 
 879 |     tmp_2  = hash >> 3;
 880 |     tmp_1  = khashv_rotr_9_bytes_gcc(hash);
 881 |     hash   = tmp_1 + tmp_2;
 882 |     return hash;
 883 | }
 884 | 
 885 | static KHASH_FINLINE kv4ui khashv_hash_gcc(kv4ui hash, const uint8_t* data, size_t data_len) {
 886 |     hash[0] ^= data_len;
 887 |     #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
 888 |         hash[1] ^= data_len >> 32;
 889 |     #endif
 890 | 
 891 |     kv4ui data_v;
 892 |     const uint8_t* end = data + (data_len & ~((size_t)15));
 893 |     while (data < end) {
 894 |         memcpy(&data_v, data, 16);
 895 |         hash = khashv_hash_block_gcc(hash, data_v);
 896 |         data += 16;
 897 |     }
 898 | 
 899 |     unsigned trailing = data_len & 0xf;
 900 |     if(trailing) {
 901 |         memset(&data_v, 0, 16);
 902 |         memcpy(&data_v, data, trailing);
 903 |         hash = khashv_hash_block_gcc(hash, data_v);
 904 |     }
 905 |     return khashv_mix_words_gcc(hash);
 906 | }
 907 | 
 908 | 
 909 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) {
 910 |     kv4ui s;
 911 |     memcpy(&s, &khash_v_init, 16);
 912 |     s[0] ^= seed;
 913 |     s = khashv_mix_words_gcc(s);
 914 |     memcpy(seed_prepped, &s, 16);
 915 | }
 916 | 
 917 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) {
 918 |     kv4ui s;
 919 |     memcpy(&s, &khash_v_init, 16);
 920 |     s[0] ^= seed;
 921 |     s = khashv_mix_words_gcc(s);
 922 |     s[1] ^= seed >> 32;
 923 |     s = khashv_mix_words_gcc(s);
 924 |     memcpy(seed_prepped, &s, 16);
 925 | }
 926 | 
 927 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) {
 928 |     memcpy(seed_prepped->words, seed, 16);
 929 | }
 930 | 
 931 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 932 |     kv4ui h;
 933 |     memcpy(&h, seed, 16);
 934 |     h = khashv_hash_gcc(h, data, data_len);
 935 |     return h[3];
 936 | }
 937 | 
 938 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 939 |     kv4ui h;
 940 |     memcpy(&h, seed, 16);
 941 |     h = khashv_hash_gcc(h, data, data_len);
 942 |     uint64_t ret;
 943 |     if (khashv_is_little_endian()) {
 944 |         memcpy(&ret, &h, 8);
 945 |     } else {
 946 |         ret = h[1];
 947 |         ret = (ret << 32) | h[0];
 948 |     }
 949 |     return ret;
 950 | }
 951 | 
 952 | #endif
 953 | 
 954 | #if defined(KHASH_VECTOR) && !defined(KHASHV_SCALAR)
 955 | 
 956 |     static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) {
 957 |         khashv_prep_seed32_vector(seed_prepped, seed);
 958 |     }
 959 | 
 960 |     static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) {
 961 |         khashv_prep_seed64_vector(seed_prepped, seed);
 962 |     }
 963 | 
 964 |     static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) {
 965 |         khashv_prep_seed128_vector(seed_prepped, seed);
 966 |     }
 967 | 
 968 |     static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 969 |         return khashv32_vector(seed, data, data_len);
 970 |     }
 971 | 
 972 |     static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 973 |         return khashv64_vector(seed, data, data_len);
 974 |     }
 975 | 
 976 | #else
 977 | 
 978 |     static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) {
 979 |         khashv_prep_seed32_scalar(seed_prepped, seed);
 980 |     }
 981 | 
 982 |     static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) {
 983 |         khashv_prep_seed64_scalar(seed_prepped, seed);
 984 |     }
 985 | 
 986 |     static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) {
 987 |         khashv_prep_seed128_scalar(seed_prepped, seed);
 988 |     }
 989 | 
 990 |     static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 991 |         return khashv32_scalar(seed, data, data_len);
 992 |     }
 993 | 
 994 |     static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
 995 |         return khashv64_scalar(seed, data, data_len);
 996 |     }
 997 | 
 998 | #endif
 999 | 
1000 | 
1001 | #ifdef __cplusplus
1002 | }
1003 | #endif
1004 | #endif
1005 | 


--------------------------------------------------------------------------------