├── .gitattributes ├── khashv32-seed-6bb75f13.png ├── khashv64-seed-1dcedff1a8b17e89.png ├── .gitignore ├── LICENSE ├── k-hashv-old ├── README_v1.md ├── test_speed_v1.c └── khashv_v1.h ├── test_speed.c ├── README.md └── khashv.h /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /khashv32-seed-6bb75f13.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Keith-Cancel/k-hashv/HEAD/khashv32-seed-6bb75f13.png -------------------------------------------------------------------------------- /khashv64-seed-1dcedff1a8b17e89.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Keith-Cancel/k-hashv/HEAD/khashv64-seed-1dcedff1a8b17e89.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Prerequisites 2 | *.d 3 | 4 | # Object files 5 | *.o 6 | *.ko 7 | *.obj 8 | *.elf 9 | 10 | # Linker output 11 | *.ilk 12 | *.map 13 | *.exp 14 | 15 | # Precompiled Headers 16 | *.gch 17 | *.pch 18 | 19 | # Libraries 20 | *.lib 21 | *.a 22 | *.la 23 | *.lo 24 | 25 | # Shared objects (inc. Windows DLLs) 26 | *.dll 27 | *.so 28 | *.so.* 29 | *.dylib 30 | 31 | # Executables 32 | *.exe 33 | *.out 34 | *.app 35 | *.i*86 36 | *.x86_64 37 | *.hex 38 | 39 | # Debug files 40 | *.dSYM/ 41 | *.su 42 | *.idb 43 | *.pdb 44 | 45 | # Kernel Module Compile Results 46 | *.mod* 47 | *.cmd 48 | .tmp_versions/ 49 | modules.order 50 | Module.symvers 51 | Mkfile.old 52 | dkms.conf 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Keith-Cancel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /k-hashv-old/README_v1.md: -------------------------------------------------------------------------------- 1 | # K-HASHV 2 | A single header hash function with both vectorized and scalar versions. The function is quite fast when vectorized achieving approximately an average of **~9.6 GB/s** on a 7 year old Xeon E3-1230 v5. 3 | 4 | Additionally, it also passes all the SMHasher hash function quality tests: https://github.com/rurban/smhasher 5 | 6 | Moreover, it is quite easy to choose a new function at runtime by just using new seed as shown below: 7 | ```C 8 | #include "khashv.h" 9 | 10 | void foo() { 11 | /* 12 | code .... 13 | */ 14 | khashvSeed seed; 15 | khashv_prep_seed64(&seed, a_64_bit_value); 16 | uint64_t hash = khashv64(&seed, your_data, data_len); 17 | /* 18 | code .... 19 | */ 20 | } 21 | ``` 22 | ### Note 23 | This is **not a cryptographic hash function**, and it should not be used in for such applications. 24 | 25 | ## Performance 26 | When testing on 1.25 GB and 512 KB of random data I get the following on averages: 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 |
Processor1.25 GB Time1.25 GB Speed 512 KB Time512 KB SpeedOSCompilerType
Xeon E3-1230 v5 0.1298 s 9.6285 GB/s 052.5107 us 9.2987 GB/sLinuxGCC 12.1.0Vectorized
Xeon E3-1230 v5 1.1911 s 1.0495 GB/s 494.1932 us 0.9880 GB/sLinuxGCC 12.1.0Scalar
Xeon E3-1230 v5 0.1418 s 8.8142 GB/s 055.9333 us 8.7297 GB/sLinuxClang 14.0.6Vectorized
Ryzen 9 7900 0.1227 s 10.1881 GB/s 046.0273 us 10.6085 GB/sLinuxGCC 12.2.1Vectorized
Ryzen 9 7900 0.8693 s 1.4379 GB/s 375.0820 us 1.3018 GB/sLinuxGCC 12.2.1Scalar
37 | 38 | The scalar version is slower at a tad over ~1 GB/s on my system when compiling test_speed.c with gcc using `-O3`. 39 | On windows Microsoft's compiler does not seem to generate as performant code from the intrinsics, but the GCC mingw64 compiler generates pretty comparable numbers for me at least. 40 | 41 | Definitely, want to add other machines to this table. But if you are curious how it performs on your machine compile test_speed.c with `-O3 -march=native` and `-O3 -march=native -D KHASHV_SCALAR`. 42 | 43 | ## Functions 44 | ```C 45 | // Prepares a seed from a 32-bit value 46 | void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) 47 | ``` 48 | 49 | ```C 50 | // Prepares a seed from a 64-bit value 51 | void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) 52 | ``` 53 | 54 | ```C 55 | // Sets 128-bits to be the seed 56 | void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) 57 | ``` 58 | 59 | ```C 60 | // Produces a 32-bit hash from the input data 61 | uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) 62 | ``` 63 | 64 | ```C 65 | // Produces a 64-bit hash from the input data 66 | uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) 67 | ``` 68 | 69 | ## K-HASHV 64 Output 70 | Here is the output of the 64 bit hash of the integers \[0, 259199\] using 0x1dcedff1a8b17e89 as the seed. 71 | 72 | drawing 73 | 74 | ## K-HASHV 32 Output 75 | 76 | Here is the output of the 32 bit hash of the integers \[0, 518399\] using 0x6bb75f13 as the seed. 77 | 78 | drawing 79 | 80 | The output of the above images was generated by basically doing the following for a hash. 81 | 82 | ```C 83 | for(int i = 0; i < sizeof(hash_bytes); i++) { 84 | pixel[img_offset + i].r = hash_bytes[i]; 85 | pixel[img_offset + i].g = hash_bytes[i]; 86 | pixel[img_offset + i].b = hash_bytes[i]; 87 | pixel[img_offset + i].a = 255; 88 | } 89 | ``` 90 | 91 | ## Things TODO 92 | When thinking about things to improve the code and hash function these are the first few things that come to mind for me. 93 | 94 | 1. The main thing would be try to get both Clang and MSVC to output code that runs as fast GCC or as close as possible. They both seem to do some silly things when compared to GCC losing some performance when looking at the generated assembly. Microsoft's compiler being the worst, and probably the fastest fix for me to implement would be to write some assembly code. However, it then would no longer be a single header file hash function since MSVC does not support inline assembly for 64-bit builds, and thusly would require a separate file. 95 | 96 | 2. Then probably consider using intrinsics for some other systems like ARM NEON, but the for now there is scalar code and code written using GCC's vector built-ins that will generate vectorized code for other architectures that GCC supports. 97 | 98 | 3. Probably, the next thing I could think of is to choose a better value for S1 and S2 that are used to basically substitute bytes. The current values where found randomly checking a small set of criteria. Mainly focusing on each bit of S1 and S2 as columns. Then Xor-ing them effectively creating an 8 bit input boolean function, and making sure the entire thing maps each input to a unique value. There likely are better values that could chosen, and criteria to look at that look at all bits at once. However, the search space is huge effectively 2^(2\*8\*16) possible permutations for S1 and S2. However, the current values do seem to work well, from my testing. 99 | 100 | ### Suggestions 101 | I am open to any other suggestions or improvments. 102 | -------------------------------------------------------------------------------- /k-hashv-old/test_speed_v1.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "khashv.h" 6 | 7 | #if defined(__MINGW32__) || defined(_WIN32) 8 | #include 9 | 10 | #define get_timer(x) QueryPerformanceCounter(&x) 11 | 12 | typedef LARGE_INTEGER timer; 13 | 14 | uint64_t time_ns(timer* start, timer* stop) { 15 | LARGE_INTEGER freq; 16 | if(!QueryPerformanceFrequency(&freq)) { 17 | return UINT64_MAX; 18 | } 19 | double ns = stop->QuadPart - start->QuadPart; 20 | double ratio = 1000000000.0; // 1 billion ns = 1 second 21 | ratio /= (double)freq.QuadPart; 22 | ns *= ratio; 23 | return (uint64_t)ns; 24 | } 25 | 26 | #else 27 | #include 28 | #define get_timer(x) clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &x) 29 | typedef struct timespec timer; 30 | 31 | uint64_t time_ns(timer* start, timer* stop) { 32 | int secs = stop->tv_sec - start->tv_sec; 33 | if(secs > 0) { 34 | uint64_t t0_ns = start->tv_sec * 1000000000; 35 | uint64_t t1_ns = stop->tv_sec * 1000000000; 36 | t0_ns += start->tv_nsec; 37 | t1_ns += stop->tv_nsec; 38 | return t1_ns - t0_ns; 39 | } 40 | return stop->tv_nsec - start->tv_nsec; 41 | } 42 | #endif 43 | 44 | #define MB_TO_BYTES(x) (1024ULL * 1024ULL * (x)) 45 | 46 | double get_gbs(double t_ns, double gigs) { 47 | t_ns /= 1000000000; 48 | return gigs / t_ns; 49 | } 50 | 51 | double get_secs(double t_ns) { 52 | return t_ns / 1000000000.0; 53 | } 54 | 55 | double get_usecs(double t_ns) { 56 | return t_ns / 1000.0; 57 | } 58 | 59 | void populate_memory(const khashvSeed* seed, uint8_t* bytes, size_t size) { 60 | printf("Populating Memory: "); 61 | // Use the hash to populate memory with pseudo random bytes 62 | uint64_t state[2] = { 0x4d9ef2f9a304588a, 0x58ca10a39947b63b }; 63 | for(size_t i = 0; i < size; i += sizeof(uint64_t)) { 64 | if(i != 0 && (i & 0x1ffffff) == 0) { 65 | printf("."); 66 | fflush(stdout); 67 | } 68 | state[0] = khashv64(seed, (uint8_t*)state, sizeof(uint64_t) * 2); 69 | memcpy(bytes + i, state, sizeof(uint64_t)); 70 | } 71 | puts(" Populated!"); 72 | } 73 | 74 | int gig_tests(khashvSeed seed) { 75 | size_t size = MB_TO_BYTES(1280); 76 | uint8_t* bytes = malloc(size); 77 | if(bytes == NULL) { 78 | fprintf(stderr, "Can not allocate memory for test!\n"); 79 | return 1; 80 | } 81 | populate_memory(&seed, bytes, size); 82 | 83 | double gigs = (double)size / (double)MB_TO_BYTES(1024); 84 | double sum = 0; 85 | double fastest = DBL_MAX; 86 | 87 | const uint32_t hashes[12] = { 88 | 0xa9ca46b1, 0x8c9f5264, 0x2094ffd9, 0x93946e70, 0x9b71dd71, 89 | 0x2abeec74, 0x6bca7368, 0x151fff30, 0xc4228495, 0xfad35669, 90 | 0x9f151590, 0x20a4045b 91 | }; 92 | 93 | printf("Tests on %.3lf GB block: ", gigs); 94 | fflush(stdout); 95 | for(unsigned i = 0; i < 12; i++) { 96 | timer t0; 97 | timer t1; 98 | get_timer(t0); 99 | uint32_t h = khashv32(&seed, bytes, size); 100 | get_timer(t1); 101 | 102 | if(h != hashes[i]) { 103 | printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]); 104 | } 105 | 106 | double t = time_ns(&t0, &t1); 107 | if(t < fastest) { 108 | fastest = t; 109 | } 110 | sum += t; 111 | bytes[i] += 1; 112 | } 113 | 114 | double avg = sum / 12; 115 | printf( 116 | "Avg: %lf GB/s, Avg Time: %lf s, Fastest: %lf GB/s\n", 117 | get_gbs (avg, gigs), 118 | get_secs(avg), 119 | get_gbs (fastest, gigs) 120 | ); 121 | fflush(stdout); 122 | free(bytes); 123 | return 0; 124 | } 125 | 126 | int half_mb_tests(khashvSeed seed) { 127 | size_t size = 1024 * 512; 128 | uint8_t* bytes = malloc(size); 129 | if(bytes == NULL) { 130 | fprintf(stderr, "Can not allocate memory for test!\n"); 131 | return 1; 132 | } 133 | populate_memory(&seed, bytes, size); 134 | 135 | double gigs = (double)size / (double)MB_TO_BYTES(1024); 136 | double sum = 0; 137 | double fastest = DBL_MAX; 138 | unsigned count = 96; 139 | const uint32_t hashes[96] = { 140 | 0x249b844b, 0x852e481c, 0xf7ce4779, 0x5b1e79c0, 0xc6280b69, 141 | 0x18aaed1f, 0x360a7b70, 0x6691373f, 0x62b0e7d2, 0x503f2a13, 142 | 0x55784198, 0x0449e145, 0xc1fec259, 0xfdde4bcc, 0x3d040585, 143 | 0x2d54b62c, 0x70f06c7e, 0xcc7a642f, 0xe784348b, 0xe360bb8a, 144 | 0xd4653bab, 0x129aac4f, 0xdf09ac90, 0xc770d23f, 0x1865b60c, 145 | 0x366d8ca9, 0x80b13f6f, 0x7317d810, 0x7816b809, 0x919adedb, 146 | 0x92713259, 0xb15e9216, 0x4cca4cd2, 0xb0bda9b9, 0xa3eb6a63, 147 | 0x1801f592, 0x7f6ebdfe, 0xcfd5f33c, 0x000c7082, 0x17265e0b, 148 | 0x6ba10359, 0x8c74f4eb, 0x803f3c08, 0x4ba6860d, 0x0716f9fb, 149 | 0x6e3c84ae, 0xe77a48f4, 0xc2374c75, 0x97f403ee, 0x3010b84b, 150 | 0x560ba778, 0x83103235, 0xfd4adabf, 0xa436bcf0, 0xaa8f96dc, 151 | 0x29922bec, 0xd5468b54, 0x4b1921b8, 0x2a8ce2d5, 0x86e336f4, 152 | 0x5fab2354, 0x0e07c225, 0xb181782a, 0xe799459f, 0xcf9541fd, 153 | 0xcd510976, 0xe70010ea, 0x6202cb22, 0x7d253b79, 0x4d047b53, 154 | 0xbd26b2ba, 0xc1df8a17, 0x48a6ed87, 0xa980b22c, 0x16b27278, 155 | 0xb5736e7c, 0x368bd0b9, 0xeee76414, 0xfe58e49d, 0xf3500e6d, 156 | 0xb57df9f5, 0xb52a7ed6, 0xaca79612, 0xccc9f98a, 0xa7140bd0, 157 | 0x7e45d2f9, 0xb91ddced, 0x9444f706, 0xa477bfb2, 0xcf7e1d5b, 158 | 0xd95eab3c, 0x737fa6e5, 0x5f548e79, 0x46539426, 0xef41aa94, 159 | 0xc0357213 160 | }; 161 | 162 | printf("Tests on 512 KB block: "); 163 | fflush(stdout); 164 | for(unsigned i = 0; i < count; i++) { 165 | timer t0; 166 | timer t1; 167 | get_timer(t0); 168 | uint32_t h = khashv32(&seed, bytes, size); 169 | get_timer(t1); 170 | 171 | if(h != hashes[i]) { 172 | printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]); 173 | } 174 | 175 | double t = time_ns(&t0, &t1); 176 | if(t < fastest) { 177 | fastest = t; 178 | } 179 | sum += t; 180 | bytes[i] += 1; 181 | } 182 | 183 | double avg = sum / count; 184 | printf( 185 | "Avg: %lf GB/s, Avg Time: %lf us, Fastest: %lf GB/s\n", 186 | get_gbs (avg, gigs), 187 | get_usecs(avg), 188 | get_gbs (fastest, gigs) 189 | ); 190 | fflush(stdout); 191 | free(bytes); 192 | return 0; 193 | } 194 | 195 | int main(int argc, char** argv) { 196 | khashvSeed seed; 197 | khashv_prep_seed64(&seed, 0xa9c163c960d480fb); 198 | 199 | if(gig_tests(seed)) { 200 | return 1; 201 | } 202 | if(half_mb_tests(seed)) { 203 | return 1; 204 | } 205 | return 0; 206 | } -------------------------------------------------------------------------------- /test_speed.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "khashv.h" 7 | 8 | #if defined(__MINGW32__) || defined(_WIN32) 9 | #include 10 | 11 | #define get_timer(x) QueryPerformanceCounter(&x) 12 | 13 | typedef LARGE_INTEGER timer; 14 | 15 | uint64_t time_ns(timer* start, timer* stop) { 16 | LARGE_INTEGER freq; 17 | if(!QueryPerformanceFrequency(&freq)) { 18 | return UINT64_MAX; 19 | } 20 | double ns = stop->QuadPart - start->QuadPart; 21 | double ratio = 1000000000.0; // 1 billion ns = 1 second 22 | ratio /= (double)freq.QuadPart; 23 | ns *= ratio; 24 | return (uint64_t)ns; 25 | } 26 | 27 | #else 28 | #include 29 | #define get_timer(x) clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &x) 30 | typedef struct timespec timer; 31 | 32 | uint64_t time_ns(timer* start, timer* stop) { 33 | int secs = stop->tv_sec - start->tv_sec; 34 | if(secs > 0) { 35 | uint64_t t0_ns = start->tv_sec * 1000000000; 36 | uint64_t t1_ns = stop->tv_sec * 1000000000; 37 | t0_ns += start->tv_nsec; 38 | t1_ns += stop->tv_nsec; 39 | return t1_ns - t0_ns; 40 | } 41 | return stop->tv_nsec - start->tv_nsec; 42 | } 43 | #endif 44 | 45 | #define MB_TO_BYTES(x) (1024ULL * 1024ULL * (x)) 46 | 47 | double get_gbs(double t_ns, double gigs) { 48 | t_ns /= 1000000000; 49 | return gigs / t_ns; 50 | } 51 | 52 | double get_secs(double t_ns) { 53 | return t_ns / 1000000000.0; 54 | } 55 | 56 | double get_usecs(double t_ns) { 57 | return t_ns / 1000.0; 58 | } 59 | 60 | void populate_memory(const khashvSeed* seed, uint8_t* bytes, size_t size) { 61 | printf("Populating Memory: "); 62 | // Use the hash to populate memory with pseudo random bytes 63 | uint64_t state[2] = { 0x4d9ef2f9a304588a, 0x58ca10a39947b63b }; 64 | for(size_t i = 0; i < size; i += sizeof(uint64_t)) { 65 | if(i != 0 && (i & 0x1ffffff) == 0) { 66 | printf("."); 67 | fflush(stdout); 68 | } 69 | state[0] = khashv64(seed, (uint8_t*)state, sizeof(uint64_t) * 2); 70 | memcpy(bytes + i, state, sizeof(uint64_t)); 71 | } 72 | puts(" Populated!"); 73 | } 74 | 75 | int gig_tests(khashvSeed seed) { 76 | size_t size = MB_TO_BYTES(1280); 77 | uint8_t* bytes = malloc(size); 78 | if(bytes == NULL) { 79 | fprintf(stderr, "Can not allocate memory for test!\n"); 80 | return 1; 81 | } 82 | populate_memory(&seed, bytes, size); 83 | 84 | double gigs = (double)size / (double)MB_TO_BYTES(1024); 85 | double sum = 0; 86 | double fastest = DBL_MAX; 87 | 88 | const uint32_t hashes[12] = { 89 | 0x8b4c1a33, 0x485105dc, 0xaf1deb0e, 0x2d4a890c, 0x8349b700, 90 | 0x29a3b3b9, 0xf1ed93ef, 0x8559b73f, 0x11452eff, 0xefa5fe1f, 91 | 0x5834c363, 0xeb7224a5 92 | }; 93 | 94 | printf("Tests on %.3lf GB block: ", gigs); 95 | fflush(stdout); 96 | for(unsigned i = 0; i < 12; i++) { 97 | timer t0; 98 | timer t1; 99 | get_timer(t0); 100 | uint32_t h = khashv32(&seed, bytes, size); 101 | get_timer(t1); 102 | 103 | if(h != hashes[i]) { 104 | printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]); 105 | } 106 | 107 | double t = time_ns(&t0, &t1); 108 | if(t < fastest) { 109 | fastest = t; 110 | } 111 | sum += t; 112 | bytes[i] += 1; 113 | } 114 | 115 | double avg = sum / 12; 116 | double gbs = get_gbs (avg, gigs); 117 | double sec = get_secs(avg); 118 | double fgbs = get_gbs(fastest, gigs); 119 | gbs = round(gbs * 10000.0) / 10000.0; 120 | sec = round(sec * 10000.0) / 10000.0; 121 | fgbs = round(fgbs * 10000.0) / 10000.0; 122 | printf( 123 | "Avg: %.4lf GB/s, Avg Time: %.4lf s, Fastest: %.4lf GB/s\n", 124 | gbs, 125 | sec, 126 | fgbs 127 | ); 128 | fflush(stdout); 129 | free(bytes); 130 | return 0; 131 | } 132 | 133 | int half_mb_tests(khashvSeed seed) { 134 | size_t size = 1024 * 512; 135 | uint8_t* bytes = malloc(size); 136 | if(bytes == NULL) { 137 | fprintf(stderr, "Can not allocate memory for test!\n"); 138 | return 1; 139 | } 140 | populate_memory(&seed, bytes, size); 141 | 142 | double gigs = (double)size / (double)MB_TO_BYTES(1024); 143 | double sum = 0; 144 | double fastest = DBL_MAX; 145 | unsigned count = 96; 146 | const uint32_t hashes[96] = { 147 | 0x3b181e13, 0x6df3efe4, 0xa1472e2f, 0xe7fe7261, 0x85db611b, 148 | 0x95b68b46, 0xa4738539, 0xc67cd2b3, 0x4630444d, 0xb357f7a3, 149 | 0x60ba4613, 0x20d50be8, 0x5908392d, 0xd5c1411e, 0xa315f311, 150 | 0xe92b8d4a, 0x3504718c, 0x78d5d987, 0xac324986, 0xa9c146a3, 151 | 0xea4120ac, 0x1ab20115, 0xb4cf0fc0, 0x3726e7c6, 0x781b19b4, 152 | 0x897a635f, 0x49c879a6, 0x414f698e, 0xef3c3c66, 0x668de11e, 153 | 0xf6f2af8d, 0x6db89e5f, 0xa2621047, 0x26736838, 0xca8539cf, 154 | 0xe1e92796, 0xbd178553, 0x31aedc2d, 0x41f4377f, 0x0683f7a2, 155 | 0xff1d7f6f, 0x4a788c33, 0xb4823086, 0xf3b45106, 0xf2e12a97, 156 | 0x1505b0e8, 0x32d16f9d, 0xa4ccbd11, 0x61f6aa54, 0x8dc4eb8d, 157 | 0xe7ac77ca, 0xb00dd338, 0x9330ce85, 0xae721ca9, 0x236eb8a2, 158 | 0xcd7aba61, 0x2fbd751e, 0x978edc2c, 0x09ef6175, 0x78d12480, 159 | 0x08b21322, 0x02826493, 0x36244a76, 0xb7e1489c, 0x365c631f, 160 | 0x08188ea8, 0x92bd6910, 0xa7cf34d0, 0x9b91a005, 0x8c7cfc38, 161 | 0xf732ae18, 0x87f2f485, 0xa42d236d, 0x967880e3, 0xf04cb79d, 162 | 0xfd9d613f, 0xfa7ae694, 0xfb680e60, 0x2de7c7c9, 0xa5979af7, 163 | 0x6b24f6a3, 0xfebb25de, 0x3163a706, 0x7d8d0a35, 0xb5cacfcf, 164 | 0xdf774e72, 0xd06db96e, 0x16d7e8db, 0xf1e368e7, 0x21efe8d5, 165 | 0x59d6f29f, 0xb0ee28bc, 0x849b575e, 0x96887453, 0x2eabdd1f, 166 | 0x3cdc8fa8 167 | }; 168 | 169 | printf("Tests on 512 KB block: "); 170 | fflush(stdout); 171 | for(unsigned i = 0; i < count; i++) { 172 | timer t0; 173 | timer t1; 174 | get_timer(t0); 175 | uint32_t h = khashv32(&seed, bytes, size); 176 | get_timer(t1); 177 | 178 | if(h != hashes[i]) { 179 | printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]); 180 | } 181 | 182 | double t = time_ns(&t0, &t1); 183 | if(t < fastest) { 184 | fastest = t; 185 | } 186 | sum += t; 187 | bytes[i] += 1; 188 | } 189 | 190 | double avg = sum / count; 191 | double gbs = get_gbs (avg, gigs); 192 | double usec = get_usecs(avg); 193 | double fgbs = get_gbs (fastest, gigs); 194 | gbs = round(gbs * 10000.0) / 10000.0; 195 | usec = round(usec * 10000.0) / 10000.0; 196 | fgbs = round(fgbs * 10000.0) / 10000.0; 197 | printf( 198 | "Avg: %.4lf GB/s, Avg Time: %.4lf us, Fastest: %.4lf GB/s\n", 199 | gbs, 200 | usec, 201 | fgbs 202 | ); 203 | fflush(stdout); 204 | free(bytes); 205 | return 0; 206 | } 207 | 208 | int main(int argc, char** argv) { 209 | khashvSeed seed; 210 | khashv_prep_seed64(&seed, 0xa9c163c960d480fb); 211 | 212 | if(gig_tests(seed)) { 213 | return 1; 214 | } 215 | if(half_mb_tests(seed)) { 216 | return 1; 217 | } 218 | return 0; 219 | } 220 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # K-HASHV 🔨 2 | A single header hash function with both vectorized and scalar versions. The function is quite fast when vectorized achieving approximately an average of **~10.2 GB/s** on a 9 year (as of 2024) old Xeon E3-1230 v5. The header contains explicit intrinsics for x86_64, and also has a version that will use GCC's portable vector built-ins, and the last fall back is a scalar version for portability. The results of the function should be the same regardless of endianness. 3 | 4 | Additionally, it also passes all the SMHasher hash function quality tests: https://github.com/rurban/smhasher. Additionally, it passes [SMHasher3](https://gitlab.com/fwojcik/smhasher3/-/blob/c56f2bddc1b3e114570d5cbe383ad207673f6c99/results/README.md) a fork of SMHasher with some more stringent tests. Some hashes that pass SMHasher fail in SMHasher3. 5 | 6 | Moreover, it is quite easy to choose a new function at runtime by just using new seed as shown below: 7 | ```C 8 | #include "khashv.h" 9 | 10 | void foo() { 11 | /* 12 | code .... 13 | */ 14 | khashvSeed seed; 15 | khashv_prep_seed64(&seed, a_64_bit_value); 16 | uint64_t hash = khashv64(&seed, your_data, data_len); 17 | /* 18 | code .... 19 | */ 20 | } 21 | ``` 22 | 23 | Issues, PRs and suggestions are welcome 😃 24 | 25 | ### Note 26 | This is **not a cryptographic hash function**, and it should not be used for such applications. 27 | 28 | # Table of Contents 29 | * [Performance](#performance) 30 | * [API](#api) 31 | * [khashv_prep_seed32](#khashv_prep_seed32) 32 | * [khashv_prep_seed64](#khashv_prep_seed64) 33 | * [khashv_prep_seed128](#khashv_prep_seed128) 34 | * [khashv32](#khashv32) 35 | * [khashv64](#khashv64) 36 | * [Output](#output) 37 | * [64-bit Output](#khashv64-output) 38 | * [32-bit Output](#khashv32-output) 39 | * [Notes](#notes) 40 | * [TODO](#todo) 41 | * [Copyright and License](#copyright-and-license) 42 | 43 | # Performance 44 | When testing on 1.25 GB and 512 KB of random data I get the following on averages: 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 |
Processor1.25 GB Time1.25 GB Speed 512 KB Time512 KB SpeedOSCompilerType
Xeon E3-1230 v5 0.1226 s 10.1987 GB/s 045.3515 us 10.7666 GB/sLinuxGCC 12.2.1Vectorized
Xeon E3-1230 v5 1.1803 s 1.0495 GB/s 462.9862 us 1.0546 GB/sLinuxGCC 12.2.1Scalar
Xeon E3-1230 v5 0.1388 s 9.0061 GB/s 052.8114 us 9.2457 GB/sLinuxClang 15.0.7Vectorized
Ryzen 9 7900 0.1182 s 10.5742 GB/s 044.4734 10.9792 GB/sLinuxGCC 12.2.1Vectorized
Ryzen 9 7900 0.7890 s 1.5843 GB/s 307.4712 us 1.5881 GB/sLinuxGCC 12.2.1Scalar
55 | 56 | The scalar version is slower at a tad over ~1 GB/s on my system when compiling test_speed.c with gcc using `-O3`. 57 | On windows Microsoft's compiler does not seem to generate as performant code from the intrinsics, but the GCC mingw64 compiler generates pretty comparable numbers for me at least. 58 | 59 | Definitely, want to add other machines to this table. But if you are curious how it performs on your machine compile `test_speed.c` with `-O3 -lm -march=native` and `-O3 -lm -march=native -D KHASHV_SCALAR`. 60 | 61 | # API 62 | 63 | ## khashv_prep_seed32 64 | ```C 65 | // Prepares a seed from a 32-bit value 66 | void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) 67 | ``` 68 | 69 | ## khashv_prep_seed64 70 | ```C 71 | // Prepares a seed from a 64-bit value 72 | void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) 73 | ``` 74 | 75 | ## khashv_prep_seed128 76 | ```C 77 | // Sets 128-bits to be the seed 78 | void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) 79 | ``` 80 | 81 | ## khashv32 82 | ```C 83 | // Produces a 32-bit hash from the input data 84 | uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) 85 | ``` 86 | 87 | ## khashv64 88 | ```C 89 | // Produces a 64-bit hash from the input data 90 | uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) 91 | ``` 92 | 93 | # Output 94 | Here is the output of the hash function as images. 95 | 96 | ## khashv64 Output 97 | Here is the output of the 64 bit hash of the integers \[0, 259199\] using 0x1dcedff1a8b17e89 as the seed. 98 | 99 | drawing 100 | 101 | ## khashv32 Output 102 | 103 | Here is the output of the 32 bit hash of the integers \[0, 518399\] using 0x6bb75f13 as the seed. 104 | 105 | drawing 106 | 107 | The output of the above images was generated by basically doing the following for a hash. 108 | 109 | ```C 110 | for(int i = 0; i < sizeof(hash_bytes); i++) { 111 | pixel[img_offset + i].r = hash_bytes[i]; 112 | pixel[img_offset + i].g = hash_bytes[i]; 113 | pixel[img_offset + i].b = hash_bytes[i]; 114 | pixel[img_offset + i].a = 255; 115 | } 116 | ``` 117 | 118 | # TODO 119 | When thinking about things to improve the code and hash function these are the first few things that come to mind for me. 120 | 1. A faster mixing function (e.g. `khashv_mix_words_`) I think is probably the next thing that could be improved. If that could be made shorter/faster it would reduce latency for smaller inputs. Any ideas or feedback for this would be appreciated. 121 | 122 | 2. The next thing would be try to get both Clang and MSVC to output code that runs as fast GCC or as close as possible. They both seem to do some silly things when compared to GCC losing some performance when looking at the generated assembly. Microsoft's compiler being the worst, and probably the fastest fix for me to implement would be to write some assembly code. However, it then would no longer be a single header file hash function since MSVC does not support inline assembly for 64-bit builds, and thusly would require a separate file. 123 | 124 | 3. Then probably consider using intrinsics for some other systems like ARM NEON, but the for now there is scalar code and code written using GCC's vector built-ins that will generate vectorized code for other architectures that GCC supports. 125 | 126 | 4. Probably, the next thing I could think of is to choose a better value for S1 and S2 that are used to basically substitute bytes. The current values where found randomly checking a small set of criteria. Mainly focusing on each bit of S1 and S2 as columns. Then Xor-ing them effectively creating an 8 bit input boolean function, and making sure the entire thing maps each input to a unique value. There likely are better values that could chosen, and criteria to look at that look at all bits at once. However, the search space is huge effectively 2^(2\*8\*16) possible permutations for S1 and S2. However, the current values do seem to work well, from my testing. An other constant that could be looked at as well is the new shuffle constant I have in v2 that randomly permutes the bytes, it's quite likely their exists a better constant for this as well. 127 | 128 | 5. Maybe, write some assembly versions to get around some of the compiler differences. Also maybe a rust version. 129 | 130 | # Copyright and License 131 | 132 | Copyright (C) 2023, by Keith Cancel [](mailto:admin@keith.pro). 133 | 134 | Under the MIT License 135 | -------------------------------------------------------------------------------- /khashv.h: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | Copyright (c) 2022 Keith-Cancel 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the “Software”), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18 | SOFTWARE. 19 | */ 20 | 21 | #ifndef K_HASH_V_H 22 | #define K_HASH_V_H 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #define restrict 26 | #endif 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | // For MSVC compiler, no __SSE3__ macro 34 | #if !defined(__SSE3__) && (defined(__AVX__) || defined(__AVX2__)) 35 | #define __SSE3__ 36 | #endif 37 | // Same deal 38 | #if !defined(__SSE4_1__) && (defined(__AVX__) || defined(__AVX2__)) 39 | #define __SSE4_1__ 40 | #endif 41 | 42 | #if defined(__SSE3__) 43 | #include 44 | #if defined(__MINGW32__) || defined(_WIN32) 45 | #include 46 | #endif 47 | #endif 48 | 49 | #if defined(__GNUC__) && !defined(__clang__) 50 | #define KHASH_GCC_LEAST__(maj, min) (__GNUC__ > maj || __GNUC__ == maj && __GNUC_MINOR__ >= min) 51 | #else 52 | #define KHASH_GCC_LEAST__(maj, min) 0 53 | #endif 54 | 55 | #if defined(__BYTE_ORDER__) && !defined(__BYTE_ORDER) 56 | #define __BYTE_ORDER __BYTE_ORDER__ 57 | #endif 58 | 59 | #if defined(__ORDER_LITTLE_ENDIAN__) && !defined(__LITTLE_ENDIAN) 60 | #define __LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__ 61 | #endif 62 | 63 | #if defined(__ORDER_BIG_ENDIAN__) && !defined(__BIG_ENDIAN) 64 | #define __BIG_ENDIAN __ORDER_BIG_ENDIAN__ 65 | #endif 66 | 67 | 68 | #if defined(__clang__) && defined(__has_attribute) 69 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr) 70 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(5, 0) 71 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr) 72 | #else 73 | #define KHASH_CHK_ATTRIBUTE__(attr) 0 74 | #endif 75 | 76 | #if defined(__clang__) && defined(__has_builtin) 77 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built) 78 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(10, 1) 79 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built) 80 | #else 81 | #define KHASH_CHK_BUILTIN__(built) 0 82 | #endif 83 | 84 | #if defined(_MSC_VER) && !defined(__clang__) 85 | #define KHASH_FINLINE __forceinline 86 | #define KHASH_BSWAP32(val) _byteswap_ulong(val) 87 | #endif 88 | 89 | #if !defined(KHASH_FINLINE) && (KHASH_CHK_ATTRIBUTE__(always_inline) || KHASH_GCC_LEAST__(3, 1)) 90 | #define KHASH_FINLINE __attribute__((always_inline)) inline 91 | #endif 92 | 93 | #if !defined(KHASH_BSWAP32) && (KHASH_CHK_BUILTIN__(__builtin_bswap32) || KHASH_GCC_LEAST__(4, 5)) 94 | #define KHASH_BSWAP32(val) __builtin_bswap32(val) 95 | #endif 96 | 97 | #if !defined(KHASH_OPT_SZ) && (KHASH_CHK_ATTRIBUTE__(optimize) || KHASH_GCC_LEAST__(4, 8)) 98 | #define KHASH_OPT_SZ __attribute__((optimize("Os"))) 99 | #endif 100 | 101 | #if !defined(KHASH_FINLINE) 102 | #define KHASH_FINLINE inline 103 | #endif 104 | 105 | #if !defined(KHASH_OPT_SZ) 106 | #define KHASH_OPT_SZ 107 | #endif 108 | 109 | #if !defined(KHASH_BSWAP32) 110 | #define KHASH_BSWAP32(val) ((val >> 24) | ((val >> 8) & 0xff00) | ((val << 8) & 0xff0000) | (val << 24)) 111 | #endif 112 | 113 | static KHASH_FINLINE int khashv_is_little_endian() { 114 | #if defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN 115 | return 1; 116 | #elif defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN 117 | return 0; 118 | #elif defined(__BYTE_ORDER) 119 | #error "Mixed/Middle endian machine, you will need to write a custom byteswap routine" 120 | #else 121 | // Otherwise hope the compiler's optimizer figures this is constant. 122 | // Also since the byte order macro does not exist there are 123 | // Middle-endian/Mixed endian machines out there but they are quite 124 | // rare/old. So I am not gonna worry about it since there are 24 or 125 | // 4! (four factorial) total endianess-es. So if the compiler does 126 | // not define __BYTE_ORDER, the hash output will be different, on 127 | // such machines, but the hash should still work fine. 128 | unsigned int x = 1; 129 | return *((char*)(&x)) == 1; 130 | #endif 131 | } 132 | 133 | #define KHASH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) 134 | 135 | struct khashv_block_s { 136 | union { 137 | uint8_t bytes[16]; 138 | uint32_t words[4]; 139 | #if defined(__SSE3__) 140 | __m128i vec; 141 | #endif 142 | }; 143 | }; 144 | 145 | typedef struct khashv_block_s khashvBlock; 146 | typedef struct khashv_block_s khashvSeed; 147 | 148 | static const khashvBlock khash_v_init = { 149 | .words = { 150 | // Really this could basically be almost anything 151 | // So just using some bytes of the SHA-256 hashes 152 | // of 1, 2, 3, and 4 153 | 0x7785459a, // SHA256 of the byte 0x01, using the last 4 bytes 154 | 0x6457d986, // SHA256 of the byte 0x02, using the last 4 bytes 155 | 0xadff29c5, // SHA256 of the byte 0x03, using the last 4 bytes 156 | 0x81c89e71, // SHA256 of the byte 0x04, using the last 4 bytes 157 | }}; 158 | 159 | static const uint8_t khashv_s1[16] = { 160 | 0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7, 161 | 0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12, 162 | }; 163 | 164 | static const uint8_t khashv_s2[16] = { 165 | 0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5, 166 | 0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a, 167 | }; 168 | 169 | static const uint8_t khashv_xored[256] = { 170 | 0xf3, 0xb2, 0x17, 0x0c, 0x2e, 0x73, 0x35, 0x58, 171 | 0x8c, 0x7e, 0xb6, 0x5c, 0xc4, 0x4a, 0x01, 0xfd, 172 | 0xd2, 0x93, 0x36, 0x2d, 0x0f, 0x52, 0x14, 0x79, 173 | 0xad, 0x5f, 0x97, 0x7d, 0xe5, 0x6b, 0x20, 0xdc, 174 | 0x7a, 0x3b, 0x9e, 0x85, 0xa7, 0xfa, 0xbc, 0xd1, 175 | 0x05, 0xf7, 0x3f, 0xd5, 0x4d, 0xc3, 0x88, 0x74, 176 | 0xef, 0xae, 0x0b, 0x10, 0x32, 0x6f, 0x29, 0x44, 177 | 0x90, 0x62, 0xaa, 0x40, 0xd8, 0x56, 0x1d, 0xe1, 178 | 0xea, 0xab, 0x0e, 0x15, 0x37, 0x6a, 0x2c, 0x41, 179 | 0x95, 0x67, 0xaf, 0x45, 0xdd, 0x53, 0x18, 0xe4, 180 | 0x3d, 0x7c, 0xd9, 0xc2, 0xe0, 0xbd, 0xfb, 0x96, 181 | 0x42, 0xb0, 0x78, 0x92, 0x0a, 0x84, 0xcf, 0x33, 182 | 0x5e, 0x1f, 0xba, 0xa1, 0x83, 0xde, 0x98, 0xf5, 183 | 0x21, 0xd3, 0x1b, 0xf1, 0x69, 0xe7, 0xac, 0x50, 184 | 0xb9, 0xf8, 0x5d, 0x46, 0x64, 0x39, 0x7f, 0x12, 185 | 0xc6, 0x34, 0xfc, 0x16, 0x8e, 0x00, 0x4b, 0xb7, 186 | 0x0d, 0x4c, 0xe9, 0xf2, 0xd0, 0x8d, 0xcb, 0xa6, 187 | 0x72, 0x80, 0x48, 0xa2, 0x3a, 0xb4, 0xff, 0x03, 188 | 0xb1, 0xf0, 0x55, 0x4e, 0x6c, 0x31, 0x77, 0x1a, 189 | 0xce, 0x3c, 0xf4, 0x1e, 0x86, 0x08, 0x43, 0xbf, 190 | 0x47, 0x06, 0xa3, 0xb8, 0x9a, 0xc7, 0x81, 0xec, 191 | 0x38, 0xca, 0x02, 0xe8, 0x70, 0xfe, 0xb5, 0x49, 192 | 0xda, 0x9b, 0x3e, 0x25, 0x07, 0x5a, 0x1c, 0x71, 193 | 0xa5, 0x57, 0x9f, 0x75, 0xed, 0x63, 0x28, 0xd4, 194 | 0x6e, 0x2f, 0x8a, 0x91, 0xb3, 0xee, 0xa8, 0xc5, 195 | 0x11, 0xe3, 0x2b, 0xc1, 0x59, 0xd7, 0x9c, 0x60, 196 | 0x24, 0x65, 0xc0, 0xdb, 0xf9, 0xa4, 0xe2, 0x8f, 197 | 0x5b, 0xa9, 0x61, 0x8b, 0x13, 0x9d, 0xd6, 0x2a, 198 | 0x89, 0xc8, 0x6d, 0x76, 0x54, 0x09, 0x4f, 0x22, 199 | 0xf6, 0x04, 0xcc, 0x26, 0xbe, 0x30, 0x7b, 0x87, 200 | 0x66, 0x27, 0x82, 0x99, 0xbb, 0xe6, 0xa0, 0xcd, 201 | 0x19, 0xeb, 0x23, 0xc9, 0x51, 0xdf, 0x94, 0x68, 202 | }; 203 | 204 | /* Scalar Code */ 205 | 206 | static KHASH_FINLINE void khashv_bswap_be_block_scalar(khashvBlock* in) { 207 | // Byte swapping is only needed if we are not on on a little endian system 208 | if (khashv_is_little_endian()) { 209 | return; 210 | } 211 | for(int i = 0; i < 4; i++) { 212 | in->words[i] = KHASH_BSWAP32(in->words[i]); 213 | } 214 | } 215 | 216 | static KHASH_FINLINE void khashv_rotr_5_bytes_scalar(khashvBlock* in) { 217 | khashv_bswap_be_block_scalar(in); 218 | khashvBlock tmp1; 219 | khashvBlock tmp2; 220 | // Avoid aliasing issues by using memcpy between these union values. 221 | memcpy(tmp1.bytes, in->words, 16); 222 | for(int i = 0; i < 16; i++) { 223 | tmp2.bytes[i] = tmp1.bytes[(i + 5) & 0xf]; 224 | } 225 | memcpy(in->words, tmp2.bytes, 16); 226 | khashv_bswap_be_block_scalar(in); 227 | } 228 | 229 | static KHASH_FINLINE void khashv_shuffle_bytes_scalar(khashvBlock* in) { 230 | static const uint8_t shuffle[16] = { 231 | 0x7, 0xe, 0x9, 0x0, 0xc, 0xf, 0xd, 0x8, 232 | 0x5, 0xb, 0x6, 0x3, 0x4, 0x2, 0xa, 0x1 233 | }; 234 | khashv_bswap_be_block_scalar(in); 235 | khashvBlock tmp1; 236 | khashvBlock tmp2; 237 | // Avoid aliasing issues by using memcpy between these union values. 238 | memcpy(tmp1.bytes, in->words, 16); 239 | for(int i = 0; i < 16; i++) { 240 | tmp2.bytes[i] = tmp1.bytes[shuffle[i]]; 241 | } 242 | memcpy(in->words, tmp2.bytes, 16); 243 | khashv_bswap_be_block_scalar(in); 244 | } 245 | 246 | static KHASH_FINLINE void khashv_shl_13_block_scalar(khashvBlock* in) { 247 | for(int i = 0; i < 4; i++) { 248 | in->words[i] <<= 13; 249 | } 250 | } 251 | 252 | static KHASH_FINLINE void khashv_shr_3_block_scalar(khashvBlock* in) { 253 | for(int i = 0; i < 4; i++) { 254 | in->words[i] >>= 3; 255 | } 256 | } 257 | 258 | static KHASH_FINLINE void khashv_add_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) { 259 | for(int i = 0; i < 4; i++) { 260 | a->words[i] += b->words[i]; 261 | } 262 | } 263 | 264 | static KHASH_FINLINE void khashv_xor_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) { 265 | for(int i = 0; i < 4; i++) { 266 | a->words[i] ^= b->words[i]; 267 | } 268 | } 269 | 270 | // GCC and Clang with -O3 were vectorizing this quite poorly with -O3 271 | // They could not detect that only a PSHUFB was needed and instead 272 | // where generating tons of inserts and extracts from the vector 273 | // registers. Thusly it was running slower than code that was not being 274 | // vectorized on my machine. So I specify the optimization level directly. 275 | // Tried a few other things to get GCC and Clang to generate more sane 276 | // code or code using PSHUFB, but this seemed the cleanest. 277 | // Example of what I mean: https://godbolt.org/z/PMnzsThPc 278 | // Compared to this: https://godbolt.org/z/dWfjr7GWP 279 | /*static KHASH_OPT_SZ void khashv_sub16(khashvBlock* tmp, const uint8_t sub[16]) { 280 | #if defined(__clang__) 281 | // Stop clang from being annoying!!! 282 | // The auto-vectorized code was worse at the time of writing this 283 | #pragma nounroll 284 | #pragma clang loop vectorize(disable) 285 | #pragma clang loop interleave(disable) 286 | #endif 287 | for (int i = 0; i < 16; i++) { 288 | tmp->bytes[i] = sub[tmp->bytes[i]]; 289 | } 290 | } 291 | 292 | static KHASH_FINLINE void khashv_replace_scalar(khashvBlock* replace) { 293 | khashvBlock tmp; 294 | for (int i = 0; i < 16; i++) { 295 | tmp.bytes[i] = (replace->bytes[i] >> 4); 296 | replace->bytes[i] &= 0x0f; 297 | } 298 | khashv_sub16(replace, khashv_s1); 299 | khashv_sub16(&tmp, khashv_s2); 300 | for (int i = 0; i < 16; i++) { 301 | replace->bytes[i] ^= tmp.bytes[i]; 302 | } 303 | }*/ 304 | // Similar issue as the commented out code so stop the optimizers 305 | // from getting crazy 306 | static KHASH_OPT_SZ void khashv_replace_scalar(khashvBlock* replace) { 307 | khashvBlock tmp; 308 | memcpy(tmp.bytes, replace->words, 16); 309 | #if defined(__clang__) 310 | // Stop clang from being annoying!!! 311 | // The auto-vectorized code was worse at the time of writing this 312 | #pragma nounroll 313 | #pragma clang loop vectorize(disable) 314 | #pragma clang loop interleave(disable) 315 | #endif 316 | for(int i = 0; i < 16; i++) { 317 | tmp.bytes[i] = khashv_xored[tmp.bytes[i]]; 318 | } 319 | memcpy(replace->words, tmp.bytes, 16); 320 | } 321 | 322 | static KHASH_FINLINE void khashv_mix_words_scalar(khashvBlock* in) { 323 | unsigned rots[4] = { 5, 7, 11, 17 }; 324 | khashvBlock tmp = { 0 }; 325 | 326 | tmp = *in; 327 | khashv_shr_3_block_scalar(&tmp); 328 | khashv_xor_block_scalar(in, &tmp); 329 | 330 | for (int i = 0; i < 4; i++) { 331 | unsigned rot = rots[i]; 332 | tmp = *in; 333 | khashv_rotr_5_bytes_scalar(&tmp); 334 | khashv_add_block_scalar(&tmp, in); 335 | for (int j = 0; j < 4; j++) { 336 | tmp.words[j] = KHASH_ROTR32(tmp.words[j], rot); 337 | } 338 | khashv_xor_block_scalar(in, &tmp); 339 | } 340 | } 341 | 342 | static void khashv_hash_scalar(khashvBlock* hash, const uint8_t* data, size_t data_len) { 343 | hash->words[0] ^= data_len; 344 | // size_t is bigger than 32 bits 345 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295 346 | hash->words[1] ^= data_len >> 32; 347 | #endif 348 | 349 | khashvBlock tmp_1; 350 | khashvBlock tmp_2; 351 | khashvBlock tmp_h = *hash; 352 | 353 | const uint8_t* end = data + (data_len & ~((size_t)15)); 354 | 355 | while (data < end) { 356 | memcpy(&tmp_2, data, 16); 357 | khashv_replace_scalar(&tmp_2); 358 | memcpy(&tmp_1.words, tmp_2.bytes, 16); 359 | 360 | khashv_bswap_be_block_scalar(&tmp_1); 361 | 362 | tmp_2 = tmp_1; 363 | //khashv_shl_13_block_scalar(&tmp_2); 364 | //khashv_add_block_scalar(&tmp_2, &tmp_1); 365 | for(int i = 0; i < 4; i++) { 366 | tmp_2.words[i] *= 8193; 367 | } 368 | khashv_xor_block_scalar(&tmp_h, &tmp_2); 369 | khashv_rotr_5_bytes_scalar(&tmp_h); 370 | khashv_add_block_scalar(&tmp_h, &tmp_1); 371 | 372 | tmp_1 = tmp_h; 373 | khashv_shuffle_bytes_scalar(&tmp_1); 374 | khashv_add_block_scalar(&tmp_h, &tmp_1); 375 | 376 | data += 16; 377 | } 378 | 379 | unsigned trailing = data_len & 0xf; 380 | if(trailing) { 381 | memset(&tmp_2, 0, 16); 382 | 383 | memcpy(&tmp_2.bytes, data, trailing); 384 | khashv_replace_scalar(&tmp_2); 385 | memcpy(&tmp_1.words, tmp_2.bytes, 16); 386 | 387 | khashv_bswap_be_block_scalar(&tmp_1); 388 | 389 | tmp_2 = tmp_1; 390 | //khashv_shl_13_block_scalar(&tmp_2); 391 | //khashv_add_block_scalar(&tmp_2, &tmp_1); 392 | for(int i = 0; i < 4; i++) { 393 | tmp_2.words[i] *= 8193; 394 | } 395 | khashv_xor_block_scalar(&tmp_h, &tmp_2); 396 | khashv_rotr_5_bytes_scalar(&tmp_h); 397 | khashv_add_block_scalar(&tmp_h, &tmp_1); 398 | 399 | tmp_1 = tmp_h; 400 | khashv_shuffle_bytes_scalar(&tmp_1); 401 | khashv_add_block_scalar(&tmp_h, &tmp_1); 402 | 403 | } 404 | khashv_mix_words_scalar(&tmp_h); 405 | *hash = tmp_h; 406 | } 407 | 408 | static inline void khashv_prep_seed32_scalar(khashvSeed* seed_prepped, uint32_t seed) { 409 | *seed_prepped = khash_v_init; 410 | seed_prepped->words[0] ^= seed; 411 | khashv_mix_words_scalar(seed_prepped); 412 | } 413 | 414 | static inline void khashv_prep_seed64_scalar(khashvSeed* seed_prepped, uint64_t seed) { 415 | *seed_prepped = khash_v_init; 416 | seed_prepped->words[0] ^= seed; 417 | khashv_mix_words_scalar(seed_prepped); 418 | // Do it again with the other part to make it different than the 32 bit seed. 419 | seed_prepped->words[1] ^= seed >> 32; 420 | khashv_mix_words_scalar(seed_prepped); 421 | } 422 | 423 | static inline void khashv_prep_seed128_scalar(khashvSeed* seed_prepped, const uint32_t seed[4]) { 424 | for(int i = 0; i < 4; i++) { 425 | seed_prepped->words[i] = seed[i]; 426 | } 427 | } 428 | 429 | static inline uint32_t khashv32_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 430 | khashvBlock h = *seed; 431 | khashv_hash_scalar(&h, data, data_len); 432 | return h.words[3]; 433 | } 434 | 435 | static inline uint64_t khashv64_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 436 | khashvBlock h = *seed; 437 | khashv_hash_scalar(&h, data, data_len); 438 | uint64_t r = h.words[1]; 439 | r <<= 32; 440 | r |= h.words[0]; 441 | return r; 442 | } 443 | 444 | /* Vectorization for and Intel/AMD */ 445 | 446 | #if defined(__SSE3__) 447 | 448 | #define KHASH_VECTOR 1 449 | 450 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(11, 0)) 451 | static KHASH_FINLINE __m128i _mm_loadu_si32(const void* data) { 452 | uint32_t val; 453 | memcpy(&val, data, sizeof(uint32_t)); 454 | return _mm_cvtsi32_si128(val); 455 | } 456 | static KHASH_FINLINE __m128i _mm_loadu_si16(const void* data) { 457 | uint32_t val = 0; 458 | memcpy(&val, data, sizeof(uint16_t)); 459 | return _mm_cvtsi32_si128(val); 460 | } 461 | #endif 462 | 463 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(9, 1)) 464 | static KHASH_FINLINE __m128i _mm_loadu_si64(const void* data) { 465 | uint64_t val = 0; 466 | memcpy(&val, data, sizeof(uint64_t)); 467 | return _mm_cvtsi64_si128(val); 468 | } 469 | #endif 470 | 471 | static KHASH_FINLINE __m128i khashv_mix_words_vector(__m128i val) { 472 | __m128i tmp1; 473 | __m128i tmp2; 474 | 475 | tmp1 = _mm_srli_epi32(val, 3); 476 | val = _mm_xor_si128(tmp1, val); 477 | 478 | tmp1 = _mm_alignr_epi8(val, val, 5); 479 | tmp1 = _mm_add_epi32(val, tmp1); 480 | #if defined(__AVX512VL__) 481 | tmp1 = _mm_ror_epi32(tmp1, 5); 482 | val = _mm_xor_si128(val, tmp1); 483 | #else 484 | tmp2 = _mm_srli_epi32(tmp1, 5); 485 | tmp1 = _mm_slli_epi32(tmp1, 27); 486 | tmp1 = _mm_or_si128(tmp1, tmp2); 487 | val = _mm_xor_si128(val, tmp1); 488 | #endif 489 | 490 | tmp1 = _mm_alignr_epi8(val, val, 5); 491 | tmp1 = _mm_add_epi32(val, tmp1); 492 | #if defined(__AVX512VL__) 493 | tmp1 = _mm_ror_epi32(tmp1, 7); 494 | val = _mm_xor_si128(val, tmp1); 495 | #else 496 | tmp2 = _mm_srli_epi32(tmp1, 7); 497 | tmp1 = _mm_slli_epi32(tmp1, 25); 498 | val = _mm_xor_si128(val, tmp2); 499 | val = _mm_xor_si128(val, tmp1); 500 | #endif 501 | 502 | tmp1 = _mm_alignr_epi8(val, val, 5); 503 | tmp1 = _mm_add_epi32(tmp1, val); 504 | #if defined(__AVX512VL__) 505 | tmp1 = _mm_ror_epi32(tmp1, 11); 506 | val = _mm_xor_si128(val, tmp1); 507 | #else 508 | tmp2 = _mm_srli_epi32(tmp1, 11); 509 | tmp1 = _mm_slli_epi32(tmp1, 21); 510 | val = _mm_xor_si128(val, tmp2); 511 | val = _mm_xor_si128(val, tmp1); 512 | #endif 513 | 514 | tmp1 = _mm_alignr_epi8(val, val, 5); 515 | tmp1 = _mm_add_epi32(tmp1, val); 516 | #if defined(__AVX512VL__) 517 | tmp1 = _mm_ror_epi32(tmp1, 17); 518 | val = _mm_xor_si128(val, tmp1); 519 | #else 520 | tmp2 = _mm_srli_epi32(tmp1, 17); 521 | tmp1 = _mm_slli_epi32(tmp1, 15); 522 | val = _mm_xor_si128(val, tmp2); 523 | val = _mm_xor_si128(val, tmp1); 524 | #endif 525 | 526 | return val; 527 | } 528 | 529 | static KHASH_FINLINE __m128i khashv_part_load_vector(const uint8_t* data, size_t len) { 530 | __m128i tmp = { 0 }; 531 | __m128i tmp2 = { 0 }; 532 | switch(len) { 533 | case 1: 534 | #if defined(__SSE4_1__) 535 | tmp = _mm_insert_epi8(tmp, data[0], 0); 536 | #else 537 | tmp = _mm_cvtsi32_si128(data[0]); 538 | #endif 539 | break; 540 | case 2: 541 | tmp = _mm_loadu_si16(data); 542 | break; 543 | case 3: 544 | tmp = _mm_loadu_si16(data); 545 | #if defined(__SSE4_1__) 546 | tmp = _mm_insert_epi8(tmp, data[2], 2); 547 | #else 548 | tmp = _mm_insert_epi16(tmp, data[2], 1); 549 | #endif 550 | break; 551 | case 4: 552 | tmp = _mm_loadu_si32(data); 553 | break; 554 | case 5: 555 | tmp = _mm_loadu_si32(data); 556 | #if defined(__SSE4_1__) 557 | tmp = _mm_insert_epi8(tmp, data[4], 4); 558 | #else 559 | tmp = _mm_insert_epi16(tmp, data[4], 2); 560 | #endif 561 | break; 562 | case 6: 563 | tmp = _mm_loadu_si32(data); 564 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2); 565 | break; 566 | case 7: 567 | tmp = _mm_loadu_si32(data); 568 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2); 569 | #if defined(__SSE4_1__) 570 | tmp = _mm_insert_epi8(tmp, data[6], 6); 571 | #else 572 | tmp = _mm_insert_epi16(tmp, data[6], 3); 573 | #endif 574 | break; 575 | case 8: 576 | tmp = _mm_loadu_si64(data); 577 | break; 578 | case 9: 579 | tmp = _mm_loadu_si64(data); 580 | #if defined(__SSE4_1__) 581 | tmp = _mm_insert_epi8(tmp, data[8], 8); 582 | #else 583 | tmp = _mm_insert_epi16(tmp, data[8], 4); 584 | #endif 585 | break; 586 | case 10: 587 | tmp = _mm_loadu_si64(data); 588 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4); 589 | break; 590 | case 11: 591 | tmp = _mm_loadu_si64(data); 592 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4); 593 | #if defined(__SSE4_1__) 594 | tmp = _mm_insert_epi8(tmp, data[10], 10); 595 | #else 596 | tmp = _mm_insert_epi16(tmp, data[10], 5); 597 | #endif 598 | break; 599 | case 12: 600 | tmp = _mm_loadu_si64(data); 601 | #if defined(__SSE4_1__) 602 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2); 603 | #else 604 | tmp2 = _mm_loadu_si32(data + 8); 605 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f); 606 | tmp = _mm_or_si128(tmp, tmp2); 607 | #endif 608 | break; 609 | case 13: 610 | tmp = _mm_loadu_si64(data); 611 | #if defined(__SSE4_1__) 612 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2); 613 | tmp = _mm_insert_epi8(tmp, data[12], 12); 614 | #else 615 | tmp2 = _mm_loadu_si32(data + 8); 616 | tmp2 = _mm_insert_epi16(tmp2, data[12], 2); 617 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f); 618 | tmp = _mm_or_si128(tmp, tmp2); 619 | #endif 620 | break; 621 | case 14: 622 | tmp = _mm_loadu_si64(data); 623 | #if defined(__SSE4_1__) 624 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2); 625 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6); 626 | #else 627 | tmp2 = _mm_loadu_si32(data + 8); 628 | tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6); 629 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f); 630 | tmp = _mm_or_si128(tmp, tmp2); 631 | #endif 632 | break; 633 | case 15: 634 | tmp = _mm_loadu_si64(data); 635 | #if defined(__SSE4_1__) 636 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2); 637 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6); 638 | tmp = _mm_insert_epi8(tmp, data[14], 14); 639 | #else 640 | tmp2 = _mm_loadu_si32(data + 8); 641 | tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6); 642 | tmp2 = _mm_insert_epi16(tmp2, data[14], 7); 643 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f); 644 | tmp = _mm_or_si128(tmp, tmp2); 645 | #endif 646 | break; 647 | case 16: 648 | tmp = _mm_loadu_si128((__m128i*)data); 649 | break; 650 | } 651 | return tmp; 652 | } 653 | 654 | static const uint8_t khashv_shuff[16] = { 655 | 0x7, 0xe, 0x9, 0x0, 0xc, 0xf, 0xd, 0x8, 656 | 0x5, 0xb, 0x6, 0x3, 0x4, 0x2, 0xa, 0x1 657 | }; 658 | 659 | static __m128i khashv_hash_vector(__m128i hash, const uint8_t* data, size_t data_len) { 660 | const __m128i s1 = _mm_loadu_si128((const __m128i*)khashv_s1); 661 | const __m128i s2 = _mm_loadu_si128((const __m128i*)khashv_s2); 662 | const __m128i shuff = _mm_loadu_si128((const __m128i*)khashv_shuff); 663 | const __m128i mask = _mm_set1_epi32(0x0f0f0f0f); 664 | 665 | __m128i tmp_1; 666 | __m128i tmp_2; 667 | 668 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295 669 | tmp_1 = _mm_cvtsi64_si128(data_len); 670 | #else 671 | tmp_1 = _mm_cvtsi32_si128(data_len); 672 | #endif 673 | hash = _mm_xor_si128(tmp_1, hash); 674 | 675 | const uint8_t* end = data + (data_len & ~((size_t)15)); 676 | const uint8_t* end2 = data + data_len; 677 | while(data_len > 16 && data < end) { 678 | tmp_1 = _mm_lddqu_si128((const __m128i*)data); 679 | tmp_2 = _mm_srli_epi32 (tmp_1, 4); 680 | 681 | tmp_1 = _mm_and_si128 (tmp_1, mask); 682 | tmp_2 = _mm_and_si128 (tmp_2, mask); 683 | tmp_1 = _mm_shuffle_epi8(s1, tmp_1); 684 | tmp_2 = _mm_shuffle_epi8(s2, tmp_2); 685 | tmp_1 = _mm_xor_si128 (tmp_1, tmp_2); 686 | 687 | tmp_2 = _mm_slli_epi32 (tmp_1, 13); 688 | tmp_2 = _mm_add_epi32 (tmp_1, tmp_2); 689 | tmp_2 = _mm_xor_si128 (hash, tmp_2); 690 | tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5); 691 | hash = _mm_add_epi32 (tmp_2, tmp_1); 692 | 693 | tmp_1 = _mm_shuffle_epi8(hash, shuff); 694 | hash = _mm_add_epi32(hash, tmp_1); 695 | 696 | data += 16; 697 | } 698 | uintptr_t trailing = end2 - data; 699 | if(trailing) { 700 | tmp_1 = khashv_part_load_vector(data, trailing); 701 | tmp_2 = _mm_srli_epi32 (tmp_1, 4); 702 | 703 | tmp_1 = _mm_and_si128 (tmp_1, mask); 704 | tmp_2 = _mm_and_si128 (tmp_2, mask); 705 | tmp_1 = _mm_shuffle_epi8(s1, tmp_1); 706 | tmp_2 = _mm_shuffle_epi8(s2, tmp_2); 707 | tmp_1 = _mm_xor_si128 (tmp_1, tmp_2); 708 | 709 | tmp_2 = _mm_slli_epi32 (tmp_1, 13); 710 | tmp_2 = _mm_add_epi32 (tmp_1, tmp_2); 711 | tmp_2 = _mm_xor_si128 (hash, tmp_2); 712 | tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5); 713 | hash = _mm_add_epi32 (tmp_2, tmp_1); 714 | 715 | tmp_1 = _mm_shuffle_epi8(hash, shuff); 716 | hash = _mm_add_epi32(hash, tmp_1); 717 | } 718 | hash = khashv_mix_words_vector(hash); 719 | return hash; 720 | } 721 | 722 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) { 723 | __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init); 724 | s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed)); 725 | seed_prepped->vec = khashv_mix_words_vector(s); 726 | } 727 | 728 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) { 729 | __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init); 730 | __m128i t = _mm_cvtsi32_si128(seed >> 32); 731 | s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed)); 732 | s = khashv_mix_words_vector(s); 733 | s = _mm_xor_si128(s, _mm_shuffle_epi32(t, 0xf3)); 734 | seed_prepped->vec = khashv_mix_words_vector(s); 735 | } 736 | 737 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) { 738 | seed_prepped->vec = _mm_loadu_si128((const __m128i*)seed); 739 | } 740 | 741 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 742 | __m128i h = khashv_hash_vector(seed->vec, data, data_len); 743 | // using word[3] to avoid any overlap with with the 744 | // 64 bit hash which uses words [0] and [1], this ensures 745 | // the 2 bit outputs should behave differently when used. 746 | #if defined(__SSE4_1__) 747 | return _mm_extract_epi32(h, 3); 748 | #else 749 | h = _mm_shuffle_epi32(h, 0xff); 750 | return _mm_cvtsi128_si32(h); 751 | #endif 752 | } 753 | 754 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 755 | __m128i h = khashv_hash_vector(seed->vec, data, data_len); 756 | return _mm_cvtsi128_si64(h); 757 | } 758 | 759 | #endif 760 | 761 | /* Vectorization via GCCs Vectorization builtins */ 762 | // Handy since it allows vectorization without explicit intrinsics 763 | // for a particular CPU. 764 | 765 | #if !defined(KHASH_VECTOR) && KHASH_GCC_LEAST__(6, 1) 766 | 767 | #define KHASH_VECTOR 1 768 | 769 | typedef uint8_t kv16ui __attribute__((vector_size(16))); 770 | typedef uint32_t kv4ui __attribute__((vector_size(16))); 771 | 772 | static KHASH_FINLINE kv16ui khashv_sub_s1_gcc(kv16ui in) { 773 | const kv16ui mask = { 774 | 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 775 | 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf 776 | }; 777 | const kv16ui sub = { 778 | 0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7, 779 | 0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12, 780 | }; 781 | in &= mask; 782 | return __builtin_shuffle(sub, in); 783 | } 784 | 785 | static KHASH_FINLINE kv16ui khashv_sub_s2_gcc(kv16ui in) { 786 | const kv16ui sub = { 787 | 0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5, 788 | 0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a, 789 | }; 790 | in >>= 4; 791 | return __builtin_shuffle(sub, in); 792 | } 793 | 794 | static KHASH_FINLINE kv4ui khashv_rotr_5_bytes_gcc(kv4ui input) { 795 | const kv16ui rotrLE = { 796 | 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 797 | 0xd, 0xe, 0xf, 0x0, 0x1, 0x2, 0x3, 0x4 798 | }; 799 | const kv16ui rotrBE = { 800 | 0xb, 0x4, 0x5, 0x6, 0xf, 0x8, 0x9, 0xa, 801 | 0x3, 0xc, 0xd, 0xe, 0x7, 0x0, 0x1, 0x2 802 | }; 803 | kv16ui tmp; 804 | memcpy(&tmp, &input, 16); 805 | if (khashv_is_little_endian()) { 806 | tmp = __builtin_shuffle(tmp, rotrLE); 807 | } else { 808 | tmp = __builtin_shuffle(tmp, rotrBE); 809 | } 810 | memcpy(&input, &tmp, 16); 811 | return input; 812 | } 813 | 814 | static KHASH_FINLINE kv4ui khashv_shuffle_bytes_gcc(kv4ui input) { 815 | const kv16ui shuffLE = { 816 | 0x7, 0xe, 0x9, 0x0, 0xc, 0xf, 0xd, 0x8, 817 | 0x5, 0xb, 0x6, 0x3, 0x4, 0x2, 0xa, 0x1 818 | }; 819 | const kv16ui shuffBE = { 820 | 0x3, 0xa, 0xd, 0x4, 0xb, 0xe, 0xc, 0xf, 821 | 0x0, 0x5, 0x8, 0x6, 0x2, 0x9, 0x1, 0x7, 822 | }; 823 | kv16ui tmp; 824 | memcpy(&tmp, &input, 16); 825 | if (khashv_is_little_endian()) { 826 | tmp = __builtin_shuffle(tmp, shuffLE); 827 | } else { 828 | tmp = __builtin_shuffle(tmp, shuffBE); 829 | } 830 | memcpy(&input, &tmp, 16); 831 | return input; 832 | } 833 | 834 | static KHASH_FINLINE kv4ui khash_byteswap_vec32_gcc( kv4ui input ) { 835 | const kv16ui bswap32 = { 836 | 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4, 837 | 0xb, 0xa, 0x9, 0x8, 0xf, 0xe, 0xd, 0xc, 838 | }; 839 | kv16ui b; 840 | 841 | memcpy(&b, &input, 16); 842 | b = __builtin_shuffle(b, bswap32); 843 | memcpy(&input, &b, 16); 844 | return input; 845 | } 846 | 847 | static KHASH_FINLINE kv4ui khashv_replace_gcc(kv4ui input) { 848 | kv16ui s1; 849 | kv16ui s2; 850 | memcpy(&s1, &input, 16); 851 | s2 = khashv_sub_s2_gcc(s1); 852 | s1 = khashv_sub_s1_gcc(s1); 853 | s1 ^= s2; 854 | memcpy(&input, &s1, 16); 855 | return input; 856 | } 857 | 858 | static KHASH_FINLINE kv4ui khashv_mix_words_gcc(kv4ui val) { 859 | const unsigned rots[4] = { 5, 7, 11, 17 }; 860 | kv4ui tmp = val >> 3; 861 | val ^= tmp; 862 | for (int i = 0; i < 4; i++) { 863 | unsigned rot = rots[i]; 864 | kv4ui tmp = val; 865 | tmp = khashv_rotr_5_bytes_gcc(tmp); 866 | tmp += val; 867 | tmp = (tmp >> rot) | (tmp << (32 - rot)); 868 | val ^= tmp; 869 | } 870 | return val; 871 | } 872 | 873 | static KHASH_FINLINE kv4ui khashv_hash_block_gcc(kv4ui hash, kv4ui input) { 874 | kv4ui tmp_1 = khashv_replace_gcc(input); 875 | if (!khashv_is_little_endian()) { 876 | tmp_1 = khash_byteswap_vec32_gcc(tmp_1); 877 | } 878 | kv4ui tmp_2 = tmp_1 * 8193; 879 | tmp_2 ^= hash; 880 | tmp_2 = khashv_rotr_5_bytes_gcc(tmp_2); 881 | hash = tmp_1 + tmp_2; 882 | 883 | tmp_1 = khashv_shuffle_bytes_gcc(hash); 884 | hash = hash + tmp_1; 885 | return hash; 886 | } 887 | 888 | static KHASH_FINLINE kv4ui khashv_hash_gcc(kv4ui hash, const uint8_t* data, size_t data_len) { 889 | hash[0] ^= data_len; 890 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295 891 | hash[1] ^= data_len >> 32; 892 | #endif 893 | 894 | kv4ui data_v; 895 | const uint8_t* end = data + (data_len & ~((size_t)15)); 896 | while (data < end) { 897 | memcpy(&data_v, data, 16); 898 | hash = khashv_hash_block_gcc(hash, data_v); 899 | data += 16; 900 | } 901 | 902 | unsigned trailing = data_len & 0xf; 903 | if(trailing) { 904 | memset(&data_v, 0, 16); 905 | memcpy(&data_v, data, trailing); 906 | hash = khashv_hash_block_gcc(hash, data_v); 907 | } 908 | return khashv_mix_words_gcc(hash); 909 | } 910 | 911 | 912 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) { 913 | kv4ui s; 914 | memcpy(&s, &khash_v_init, 16); 915 | s[0] ^= seed; 916 | s = khashv_mix_words_gcc(s); 917 | memcpy(seed_prepped, &s, 16); 918 | } 919 | 920 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) { 921 | kv4ui s; 922 | memcpy(&s, &khash_v_init, 16); 923 | s[0] ^= seed; 924 | s = khashv_mix_words_gcc(s); 925 | s[1] ^= seed >> 32; 926 | s = khashv_mix_words_gcc(s); 927 | memcpy(seed_prepped, &s, 16); 928 | } 929 | 930 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) { 931 | memcpy(seed_prepped->words, seed, 16); 932 | } 933 | 934 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 935 | kv4ui h; 936 | memcpy(&h, seed, 16); 937 | h = khashv_hash_gcc(h, data, data_len); 938 | return h[3]; 939 | } 940 | 941 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 942 | kv4ui h; 943 | memcpy(&h, seed, 16); 944 | h = khashv_hash_gcc(h, data, data_len); 945 | uint64_t ret; 946 | if (khashv_is_little_endian()) { 947 | memcpy(&ret, &h, 8); 948 | } else { 949 | ret = h[1]; 950 | ret = (ret << 32) | h[0]; 951 | } 952 | return ret; 953 | } 954 | 955 | #endif 956 | 957 | #if defined(KHASH_VECTOR) && !defined(KHASHV_SCALAR) 958 | 959 | static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) { 960 | khashv_prep_seed32_vector(seed_prepped, seed); 961 | } 962 | 963 | static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) { 964 | khashv_prep_seed64_vector(seed_prepped, seed); 965 | } 966 | 967 | static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) { 968 | khashv_prep_seed128_vector(seed_prepped, seed); 969 | } 970 | 971 | static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 972 | return khashv32_vector(seed, data, data_len); 973 | } 974 | 975 | static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 976 | return khashv64_vector(seed, data, data_len); 977 | } 978 | 979 | #else 980 | 981 | static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) { 982 | khashv_prep_seed32_scalar(seed_prepped, seed); 983 | } 984 | 985 | static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) { 986 | khashv_prep_seed64_scalar(seed_prepped, seed); 987 | } 988 | 989 | static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) { 990 | khashv_prep_seed128_scalar(seed_prepped, seed); 991 | } 992 | 993 | static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 994 | return khashv32_scalar(seed, data, data_len); 995 | } 996 | 997 | static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 998 | return khashv64_scalar(seed, data, data_len); 999 | } 1000 | 1001 | #endif 1002 | 1003 | 1004 | #ifdef __cplusplus 1005 | } 1006 | #endif 1007 | #endif 1008 | -------------------------------------------------------------------------------- /k-hashv-old/khashv_v1.h: -------------------------------------------------------------------------------- 1 | /* 2 | MIT License 3 | Copyright (c) 2022 Keith-Cancel 4 | Permission is hereby granted, free of charge, to any person obtaining a copy 5 | of this software and associated documentation files (the “Software”), to deal 6 | in the Software without restriction, including without limitation the rights 7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 8 | copies of the Software, and to permit persons to whom the Software is 9 | furnished to do so, subject to the following conditions: 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 18 | SOFTWARE. 19 | */ 20 | 21 | #ifndef K_HASH_V_H 22 | #define K_HASH_V_H 23 | #ifdef __cplusplus 24 | extern "C" { 25 | #define restrict 26 | #endif 27 | 28 | #include 29 | #include 30 | #include 31 | #include 32 | 33 | // For MSVC compiler, no __SSE3__ macro 34 | #if !defined(__SSE3__) && (defined(__AVX__) || defined(__AVX2__)) 35 | #define __SSE3__ 36 | #endif 37 | // Same deal 38 | #if !defined(__SSE4_1__) && (defined(__AVX__) || defined(__AVX2__)) 39 | #define __SSE4_1__ 40 | #endif 41 | 42 | #if defined(__SSE3__) 43 | #include 44 | #if defined(__MINGW32__) || defined(_WIN32) 45 | #include 46 | #endif 47 | #endif 48 | 49 | #if defined(__GNUC__) && !defined(__clang__) 50 | #define KHASH_GCC_LEAST__(maj, min) (__GNUC__ > maj || __GNUC__ == maj && __GNUC_MINOR__ >= min) 51 | #else 52 | #define KHASH_GCC_LEAST__(maj, min) 0 53 | #endif 54 | 55 | #if defined(__BYTE_ORDER__) && !defined(__BYTE_ORDER) 56 | #define __BYTE_ORDER __BYTE_ORDER__ 57 | #endif 58 | 59 | #if defined(__ORDER_LITTLE_ENDIAN__) && !defined(__LITTLE_ENDIAN) 60 | #define __LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__ 61 | #endif 62 | 63 | #if defined(__ORDER_BIG_ENDIAN__) && !defined(__BIG_ENDIAN) 64 | #define __BIG_ENDIAN __ORDER_BIG_ENDIAN__ 65 | #endif 66 | 67 | 68 | #if defined(__clang__) && defined(__has_attribute) 69 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr) 70 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(5, 0) 71 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr) 72 | #else 73 | #define KHASH_CHK_ATTRIBUTE__(attr) 0 74 | #endif 75 | 76 | #if defined(__clang__) && defined(__has_builtin) 77 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built) 78 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(10, 1) 79 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built) 80 | #else 81 | #define KHASH_CHK_BUILTIN__(built) 0 82 | #endif 83 | 84 | #if defined(_MSC_VER) && !defined(__clang__) 85 | #define KHASH_FINLINE __forceinline 86 | #define KHASH_BSWAP32(val) _byteswap_ulong(val) 87 | #endif 88 | 89 | #if !defined(KHASH_FINLINE) && (KHASH_CHK_ATTRIBUTE__(always_inline) || KHASH_GCC_LEAST__(3, 1)) 90 | #define KHASH_FINLINE __attribute__((always_inline)) inline 91 | #endif 92 | 93 | #if !defined(KHASH_BSWAP32) && (KHASH_CHK_BUILTIN__(__builtin_bswap32) || KHASH_GCC_LEAST__(4, 5)) 94 | #define KHASH_BSWAP32(val) __builtin_bswap32(val) 95 | #endif 96 | 97 | #if !defined(KHASH_OPT_SZ) && (KHASH_CHK_ATTRIBUTE__(optimize) || KHASH_GCC_LEAST__(4, 8)) 98 | #define KHASH_OPT_SZ __attribute__((optimize("Os"))) 99 | #endif 100 | 101 | #if !defined(KHASH_FINLINE) 102 | #define KHASH_FINLINE inline 103 | #endif 104 | 105 | #if !defined(KHASH_OPT_SZ) 106 | #define KHASH_OPT_SZ 107 | #endif 108 | 109 | #if !defined(KHASH_BSWAP32) 110 | #define KHASH_BSWAP32(val) ((val >> 24) | ((val >> 8) & 0xff00) | ((val << 8) & 0xff0000) | (val << 24)) 111 | #endif 112 | 113 | static KHASH_FINLINE int khashv_is_little_endian() { 114 | #if defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN 115 | return 1; 116 | #elif defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN 117 | return 0; 118 | #elif defined(__BYTE_ORDER) 119 | #error "Mixed/Middle endian machine, you will need to write a custom byteswap routine" 120 | #else 121 | // Otherwise hope the compiler's optimizer figures this is constant. 122 | // Also since the byte order macro does not exist there are 123 | // Middle-endian/Mixed endian machines out there but they are quite 124 | // rare/old. So I am not gonna worry about it since there are 24 or 125 | // 4! (four factorial) total endianess-es. So if the compiler does 126 | // not define __BYTE_ORDER, the hash output will be different, on 127 | // such machines, but the hash should still work fine. 128 | unsigned int x = 1; 129 | return *((char*)(&x)) == 1; 130 | #endif 131 | } 132 | 133 | #define KHASH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) 134 | 135 | struct khashv_block_s { 136 | union { 137 | uint8_t bytes[16]; 138 | uint32_t words[4]; 139 | #if defined(__SSE3__) 140 | __m128i vec; 141 | #endif 142 | }; 143 | }; 144 | 145 | typedef struct khashv_block_s khashvBlock; 146 | typedef struct khashv_block_s khashvSeed; 147 | 148 | static const khashvBlock khash_v_init = { 149 | .words = { 150 | // Really this could basically be almost anything 151 | // So just using some bytes of the SHA-256 hashes 152 | // of 1, 2, 3, and 4 153 | 0x7785459a, // SHA256 of the byte 0x01, using the last 4 bytes 154 | 0x6457d986, // SHA256 of the byte 0x02, using the last 4 bytes 155 | 0xadff29c5, // SHA256 of the byte 0x03, using the last 4 bytes 156 | 0x81c89e71, // SHA256 of the byte 0x04, using the last 4 bytes 157 | }}; 158 | 159 | static const uint8_t khashv_s1[16] = { 160 | 0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7, 161 | 0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12, 162 | }; 163 | 164 | static const uint8_t khashv_s2[16] = { 165 | 0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5, 166 | 0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a, 167 | }; 168 | 169 | static const uint8_t khashv_xored[256] = { 170 | 0xf3, 0xb2, 0x17, 0x0c, 0x2e, 0x73, 0x35, 0x58, 171 | 0x8c, 0x7e, 0xb6, 0x5c, 0xc4, 0x4a, 0x01, 0xfd, 172 | 0xd2, 0x93, 0x36, 0x2d, 0x0f, 0x52, 0x14, 0x79, 173 | 0xad, 0x5f, 0x97, 0x7d, 0xe5, 0x6b, 0x20, 0xdc, 174 | 0x7a, 0x3b, 0x9e, 0x85, 0xa7, 0xfa, 0xbc, 0xd1, 175 | 0x05, 0xf7, 0x3f, 0xd5, 0x4d, 0xc3, 0x88, 0x74, 176 | 0xef, 0xae, 0x0b, 0x10, 0x32, 0x6f, 0x29, 0x44, 177 | 0x90, 0x62, 0xaa, 0x40, 0xd8, 0x56, 0x1d, 0xe1, 178 | 0xea, 0xab, 0x0e, 0x15, 0x37, 0x6a, 0x2c, 0x41, 179 | 0x95, 0x67, 0xaf, 0x45, 0xdd, 0x53, 0x18, 0xe4, 180 | 0x3d, 0x7c, 0xd9, 0xc2, 0xe0, 0xbd, 0xfb, 0x96, 181 | 0x42, 0xb0, 0x78, 0x92, 0x0a, 0x84, 0xcf, 0x33, 182 | 0x5e, 0x1f, 0xba, 0xa1, 0x83, 0xde, 0x98, 0xf5, 183 | 0x21, 0xd3, 0x1b, 0xf1, 0x69, 0xe7, 0xac, 0x50, 184 | 0xb9, 0xf8, 0x5d, 0x46, 0x64, 0x39, 0x7f, 0x12, 185 | 0xc6, 0x34, 0xfc, 0x16, 0x8e, 0x00, 0x4b, 0xb7, 186 | 0x0d, 0x4c, 0xe9, 0xf2, 0xd0, 0x8d, 0xcb, 0xa6, 187 | 0x72, 0x80, 0x48, 0xa2, 0x3a, 0xb4, 0xff, 0x03, 188 | 0xb1, 0xf0, 0x55, 0x4e, 0x6c, 0x31, 0x77, 0x1a, 189 | 0xce, 0x3c, 0xf4, 0x1e, 0x86, 0x08, 0x43, 0xbf, 190 | 0x47, 0x06, 0xa3, 0xb8, 0x9a, 0xc7, 0x81, 0xec, 191 | 0x38, 0xca, 0x02, 0xe8, 0x70, 0xfe, 0xb5, 0x49, 192 | 0xda, 0x9b, 0x3e, 0x25, 0x07, 0x5a, 0x1c, 0x71, 193 | 0xa5, 0x57, 0x9f, 0x75, 0xed, 0x63, 0x28, 0xd4, 194 | 0x6e, 0x2f, 0x8a, 0x91, 0xb3, 0xee, 0xa8, 0xc5, 195 | 0x11, 0xe3, 0x2b, 0xc1, 0x59, 0xd7, 0x9c, 0x60, 196 | 0x24, 0x65, 0xc0, 0xdb, 0xf9, 0xa4, 0xe2, 0x8f, 197 | 0x5b, 0xa9, 0x61, 0x8b, 0x13, 0x9d, 0xd6, 0x2a, 198 | 0x89, 0xc8, 0x6d, 0x76, 0x54, 0x09, 0x4f, 0x22, 199 | 0xf6, 0x04, 0xcc, 0x26, 0xbe, 0x30, 0x7b, 0x87, 200 | 0x66, 0x27, 0x82, 0x99, 0xbb, 0xe6, 0xa0, 0xcd, 201 | 0x19, 0xeb, 0x23, 0xc9, 0x51, 0xdf, 0x94, 0x68, 202 | }; 203 | 204 | /* Scalar Code */ 205 | 206 | static KHASH_FINLINE void khashv_bswap_be_block_scalar(khashvBlock* in) { 207 | // Byte swapping is only needed if we are not on on a little endian system 208 | if (khashv_is_little_endian()) { 209 | return; 210 | } 211 | for(int i = 0; i < 4; i++) { 212 | in->words[i] = KHASH_BSWAP32(in->words[i]); 213 | } 214 | } 215 | 216 | static KHASH_FINLINE void khashv_rotr_5_bytes_scalar(khashvBlock* in) { 217 | khashv_bswap_be_block_scalar(in); 218 | khashvBlock tmp1; 219 | khashvBlock tmp2; 220 | // Avoid aliasing issues by using memcpy between these union values. 221 | memcpy(tmp1.bytes, in->words, 16); 222 | for(int i = 0; i < 16; i++) { 223 | tmp2.bytes[i] = tmp1.bytes[(i + 5) & 0xf]; 224 | } 225 | memcpy(in->words, tmp2.bytes, 16); 226 | khashv_bswap_be_block_scalar(in); 227 | } 228 | 229 | static KHASH_FINLINE void khashv_rotr_9_bytes_scalar(khashvBlock* in) { 230 | khashv_bswap_be_block_scalar(in); 231 | khashvBlock tmp1; 232 | khashvBlock tmp2; 233 | // Avoid aliasing issues by using memcpy between these union values. 234 | memcpy(tmp1.bytes, in->words, 16); 235 | for(int i = 0; i < 16; i++) { 236 | tmp2.bytes[i] = tmp1.bytes[(i + 9) & 0xf]; 237 | } 238 | memcpy(in->words, tmp2.bytes, 16); 239 | khashv_bswap_be_block_scalar(in); 240 | } 241 | 242 | static KHASH_FINLINE void khashv_shl_13_block_scalar(khashvBlock* in) { 243 | for(int i = 0; i < 4; i++) { 244 | in->words[i] <<= 13; 245 | } 246 | } 247 | 248 | static KHASH_FINLINE void khashv_shr_3_block_scalar(khashvBlock* in) { 249 | for(int i = 0; i < 4; i++) { 250 | in->words[i] >>= 3; 251 | } 252 | } 253 | 254 | static KHASH_FINLINE void khashv_add_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) { 255 | for(int i = 0; i < 4; i++) { 256 | a->words[i] += b->words[i]; 257 | } 258 | } 259 | 260 | static KHASH_FINLINE void khashv_xor_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) { 261 | for(int i = 0; i < 4; i++) { 262 | a->words[i] ^= b->words[i]; 263 | } 264 | } 265 | 266 | // GCC and Clang with -O3 were vectorizing this quite poorly with -O3 267 | // They could not detect that only a PSHUFB was needed and instead 268 | // where generating tons of inserts and extracts from the vector 269 | // registers. Thusly it was running slower than code that was not being 270 | // vectorized on my machine. So I specify the optimization level directly. 271 | // Tried a few other things to get GCC and Clang to generate more sane 272 | // code or code using PSHUFB, but this seemed the cleanest. 273 | // Example of what I mean: https://godbolt.org/z/PMnzsThPc 274 | // Compared to this: https://godbolt.org/z/dWfjr7GWP 275 | /*static KHASH_OPT_SZ void khashv_sub16(khashvBlock* tmp, const uint8_t sub[16]) { 276 | #if defined(__clang__) 277 | // Stop clang from being annoying!!! 278 | // The auto-vectorized code was worse at the time of writing this 279 | #pragma nounroll 280 | #pragma clang loop vectorize(disable) 281 | #pragma clang loop interleave(disable) 282 | #endif 283 | for (int i = 0; i < 16; i++) { 284 | tmp->bytes[i] = sub[tmp->bytes[i]]; 285 | } 286 | } 287 | 288 | static KHASH_FINLINE void khashv_replace_scalar(khashvBlock* replace) { 289 | khashvBlock tmp; 290 | for (int i = 0; i < 16; i++) { 291 | tmp.bytes[i] = (replace->bytes[i] >> 4); 292 | replace->bytes[i] &= 0x0f; 293 | } 294 | khashv_sub16(replace, khashv_s1); 295 | khashv_sub16(&tmp, khashv_s2); 296 | for (int i = 0; i < 16; i++) { 297 | replace->bytes[i] ^= tmp.bytes[i]; 298 | } 299 | }*/ 300 | // Similar issue as the commented out code so stop the optimizers 301 | // from getting crazy 302 | static KHASH_OPT_SZ void khashv_replace_scalar(khashvBlock* replace) { 303 | khashvBlock tmp; 304 | memcpy(tmp.bytes, replace->words, 16); 305 | #if defined(__clang__) 306 | // Stop clang from being annoying!!! 307 | // The auto-vectorized code was worse at the time of writing this 308 | #pragma nounroll 309 | #pragma clang loop vectorize(disable) 310 | #pragma clang loop interleave(disable) 311 | #endif 312 | for(int i = 0; i < 16; i++) { 313 | tmp.bytes[i] = khashv_xored[tmp.bytes[i]]; 314 | } 315 | memcpy(replace->words, tmp.bytes, 16); 316 | } 317 | 318 | static KHASH_FINLINE void khashv_mix_words_scalar(khashvBlock* in) { 319 | unsigned rots[4] = { 5, 7, 11, 17 }; 320 | khashvBlock tmp = { 0 }; 321 | for (int i = 0; i < 4; i++) { 322 | unsigned rot = rots[i]; 323 | tmp = *in; 324 | khashv_rotr_5_bytes_scalar(&tmp); 325 | khashv_add_block_scalar(&tmp, in); 326 | for (int j = 0; j < 4; j++) { 327 | tmp.words[j] = KHASH_ROTR32(tmp.words[j], rot); 328 | } 329 | khashv_xor_block_scalar(in, &tmp); 330 | } 331 | } 332 | 333 | static void khashv_hash_scalar(khashvBlock* hash, const uint8_t* data, size_t data_len) { 334 | hash->words[0] ^= data_len; 335 | // size_t is bigger than 32 bits 336 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295 337 | hash->words[1] ^= data_len >> 32; 338 | #endif 339 | 340 | khashvBlock tmp_1; 341 | khashvBlock tmp_2; 342 | khashvBlock tmp_h = *hash; 343 | 344 | const uint8_t* end = data + (data_len & ~((size_t)15)); 345 | 346 | while (data < end) { 347 | memcpy(&tmp_2, data, 16); 348 | khashv_replace_scalar(&tmp_2); 349 | memcpy(&tmp_1.words, tmp_2.bytes, 16); 350 | 351 | khashv_bswap_be_block_scalar(&tmp_1); 352 | 353 | tmp_2 = tmp_1; 354 | //khashv_shl_13_block_scalar(&tmp_2); 355 | //khashv_add_block_scalar(&tmp_2, &tmp_1); 356 | for(int i = 0; i < 4; i++) { 357 | tmp_2.words[i] *= 8193; 358 | } 359 | khashv_xor_block_scalar(&tmp_h, &tmp_2); 360 | khashv_rotr_5_bytes_scalar(&tmp_h); 361 | khashv_add_block_scalar(&tmp_h, &tmp_1); 362 | 363 | tmp_2 = tmp_h; 364 | khashv_shr_3_block_scalar(&tmp_2); 365 | khashv_rotr_9_bytes_scalar(&tmp_h); 366 | khashv_add_block_scalar(&tmp_h, &tmp_2); 367 | 368 | data += 16; 369 | } 370 | 371 | unsigned trailing = data_len & 0xf; 372 | if(trailing) { 373 | memset(&tmp_2, 0, 16); 374 | 375 | memcpy(&tmp_2.bytes, data, trailing); 376 | khashv_replace_scalar(&tmp_2); 377 | memcpy(&tmp_1.words, tmp_2.bytes, 16); 378 | 379 | khashv_bswap_be_block_scalar(&tmp_1); 380 | 381 | tmp_2 = tmp_1; 382 | //khashv_shl_13_block_scalar(&tmp_2); 383 | //khashv_add_block_scalar(&tmp_2, &tmp_1); 384 | for(int i = 0; i < 4; i++) { 385 | tmp_2.words[i] *= 8193; 386 | } 387 | khashv_xor_block_scalar(&tmp_h, &tmp_2); 388 | khashv_rotr_5_bytes_scalar(&tmp_h); 389 | khashv_add_block_scalar(&tmp_h, &tmp_1); 390 | 391 | tmp_2 = tmp_h; 392 | khashv_shr_3_block_scalar(&tmp_2); 393 | khashv_rotr_9_bytes_scalar(&tmp_h); 394 | khashv_add_block_scalar(&tmp_h, &tmp_2); 395 | 396 | } 397 | khashv_mix_words_scalar(&tmp_h); 398 | *hash = tmp_h; 399 | } 400 | 401 | static inline void khashv_prep_seed32_scalar(khashvSeed* seed_prepped, uint32_t seed) { 402 | *seed_prepped = khash_v_init; 403 | seed_prepped->words[0] ^= seed; 404 | khashv_mix_words_scalar(seed_prepped); 405 | } 406 | 407 | static inline void khashv_prep_seed64_scalar(khashvSeed* seed_prepped, uint64_t seed) { 408 | *seed_prepped = khash_v_init; 409 | seed_prepped->words[0] ^= seed; 410 | khashv_mix_words_scalar(seed_prepped); 411 | // Do it again with the other part to make it different than the 32 bit seed. 412 | seed_prepped->words[1] ^= seed >> 32; 413 | khashv_mix_words_scalar(seed_prepped); 414 | } 415 | 416 | static inline void khashv_prep_seed128_scalar(khashvSeed* seed_prepped, const uint32_t seed[4]) { 417 | for(int i = 0; i < 4; i++) { 418 | seed_prepped->words[i] = seed[i]; 419 | } 420 | } 421 | 422 | static inline uint32_t khashv32_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 423 | khashvBlock h = *seed; 424 | khashv_hash_scalar(&h, data, data_len); 425 | return h.words[3]; 426 | } 427 | 428 | static inline uint64_t khashv64_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 429 | khashvBlock h = *seed; 430 | khashv_hash_scalar(&h, data, data_len); 431 | uint64_t r = h.words[1]; 432 | r <<= 32; 433 | r |= h.words[0]; 434 | return r; 435 | } 436 | 437 | /* Vectorization for and Intel/AMD */ 438 | 439 | #if defined(__SSE3__) 440 | 441 | #define KHASH_VECTOR 1 442 | 443 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(11, 0)) 444 | static KHASH_FINLINE __m128i _mm_loadu_si32(const void* data) { 445 | uint32_t val; 446 | memcpy(&val, data, sizeof(uint32_t)); 447 | return _mm_cvtsi32_si128(val); 448 | } 449 | static KHASH_FINLINE __m128i _mm_loadu_si16(const void* data) { 450 | uint32_t val = 0; 451 | memcpy(&val, data, sizeof(uint16_t)); 452 | return _mm_cvtsi32_si128(val); 453 | } 454 | #endif 455 | 456 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(9, 1)) 457 | static KHASH_FINLINE __m128i _mm_loadu_si64(const void* data) { 458 | uint64_t val = 0; 459 | memcpy(&val, data, sizeof(uint64_t)); 460 | return _mm_cvtsi64_si128(val); 461 | } 462 | #endif 463 | 464 | static KHASH_FINLINE __m128i khashv_mix_words_vector(__m128i val) { 465 | __m128i tmp1; 466 | __m128i tmp2; 467 | 468 | tmp1 = _mm_alignr_epi8(val, val, 5); 469 | tmp1 = _mm_add_epi32(val, tmp1); 470 | #if defined(__AVX512VL__) 471 | tmp1 = _mm_ror_epi32(tmp1, 5); 472 | val = _mm_xor_si128(val, tmp1); 473 | #else 474 | tmp2 = _mm_srli_epi32(tmp1, 5); 475 | tmp1 = _mm_slli_epi32(tmp1, 27); 476 | val = _mm_xor_si128(val, tmp2); 477 | val = _mm_xor_si128(val, tmp1); 478 | #endif 479 | 480 | tmp1 = _mm_alignr_epi8(val, val, 5); 481 | tmp1 = _mm_add_epi32(val, tmp1); 482 | #if defined(__AVX512VL__) 483 | tmp1 = _mm_ror_epi32(tmp1, 7); 484 | val = _mm_xor_si128(val, tmp1); 485 | #else 486 | tmp2 = _mm_srli_epi32(tmp1, 7); 487 | tmp1 = _mm_slli_epi32(tmp1, 25); 488 | val = _mm_xor_si128(val, tmp2); 489 | val = _mm_xor_si128(val, tmp1); 490 | #endif 491 | 492 | tmp1 = _mm_alignr_epi8(val, val, 5); 493 | tmp1 = _mm_add_epi32(tmp1, val); 494 | #if defined(__AVX512VL__) 495 | tmp1 = _mm_ror_epi32(tmp1, 11); 496 | val = _mm_xor_si128(val, tmp1); 497 | #else 498 | tmp2 = _mm_srli_epi32(tmp1, 11); 499 | tmp1 = _mm_slli_epi32(tmp1, 21); 500 | val = _mm_xor_si128(val, tmp2); 501 | val = _mm_xor_si128(val, tmp1); 502 | #endif 503 | 504 | tmp1 = _mm_alignr_epi8(val, val, 5); 505 | tmp1 = _mm_add_epi32(tmp1, val); 506 | #if defined(__AVX512VL__) 507 | tmp1 = _mm_ror_epi32(tmp1, 17); 508 | val = _mm_xor_si128(val, tmp1); 509 | #else 510 | tmp2 = _mm_srli_epi32(tmp1, 17); 511 | tmp1 = _mm_slli_epi32(tmp1, 15); 512 | val = _mm_xor_si128(val, tmp2); 513 | val = _mm_xor_si128(val, tmp1); 514 | #endif 515 | 516 | return val; 517 | } 518 | 519 | static KHASH_FINLINE __m128i khashv_part_load_vector(const uint8_t* data, size_t len) { 520 | __m128i tmp = { 0 }; 521 | __m128i tmp2 = { 0 }; 522 | switch(len) { 523 | case 1: 524 | #if defined(__SSE4_1__) 525 | tmp = _mm_insert_epi8(tmp, data[0], 0); 526 | #else 527 | tmp = _mm_cvtsi32_si128(data[0]); 528 | #endif 529 | break; 530 | case 2: 531 | tmp = _mm_loadu_si16(data); 532 | break; 533 | case 3: 534 | tmp = _mm_loadu_si16(data); 535 | #if defined(__SSE4_1__) 536 | tmp = _mm_insert_epi8(tmp, data[2], 2); 537 | #else 538 | tmp = _mm_insert_epi16(tmp, data[2], 1); 539 | #endif 540 | break; 541 | case 4: 542 | tmp = _mm_loadu_si32(data); 543 | break; 544 | case 5: 545 | tmp = _mm_loadu_si32(data); 546 | #if defined(__SSE4_1__) 547 | tmp = _mm_insert_epi8(tmp, data[4], 4); 548 | #else 549 | tmp = _mm_insert_epi16(tmp, data[4], 2); 550 | #endif 551 | break; 552 | case 6: 553 | tmp = _mm_loadu_si32(data); 554 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2); 555 | break; 556 | case 7: 557 | tmp = _mm_loadu_si32(data); 558 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2); 559 | #if defined(__SSE4_1__) 560 | tmp = _mm_insert_epi8(tmp, data[6], 6); 561 | #else 562 | tmp = _mm_insert_epi16(tmp, data[6], 3); 563 | #endif 564 | break; 565 | case 8: 566 | tmp = _mm_loadu_si64(data); 567 | break; 568 | case 9: 569 | tmp = _mm_loadu_si64(data); 570 | #if defined(__SSE4_1__) 571 | tmp = _mm_insert_epi8(tmp, data[8], 8); 572 | #else 573 | tmp = _mm_insert_epi16(tmp, data[8], 4); 574 | #endif 575 | break; 576 | case 10: 577 | tmp = _mm_loadu_si64(data); 578 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4); 579 | break; 580 | case 11: 581 | tmp = _mm_loadu_si64(data); 582 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4); 583 | #if defined(__SSE4_1__) 584 | tmp = _mm_insert_epi8(tmp, data[10], 10); 585 | #else 586 | tmp = _mm_insert_epi16(tmp, data[10], 5); 587 | #endif 588 | break; 589 | case 12: 590 | tmp = _mm_loadu_si64(data); 591 | #if defined(__SSE4_1__) 592 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2); 593 | #else 594 | tmp2 = _mm_loadu_si32(data + 8); 595 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f); 596 | tmp = _mm_or_si128(tmp, tmp2); 597 | #endif 598 | break; 599 | case 13: 600 | tmp = _mm_loadu_si64(data); 601 | #if defined(__SSE4_1__) 602 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2); 603 | tmp = _mm_insert_epi8(tmp, data[12], 12); 604 | #else 605 | tmp2 = _mm_loadu_si32(data + 8); 606 | tmp2 = _mm_insert_epi16(tmp2, data[12], 2); 607 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f); 608 | tmp = _mm_or_si128(tmp, tmp2); 609 | #endif 610 | break; 611 | case 14: 612 | tmp = _mm_loadu_si64(data); 613 | #if defined(__SSE4_1__) 614 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2); 615 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6); 616 | #else 617 | tmp2 = _mm_loadu_si32(data + 8); 618 | tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6); 619 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f); 620 | tmp = _mm_or_si128(tmp, tmp2); 621 | #endif 622 | break; 623 | case 15: 624 | tmp = _mm_loadu_si64(data); 625 | #if defined(__SSE4_1__) 626 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2); 627 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6); 628 | tmp = _mm_insert_epi8(tmp, data[14], 14); 629 | #else 630 | tmp2 = _mm_loadu_si32(data + 8); 631 | tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6); 632 | tmp2 = _mm_insert_epi16(tmp2, data[14], 7); 633 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f); 634 | tmp = _mm_or_si128(tmp, tmp2); 635 | #endif 636 | break; 637 | case 16: 638 | tmp = _mm_loadu_si64(data); 639 | #if defined(__SSE4_1__) 640 | tmp = _mm_insert_epi64(tmp, *(uint64_t*)(data + 8), 1); 641 | #else 642 | tmp2 = _mm_loadu_si64(data + 8); 643 | tmp = _mm_unpacklo_epi64(tmp, tmp2); 644 | #endif 645 | break; 646 | } 647 | return tmp; 648 | } 649 | 650 | static const uint8_t khashv_shuff[16] = { 651 | 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 652 | 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 653 | }; 654 | 655 | static __m128i khashv_hash_vector(__m128i hash, const uint8_t* data, size_t data_len) { 656 | const __m128i s1 = _mm_loadu_si128((const __m128i*)khashv_s1); 657 | const __m128i s2 = _mm_loadu_si128((const __m128i*)khashv_s2); 658 | const __m128i shuff = _mm_loadu_si128((const __m128i*)khashv_shuff); 659 | const __m128i mask = _mm_set1_epi32(0x0f0f0f0f); 660 | 661 | __m128i tmp_1; 662 | __m128i tmp_2; 663 | 664 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295 665 | tmp_1 = _mm_cvtsi64_si128(data_len); 666 | #else 667 | tmp_1 = _mm_cvtsi32_si128(data_len); 668 | #endif 669 | hash = _mm_xor_si128(tmp_1, hash); 670 | 671 | const uint8_t* end = data + (data_len & ~((size_t)15)); 672 | const uint8_t* end2 = data + data_len; 673 | while(data_len > 16 && data < end) { 674 | tmp_1 = _mm_lddqu_si128((const __m128i*)data); 675 | tmp_2 = _mm_srli_epi32 (tmp_1, 4); 676 | 677 | tmp_1 = _mm_and_si128 (tmp_1, mask); 678 | tmp_2 = _mm_and_si128 (tmp_2, mask); 679 | tmp_1 = _mm_shuffle_epi8(s1, tmp_1); 680 | tmp_2 = _mm_shuffle_epi8(s2, tmp_2); 681 | tmp_1 = _mm_xor_si128 (tmp_1, tmp_2); 682 | 683 | tmp_2 = _mm_slli_epi32 (tmp_1, 13); 684 | tmp_2 = _mm_add_epi32 (tmp_1, tmp_2); 685 | tmp_2 = _mm_xor_si128 (hash, tmp_2); 686 | tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5); 687 | hash = _mm_add_epi32 (tmp_2, tmp_1); 688 | 689 | tmp_2 = _mm_srli_epi32(hash, 3); 690 | tmp_1 = _mm_shuffle_epi8(hash, shuff); 691 | hash = _mm_add_epi32 (tmp_2, tmp_1); 692 | 693 | data += 16; 694 | } 695 | uintptr_t trailing = end2 - data; 696 | if(trailing) { 697 | tmp_1 = khashv_part_load_vector(data, trailing); 698 | tmp_2 = _mm_srli_epi32 (tmp_1, 4); 699 | 700 | tmp_1 = _mm_and_si128 (tmp_1, mask); 701 | tmp_2 = _mm_and_si128 (tmp_2, mask); 702 | tmp_1 = _mm_shuffle_epi8(s1, tmp_1); 703 | tmp_2 = _mm_shuffle_epi8(s2, tmp_2); 704 | tmp_1 = _mm_xor_si128 (tmp_1, tmp_2); 705 | 706 | tmp_2 = _mm_slli_epi32 (tmp_1, 13); 707 | tmp_2 = _mm_add_epi32 (tmp_1, tmp_2); 708 | tmp_2 = _mm_xor_si128 (hash, tmp_2); 709 | tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5); 710 | hash = _mm_add_epi32 (tmp_2, tmp_1); 711 | 712 | tmp_2 = _mm_srli_epi32(hash, 3); 713 | tmp_1 = _mm_shuffle_epi8(hash, shuff); 714 | hash = _mm_add_epi32 (tmp_2, tmp_1); 715 | } 716 | hash = khashv_mix_words_vector(hash); 717 | return hash; 718 | } 719 | 720 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) { 721 | __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init); 722 | s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed)); 723 | seed_prepped->vec = khashv_mix_words_vector(s); 724 | } 725 | 726 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) { 727 | __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init); 728 | __m128i t = _mm_cvtsi32_si128(seed >> 32); 729 | s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed)); 730 | s = khashv_mix_words_vector(s); 731 | s = _mm_xor_si128(s, _mm_shuffle_epi32(t, 0xf3)); 732 | seed_prepped->vec = khashv_mix_words_vector(s); 733 | } 734 | 735 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) { 736 | seed_prepped->vec = _mm_loadu_si128((const __m128i*)seed); 737 | } 738 | 739 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 740 | __m128i h = khashv_hash_vector(seed->vec, data, data_len); 741 | // using word[3] to avoid any overlap with with the 742 | // 64 bit hash which uses words [0] and [1], this ensures 743 | // the 2 bit outputs should behave differently when used. 744 | #if defined(__SSE4_1__) 745 | return _mm_extract_epi32(h, 3); 746 | #else 747 | h = _mm_shuffle_epi32(h, 0xff); 748 | return _mm_cvtsi128_si32(h); 749 | #endif 750 | } 751 | 752 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 753 | __m128i h = khashv_hash_vector(seed->vec, data, data_len); 754 | return _mm_cvtsi128_si64(h); 755 | } 756 | 757 | #endif 758 | 759 | /* Vectorization via GCCs Vectorization builtins */ 760 | // Handy since it allows vectorization without explicit intrinsics 761 | // for a particular CPU. 762 | 763 | #if !defined(KHASH_VECTOR) && KHASH_GCC_LEAST__(6, 1) 764 | 765 | #define KHASH_VECTOR 1 766 | 767 | typedef uint8_t kv16ui __attribute__((vector_size(16))); 768 | typedef uint32_t kv4ui __attribute__((vector_size(16))); 769 | 770 | static KHASH_FINLINE kv16ui khashv_sub_s1_gcc(kv16ui in) { 771 | const kv16ui mask = { 772 | 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 773 | 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf 774 | }; 775 | const kv16ui sub = { 776 | 0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7, 777 | 0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12, 778 | }; 779 | in &= mask; 780 | return __builtin_shuffle(sub, in); 781 | } 782 | 783 | static KHASH_FINLINE kv16ui khashv_sub_s2_gcc(kv16ui in) { 784 | const kv16ui sub = { 785 | 0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5, 786 | 0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a, 787 | }; 788 | in >>= 4; 789 | return __builtin_shuffle(sub, in); 790 | } 791 | 792 | static KHASH_FINLINE kv4ui khashv_rotr_5_bytes_gcc(kv4ui input) { 793 | const kv16ui rotrLE = { 794 | 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 795 | 0xd, 0xe, 0xf, 0x0, 0x1, 0x2, 0x3, 0x4 796 | }; 797 | const kv16ui rotrBE = { 798 | 0xb, 0x4, 0x5, 0x6, 0xf, 0x8, 0x9, 0xa, 799 | 0x3, 0xc, 0xd, 0xe, 0x7, 0x0, 0x1, 0x2 800 | }; 801 | kv16ui tmp; 802 | memcpy(&tmp, &input, 16); 803 | if (khashv_is_little_endian()) { 804 | tmp = __builtin_shuffle(tmp, rotrLE); 805 | } else { 806 | tmp = __builtin_shuffle(tmp, rotrBE); 807 | } 808 | memcpy(&input, &tmp, 16); 809 | return input; 810 | } 811 | 812 | static KHASH_FINLINE kv4ui khashv_rotr_9_bytes_gcc(kv4ui input) { 813 | const kv16ui rotrLE = { 814 | 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0, 815 | 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 816 | }; 817 | const kv16ui rotrBE = { 818 | 0xf, 0x8, 0x9, 0xa, 0x3, 0xc, 0xd, 0xe, 819 | 0x7, 0x0, 0x1, 0x2, 0xb, 0x4, 0x5, 0x6, 820 | }; 821 | kv16ui tmp; 822 | memcpy(&tmp, &input, 16); 823 | if (khashv_is_little_endian()) { 824 | tmp = __builtin_shuffle(tmp, rotrLE); 825 | } else { 826 | tmp = __builtin_shuffle(tmp, rotrBE); 827 | } 828 | memcpy(&input, &tmp, 16); 829 | return input; 830 | } 831 | 832 | static KHASH_FINLINE kv4ui khash_byteswap_vec32_gcc( kv4ui input ) { 833 | const kv16ui bswap32 = { 834 | 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4, 835 | 0xb, 0xa, 0x9, 0x8, 0xf, 0xe, 0xd, 0xc, 836 | }; 837 | kv16ui b; 838 | 839 | memcpy(&b, &input, 16); 840 | b = __builtin_shuffle(b, bswap32); 841 | memcpy(&input, &b, 16); 842 | return input; 843 | } 844 | 845 | static KHASH_FINLINE kv4ui khashv_replace_gcc(kv4ui input) { 846 | kv16ui s1; 847 | kv16ui s2; 848 | memcpy(&s1, &input, 16); 849 | s2 = khashv_sub_s2_gcc(s1); 850 | s1 = khashv_sub_s1_gcc(s1); 851 | s1 ^= s2; 852 | memcpy(&input, &s1, 16); 853 | return input; 854 | } 855 | 856 | static KHASH_FINLINE kv4ui khashv_mix_words_gcc(kv4ui val) { 857 | const unsigned rots[4] = { 5, 7, 11, 17 }; 858 | for (int i = 0; i < 4; i++) { 859 | unsigned rot = rots[i]; 860 | kv4ui tmp = val; 861 | tmp = khashv_rotr_5_bytes_gcc(tmp); 862 | tmp += val; 863 | tmp = (tmp >> rot) | (tmp << (32 - rot)); 864 | val ^= tmp; 865 | } 866 | return val; 867 | } 868 | 869 | static KHASH_FINLINE kv4ui khashv_hash_block_gcc(kv4ui hash, kv4ui input) { 870 | kv4ui tmp_1 = khashv_replace_gcc(input); 871 | if (!khashv_is_little_endian()) { 872 | tmp_1 = khash_byteswap_vec32_gcc(tmp_1); 873 | } 874 | kv4ui tmp_2 = tmp_1 * 8193; 875 | tmp_2 ^= hash; 876 | tmp_2 = khashv_rotr_5_bytes_gcc(tmp_2); 877 | hash = tmp_1 + tmp_2; 878 | 879 | tmp_2 = hash >> 3; 880 | tmp_1 = khashv_rotr_9_bytes_gcc(hash); 881 | hash = tmp_1 + tmp_2; 882 | return hash; 883 | } 884 | 885 | static KHASH_FINLINE kv4ui khashv_hash_gcc(kv4ui hash, const uint8_t* data, size_t data_len) { 886 | hash[0] ^= data_len; 887 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295 888 | hash[1] ^= data_len >> 32; 889 | #endif 890 | 891 | kv4ui data_v; 892 | const uint8_t* end = data + (data_len & ~((size_t)15)); 893 | while (data < end) { 894 | memcpy(&data_v, data, 16); 895 | hash = khashv_hash_block_gcc(hash, data_v); 896 | data += 16; 897 | } 898 | 899 | unsigned trailing = data_len & 0xf; 900 | if(trailing) { 901 | memset(&data_v, 0, 16); 902 | memcpy(&data_v, data, trailing); 903 | hash = khashv_hash_block_gcc(hash, data_v); 904 | } 905 | return khashv_mix_words_gcc(hash); 906 | } 907 | 908 | 909 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) { 910 | kv4ui s; 911 | memcpy(&s, &khash_v_init, 16); 912 | s[0] ^= seed; 913 | s = khashv_mix_words_gcc(s); 914 | memcpy(seed_prepped, &s, 16); 915 | } 916 | 917 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) { 918 | kv4ui s; 919 | memcpy(&s, &khash_v_init, 16); 920 | s[0] ^= seed; 921 | s = khashv_mix_words_gcc(s); 922 | s[1] ^= seed >> 32; 923 | s = khashv_mix_words_gcc(s); 924 | memcpy(seed_prepped, &s, 16); 925 | } 926 | 927 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) { 928 | memcpy(seed_prepped->words, seed, 16); 929 | } 930 | 931 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 932 | kv4ui h; 933 | memcpy(&h, seed, 16); 934 | h = khashv_hash_gcc(h, data, data_len); 935 | return h[3]; 936 | } 937 | 938 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 939 | kv4ui h; 940 | memcpy(&h, seed, 16); 941 | h = khashv_hash_gcc(h, data, data_len); 942 | uint64_t ret; 943 | if (khashv_is_little_endian()) { 944 | memcpy(&ret, &h, 8); 945 | } else { 946 | ret = h[1]; 947 | ret = (ret << 32) | h[0]; 948 | } 949 | return ret; 950 | } 951 | 952 | #endif 953 | 954 | #if defined(KHASH_VECTOR) && !defined(KHASHV_SCALAR) 955 | 956 | static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) { 957 | khashv_prep_seed32_vector(seed_prepped, seed); 958 | } 959 | 960 | static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) { 961 | khashv_prep_seed64_vector(seed_prepped, seed); 962 | } 963 | 964 | static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) { 965 | khashv_prep_seed128_vector(seed_prepped, seed); 966 | } 967 | 968 | static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 969 | return khashv32_vector(seed, data, data_len); 970 | } 971 | 972 | static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 973 | return khashv64_vector(seed, data, data_len); 974 | } 975 | 976 | #else 977 | 978 | static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) { 979 | khashv_prep_seed32_scalar(seed_prepped, seed); 980 | } 981 | 982 | static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) { 983 | khashv_prep_seed64_scalar(seed_prepped, seed); 984 | } 985 | 986 | static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) { 987 | khashv_prep_seed128_scalar(seed_prepped, seed); 988 | } 989 | 990 | static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 991 | return khashv32_scalar(seed, data, data_len); 992 | } 993 | 994 | static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) { 995 | return khashv64_scalar(seed, data, data_len); 996 | } 997 | 998 | #endif 999 | 1000 | 1001 | #ifdef __cplusplus 1002 | } 1003 | #endif 1004 | #endif 1005 | --------------------------------------------------------------------------------