├── .gitattributes
├── khashv32-seed-6bb75f13.png
├── khashv64-seed-1dcedff1a8b17e89.png
├── .gitignore
├── LICENSE
├── k-hashv-old
├── README_v1.md
├── test_speed_v1.c
└── khashv_v1.h
├── test_speed.c
├── README.md
└── khashv.h
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/khashv32-seed-6bb75f13.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Keith-Cancel/k-hashv/HEAD/khashv32-seed-6bb75f13.png
--------------------------------------------------------------------------------
/khashv64-seed-1dcedff1a8b17e89.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Keith-Cancel/k-hashv/HEAD/khashv64-seed-1dcedff1a8b17e89.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Prerequisites
2 | *.d
3 |
4 | # Object files
5 | *.o
6 | *.ko
7 | *.obj
8 | *.elf
9 |
10 | # Linker output
11 | *.ilk
12 | *.map
13 | *.exp
14 |
15 | # Precompiled Headers
16 | *.gch
17 | *.pch
18 |
19 | # Libraries
20 | *.lib
21 | *.a
22 | *.la
23 | *.lo
24 |
25 | # Shared objects (inc. Windows DLLs)
26 | *.dll
27 | *.so
28 | *.so.*
29 | *.dylib
30 |
31 | # Executables
32 | *.exe
33 | *.out
34 | *.app
35 | *.i*86
36 | *.x86_64
37 | *.hex
38 |
39 | # Debug files
40 | *.dSYM/
41 | *.su
42 | *.idb
43 | *.pdb
44 |
45 | # Kernel Module Compile Results
46 | *.mod*
47 | *.cmd
48 | .tmp_versions/
49 | modules.order
50 | Module.symvers
51 | Mkfile.old
52 | dkms.conf
53 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 Keith-Cancel
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/k-hashv-old/README_v1.md:
--------------------------------------------------------------------------------
1 | # K-HASHV
2 | A single header hash function with both vectorized and scalar versions. The function is quite fast when vectorized achieving approximately an average of **~9.6 GB/s** on a 7 year old Xeon E3-1230 v5.
3 |
4 | Additionally, it also passes all the SMHasher hash function quality tests: https://github.com/rurban/smhasher
5 |
6 | Moreover, it is quite easy to choose a new function at runtime by just using new seed as shown below:
7 | ```C
8 | #include "khashv.h"
9 |
10 | void foo() {
11 | /*
12 | code ....
13 | */
14 | khashvSeed seed;
15 | khashv_prep_seed64(&seed, a_64_bit_value);
16 | uint64_t hash = khashv64(&seed, your_data, data_len);
17 | /*
18 | code ....
19 | */
20 | }
21 | ```
22 | ### Note
23 | This is **not a cryptographic hash function**, and it should not be used in for such applications.
24 |
25 | ## Performance
26 | When testing on 1.25 GB and 512 KB of random data I get the following on averages:
27 |
28 | | Processor | 1.25 GB Time | 1.25 GB Speed | 512 KB Time | 512 KB Speed | OS | Compiler | Type |
29 |
30 | | Xeon E3-1230 v5 | 0.1298 s | 9.6285 GB/s | 052.5107 us | 9.2987 GB/s | Linux | GCC 12.1.0 | Vectorized |
31 | | Xeon E3-1230 v5 | 1.1911 s | 1.0495 GB/s | 494.1932 us | 0.9880 GB/s | Linux | GCC 12.1.0 | Scalar |
32 | | Xeon E3-1230 v5 | 0.1418 s | 8.8142 GB/s | 055.9333 us | 8.7297 GB/s | Linux | Clang 14.0.6 | Vectorized |
33 | | Ryzen 9 7900 | 0.1227 s | 10.1881 GB/s | 046.0273 us | 10.6085 GB/s | Linux | GCC 12.2.1 | Vectorized |
34 | | Ryzen 9 7900 | 0.8693 s | 1.4379 GB/s | 375.0820 us | 1.3018 GB/s | Linux | GCC 12.2.1 | Scalar |
35 |
36 |
37 |
38 | The scalar version is slower at a tad over ~1 GB/s on my system when compiling test_speed.c with gcc using `-O3`.
39 | On windows Microsoft's compiler does not seem to generate as performant code from the intrinsics, but the GCC mingw64 compiler generates pretty comparable numbers for me at least.
40 |
41 | Definitely, want to add other machines to this table. But if you are curious how it performs on your machine compile test_speed.c with `-O3 -march=native` and `-O3 -march=native -D KHASHV_SCALAR`.
42 |
43 | ## Functions
44 | ```C
45 | // Prepares a seed from a 32-bit value
46 | void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed)
47 | ```
48 |
49 | ```C
50 | // Prepares a seed from a 64-bit value
51 | void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed)
52 | ```
53 |
54 | ```C
55 | // Sets 128-bits to be the seed
56 | void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4])
57 | ```
58 |
59 | ```C
60 | // Produces a 32-bit hash from the input data
61 | uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len)
62 | ```
63 |
64 | ```C
65 | // Produces a 64-bit hash from the input data
66 | uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len)
67 | ```
68 |
69 | ## K-HASHV 64 Output
70 | Here is the output of the 64 bit hash of the integers \[0, 259199\] using 0x1dcedff1a8b17e89 as the seed.
71 |
72 |
73 |
74 | ## K-HASHV 32 Output
75 |
76 | Here is the output of the 32 bit hash of the integers \[0, 518399\] using 0x6bb75f13 as the seed.
77 |
78 |
79 |
80 | The output of the above images was generated by basically doing the following for a hash.
81 |
82 | ```C
83 | for(int i = 0; i < sizeof(hash_bytes); i++) {
84 | pixel[img_offset + i].r = hash_bytes[i];
85 | pixel[img_offset + i].g = hash_bytes[i];
86 | pixel[img_offset + i].b = hash_bytes[i];
87 | pixel[img_offset + i].a = 255;
88 | }
89 | ```
90 |
91 | ## Things TODO
92 | When thinking about things to improve the code and hash function these are the first few things that come to mind for me.
93 |
94 | 1. The main thing would be try to get both Clang and MSVC to output code that runs as fast GCC or as close as possible. They both seem to do some silly things when compared to GCC losing some performance when looking at the generated assembly. Microsoft's compiler being the worst, and probably the fastest fix for me to implement would be to write some assembly code. However, it then would no longer be a single header file hash function since MSVC does not support inline assembly for 64-bit builds, and thusly would require a separate file.
95 |
96 | 2. Then probably consider using intrinsics for some other systems like ARM NEON, but the for now there is scalar code and code written using GCC's vector built-ins that will generate vectorized code for other architectures that GCC supports.
97 |
98 | 3. Probably, the next thing I could think of is to choose a better value for S1 and S2 that are used to basically substitute bytes. The current values where found randomly checking a small set of criteria. Mainly focusing on each bit of S1 and S2 as columns. Then Xor-ing them effectively creating an 8 bit input boolean function, and making sure the entire thing maps each input to a unique value. There likely are better values that could chosen, and criteria to look at that look at all bits at once. However, the search space is huge effectively 2^(2\*8\*16) possible permutations for S1 and S2. However, the current values do seem to work well, from my testing.
99 |
100 | ### Suggestions
101 | I am open to any other suggestions or improvments.
102 |
--------------------------------------------------------------------------------
/k-hashv-old/test_speed_v1.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include "khashv.h"
6 |
7 | #if defined(__MINGW32__) || defined(_WIN32)
8 | #include
9 |
10 | #define get_timer(x) QueryPerformanceCounter(&x)
11 |
12 | typedef LARGE_INTEGER timer;
13 |
14 | uint64_t time_ns(timer* start, timer* stop) {
15 | LARGE_INTEGER freq;
16 | if(!QueryPerformanceFrequency(&freq)) {
17 | return UINT64_MAX;
18 | }
19 | double ns = stop->QuadPart - start->QuadPart;
20 | double ratio = 1000000000.0; // 1 billion ns = 1 second
21 | ratio /= (double)freq.QuadPart;
22 | ns *= ratio;
23 | return (uint64_t)ns;
24 | }
25 |
26 | #else
27 | #include
28 | #define get_timer(x) clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &x)
29 | typedef struct timespec timer;
30 |
31 | uint64_t time_ns(timer* start, timer* stop) {
32 | int secs = stop->tv_sec - start->tv_sec;
33 | if(secs > 0) {
34 | uint64_t t0_ns = start->tv_sec * 1000000000;
35 | uint64_t t1_ns = stop->tv_sec * 1000000000;
36 | t0_ns += start->tv_nsec;
37 | t1_ns += stop->tv_nsec;
38 | return t1_ns - t0_ns;
39 | }
40 | return stop->tv_nsec - start->tv_nsec;
41 | }
42 | #endif
43 |
44 | #define MB_TO_BYTES(x) (1024ULL * 1024ULL * (x))
45 |
46 | double get_gbs(double t_ns, double gigs) {
47 | t_ns /= 1000000000;
48 | return gigs / t_ns;
49 | }
50 |
51 | double get_secs(double t_ns) {
52 | return t_ns / 1000000000.0;
53 | }
54 |
55 | double get_usecs(double t_ns) {
56 | return t_ns / 1000.0;
57 | }
58 |
59 | void populate_memory(const khashvSeed* seed, uint8_t* bytes, size_t size) {
60 | printf("Populating Memory: ");
61 | // Use the hash to populate memory with pseudo random bytes
62 | uint64_t state[2] = { 0x4d9ef2f9a304588a, 0x58ca10a39947b63b };
63 | for(size_t i = 0; i < size; i += sizeof(uint64_t)) {
64 | if(i != 0 && (i & 0x1ffffff) == 0) {
65 | printf(".");
66 | fflush(stdout);
67 | }
68 | state[0] = khashv64(seed, (uint8_t*)state, sizeof(uint64_t) * 2);
69 | memcpy(bytes + i, state, sizeof(uint64_t));
70 | }
71 | puts(" Populated!");
72 | }
73 |
74 | int gig_tests(khashvSeed seed) {
75 | size_t size = MB_TO_BYTES(1280);
76 | uint8_t* bytes = malloc(size);
77 | if(bytes == NULL) {
78 | fprintf(stderr, "Can not allocate memory for test!\n");
79 | return 1;
80 | }
81 | populate_memory(&seed, bytes, size);
82 |
83 | double gigs = (double)size / (double)MB_TO_BYTES(1024);
84 | double sum = 0;
85 | double fastest = DBL_MAX;
86 |
87 | const uint32_t hashes[12] = {
88 | 0xa9ca46b1, 0x8c9f5264, 0x2094ffd9, 0x93946e70, 0x9b71dd71,
89 | 0x2abeec74, 0x6bca7368, 0x151fff30, 0xc4228495, 0xfad35669,
90 | 0x9f151590, 0x20a4045b
91 | };
92 |
93 | printf("Tests on %.3lf GB block: ", gigs);
94 | fflush(stdout);
95 | for(unsigned i = 0; i < 12; i++) {
96 | timer t0;
97 | timer t1;
98 | get_timer(t0);
99 | uint32_t h = khashv32(&seed, bytes, size);
100 | get_timer(t1);
101 |
102 | if(h != hashes[i]) {
103 | printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]);
104 | }
105 |
106 | double t = time_ns(&t0, &t1);
107 | if(t < fastest) {
108 | fastest = t;
109 | }
110 | sum += t;
111 | bytes[i] += 1;
112 | }
113 |
114 | double avg = sum / 12;
115 | printf(
116 | "Avg: %lf GB/s, Avg Time: %lf s, Fastest: %lf GB/s\n",
117 | get_gbs (avg, gigs),
118 | get_secs(avg),
119 | get_gbs (fastest, gigs)
120 | );
121 | fflush(stdout);
122 | free(bytes);
123 | return 0;
124 | }
125 |
126 | int half_mb_tests(khashvSeed seed) {
127 | size_t size = 1024 * 512;
128 | uint8_t* bytes = malloc(size);
129 | if(bytes == NULL) {
130 | fprintf(stderr, "Can not allocate memory for test!\n");
131 | return 1;
132 | }
133 | populate_memory(&seed, bytes, size);
134 |
135 | double gigs = (double)size / (double)MB_TO_BYTES(1024);
136 | double sum = 0;
137 | double fastest = DBL_MAX;
138 | unsigned count = 96;
139 | const uint32_t hashes[96] = {
140 | 0x249b844b, 0x852e481c, 0xf7ce4779, 0x5b1e79c0, 0xc6280b69,
141 | 0x18aaed1f, 0x360a7b70, 0x6691373f, 0x62b0e7d2, 0x503f2a13,
142 | 0x55784198, 0x0449e145, 0xc1fec259, 0xfdde4bcc, 0x3d040585,
143 | 0x2d54b62c, 0x70f06c7e, 0xcc7a642f, 0xe784348b, 0xe360bb8a,
144 | 0xd4653bab, 0x129aac4f, 0xdf09ac90, 0xc770d23f, 0x1865b60c,
145 | 0x366d8ca9, 0x80b13f6f, 0x7317d810, 0x7816b809, 0x919adedb,
146 | 0x92713259, 0xb15e9216, 0x4cca4cd2, 0xb0bda9b9, 0xa3eb6a63,
147 | 0x1801f592, 0x7f6ebdfe, 0xcfd5f33c, 0x000c7082, 0x17265e0b,
148 | 0x6ba10359, 0x8c74f4eb, 0x803f3c08, 0x4ba6860d, 0x0716f9fb,
149 | 0x6e3c84ae, 0xe77a48f4, 0xc2374c75, 0x97f403ee, 0x3010b84b,
150 | 0x560ba778, 0x83103235, 0xfd4adabf, 0xa436bcf0, 0xaa8f96dc,
151 | 0x29922bec, 0xd5468b54, 0x4b1921b8, 0x2a8ce2d5, 0x86e336f4,
152 | 0x5fab2354, 0x0e07c225, 0xb181782a, 0xe799459f, 0xcf9541fd,
153 | 0xcd510976, 0xe70010ea, 0x6202cb22, 0x7d253b79, 0x4d047b53,
154 | 0xbd26b2ba, 0xc1df8a17, 0x48a6ed87, 0xa980b22c, 0x16b27278,
155 | 0xb5736e7c, 0x368bd0b9, 0xeee76414, 0xfe58e49d, 0xf3500e6d,
156 | 0xb57df9f5, 0xb52a7ed6, 0xaca79612, 0xccc9f98a, 0xa7140bd0,
157 | 0x7e45d2f9, 0xb91ddced, 0x9444f706, 0xa477bfb2, 0xcf7e1d5b,
158 | 0xd95eab3c, 0x737fa6e5, 0x5f548e79, 0x46539426, 0xef41aa94,
159 | 0xc0357213
160 | };
161 |
162 | printf("Tests on 512 KB block: ");
163 | fflush(stdout);
164 | for(unsigned i = 0; i < count; i++) {
165 | timer t0;
166 | timer t1;
167 | get_timer(t0);
168 | uint32_t h = khashv32(&seed, bytes, size);
169 | get_timer(t1);
170 |
171 | if(h != hashes[i]) {
172 | printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]);
173 | }
174 |
175 | double t = time_ns(&t0, &t1);
176 | if(t < fastest) {
177 | fastest = t;
178 | }
179 | sum += t;
180 | bytes[i] += 1;
181 | }
182 |
183 | double avg = sum / count;
184 | printf(
185 | "Avg: %lf GB/s, Avg Time: %lf us, Fastest: %lf GB/s\n",
186 | get_gbs (avg, gigs),
187 | get_usecs(avg),
188 | get_gbs (fastest, gigs)
189 | );
190 | fflush(stdout);
191 | free(bytes);
192 | return 0;
193 | }
194 |
195 | int main(int argc, char** argv) {
196 | khashvSeed seed;
197 | khashv_prep_seed64(&seed, 0xa9c163c960d480fb);
198 |
199 | if(gig_tests(seed)) {
200 | return 1;
201 | }
202 | if(half_mb_tests(seed)) {
203 | return 1;
204 | }
205 | return 0;
206 | }
--------------------------------------------------------------------------------
/test_speed.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include "khashv.h"
7 |
8 | #if defined(__MINGW32__) || defined(_WIN32)
9 | #include
10 |
11 | #define get_timer(x) QueryPerformanceCounter(&x)
12 |
13 | typedef LARGE_INTEGER timer;
14 |
15 | uint64_t time_ns(timer* start, timer* stop) {
16 | LARGE_INTEGER freq;
17 | if(!QueryPerformanceFrequency(&freq)) {
18 | return UINT64_MAX;
19 | }
20 | double ns = stop->QuadPart - start->QuadPart;
21 | double ratio = 1000000000.0; // 1 billion ns = 1 second
22 | ratio /= (double)freq.QuadPart;
23 | ns *= ratio;
24 | return (uint64_t)ns;
25 | }
26 |
27 | #else
28 | #include
29 | #define get_timer(x) clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &x)
30 | typedef struct timespec timer;
31 |
32 | uint64_t time_ns(timer* start, timer* stop) {
33 | int secs = stop->tv_sec - start->tv_sec;
34 | if(secs > 0) {
35 | uint64_t t0_ns = start->tv_sec * 1000000000;
36 | uint64_t t1_ns = stop->tv_sec * 1000000000;
37 | t0_ns += start->tv_nsec;
38 | t1_ns += stop->tv_nsec;
39 | return t1_ns - t0_ns;
40 | }
41 | return stop->tv_nsec - start->tv_nsec;
42 | }
43 | #endif
44 |
45 | #define MB_TO_BYTES(x) (1024ULL * 1024ULL * (x))
46 |
47 | double get_gbs(double t_ns, double gigs) {
48 | t_ns /= 1000000000;
49 | return gigs / t_ns;
50 | }
51 |
52 | double get_secs(double t_ns) {
53 | return t_ns / 1000000000.0;
54 | }
55 |
56 | double get_usecs(double t_ns) {
57 | return t_ns / 1000.0;
58 | }
59 |
60 | void populate_memory(const khashvSeed* seed, uint8_t* bytes, size_t size) {
61 | printf("Populating Memory: ");
62 | // Use the hash to populate memory with pseudo random bytes
63 | uint64_t state[2] = { 0x4d9ef2f9a304588a, 0x58ca10a39947b63b };
64 | for(size_t i = 0; i < size; i += sizeof(uint64_t)) {
65 | if(i != 0 && (i & 0x1ffffff) == 0) {
66 | printf(".");
67 | fflush(stdout);
68 | }
69 | state[0] = khashv64(seed, (uint8_t*)state, sizeof(uint64_t) * 2);
70 | memcpy(bytes + i, state, sizeof(uint64_t));
71 | }
72 | puts(" Populated!");
73 | }
74 |
75 | int gig_tests(khashvSeed seed) {
76 | size_t size = MB_TO_BYTES(1280);
77 | uint8_t* bytes = malloc(size);
78 | if(bytes == NULL) {
79 | fprintf(stderr, "Can not allocate memory for test!\n");
80 | return 1;
81 | }
82 | populate_memory(&seed, bytes, size);
83 |
84 | double gigs = (double)size / (double)MB_TO_BYTES(1024);
85 | double sum = 0;
86 | double fastest = DBL_MAX;
87 |
88 | const uint32_t hashes[12] = {
89 | 0x8b4c1a33, 0x485105dc, 0xaf1deb0e, 0x2d4a890c, 0x8349b700,
90 | 0x29a3b3b9, 0xf1ed93ef, 0x8559b73f, 0x11452eff, 0xefa5fe1f,
91 | 0x5834c363, 0xeb7224a5
92 | };
93 |
94 | printf("Tests on %.3lf GB block: ", gigs);
95 | fflush(stdout);
96 | for(unsigned i = 0; i < 12; i++) {
97 | timer t0;
98 | timer t1;
99 | get_timer(t0);
100 | uint32_t h = khashv32(&seed, bytes, size);
101 | get_timer(t1);
102 |
103 | if(h != hashes[i]) {
104 | printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]);
105 | }
106 |
107 | double t = time_ns(&t0, &t1);
108 | if(t < fastest) {
109 | fastest = t;
110 | }
111 | sum += t;
112 | bytes[i] += 1;
113 | }
114 |
115 | double avg = sum / 12;
116 | double gbs = get_gbs (avg, gigs);
117 | double sec = get_secs(avg);
118 | double fgbs = get_gbs(fastest, gigs);
119 | gbs = round(gbs * 10000.0) / 10000.0;
120 | sec = round(sec * 10000.0) / 10000.0;
121 | fgbs = round(fgbs * 10000.0) / 10000.0;
122 | printf(
123 | "Avg: %.4lf GB/s, Avg Time: %.4lf s, Fastest: %.4lf GB/s\n",
124 | gbs,
125 | sec,
126 | fgbs
127 | );
128 | fflush(stdout);
129 | free(bytes);
130 | return 0;
131 | }
132 |
133 | int half_mb_tests(khashvSeed seed) {
134 | size_t size = 1024 * 512;
135 | uint8_t* bytes = malloc(size);
136 | if(bytes == NULL) {
137 | fprintf(stderr, "Can not allocate memory for test!\n");
138 | return 1;
139 | }
140 | populate_memory(&seed, bytes, size);
141 |
142 | double gigs = (double)size / (double)MB_TO_BYTES(1024);
143 | double sum = 0;
144 | double fastest = DBL_MAX;
145 | unsigned count = 96;
146 | const uint32_t hashes[96] = {
147 | 0x3b181e13, 0x6df3efe4, 0xa1472e2f, 0xe7fe7261, 0x85db611b,
148 | 0x95b68b46, 0xa4738539, 0xc67cd2b3, 0x4630444d, 0xb357f7a3,
149 | 0x60ba4613, 0x20d50be8, 0x5908392d, 0xd5c1411e, 0xa315f311,
150 | 0xe92b8d4a, 0x3504718c, 0x78d5d987, 0xac324986, 0xa9c146a3,
151 | 0xea4120ac, 0x1ab20115, 0xb4cf0fc0, 0x3726e7c6, 0x781b19b4,
152 | 0x897a635f, 0x49c879a6, 0x414f698e, 0xef3c3c66, 0x668de11e,
153 | 0xf6f2af8d, 0x6db89e5f, 0xa2621047, 0x26736838, 0xca8539cf,
154 | 0xe1e92796, 0xbd178553, 0x31aedc2d, 0x41f4377f, 0x0683f7a2,
155 | 0xff1d7f6f, 0x4a788c33, 0xb4823086, 0xf3b45106, 0xf2e12a97,
156 | 0x1505b0e8, 0x32d16f9d, 0xa4ccbd11, 0x61f6aa54, 0x8dc4eb8d,
157 | 0xe7ac77ca, 0xb00dd338, 0x9330ce85, 0xae721ca9, 0x236eb8a2,
158 | 0xcd7aba61, 0x2fbd751e, 0x978edc2c, 0x09ef6175, 0x78d12480,
159 | 0x08b21322, 0x02826493, 0x36244a76, 0xb7e1489c, 0x365c631f,
160 | 0x08188ea8, 0x92bd6910, 0xa7cf34d0, 0x9b91a005, 0x8c7cfc38,
161 | 0xf732ae18, 0x87f2f485, 0xa42d236d, 0x967880e3, 0xf04cb79d,
162 | 0xfd9d613f, 0xfa7ae694, 0xfb680e60, 0x2de7c7c9, 0xa5979af7,
163 | 0x6b24f6a3, 0xfebb25de, 0x3163a706, 0x7d8d0a35, 0xb5cacfcf,
164 | 0xdf774e72, 0xd06db96e, 0x16d7e8db, 0xf1e368e7, 0x21efe8d5,
165 | 0x59d6f29f, 0xb0ee28bc, 0x849b575e, 0x96887453, 0x2eabdd1f,
166 | 0x3cdc8fa8
167 | };
168 |
169 | printf("Tests on 512 KB block: ");
170 | fflush(stdout);
171 | for(unsigned i = 0; i < count; i++) {
172 | timer t0;
173 | timer t1;
174 | get_timer(t0);
175 | uint32_t h = khashv32(&seed, bytes, size);
176 | get_timer(t1);
177 |
178 | if(h != hashes[i]) {
179 | printf("Bad Hash: 0x%08x, expected: 0x%08x !!!\n", h, hashes[i]);
180 | }
181 |
182 | double t = time_ns(&t0, &t1);
183 | if(t < fastest) {
184 | fastest = t;
185 | }
186 | sum += t;
187 | bytes[i] += 1;
188 | }
189 |
190 | double avg = sum / count;
191 | double gbs = get_gbs (avg, gigs);
192 | double usec = get_usecs(avg);
193 | double fgbs = get_gbs (fastest, gigs);
194 | gbs = round(gbs * 10000.0) / 10000.0;
195 | usec = round(usec * 10000.0) / 10000.0;
196 | fgbs = round(fgbs * 10000.0) / 10000.0;
197 | printf(
198 | "Avg: %.4lf GB/s, Avg Time: %.4lf us, Fastest: %.4lf GB/s\n",
199 | gbs,
200 | usec,
201 | fgbs
202 | );
203 | fflush(stdout);
204 | free(bytes);
205 | return 0;
206 | }
207 |
208 | int main(int argc, char** argv) {
209 | khashvSeed seed;
210 | khashv_prep_seed64(&seed, 0xa9c163c960d480fb);
211 |
212 | if(gig_tests(seed)) {
213 | return 1;
214 | }
215 | if(half_mb_tests(seed)) {
216 | return 1;
217 | }
218 | return 0;
219 | }
220 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # K-HASHV 🔨
2 | A single header hash function with both vectorized and scalar versions. The function is quite fast when vectorized achieving approximately an average of **~10.2 GB/s** on a 9 year (as of 2024) old Xeon E3-1230 v5. The header contains explicit intrinsics for x86_64, and also has a version that will use GCC's portable vector built-ins, and the last fall back is a scalar version for portability. The results of the function should be the same regardless of endianness.
3 |
4 | Additionally, it also passes all the SMHasher hash function quality tests: https://github.com/rurban/smhasher. Additionally, it passes [SMHasher3](https://gitlab.com/fwojcik/smhasher3/-/blob/c56f2bddc1b3e114570d5cbe383ad207673f6c99/results/README.md) a fork of SMHasher with some more stringent tests. Some hashes that pass SMHasher fail in SMHasher3.
5 |
6 | Moreover, it is quite easy to choose a new function at runtime by just using new seed as shown below:
7 | ```C
8 | #include "khashv.h"
9 |
10 | void foo() {
11 | /*
12 | code ....
13 | */
14 | khashvSeed seed;
15 | khashv_prep_seed64(&seed, a_64_bit_value);
16 | uint64_t hash = khashv64(&seed, your_data, data_len);
17 | /*
18 | code ....
19 | */
20 | }
21 | ```
22 |
23 | Issues, PRs and suggestions are welcome 😃
24 |
25 | ### Note
26 | This is **not a cryptographic hash function**, and it should not be used for such applications.
27 |
28 | # Table of Contents
29 | * [Performance](#performance)
30 | * [API](#api)
31 | * [khashv_prep_seed32](#khashv_prep_seed32)
32 | * [khashv_prep_seed64](#khashv_prep_seed64)
33 | * [khashv_prep_seed128](#khashv_prep_seed128)
34 | * [khashv32](#khashv32)
35 | * [khashv64](#khashv64)
36 | * [Output](#output)
37 | * [64-bit Output](#khashv64-output)
38 | * [32-bit Output](#khashv32-output)
39 | * [Notes](#notes)
40 | * [TODO](#todo)
41 | * [Copyright and License](#copyright-and-license)
42 |
43 | # Performance
44 | When testing on 1.25 GB and 512 KB of random data I get the following on averages:
45 |
46 | | Processor | 1.25 GB Time | 1.25 GB Speed | 512 KB Time | 512 KB Speed | OS | Compiler | Type |
47 |
48 | | Xeon E3-1230 v5 | 0.1226 s | 10.1987 GB/s | 045.3515 us | 10.7666 GB/s | Linux | GCC 12.2.1 | Vectorized |
49 | | Xeon E3-1230 v5 | 1.1803 s | 1.0495 GB/s | 462.9862 us | 1.0546 GB/s | Linux | GCC 12.2.1 | Scalar |
50 | | Xeon E3-1230 v5 | 0.1388 s | 9.0061 GB/s | 052.8114 us | 9.2457 GB/s | Linux | Clang 15.0.7 | Vectorized |
51 | | Ryzen 9 7900 | 0.1182 s | 10.5742 GB/s | 044.4734 | 10.9792 GB/s | Linux | GCC 12.2.1 | Vectorized |
52 | | Ryzen 9 7900 | 0.7890 s | 1.5843 GB/s | 307.4712 us | 1.5881 GB/s | Linux | GCC 12.2.1 | Scalar |
53 |
54 |
55 |
56 | The scalar version is slower at a tad over ~1 GB/s on my system when compiling test_speed.c with gcc using `-O3`.
57 | On windows Microsoft's compiler does not seem to generate as performant code from the intrinsics, but the GCC mingw64 compiler generates pretty comparable numbers for me at least.
58 |
59 | Definitely, want to add other machines to this table. But if you are curious how it performs on your machine compile `test_speed.c` with `-O3 -lm -march=native` and `-O3 -lm -march=native -D KHASHV_SCALAR`.
60 |
61 | # API
62 |
63 | ## khashv_prep_seed32
64 | ```C
65 | // Prepares a seed from a 32-bit value
66 | void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed)
67 | ```
68 |
69 | ## khashv_prep_seed64
70 | ```C
71 | // Prepares a seed from a 64-bit value
72 | void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed)
73 | ```
74 |
75 | ## khashv_prep_seed128
76 | ```C
77 | // Sets 128-bits to be the seed
78 | void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4])
79 | ```
80 |
81 | ## khashv32
82 | ```C
83 | // Produces a 32-bit hash from the input data
84 | uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len)
85 | ```
86 |
87 | ## khashv64
88 | ```C
89 | // Produces a 64-bit hash from the input data
90 | uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len)
91 | ```
92 |
93 | # Output
94 | Here is the output of the hash function as images.
95 |
96 | ## khashv64 Output
97 | Here is the output of the 64 bit hash of the integers \[0, 259199\] using 0x1dcedff1a8b17e89 as the seed.
98 |
99 |
100 |
101 | ## khashv32 Output
102 |
103 | Here is the output of the 32 bit hash of the integers \[0, 518399\] using 0x6bb75f13 as the seed.
104 |
105 |
106 |
107 | The output of the above images was generated by basically doing the following for a hash.
108 |
109 | ```C
110 | for(int i = 0; i < sizeof(hash_bytes); i++) {
111 | pixel[img_offset + i].r = hash_bytes[i];
112 | pixel[img_offset + i].g = hash_bytes[i];
113 | pixel[img_offset + i].b = hash_bytes[i];
114 | pixel[img_offset + i].a = 255;
115 | }
116 | ```
117 |
118 | # TODO
119 | When thinking about things to improve the code and hash function these are the first few things that come to mind for me.
120 | 1. A faster mixing function (e.g. `khashv_mix_words_`) I think is probably the next thing that could be improved. If that could be made shorter/faster it would reduce latency for smaller inputs. Any ideas or feedback for this would be appreciated.
121 |
122 | 2. The next thing would be try to get both Clang and MSVC to output code that runs as fast GCC or as close as possible. They both seem to do some silly things when compared to GCC losing some performance when looking at the generated assembly. Microsoft's compiler being the worst, and probably the fastest fix for me to implement would be to write some assembly code. However, it then would no longer be a single header file hash function since MSVC does not support inline assembly for 64-bit builds, and thusly would require a separate file.
123 |
124 | 3. Then probably consider using intrinsics for some other systems like ARM NEON, but the for now there is scalar code and code written using GCC's vector built-ins that will generate vectorized code for other architectures that GCC supports.
125 |
126 | 4. Probably, the next thing I could think of is to choose a better value for S1 and S2 that are used to basically substitute bytes. The current values where found randomly checking a small set of criteria. Mainly focusing on each bit of S1 and S2 as columns. Then Xor-ing them effectively creating an 8 bit input boolean function, and making sure the entire thing maps each input to a unique value. There likely are better values that could chosen, and criteria to look at that look at all bits at once. However, the search space is huge effectively 2^(2\*8\*16) possible permutations for S1 and S2. However, the current values do seem to work well, from my testing. An other constant that could be looked at as well is the new shuffle constant I have in v2 that randomly permutes the bytes, it's quite likely their exists a better constant for this as well.
127 |
128 | 5. Maybe, write some assembly versions to get around some of the compiler differences. Also maybe a rust version.
129 |
130 | # Copyright and License
131 |
132 | Copyright (C) 2023, by Keith Cancel [](mailto:admin@keith.pro).
133 |
134 | Under the MIT License
135 |
--------------------------------------------------------------------------------
/khashv.h:
--------------------------------------------------------------------------------
1 | /*
2 | MIT License
3 | Copyright (c) 2022 Keith-Cancel
4 | Permission is hereby granted, free of charge, to any person obtaining a copy
5 | of this software and associated documentation files (the “Software”), to deal
6 | in the Software without restriction, including without limitation the rights
7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 | copies of the Software, and to permit persons to whom the Software is
9 | furnished to do so, subject to the following conditions:
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
18 | SOFTWARE.
19 | */
20 |
21 | #ifndef K_HASH_V_H
22 | #define K_HASH_V_H
23 | #ifdef __cplusplus
24 | extern "C" {
25 | #define restrict
26 | #endif
27 |
28 | #include
29 | #include
30 | #include
31 | #include
32 |
33 | // For MSVC compiler, no __SSE3__ macro
34 | #if !defined(__SSE3__) && (defined(__AVX__) || defined(__AVX2__))
35 | #define __SSE3__
36 | #endif
37 | // Same deal
38 | #if !defined(__SSE4_1__) && (defined(__AVX__) || defined(__AVX2__))
39 | #define __SSE4_1__
40 | #endif
41 |
42 | #if defined(__SSE3__)
43 | #include
44 | #if defined(__MINGW32__) || defined(_WIN32)
45 | #include
46 | #endif
47 | #endif
48 |
49 | #if defined(__GNUC__) && !defined(__clang__)
50 | #define KHASH_GCC_LEAST__(maj, min) (__GNUC__ > maj || __GNUC__ == maj && __GNUC_MINOR__ >= min)
51 | #else
52 | #define KHASH_GCC_LEAST__(maj, min) 0
53 | #endif
54 |
55 | #if defined(__BYTE_ORDER__) && !defined(__BYTE_ORDER)
56 | #define __BYTE_ORDER __BYTE_ORDER__
57 | #endif
58 |
59 | #if defined(__ORDER_LITTLE_ENDIAN__) && !defined(__LITTLE_ENDIAN)
60 | #define __LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__
61 | #endif
62 |
63 | #if defined(__ORDER_BIG_ENDIAN__) && !defined(__BIG_ENDIAN)
64 | #define __BIG_ENDIAN __ORDER_BIG_ENDIAN__
65 | #endif
66 |
67 |
68 | #if defined(__clang__) && defined(__has_attribute)
69 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr)
70 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(5, 0)
71 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr)
72 | #else
73 | #define KHASH_CHK_ATTRIBUTE__(attr) 0
74 | #endif
75 |
76 | #if defined(__clang__) && defined(__has_builtin)
77 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built)
78 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(10, 1)
79 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built)
80 | #else
81 | #define KHASH_CHK_BUILTIN__(built) 0
82 | #endif
83 |
84 | #if defined(_MSC_VER) && !defined(__clang__)
85 | #define KHASH_FINLINE __forceinline
86 | #define KHASH_BSWAP32(val) _byteswap_ulong(val)
87 | #endif
88 |
89 | #if !defined(KHASH_FINLINE) && (KHASH_CHK_ATTRIBUTE__(always_inline) || KHASH_GCC_LEAST__(3, 1))
90 | #define KHASH_FINLINE __attribute__((always_inline)) inline
91 | #endif
92 |
93 | #if !defined(KHASH_BSWAP32) && (KHASH_CHK_BUILTIN__(__builtin_bswap32) || KHASH_GCC_LEAST__(4, 5))
94 | #define KHASH_BSWAP32(val) __builtin_bswap32(val)
95 | #endif
96 |
97 | #if !defined(KHASH_OPT_SZ) && (KHASH_CHK_ATTRIBUTE__(optimize) || KHASH_GCC_LEAST__(4, 8))
98 | #define KHASH_OPT_SZ __attribute__((optimize("Os")))
99 | #endif
100 |
101 | #if !defined(KHASH_FINLINE)
102 | #define KHASH_FINLINE inline
103 | #endif
104 |
105 | #if !defined(KHASH_OPT_SZ)
106 | #define KHASH_OPT_SZ
107 | #endif
108 |
109 | #if !defined(KHASH_BSWAP32)
110 | #define KHASH_BSWAP32(val) ((val >> 24) | ((val >> 8) & 0xff00) | ((val << 8) & 0xff0000) | (val << 24))
111 | #endif
112 |
113 | static KHASH_FINLINE int khashv_is_little_endian() {
114 | #if defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN
115 | return 1;
116 | #elif defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN
117 | return 0;
118 | #elif defined(__BYTE_ORDER)
119 | #error "Mixed/Middle endian machine, you will need to write a custom byteswap routine"
120 | #else
121 | // Otherwise hope the compiler's optimizer figures this is constant.
122 | // Also since the byte order macro does not exist there are
123 | // Middle-endian/Mixed endian machines out there but they are quite
124 | // rare/old. So I am not gonna worry about it since there are 24 or
125 | // 4! (four factorial) total endianess-es. So if the compiler does
126 | // not define __BYTE_ORDER, the hash output will be different, on
127 | // such machines, but the hash should still work fine.
128 | unsigned int x = 1;
129 | return *((char*)(&x)) == 1;
130 | #endif
131 | }
132 |
133 | #define KHASH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
134 |
135 | struct khashv_block_s {
136 | union {
137 | uint8_t bytes[16];
138 | uint32_t words[4];
139 | #if defined(__SSE3__)
140 | __m128i vec;
141 | #endif
142 | };
143 | };
144 |
145 | typedef struct khashv_block_s khashvBlock;
146 | typedef struct khashv_block_s khashvSeed;
147 |
148 | static const khashvBlock khash_v_init = {
149 | .words = {
150 | // Really this could basically be almost anything
151 | // So just using some bytes of the SHA-256 hashes
152 | // of 1, 2, 3, and 4
153 | 0x7785459a, // SHA256 of the byte 0x01, using the last 4 bytes
154 | 0x6457d986, // SHA256 of the byte 0x02, using the last 4 bytes
155 | 0xadff29c5, // SHA256 of the byte 0x03, using the last 4 bytes
156 | 0x81c89e71, // SHA256 of the byte 0x04, using the last 4 bytes
157 | }};
158 |
159 | static const uint8_t khashv_s1[16] = {
160 | 0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7,
161 | 0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12,
162 | };
163 |
164 | static const uint8_t khashv_s2[16] = {
165 | 0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5,
166 | 0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a,
167 | };
168 |
169 | static const uint8_t khashv_xored[256] = {
170 | 0xf3, 0xb2, 0x17, 0x0c, 0x2e, 0x73, 0x35, 0x58,
171 | 0x8c, 0x7e, 0xb6, 0x5c, 0xc4, 0x4a, 0x01, 0xfd,
172 | 0xd2, 0x93, 0x36, 0x2d, 0x0f, 0x52, 0x14, 0x79,
173 | 0xad, 0x5f, 0x97, 0x7d, 0xe5, 0x6b, 0x20, 0xdc,
174 | 0x7a, 0x3b, 0x9e, 0x85, 0xa7, 0xfa, 0xbc, 0xd1,
175 | 0x05, 0xf7, 0x3f, 0xd5, 0x4d, 0xc3, 0x88, 0x74,
176 | 0xef, 0xae, 0x0b, 0x10, 0x32, 0x6f, 0x29, 0x44,
177 | 0x90, 0x62, 0xaa, 0x40, 0xd8, 0x56, 0x1d, 0xe1,
178 | 0xea, 0xab, 0x0e, 0x15, 0x37, 0x6a, 0x2c, 0x41,
179 | 0x95, 0x67, 0xaf, 0x45, 0xdd, 0x53, 0x18, 0xe4,
180 | 0x3d, 0x7c, 0xd9, 0xc2, 0xe0, 0xbd, 0xfb, 0x96,
181 | 0x42, 0xb0, 0x78, 0x92, 0x0a, 0x84, 0xcf, 0x33,
182 | 0x5e, 0x1f, 0xba, 0xa1, 0x83, 0xde, 0x98, 0xf5,
183 | 0x21, 0xd3, 0x1b, 0xf1, 0x69, 0xe7, 0xac, 0x50,
184 | 0xb9, 0xf8, 0x5d, 0x46, 0x64, 0x39, 0x7f, 0x12,
185 | 0xc6, 0x34, 0xfc, 0x16, 0x8e, 0x00, 0x4b, 0xb7,
186 | 0x0d, 0x4c, 0xe9, 0xf2, 0xd0, 0x8d, 0xcb, 0xa6,
187 | 0x72, 0x80, 0x48, 0xa2, 0x3a, 0xb4, 0xff, 0x03,
188 | 0xb1, 0xf0, 0x55, 0x4e, 0x6c, 0x31, 0x77, 0x1a,
189 | 0xce, 0x3c, 0xf4, 0x1e, 0x86, 0x08, 0x43, 0xbf,
190 | 0x47, 0x06, 0xa3, 0xb8, 0x9a, 0xc7, 0x81, 0xec,
191 | 0x38, 0xca, 0x02, 0xe8, 0x70, 0xfe, 0xb5, 0x49,
192 | 0xda, 0x9b, 0x3e, 0x25, 0x07, 0x5a, 0x1c, 0x71,
193 | 0xa5, 0x57, 0x9f, 0x75, 0xed, 0x63, 0x28, 0xd4,
194 | 0x6e, 0x2f, 0x8a, 0x91, 0xb3, 0xee, 0xa8, 0xc5,
195 | 0x11, 0xe3, 0x2b, 0xc1, 0x59, 0xd7, 0x9c, 0x60,
196 | 0x24, 0x65, 0xc0, 0xdb, 0xf9, 0xa4, 0xe2, 0x8f,
197 | 0x5b, 0xa9, 0x61, 0x8b, 0x13, 0x9d, 0xd6, 0x2a,
198 | 0x89, 0xc8, 0x6d, 0x76, 0x54, 0x09, 0x4f, 0x22,
199 | 0xf6, 0x04, 0xcc, 0x26, 0xbe, 0x30, 0x7b, 0x87,
200 | 0x66, 0x27, 0x82, 0x99, 0xbb, 0xe6, 0xa0, 0xcd,
201 | 0x19, 0xeb, 0x23, 0xc9, 0x51, 0xdf, 0x94, 0x68,
202 | };
203 |
204 | /* Scalar Code */
205 |
206 | static KHASH_FINLINE void khashv_bswap_be_block_scalar(khashvBlock* in) {
207 | // Byte swapping is only needed if we are not on on a little endian system
208 | if (khashv_is_little_endian()) {
209 | return;
210 | }
211 | for(int i = 0; i < 4; i++) {
212 | in->words[i] = KHASH_BSWAP32(in->words[i]);
213 | }
214 | }
215 |
216 | static KHASH_FINLINE void khashv_rotr_5_bytes_scalar(khashvBlock* in) {
217 | khashv_bswap_be_block_scalar(in);
218 | khashvBlock tmp1;
219 | khashvBlock tmp2;
220 | // Avoid aliasing issues by using memcpy between these union values.
221 | memcpy(tmp1.bytes, in->words, 16);
222 | for(int i = 0; i < 16; i++) {
223 | tmp2.bytes[i] = tmp1.bytes[(i + 5) & 0xf];
224 | }
225 | memcpy(in->words, tmp2.bytes, 16);
226 | khashv_bswap_be_block_scalar(in);
227 | }
228 |
229 | static KHASH_FINLINE void khashv_shuffle_bytes_scalar(khashvBlock* in) {
230 | static const uint8_t shuffle[16] = {
231 | 0x7, 0xe, 0x9, 0x0, 0xc, 0xf, 0xd, 0x8,
232 | 0x5, 0xb, 0x6, 0x3, 0x4, 0x2, 0xa, 0x1
233 | };
234 | khashv_bswap_be_block_scalar(in);
235 | khashvBlock tmp1;
236 | khashvBlock tmp2;
237 | // Avoid aliasing issues by using memcpy between these union values.
238 | memcpy(tmp1.bytes, in->words, 16);
239 | for(int i = 0; i < 16; i++) {
240 | tmp2.bytes[i] = tmp1.bytes[shuffle[i]];
241 | }
242 | memcpy(in->words, tmp2.bytes, 16);
243 | khashv_bswap_be_block_scalar(in);
244 | }
245 |
246 | static KHASH_FINLINE void khashv_shl_13_block_scalar(khashvBlock* in) {
247 | for(int i = 0; i < 4; i++) {
248 | in->words[i] <<= 13;
249 | }
250 | }
251 |
252 | static KHASH_FINLINE void khashv_shr_3_block_scalar(khashvBlock* in) {
253 | for(int i = 0; i < 4; i++) {
254 | in->words[i] >>= 3;
255 | }
256 | }
257 |
258 | static KHASH_FINLINE void khashv_add_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) {
259 | for(int i = 0; i < 4; i++) {
260 | a->words[i] += b->words[i];
261 | }
262 | }
263 |
264 | static KHASH_FINLINE void khashv_xor_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) {
265 | for(int i = 0; i < 4; i++) {
266 | a->words[i] ^= b->words[i];
267 | }
268 | }
269 |
270 | // GCC and Clang with -O3 were vectorizing this quite poorly with -O3
271 | // They could not detect that only a PSHUFB was needed and instead
272 | // where generating tons of inserts and extracts from the vector
273 | // registers. Thusly it was running slower than code that was not being
274 | // vectorized on my machine. So I specify the optimization level directly.
275 | // Tried a few other things to get GCC and Clang to generate more sane
276 | // code or code using PSHUFB, but this seemed the cleanest.
277 | // Example of what I mean: https://godbolt.org/z/PMnzsThPc
278 | // Compared to this: https://godbolt.org/z/dWfjr7GWP
279 | /*static KHASH_OPT_SZ void khashv_sub16(khashvBlock* tmp, const uint8_t sub[16]) {
280 | #if defined(__clang__)
281 | // Stop clang from being annoying!!!
282 | // The auto-vectorized code was worse at the time of writing this
283 | #pragma nounroll
284 | #pragma clang loop vectorize(disable)
285 | #pragma clang loop interleave(disable)
286 | #endif
287 | for (int i = 0; i < 16; i++) {
288 | tmp->bytes[i] = sub[tmp->bytes[i]];
289 | }
290 | }
291 |
292 | static KHASH_FINLINE void khashv_replace_scalar(khashvBlock* replace) {
293 | khashvBlock tmp;
294 | for (int i = 0; i < 16; i++) {
295 | tmp.bytes[i] = (replace->bytes[i] >> 4);
296 | replace->bytes[i] &= 0x0f;
297 | }
298 | khashv_sub16(replace, khashv_s1);
299 | khashv_sub16(&tmp, khashv_s2);
300 | for (int i = 0; i < 16; i++) {
301 | replace->bytes[i] ^= tmp.bytes[i];
302 | }
303 | }*/
304 | // Similar issue as the commented out code so stop the optimizers
305 | // from getting crazy
306 | static KHASH_OPT_SZ void khashv_replace_scalar(khashvBlock* replace) {
307 | khashvBlock tmp;
308 | memcpy(tmp.bytes, replace->words, 16);
309 | #if defined(__clang__)
310 | // Stop clang from being annoying!!!
311 | // The auto-vectorized code was worse at the time of writing this
312 | #pragma nounroll
313 | #pragma clang loop vectorize(disable)
314 | #pragma clang loop interleave(disable)
315 | #endif
316 | for(int i = 0; i < 16; i++) {
317 | tmp.bytes[i] = khashv_xored[tmp.bytes[i]];
318 | }
319 | memcpy(replace->words, tmp.bytes, 16);
320 | }
321 |
322 | static KHASH_FINLINE void khashv_mix_words_scalar(khashvBlock* in) {
323 | unsigned rots[4] = { 5, 7, 11, 17 };
324 | khashvBlock tmp = { 0 };
325 |
326 | tmp = *in;
327 | khashv_shr_3_block_scalar(&tmp);
328 | khashv_xor_block_scalar(in, &tmp);
329 |
330 | for (int i = 0; i < 4; i++) {
331 | unsigned rot = rots[i];
332 | tmp = *in;
333 | khashv_rotr_5_bytes_scalar(&tmp);
334 | khashv_add_block_scalar(&tmp, in);
335 | for (int j = 0; j < 4; j++) {
336 | tmp.words[j] = KHASH_ROTR32(tmp.words[j], rot);
337 | }
338 | khashv_xor_block_scalar(in, &tmp);
339 | }
340 | }
341 |
342 | static void khashv_hash_scalar(khashvBlock* hash, const uint8_t* data, size_t data_len) {
343 | hash->words[0] ^= data_len;
344 | // size_t is bigger than 32 bits
345 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
346 | hash->words[1] ^= data_len >> 32;
347 | #endif
348 |
349 | khashvBlock tmp_1;
350 | khashvBlock tmp_2;
351 | khashvBlock tmp_h = *hash;
352 |
353 | const uint8_t* end = data + (data_len & ~((size_t)15));
354 |
355 | while (data < end) {
356 | memcpy(&tmp_2, data, 16);
357 | khashv_replace_scalar(&tmp_2);
358 | memcpy(&tmp_1.words, tmp_2.bytes, 16);
359 |
360 | khashv_bswap_be_block_scalar(&tmp_1);
361 |
362 | tmp_2 = tmp_1;
363 | //khashv_shl_13_block_scalar(&tmp_2);
364 | //khashv_add_block_scalar(&tmp_2, &tmp_1);
365 | for(int i = 0; i < 4; i++) {
366 | tmp_2.words[i] *= 8193;
367 | }
368 | khashv_xor_block_scalar(&tmp_h, &tmp_2);
369 | khashv_rotr_5_bytes_scalar(&tmp_h);
370 | khashv_add_block_scalar(&tmp_h, &tmp_1);
371 |
372 | tmp_1 = tmp_h;
373 | khashv_shuffle_bytes_scalar(&tmp_1);
374 | khashv_add_block_scalar(&tmp_h, &tmp_1);
375 |
376 | data += 16;
377 | }
378 |
379 | unsigned trailing = data_len & 0xf;
380 | if(trailing) {
381 | memset(&tmp_2, 0, 16);
382 |
383 | memcpy(&tmp_2.bytes, data, trailing);
384 | khashv_replace_scalar(&tmp_2);
385 | memcpy(&tmp_1.words, tmp_2.bytes, 16);
386 |
387 | khashv_bswap_be_block_scalar(&tmp_1);
388 |
389 | tmp_2 = tmp_1;
390 | //khashv_shl_13_block_scalar(&tmp_2);
391 | //khashv_add_block_scalar(&tmp_2, &tmp_1);
392 | for(int i = 0; i < 4; i++) {
393 | tmp_2.words[i] *= 8193;
394 | }
395 | khashv_xor_block_scalar(&tmp_h, &tmp_2);
396 | khashv_rotr_5_bytes_scalar(&tmp_h);
397 | khashv_add_block_scalar(&tmp_h, &tmp_1);
398 |
399 | tmp_1 = tmp_h;
400 | khashv_shuffle_bytes_scalar(&tmp_1);
401 | khashv_add_block_scalar(&tmp_h, &tmp_1);
402 |
403 | }
404 | khashv_mix_words_scalar(&tmp_h);
405 | *hash = tmp_h;
406 | }
407 |
408 | static inline void khashv_prep_seed32_scalar(khashvSeed* seed_prepped, uint32_t seed) {
409 | *seed_prepped = khash_v_init;
410 | seed_prepped->words[0] ^= seed;
411 | khashv_mix_words_scalar(seed_prepped);
412 | }
413 |
414 | static inline void khashv_prep_seed64_scalar(khashvSeed* seed_prepped, uint64_t seed) {
415 | *seed_prepped = khash_v_init;
416 | seed_prepped->words[0] ^= seed;
417 | khashv_mix_words_scalar(seed_prepped);
418 | // Do it again with the other part to make it different than the 32 bit seed.
419 | seed_prepped->words[1] ^= seed >> 32;
420 | khashv_mix_words_scalar(seed_prepped);
421 | }
422 |
423 | static inline void khashv_prep_seed128_scalar(khashvSeed* seed_prepped, const uint32_t seed[4]) {
424 | for(int i = 0; i < 4; i++) {
425 | seed_prepped->words[i] = seed[i];
426 | }
427 | }
428 |
429 | static inline uint32_t khashv32_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
430 | khashvBlock h = *seed;
431 | khashv_hash_scalar(&h, data, data_len);
432 | return h.words[3];
433 | }
434 |
435 | static inline uint64_t khashv64_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
436 | khashvBlock h = *seed;
437 | khashv_hash_scalar(&h, data, data_len);
438 | uint64_t r = h.words[1];
439 | r <<= 32;
440 | r |= h.words[0];
441 | return r;
442 | }
443 |
444 | /* Vectorization for and Intel/AMD */
445 |
446 | #if defined(__SSE3__)
447 |
448 | #define KHASH_VECTOR 1
449 |
450 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(11, 0))
451 | static KHASH_FINLINE __m128i _mm_loadu_si32(const void* data) {
452 | uint32_t val;
453 | memcpy(&val, data, sizeof(uint32_t));
454 | return _mm_cvtsi32_si128(val);
455 | }
456 | static KHASH_FINLINE __m128i _mm_loadu_si16(const void* data) {
457 | uint32_t val = 0;
458 | memcpy(&val, data, sizeof(uint16_t));
459 | return _mm_cvtsi32_si128(val);
460 | }
461 | #endif
462 |
463 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(9, 1))
464 | static KHASH_FINLINE __m128i _mm_loadu_si64(const void* data) {
465 | uint64_t val = 0;
466 | memcpy(&val, data, sizeof(uint64_t));
467 | return _mm_cvtsi64_si128(val);
468 | }
469 | #endif
470 |
471 | static KHASH_FINLINE __m128i khashv_mix_words_vector(__m128i val) {
472 | __m128i tmp1;
473 | __m128i tmp2;
474 |
475 | tmp1 = _mm_srli_epi32(val, 3);
476 | val = _mm_xor_si128(tmp1, val);
477 |
478 | tmp1 = _mm_alignr_epi8(val, val, 5);
479 | tmp1 = _mm_add_epi32(val, tmp1);
480 | #if defined(__AVX512VL__)
481 | tmp1 = _mm_ror_epi32(tmp1, 5);
482 | val = _mm_xor_si128(val, tmp1);
483 | #else
484 | tmp2 = _mm_srli_epi32(tmp1, 5);
485 | tmp1 = _mm_slli_epi32(tmp1, 27);
486 | tmp1 = _mm_or_si128(tmp1, tmp2);
487 | val = _mm_xor_si128(val, tmp1);
488 | #endif
489 |
490 | tmp1 = _mm_alignr_epi8(val, val, 5);
491 | tmp1 = _mm_add_epi32(val, tmp1);
492 | #if defined(__AVX512VL__)
493 | tmp1 = _mm_ror_epi32(tmp1, 7);
494 | val = _mm_xor_si128(val, tmp1);
495 | #else
496 | tmp2 = _mm_srli_epi32(tmp1, 7);
497 | tmp1 = _mm_slli_epi32(tmp1, 25);
498 | val = _mm_xor_si128(val, tmp2);
499 | val = _mm_xor_si128(val, tmp1);
500 | #endif
501 |
502 | tmp1 = _mm_alignr_epi8(val, val, 5);
503 | tmp1 = _mm_add_epi32(tmp1, val);
504 | #if defined(__AVX512VL__)
505 | tmp1 = _mm_ror_epi32(tmp1, 11);
506 | val = _mm_xor_si128(val, tmp1);
507 | #else
508 | tmp2 = _mm_srli_epi32(tmp1, 11);
509 | tmp1 = _mm_slli_epi32(tmp1, 21);
510 | val = _mm_xor_si128(val, tmp2);
511 | val = _mm_xor_si128(val, tmp1);
512 | #endif
513 |
514 | tmp1 = _mm_alignr_epi8(val, val, 5);
515 | tmp1 = _mm_add_epi32(tmp1, val);
516 | #if defined(__AVX512VL__)
517 | tmp1 = _mm_ror_epi32(tmp1, 17);
518 | val = _mm_xor_si128(val, tmp1);
519 | #else
520 | tmp2 = _mm_srli_epi32(tmp1, 17);
521 | tmp1 = _mm_slli_epi32(tmp1, 15);
522 | val = _mm_xor_si128(val, tmp2);
523 | val = _mm_xor_si128(val, tmp1);
524 | #endif
525 |
526 | return val;
527 | }
528 |
529 | static KHASH_FINLINE __m128i khashv_part_load_vector(const uint8_t* data, size_t len) {
530 | __m128i tmp = { 0 };
531 | __m128i tmp2 = { 0 };
532 | switch(len) {
533 | case 1:
534 | #if defined(__SSE4_1__)
535 | tmp = _mm_insert_epi8(tmp, data[0], 0);
536 | #else
537 | tmp = _mm_cvtsi32_si128(data[0]);
538 | #endif
539 | break;
540 | case 2:
541 | tmp = _mm_loadu_si16(data);
542 | break;
543 | case 3:
544 | tmp = _mm_loadu_si16(data);
545 | #if defined(__SSE4_1__)
546 | tmp = _mm_insert_epi8(tmp, data[2], 2);
547 | #else
548 | tmp = _mm_insert_epi16(tmp, data[2], 1);
549 | #endif
550 | break;
551 | case 4:
552 | tmp = _mm_loadu_si32(data);
553 | break;
554 | case 5:
555 | tmp = _mm_loadu_si32(data);
556 | #if defined(__SSE4_1__)
557 | tmp = _mm_insert_epi8(tmp, data[4], 4);
558 | #else
559 | tmp = _mm_insert_epi16(tmp, data[4], 2);
560 | #endif
561 | break;
562 | case 6:
563 | tmp = _mm_loadu_si32(data);
564 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2);
565 | break;
566 | case 7:
567 | tmp = _mm_loadu_si32(data);
568 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2);
569 | #if defined(__SSE4_1__)
570 | tmp = _mm_insert_epi8(tmp, data[6], 6);
571 | #else
572 | tmp = _mm_insert_epi16(tmp, data[6], 3);
573 | #endif
574 | break;
575 | case 8:
576 | tmp = _mm_loadu_si64(data);
577 | break;
578 | case 9:
579 | tmp = _mm_loadu_si64(data);
580 | #if defined(__SSE4_1__)
581 | tmp = _mm_insert_epi8(tmp, data[8], 8);
582 | #else
583 | tmp = _mm_insert_epi16(tmp, data[8], 4);
584 | #endif
585 | break;
586 | case 10:
587 | tmp = _mm_loadu_si64(data);
588 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4);
589 | break;
590 | case 11:
591 | tmp = _mm_loadu_si64(data);
592 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4);
593 | #if defined(__SSE4_1__)
594 | tmp = _mm_insert_epi8(tmp, data[10], 10);
595 | #else
596 | tmp = _mm_insert_epi16(tmp, data[10], 5);
597 | #endif
598 | break;
599 | case 12:
600 | tmp = _mm_loadu_si64(data);
601 | #if defined(__SSE4_1__)
602 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
603 | #else
604 | tmp2 = _mm_loadu_si32(data + 8);
605 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
606 | tmp = _mm_or_si128(tmp, tmp2);
607 | #endif
608 | break;
609 | case 13:
610 | tmp = _mm_loadu_si64(data);
611 | #if defined(__SSE4_1__)
612 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
613 | tmp = _mm_insert_epi8(tmp, data[12], 12);
614 | #else
615 | tmp2 = _mm_loadu_si32(data + 8);
616 | tmp2 = _mm_insert_epi16(tmp2, data[12], 2);
617 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
618 | tmp = _mm_or_si128(tmp, tmp2);
619 | #endif
620 | break;
621 | case 14:
622 | tmp = _mm_loadu_si64(data);
623 | #if defined(__SSE4_1__)
624 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
625 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6);
626 | #else
627 | tmp2 = _mm_loadu_si32(data + 8);
628 | tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6);
629 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
630 | tmp = _mm_or_si128(tmp, tmp2);
631 | #endif
632 | break;
633 | case 15:
634 | tmp = _mm_loadu_si64(data);
635 | #if defined(__SSE4_1__)
636 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
637 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6);
638 | tmp = _mm_insert_epi8(tmp, data[14], 14);
639 | #else
640 | tmp2 = _mm_loadu_si32(data + 8);
641 | tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6);
642 | tmp2 = _mm_insert_epi16(tmp2, data[14], 7);
643 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
644 | tmp = _mm_or_si128(tmp, tmp2);
645 | #endif
646 | break;
647 | case 16:
648 | tmp = _mm_loadu_si128((__m128i*)data);
649 | break;
650 | }
651 | return tmp;
652 | }
653 |
654 | static const uint8_t khashv_shuff[16] = {
655 | 0x7, 0xe, 0x9, 0x0, 0xc, 0xf, 0xd, 0x8,
656 | 0x5, 0xb, 0x6, 0x3, 0x4, 0x2, 0xa, 0x1
657 | };
658 |
659 | static __m128i khashv_hash_vector(__m128i hash, const uint8_t* data, size_t data_len) {
660 | const __m128i s1 = _mm_loadu_si128((const __m128i*)khashv_s1);
661 | const __m128i s2 = _mm_loadu_si128((const __m128i*)khashv_s2);
662 | const __m128i shuff = _mm_loadu_si128((const __m128i*)khashv_shuff);
663 | const __m128i mask = _mm_set1_epi32(0x0f0f0f0f);
664 |
665 | __m128i tmp_1;
666 | __m128i tmp_2;
667 |
668 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
669 | tmp_1 = _mm_cvtsi64_si128(data_len);
670 | #else
671 | tmp_1 = _mm_cvtsi32_si128(data_len);
672 | #endif
673 | hash = _mm_xor_si128(tmp_1, hash);
674 |
675 | const uint8_t* end = data + (data_len & ~((size_t)15));
676 | const uint8_t* end2 = data + data_len;
677 | while(data_len > 16 && data < end) {
678 | tmp_1 = _mm_lddqu_si128((const __m128i*)data);
679 | tmp_2 = _mm_srli_epi32 (tmp_1, 4);
680 |
681 | tmp_1 = _mm_and_si128 (tmp_1, mask);
682 | tmp_2 = _mm_and_si128 (tmp_2, mask);
683 | tmp_1 = _mm_shuffle_epi8(s1, tmp_1);
684 | tmp_2 = _mm_shuffle_epi8(s2, tmp_2);
685 | tmp_1 = _mm_xor_si128 (tmp_1, tmp_2);
686 |
687 | tmp_2 = _mm_slli_epi32 (tmp_1, 13);
688 | tmp_2 = _mm_add_epi32 (tmp_1, tmp_2);
689 | tmp_2 = _mm_xor_si128 (hash, tmp_2);
690 | tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5);
691 | hash = _mm_add_epi32 (tmp_2, tmp_1);
692 |
693 | tmp_1 = _mm_shuffle_epi8(hash, shuff);
694 | hash = _mm_add_epi32(hash, tmp_1);
695 |
696 | data += 16;
697 | }
698 | uintptr_t trailing = end2 - data;
699 | if(trailing) {
700 | tmp_1 = khashv_part_load_vector(data, trailing);
701 | tmp_2 = _mm_srli_epi32 (tmp_1, 4);
702 |
703 | tmp_1 = _mm_and_si128 (tmp_1, mask);
704 | tmp_2 = _mm_and_si128 (tmp_2, mask);
705 | tmp_1 = _mm_shuffle_epi8(s1, tmp_1);
706 | tmp_2 = _mm_shuffle_epi8(s2, tmp_2);
707 | tmp_1 = _mm_xor_si128 (tmp_1, tmp_2);
708 |
709 | tmp_2 = _mm_slli_epi32 (tmp_1, 13);
710 | tmp_2 = _mm_add_epi32 (tmp_1, tmp_2);
711 | tmp_2 = _mm_xor_si128 (hash, tmp_2);
712 | tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5);
713 | hash = _mm_add_epi32 (tmp_2, tmp_1);
714 |
715 | tmp_1 = _mm_shuffle_epi8(hash, shuff);
716 | hash = _mm_add_epi32(hash, tmp_1);
717 | }
718 | hash = khashv_mix_words_vector(hash);
719 | return hash;
720 | }
721 |
722 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) {
723 | __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init);
724 | s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed));
725 | seed_prepped->vec = khashv_mix_words_vector(s);
726 | }
727 |
728 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) {
729 | __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init);
730 | __m128i t = _mm_cvtsi32_si128(seed >> 32);
731 | s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed));
732 | s = khashv_mix_words_vector(s);
733 | s = _mm_xor_si128(s, _mm_shuffle_epi32(t, 0xf3));
734 | seed_prepped->vec = khashv_mix_words_vector(s);
735 | }
736 |
737 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) {
738 | seed_prepped->vec = _mm_loadu_si128((const __m128i*)seed);
739 | }
740 |
741 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
742 | __m128i h = khashv_hash_vector(seed->vec, data, data_len);
743 | // using word[3] to avoid any overlap with with the
744 | // 64 bit hash which uses words [0] and [1], this ensures
745 | // the 2 bit outputs should behave differently when used.
746 | #if defined(__SSE4_1__)
747 | return _mm_extract_epi32(h, 3);
748 | #else
749 | h = _mm_shuffle_epi32(h, 0xff);
750 | return _mm_cvtsi128_si32(h);
751 | #endif
752 | }
753 |
754 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
755 | __m128i h = khashv_hash_vector(seed->vec, data, data_len);
756 | return _mm_cvtsi128_si64(h);
757 | }
758 |
759 | #endif
760 |
761 | /* Vectorization via GCCs Vectorization builtins */
762 | // Handy since it allows vectorization without explicit intrinsics
763 | // for a particular CPU.
764 |
765 | #if !defined(KHASH_VECTOR) && KHASH_GCC_LEAST__(6, 1)
766 |
767 | #define KHASH_VECTOR 1
768 |
769 | typedef uint8_t kv16ui __attribute__((vector_size(16)));
770 | typedef uint32_t kv4ui __attribute__((vector_size(16)));
771 |
772 | static KHASH_FINLINE kv16ui khashv_sub_s1_gcc(kv16ui in) {
773 | const kv16ui mask = {
774 | 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
775 | 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf
776 | };
777 | const kv16ui sub = {
778 | 0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7,
779 | 0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12,
780 | };
781 | in &= mask;
782 | return __builtin_shuffle(sub, in);
783 | }
784 |
785 | static KHASH_FINLINE kv16ui khashv_sub_s2_gcc(kv16ui in) {
786 | const kv16ui sub = {
787 | 0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5,
788 | 0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a,
789 | };
790 | in >>= 4;
791 | return __builtin_shuffle(sub, in);
792 | }
793 |
794 | static KHASH_FINLINE kv4ui khashv_rotr_5_bytes_gcc(kv4ui input) {
795 | const kv16ui rotrLE = {
796 | 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc,
797 | 0xd, 0xe, 0xf, 0x0, 0x1, 0x2, 0x3, 0x4
798 | };
799 | const kv16ui rotrBE = {
800 | 0xb, 0x4, 0x5, 0x6, 0xf, 0x8, 0x9, 0xa,
801 | 0x3, 0xc, 0xd, 0xe, 0x7, 0x0, 0x1, 0x2
802 | };
803 | kv16ui tmp;
804 | memcpy(&tmp, &input, 16);
805 | if (khashv_is_little_endian()) {
806 | tmp = __builtin_shuffle(tmp, rotrLE);
807 | } else {
808 | tmp = __builtin_shuffle(tmp, rotrBE);
809 | }
810 | memcpy(&input, &tmp, 16);
811 | return input;
812 | }
813 |
814 | static KHASH_FINLINE kv4ui khashv_shuffle_bytes_gcc(kv4ui input) {
815 | const kv16ui shuffLE = {
816 | 0x7, 0xe, 0x9, 0x0, 0xc, 0xf, 0xd, 0x8,
817 | 0x5, 0xb, 0x6, 0x3, 0x4, 0x2, 0xa, 0x1
818 | };
819 | const kv16ui shuffBE = {
820 | 0x3, 0xa, 0xd, 0x4, 0xb, 0xe, 0xc, 0xf,
821 | 0x0, 0x5, 0x8, 0x6, 0x2, 0x9, 0x1, 0x7,
822 | };
823 | kv16ui tmp;
824 | memcpy(&tmp, &input, 16);
825 | if (khashv_is_little_endian()) {
826 | tmp = __builtin_shuffle(tmp, shuffLE);
827 | } else {
828 | tmp = __builtin_shuffle(tmp, shuffBE);
829 | }
830 | memcpy(&input, &tmp, 16);
831 | return input;
832 | }
833 |
834 | static KHASH_FINLINE kv4ui khash_byteswap_vec32_gcc( kv4ui input ) {
835 | const kv16ui bswap32 = {
836 | 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4,
837 | 0xb, 0xa, 0x9, 0x8, 0xf, 0xe, 0xd, 0xc,
838 | };
839 | kv16ui b;
840 |
841 | memcpy(&b, &input, 16);
842 | b = __builtin_shuffle(b, bswap32);
843 | memcpy(&input, &b, 16);
844 | return input;
845 | }
846 |
847 | static KHASH_FINLINE kv4ui khashv_replace_gcc(kv4ui input) {
848 | kv16ui s1;
849 | kv16ui s2;
850 | memcpy(&s1, &input, 16);
851 | s2 = khashv_sub_s2_gcc(s1);
852 | s1 = khashv_sub_s1_gcc(s1);
853 | s1 ^= s2;
854 | memcpy(&input, &s1, 16);
855 | return input;
856 | }
857 |
858 | static KHASH_FINLINE kv4ui khashv_mix_words_gcc(kv4ui val) {
859 | const unsigned rots[4] = { 5, 7, 11, 17 };
860 | kv4ui tmp = val >> 3;
861 | val ^= tmp;
862 | for (int i = 0; i < 4; i++) {
863 | unsigned rot = rots[i];
864 | kv4ui tmp = val;
865 | tmp = khashv_rotr_5_bytes_gcc(tmp);
866 | tmp += val;
867 | tmp = (tmp >> rot) | (tmp << (32 - rot));
868 | val ^= tmp;
869 | }
870 | return val;
871 | }
872 |
873 | static KHASH_FINLINE kv4ui khashv_hash_block_gcc(kv4ui hash, kv4ui input) {
874 | kv4ui tmp_1 = khashv_replace_gcc(input);
875 | if (!khashv_is_little_endian()) {
876 | tmp_1 = khash_byteswap_vec32_gcc(tmp_1);
877 | }
878 | kv4ui tmp_2 = tmp_1 * 8193;
879 | tmp_2 ^= hash;
880 | tmp_2 = khashv_rotr_5_bytes_gcc(tmp_2);
881 | hash = tmp_1 + tmp_2;
882 |
883 | tmp_1 = khashv_shuffle_bytes_gcc(hash);
884 | hash = hash + tmp_1;
885 | return hash;
886 | }
887 |
888 | static KHASH_FINLINE kv4ui khashv_hash_gcc(kv4ui hash, const uint8_t* data, size_t data_len) {
889 | hash[0] ^= data_len;
890 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
891 | hash[1] ^= data_len >> 32;
892 | #endif
893 |
894 | kv4ui data_v;
895 | const uint8_t* end = data + (data_len & ~((size_t)15));
896 | while (data < end) {
897 | memcpy(&data_v, data, 16);
898 | hash = khashv_hash_block_gcc(hash, data_v);
899 | data += 16;
900 | }
901 |
902 | unsigned trailing = data_len & 0xf;
903 | if(trailing) {
904 | memset(&data_v, 0, 16);
905 | memcpy(&data_v, data, trailing);
906 | hash = khashv_hash_block_gcc(hash, data_v);
907 | }
908 | return khashv_mix_words_gcc(hash);
909 | }
910 |
911 |
912 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) {
913 | kv4ui s;
914 | memcpy(&s, &khash_v_init, 16);
915 | s[0] ^= seed;
916 | s = khashv_mix_words_gcc(s);
917 | memcpy(seed_prepped, &s, 16);
918 | }
919 |
920 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) {
921 | kv4ui s;
922 | memcpy(&s, &khash_v_init, 16);
923 | s[0] ^= seed;
924 | s = khashv_mix_words_gcc(s);
925 | s[1] ^= seed >> 32;
926 | s = khashv_mix_words_gcc(s);
927 | memcpy(seed_prepped, &s, 16);
928 | }
929 |
930 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) {
931 | memcpy(seed_prepped->words, seed, 16);
932 | }
933 |
934 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
935 | kv4ui h;
936 | memcpy(&h, seed, 16);
937 | h = khashv_hash_gcc(h, data, data_len);
938 | return h[3];
939 | }
940 |
941 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
942 | kv4ui h;
943 | memcpy(&h, seed, 16);
944 | h = khashv_hash_gcc(h, data, data_len);
945 | uint64_t ret;
946 | if (khashv_is_little_endian()) {
947 | memcpy(&ret, &h, 8);
948 | } else {
949 | ret = h[1];
950 | ret = (ret << 32) | h[0];
951 | }
952 | return ret;
953 | }
954 |
955 | #endif
956 |
957 | #if defined(KHASH_VECTOR) && !defined(KHASHV_SCALAR)
958 |
959 | static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) {
960 | khashv_prep_seed32_vector(seed_prepped, seed);
961 | }
962 |
963 | static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) {
964 | khashv_prep_seed64_vector(seed_prepped, seed);
965 | }
966 |
967 | static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) {
968 | khashv_prep_seed128_vector(seed_prepped, seed);
969 | }
970 |
971 | static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
972 | return khashv32_vector(seed, data, data_len);
973 | }
974 |
975 | static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
976 | return khashv64_vector(seed, data, data_len);
977 | }
978 |
979 | #else
980 |
981 | static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) {
982 | khashv_prep_seed32_scalar(seed_prepped, seed);
983 | }
984 |
985 | static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) {
986 | khashv_prep_seed64_scalar(seed_prepped, seed);
987 | }
988 |
989 | static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) {
990 | khashv_prep_seed128_scalar(seed_prepped, seed);
991 | }
992 |
993 | static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
994 | return khashv32_scalar(seed, data, data_len);
995 | }
996 |
997 | static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
998 | return khashv64_scalar(seed, data, data_len);
999 | }
1000 |
1001 | #endif
1002 |
1003 |
1004 | #ifdef __cplusplus
1005 | }
1006 | #endif
1007 | #endif
1008 |
--------------------------------------------------------------------------------
/k-hashv-old/khashv_v1.h:
--------------------------------------------------------------------------------
1 | /*
2 | MIT License
3 | Copyright (c) 2022 Keith-Cancel
4 | Permission is hereby granted, free of charge, to any person obtaining a copy
5 | of this software and associated documentation files (the “Software”), to deal
6 | in the Software without restriction, including without limitation the rights
7 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8 | copies of the Software, and to permit persons to whom the Software is
9 | furnished to do so, subject to the following conditions:
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
13 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
14 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
15 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
18 | SOFTWARE.
19 | */
20 |
21 | #ifndef K_HASH_V_H
22 | #define K_HASH_V_H
23 | #ifdef __cplusplus
24 | extern "C" {
25 | #define restrict
26 | #endif
27 |
28 | #include
29 | #include
30 | #include
31 | #include
32 |
33 | // For MSVC compiler, no __SSE3__ macro
34 | #if !defined(__SSE3__) && (defined(__AVX__) || defined(__AVX2__))
35 | #define __SSE3__
36 | #endif
37 | // Same deal
38 | #if !defined(__SSE4_1__) && (defined(__AVX__) || defined(__AVX2__))
39 | #define __SSE4_1__
40 | #endif
41 |
42 | #if defined(__SSE3__)
43 | #include
44 | #if defined(__MINGW32__) || defined(_WIN32)
45 | #include
46 | #endif
47 | #endif
48 |
49 | #if defined(__GNUC__) && !defined(__clang__)
50 | #define KHASH_GCC_LEAST__(maj, min) (__GNUC__ > maj || __GNUC__ == maj && __GNUC_MINOR__ >= min)
51 | #else
52 | #define KHASH_GCC_LEAST__(maj, min) 0
53 | #endif
54 |
55 | #if defined(__BYTE_ORDER__) && !defined(__BYTE_ORDER)
56 | #define __BYTE_ORDER __BYTE_ORDER__
57 | #endif
58 |
59 | #if defined(__ORDER_LITTLE_ENDIAN__) && !defined(__LITTLE_ENDIAN)
60 | #define __LITTLE_ENDIAN __ORDER_LITTLE_ENDIAN__
61 | #endif
62 |
63 | #if defined(__ORDER_BIG_ENDIAN__) && !defined(__BIG_ENDIAN)
64 | #define __BIG_ENDIAN __ORDER_BIG_ENDIAN__
65 | #endif
66 |
67 |
68 | #if defined(__clang__) && defined(__has_attribute)
69 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr)
70 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(5, 0)
71 | #define KHASH_CHK_ATTRIBUTE__(attr) __has_attribute(attr)
72 | #else
73 | #define KHASH_CHK_ATTRIBUTE__(attr) 0
74 | #endif
75 |
76 | #if defined(__clang__) && defined(__has_builtin)
77 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built)
78 | #elif defined(__has_attribute) && KHASH_GCC_LEAST__(10, 1)
79 | #define KHASH_CHK_BUILTIN__(built) __has_builtin(built)
80 | #else
81 | #define KHASH_CHK_BUILTIN__(built) 0
82 | #endif
83 |
84 | #if defined(_MSC_VER) && !defined(__clang__)
85 | #define KHASH_FINLINE __forceinline
86 | #define KHASH_BSWAP32(val) _byteswap_ulong(val)
87 | #endif
88 |
89 | #if !defined(KHASH_FINLINE) && (KHASH_CHK_ATTRIBUTE__(always_inline) || KHASH_GCC_LEAST__(3, 1))
90 | #define KHASH_FINLINE __attribute__((always_inline)) inline
91 | #endif
92 |
93 | #if !defined(KHASH_BSWAP32) && (KHASH_CHK_BUILTIN__(__builtin_bswap32) || KHASH_GCC_LEAST__(4, 5))
94 | #define KHASH_BSWAP32(val) __builtin_bswap32(val)
95 | #endif
96 |
97 | #if !defined(KHASH_OPT_SZ) && (KHASH_CHK_ATTRIBUTE__(optimize) || KHASH_GCC_LEAST__(4, 8))
98 | #define KHASH_OPT_SZ __attribute__((optimize("Os")))
99 | #endif
100 |
101 | #if !defined(KHASH_FINLINE)
102 | #define KHASH_FINLINE inline
103 | #endif
104 |
105 | #if !defined(KHASH_OPT_SZ)
106 | #define KHASH_OPT_SZ
107 | #endif
108 |
109 | #if !defined(KHASH_BSWAP32)
110 | #define KHASH_BSWAP32(val) ((val >> 24) | ((val >> 8) & 0xff00) | ((val << 8) & 0xff0000) | (val << 24))
111 | #endif
112 |
113 | static KHASH_FINLINE int khashv_is_little_endian() {
114 | #if defined(__BYTE_ORDER) && __BYTE_ORDER == __LITTLE_ENDIAN
115 | return 1;
116 | #elif defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN
117 | return 0;
118 | #elif defined(__BYTE_ORDER)
119 | #error "Mixed/Middle endian machine, you will need to write a custom byteswap routine"
120 | #else
121 | // Otherwise hope the compiler's optimizer figures this is constant.
122 | // Also since the byte order macro does not exist there are
123 | // Middle-endian/Mixed endian machines out there but they are quite
124 | // rare/old. So I am not gonna worry about it since there are 24 or
125 | // 4! (four factorial) total endianess-es. So if the compiler does
126 | // not define __BYTE_ORDER, the hash output will be different, on
127 | // such machines, but the hash should still work fine.
128 | unsigned int x = 1;
129 | return *((char*)(&x)) == 1;
130 | #endif
131 | }
132 |
133 | #define KHASH_ROTR32(x, n) (((x) >> (n)) | ((x) << (32 - (n))))
134 |
135 | struct khashv_block_s {
136 | union {
137 | uint8_t bytes[16];
138 | uint32_t words[4];
139 | #if defined(__SSE3__)
140 | __m128i vec;
141 | #endif
142 | };
143 | };
144 |
145 | typedef struct khashv_block_s khashvBlock;
146 | typedef struct khashv_block_s khashvSeed;
147 |
148 | static const khashvBlock khash_v_init = {
149 | .words = {
150 | // Really this could basically be almost anything
151 | // So just using some bytes of the SHA-256 hashes
152 | // of 1, 2, 3, and 4
153 | 0x7785459a, // SHA256 of the byte 0x01, using the last 4 bytes
154 | 0x6457d986, // SHA256 of the byte 0x02, using the last 4 bytes
155 | 0xadff29c5, // SHA256 of the byte 0x03, using the last 4 bytes
156 | 0x81c89e71, // SHA256 of the byte 0x04, using the last 4 bytes
157 | }};
158 |
159 | static const uint8_t khashv_s1[16] = {
160 | 0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7,
161 | 0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12,
162 | };
163 |
164 | static const uint8_t khashv_s2[16] = {
165 | 0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5,
166 | 0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a,
167 | };
168 |
169 | static const uint8_t khashv_xored[256] = {
170 | 0xf3, 0xb2, 0x17, 0x0c, 0x2e, 0x73, 0x35, 0x58,
171 | 0x8c, 0x7e, 0xb6, 0x5c, 0xc4, 0x4a, 0x01, 0xfd,
172 | 0xd2, 0x93, 0x36, 0x2d, 0x0f, 0x52, 0x14, 0x79,
173 | 0xad, 0x5f, 0x97, 0x7d, 0xe5, 0x6b, 0x20, 0xdc,
174 | 0x7a, 0x3b, 0x9e, 0x85, 0xa7, 0xfa, 0xbc, 0xd1,
175 | 0x05, 0xf7, 0x3f, 0xd5, 0x4d, 0xc3, 0x88, 0x74,
176 | 0xef, 0xae, 0x0b, 0x10, 0x32, 0x6f, 0x29, 0x44,
177 | 0x90, 0x62, 0xaa, 0x40, 0xd8, 0x56, 0x1d, 0xe1,
178 | 0xea, 0xab, 0x0e, 0x15, 0x37, 0x6a, 0x2c, 0x41,
179 | 0x95, 0x67, 0xaf, 0x45, 0xdd, 0x53, 0x18, 0xe4,
180 | 0x3d, 0x7c, 0xd9, 0xc2, 0xe0, 0xbd, 0xfb, 0x96,
181 | 0x42, 0xb0, 0x78, 0x92, 0x0a, 0x84, 0xcf, 0x33,
182 | 0x5e, 0x1f, 0xba, 0xa1, 0x83, 0xde, 0x98, 0xf5,
183 | 0x21, 0xd3, 0x1b, 0xf1, 0x69, 0xe7, 0xac, 0x50,
184 | 0xb9, 0xf8, 0x5d, 0x46, 0x64, 0x39, 0x7f, 0x12,
185 | 0xc6, 0x34, 0xfc, 0x16, 0x8e, 0x00, 0x4b, 0xb7,
186 | 0x0d, 0x4c, 0xe9, 0xf2, 0xd0, 0x8d, 0xcb, 0xa6,
187 | 0x72, 0x80, 0x48, 0xa2, 0x3a, 0xb4, 0xff, 0x03,
188 | 0xb1, 0xf0, 0x55, 0x4e, 0x6c, 0x31, 0x77, 0x1a,
189 | 0xce, 0x3c, 0xf4, 0x1e, 0x86, 0x08, 0x43, 0xbf,
190 | 0x47, 0x06, 0xa3, 0xb8, 0x9a, 0xc7, 0x81, 0xec,
191 | 0x38, 0xca, 0x02, 0xe8, 0x70, 0xfe, 0xb5, 0x49,
192 | 0xda, 0x9b, 0x3e, 0x25, 0x07, 0x5a, 0x1c, 0x71,
193 | 0xa5, 0x57, 0x9f, 0x75, 0xed, 0x63, 0x28, 0xd4,
194 | 0x6e, 0x2f, 0x8a, 0x91, 0xb3, 0xee, 0xa8, 0xc5,
195 | 0x11, 0xe3, 0x2b, 0xc1, 0x59, 0xd7, 0x9c, 0x60,
196 | 0x24, 0x65, 0xc0, 0xdb, 0xf9, 0xa4, 0xe2, 0x8f,
197 | 0x5b, 0xa9, 0x61, 0x8b, 0x13, 0x9d, 0xd6, 0x2a,
198 | 0x89, 0xc8, 0x6d, 0x76, 0x54, 0x09, 0x4f, 0x22,
199 | 0xf6, 0x04, 0xcc, 0x26, 0xbe, 0x30, 0x7b, 0x87,
200 | 0x66, 0x27, 0x82, 0x99, 0xbb, 0xe6, 0xa0, 0xcd,
201 | 0x19, 0xeb, 0x23, 0xc9, 0x51, 0xdf, 0x94, 0x68,
202 | };
203 |
204 | /* Scalar Code */
205 |
206 | static KHASH_FINLINE void khashv_bswap_be_block_scalar(khashvBlock* in) {
207 | // Byte swapping is only needed if we are not on on a little endian system
208 | if (khashv_is_little_endian()) {
209 | return;
210 | }
211 | for(int i = 0; i < 4; i++) {
212 | in->words[i] = KHASH_BSWAP32(in->words[i]);
213 | }
214 | }
215 |
216 | static KHASH_FINLINE void khashv_rotr_5_bytes_scalar(khashvBlock* in) {
217 | khashv_bswap_be_block_scalar(in);
218 | khashvBlock tmp1;
219 | khashvBlock tmp2;
220 | // Avoid aliasing issues by using memcpy between these union values.
221 | memcpy(tmp1.bytes, in->words, 16);
222 | for(int i = 0; i < 16; i++) {
223 | tmp2.bytes[i] = tmp1.bytes[(i + 5) & 0xf];
224 | }
225 | memcpy(in->words, tmp2.bytes, 16);
226 | khashv_bswap_be_block_scalar(in);
227 | }
228 |
229 | static KHASH_FINLINE void khashv_rotr_9_bytes_scalar(khashvBlock* in) {
230 | khashv_bswap_be_block_scalar(in);
231 | khashvBlock tmp1;
232 | khashvBlock tmp2;
233 | // Avoid aliasing issues by using memcpy between these union values.
234 | memcpy(tmp1.bytes, in->words, 16);
235 | for(int i = 0; i < 16; i++) {
236 | tmp2.bytes[i] = tmp1.bytes[(i + 9) & 0xf];
237 | }
238 | memcpy(in->words, tmp2.bytes, 16);
239 | khashv_bswap_be_block_scalar(in);
240 | }
241 |
242 | static KHASH_FINLINE void khashv_shl_13_block_scalar(khashvBlock* in) {
243 | for(int i = 0; i < 4; i++) {
244 | in->words[i] <<= 13;
245 | }
246 | }
247 |
248 | static KHASH_FINLINE void khashv_shr_3_block_scalar(khashvBlock* in) {
249 | for(int i = 0; i < 4; i++) {
250 | in->words[i] >>= 3;
251 | }
252 | }
253 |
254 | static KHASH_FINLINE void khashv_add_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) {
255 | for(int i = 0; i < 4; i++) {
256 | a->words[i] += b->words[i];
257 | }
258 | }
259 |
260 | static KHASH_FINLINE void khashv_xor_block_scalar(khashvBlock* restrict a, const khashvBlock* restrict b) {
261 | for(int i = 0; i < 4; i++) {
262 | a->words[i] ^= b->words[i];
263 | }
264 | }
265 |
266 | // GCC and Clang with -O3 were vectorizing this quite poorly with -O3
267 | // They could not detect that only a PSHUFB was needed and instead
268 | // where generating tons of inserts and extracts from the vector
269 | // registers. Thusly it was running slower than code that was not being
270 | // vectorized on my machine. So I specify the optimization level directly.
271 | // Tried a few other things to get GCC and Clang to generate more sane
272 | // code or code using PSHUFB, but this seemed the cleanest.
273 | // Example of what I mean: https://godbolt.org/z/PMnzsThPc
274 | // Compared to this: https://godbolt.org/z/dWfjr7GWP
275 | /*static KHASH_OPT_SZ void khashv_sub16(khashvBlock* tmp, const uint8_t sub[16]) {
276 | #if defined(__clang__)
277 | // Stop clang from being annoying!!!
278 | // The auto-vectorized code was worse at the time of writing this
279 | #pragma nounroll
280 | #pragma clang loop vectorize(disable)
281 | #pragma clang loop interleave(disable)
282 | #endif
283 | for (int i = 0; i < 16; i++) {
284 | tmp->bytes[i] = sub[tmp->bytes[i]];
285 | }
286 | }
287 |
288 | static KHASH_FINLINE void khashv_replace_scalar(khashvBlock* replace) {
289 | khashvBlock tmp;
290 | for (int i = 0; i < 16; i++) {
291 | tmp.bytes[i] = (replace->bytes[i] >> 4);
292 | replace->bytes[i] &= 0x0f;
293 | }
294 | khashv_sub16(replace, khashv_s1);
295 | khashv_sub16(&tmp, khashv_s2);
296 | for (int i = 0; i < 16; i++) {
297 | replace->bytes[i] ^= tmp.bytes[i];
298 | }
299 | }*/
300 | // Similar issue as the commented out code so stop the optimizers
301 | // from getting crazy
302 | static KHASH_OPT_SZ void khashv_replace_scalar(khashvBlock* replace) {
303 | khashvBlock tmp;
304 | memcpy(tmp.bytes, replace->words, 16);
305 | #if defined(__clang__)
306 | // Stop clang from being annoying!!!
307 | // The auto-vectorized code was worse at the time of writing this
308 | #pragma nounroll
309 | #pragma clang loop vectorize(disable)
310 | #pragma clang loop interleave(disable)
311 | #endif
312 | for(int i = 0; i < 16; i++) {
313 | tmp.bytes[i] = khashv_xored[tmp.bytes[i]];
314 | }
315 | memcpy(replace->words, tmp.bytes, 16);
316 | }
317 |
318 | static KHASH_FINLINE void khashv_mix_words_scalar(khashvBlock* in) {
319 | unsigned rots[4] = { 5, 7, 11, 17 };
320 | khashvBlock tmp = { 0 };
321 | for (int i = 0; i < 4; i++) {
322 | unsigned rot = rots[i];
323 | tmp = *in;
324 | khashv_rotr_5_bytes_scalar(&tmp);
325 | khashv_add_block_scalar(&tmp, in);
326 | for (int j = 0; j < 4; j++) {
327 | tmp.words[j] = KHASH_ROTR32(tmp.words[j], rot);
328 | }
329 | khashv_xor_block_scalar(in, &tmp);
330 | }
331 | }
332 |
333 | static void khashv_hash_scalar(khashvBlock* hash, const uint8_t* data, size_t data_len) {
334 | hash->words[0] ^= data_len;
335 | // size_t is bigger than 32 bits
336 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
337 | hash->words[1] ^= data_len >> 32;
338 | #endif
339 |
340 | khashvBlock tmp_1;
341 | khashvBlock tmp_2;
342 | khashvBlock tmp_h = *hash;
343 |
344 | const uint8_t* end = data + (data_len & ~((size_t)15));
345 |
346 | while (data < end) {
347 | memcpy(&tmp_2, data, 16);
348 | khashv_replace_scalar(&tmp_2);
349 | memcpy(&tmp_1.words, tmp_2.bytes, 16);
350 |
351 | khashv_bswap_be_block_scalar(&tmp_1);
352 |
353 | tmp_2 = tmp_1;
354 | //khashv_shl_13_block_scalar(&tmp_2);
355 | //khashv_add_block_scalar(&tmp_2, &tmp_1);
356 | for(int i = 0; i < 4; i++) {
357 | tmp_2.words[i] *= 8193;
358 | }
359 | khashv_xor_block_scalar(&tmp_h, &tmp_2);
360 | khashv_rotr_5_bytes_scalar(&tmp_h);
361 | khashv_add_block_scalar(&tmp_h, &tmp_1);
362 |
363 | tmp_2 = tmp_h;
364 | khashv_shr_3_block_scalar(&tmp_2);
365 | khashv_rotr_9_bytes_scalar(&tmp_h);
366 | khashv_add_block_scalar(&tmp_h, &tmp_2);
367 |
368 | data += 16;
369 | }
370 |
371 | unsigned trailing = data_len & 0xf;
372 | if(trailing) {
373 | memset(&tmp_2, 0, 16);
374 |
375 | memcpy(&tmp_2.bytes, data, trailing);
376 | khashv_replace_scalar(&tmp_2);
377 | memcpy(&tmp_1.words, tmp_2.bytes, 16);
378 |
379 | khashv_bswap_be_block_scalar(&tmp_1);
380 |
381 | tmp_2 = tmp_1;
382 | //khashv_shl_13_block_scalar(&tmp_2);
383 | //khashv_add_block_scalar(&tmp_2, &tmp_1);
384 | for(int i = 0; i < 4; i++) {
385 | tmp_2.words[i] *= 8193;
386 | }
387 | khashv_xor_block_scalar(&tmp_h, &tmp_2);
388 | khashv_rotr_5_bytes_scalar(&tmp_h);
389 | khashv_add_block_scalar(&tmp_h, &tmp_1);
390 |
391 | tmp_2 = tmp_h;
392 | khashv_shr_3_block_scalar(&tmp_2);
393 | khashv_rotr_9_bytes_scalar(&tmp_h);
394 | khashv_add_block_scalar(&tmp_h, &tmp_2);
395 |
396 | }
397 | khashv_mix_words_scalar(&tmp_h);
398 | *hash = tmp_h;
399 | }
400 |
401 | static inline void khashv_prep_seed32_scalar(khashvSeed* seed_prepped, uint32_t seed) {
402 | *seed_prepped = khash_v_init;
403 | seed_prepped->words[0] ^= seed;
404 | khashv_mix_words_scalar(seed_prepped);
405 | }
406 |
407 | static inline void khashv_prep_seed64_scalar(khashvSeed* seed_prepped, uint64_t seed) {
408 | *seed_prepped = khash_v_init;
409 | seed_prepped->words[0] ^= seed;
410 | khashv_mix_words_scalar(seed_prepped);
411 | // Do it again with the other part to make it different than the 32 bit seed.
412 | seed_prepped->words[1] ^= seed >> 32;
413 | khashv_mix_words_scalar(seed_prepped);
414 | }
415 |
416 | static inline void khashv_prep_seed128_scalar(khashvSeed* seed_prepped, const uint32_t seed[4]) {
417 | for(int i = 0; i < 4; i++) {
418 | seed_prepped->words[i] = seed[i];
419 | }
420 | }
421 |
422 | static inline uint32_t khashv32_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
423 | khashvBlock h = *seed;
424 | khashv_hash_scalar(&h, data, data_len);
425 | return h.words[3];
426 | }
427 |
428 | static inline uint64_t khashv64_scalar(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
429 | khashvBlock h = *seed;
430 | khashv_hash_scalar(&h, data, data_len);
431 | uint64_t r = h.words[1];
432 | r <<= 32;
433 | r |= h.words[0];
434 | return r;
435 | }
436 |
437 | /* Vectorization for and Intel/AMD */
438 |
439 | #if defined(__SSE3__)
440 |
441 | #define KHASH_VECTOR 1
442 |
443 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(11, 0))
444 | static KHASH_FINLINE __m128i _mm_loadu_si32(const void* data) {
445 | uint32_t val;
446 | memcpy(&val, data, sizeof(uint32_t));
447 | return _mm_cvtsi32_si128(val);
448 | }
449 | static KHASH_FINLINE __m128i _mm_loadu_si16(const void* data) {
450 | uint32_t val = 0;
451 | memcpy(&val, data, sizeof(uint16_t));
452 | return _mm_cvtsi32_si128(val);
453 | }
454 | #endif
455 |
456 | #if !defined(_MSC_VER) && !defined(__clang__) && !(KHASH_GCC_LEAST__(9, 1))
457 | static KHASH_FINLINE __m128i _mm_loadu_si64(const void* data) {
458 | uint64_t val = 0;
459 | memcpy(&val, data, sizeof(uint64_t));
460 | return _mm_cvtsi64_si128(val);
461 | }
462 | #endif
463 |
464 | static KHASH_FINLINE __m128i khashv_mix_words_vector(__m128i val) {
465 | __m128i tmp1;
466 | __m128i tmp2;
467 |
468 | tmp1 = _mm_alignr_epi8(val, val, 5);
469 | tmp1 = _mm_add_epi32(val, tmp1);
470 | #if defined(__AVX512VL__)
471 | tmp1 = _mm_ror_epi32(tmp1, 5);
472 | val = _mm_xor_si128(val, tmp1);
473 | #else
474 | tmp2 = _mm_srli_epi32(tmp1, 5);
475 | tmp1 = _mm_slli_epi32(tmp1, 27);
476 | val = _mm_xor_si128(val, tmp2);
477 | val = _mm_xor_si128(val, tmp1);
478 | #endif
479 |
480 | tmp1 = _mm_alignr_epi8(val, val, 5);
481 | tmp1 = _mm_add_epi32(val, tmp1);
482 | #if defined(__AVX512VL__)
483 | tmp1 = _mm_ror_epi32(tmp1, 7);
484 | val = _mm_xor_si128(val, tmp1);
485 | #else
486 | tmp2 = _mm_srli_epi32(tmp1, 7);
487 | tmp1 = _mm_slli_epi32(tmp1, 25);
488 | val = _mm_xor_si128(val, tmp2);
489 | val = _mm_xor_si128(val, tmp1);
490 | #endif
491 |
492 | tmp1 = _mm_alignr_epi8(val, val, 5);
493 | tmp1 = _mm_add_epi32(tmp1, val);
494 | #if defined(__AVX512VL__)
495 | tmp1 = _mm_ror_epi32(tmp1, 11);
496 | val = _mm_xor_si128(val, tmp1);
497 | #else
498 | tmp2 = _mm_srli_epi32(tmp1, 11);
499 | tmp1 = _mm_slli_epi32(tmp1, 21);
500 | val = _mm_xor_si128(val, tmp2);
501 | val = _mm_xor_si128(val, tmp1);
502 | #endif
503 |
504 | tmp1 = _mm_alignr_epi8(val, val, 5);
505 | tmp1 = _mm_add_epi32(tmp1, val);
506 | #if defined(__AVX512VL__)
507 | tmp1 = _mm_ror_epi32(tmp1, 17);
508 | val = _mm_xor_si128(val, tmp1);
509 | #else
510 | tmp2 = _mm_srli_epi32(tmp1, 17);
511 | tmp1 = _mm_slli_epi32(tmp1, 15);
512 | val = _mm_xor_si128(val, tmp2);
513 | val = _mm_xor_si128(val, tmp1);
514 | #endif
515 |
516 | return val;
517 | }
518 |
519 | static KHASH_FINLINE __m128i khashv_part_load_vector(const uint8_t* data, size_t len) {
520 | __m128i tmp = { 0 };
521 | __m128i tmp2 = { 0 };
522 | switch(len) {
523 | case 1:
524 | #if defined(__SSE4_1__)
525 | tmp = _mm_insert_epi8(tmp, data[0], 0);
526 | #else
527 | tmp = _mm_cvtsi32_si128(data[0]);
528 | #endif
529 | break;
530 | case 2:
531 | tmp = _mm_loadu_si16(data);
532 | break;
533 | case 3:
534 | tmp = _mm_loadu_si16(data);
535 | #if defined(__SSE4_1__)
536 | tmp = _mm_insert_epi8(tmp, data[2], 2);
537 | #else
538 | tmp = _mm_insert_epi16(tmp, data[2], 1);
539 | #endif
540 | break;
541 | case 4:
542 | tmp = _mm_loadu_si32(data);
543 | break;
544 | case 5:
545 | tmp = _mm_loadu_si32(data);
546 | #if defined(__SSE4_1__)
547 | tmp = _mm_insert_epi8(tmp, data[4], 4);
548 | #else
549 | tmp = _mm_insert_epi16(tmp, data[4], 2);
550 | #endif
551 | break;
552 | case 6:
553 | tmp = _mm_loadu_si32(data);
554 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2);
555 | break;
556 | case 7:
557 | tmp = _mm_loadu_si32(data);
558 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 4), 2);
559 | #if defined(__SSE4_1__)
560 | tmp = _mm_insert_epi8(tmp, data[6], 6);
561 | #else
562 | tmp = _mm_insert_epi16(tmp, data[6], 3);
563 | #endif
564 | break;
565 | case 8:
566 | tmp = _mm_loadu_si64(data);
567 | break;
568 | case 9:
569 | tmp = _mm_loadu_si64(data);
570 | #if defined(__SSE4_1__)
571 | tmp = _mm_insert_epi8(tmp, data[8], 8);
572 | #else
573 | tmp = _mm_insert_epi16(tmp, data[8], 4);
574 | #endif
575 | break;
576 | case 10:
577 | tmp = _mm_loadu_si64(data);
578 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4);
579 | break;
580 | case 11:
581 | tmp = _mm_loadu_si64(data);
582 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 8), 4);
583 | #if defined(__SSE4_1__)
584 | tmp = _mm_insert_epi8(tmp, data[10], 10);
585 | #else
586 | tmp = _mm_insert_epi16(tmp, data[10], 5);
587 | #endif
588 | break;
589 | case 12:
590 | tmp = _mm_loadu_si64(data);
591 | #if defined(__SSE4_1__)
592 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
593 | #else
594 | tmp2 = _mm_loadu_si32(data + 8);
595 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
596 | tmp = _mm_or_si128(tmp, tmp2);
597 | #endif
598 | break;
599 | case 13:
600 | tmp = _mm_loadu_si64(data);
601 | #if defined(__SSE4_1__)
602 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
603 | tmp = _mm_insert_epi8(tmp, data[12], 12);
604 | #else
605 | tmp2 = _mm_loadu_si32(data + 8);
606 | tmp2 = _mm_insert_epi16(tmp2, data[12], 2);
607 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
608 | tmp = _mm_or_si128(tmp, tmp2);
609 | #endif
610 | break;
611 | case 14:
612 | tmp = _mm_loadu_si64(data);
613 | #if defined(__SSE4_1__)
614 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
615 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6);
616 | #else
617 | tmp2 = _mm_loadu_si32(data + 8);
618 | tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6);
619 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
620 | tmp = _mm_or_si128(tmp, tmp2);
621 | #endif
622 | break;
623 | case 15:
624 | tmp = _mm_loadu_si64(data);
625 | #if defined(__SSE4_1__)
626 | tmp = _mm_insert_epi32(tmp, *(uint32_t*)(data + 8), 2);
627 | tmp = _mm_insert_epi16(tmp, *(uint16_t*)(data + 12), 6);
628 | tmp = _mm_insert_epi8(tmp, data[14], 14);
629 | #else
630 | tmp2 = _mm_loadu_si32(data + 8);
631 | tmp2 = _mm_insert_epi16(tmp2, *(uint16_t*)(data + 12), 6);
632 | tmp2 = _mm_insert_epi16(tmp2, data[14], 7);
633 | tmp2 = _mm_shuffle_epi32(tmp2, 0x4f);
634 | tmp = _mm_or_si128(tmp, tmp2);
635 | #endif
636 | break;
637 | case 16:
638 | tmp = _mm_loadu_si64(data);
639 | #if defined(__SSE4_1__)
640 | tmp = _mm_insert_epi64(tmp, *(uint64_t*)(data + 8), 1);
641 | #else
642 | tmp2 = _mm_loadu_si64(data + 8);
643 | tmp = _mm_unpacklo_epi64(tmp, tmp2);
644 | #endif
645 | break;
646 | }
647 | return tmp;
648 | }
649 |
650 | static const uint8_t khashv_shuff[16] = {
651 | 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00,
652 | 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
653 | };
654 |
655 | static __m128i khashv_hash_vector(__m128i hash, const uint8_t* data, size_t data_len) {
656 | const __m128i s1 = _mm_loadu_si128((const __m128i*)khashv_s1);
657 | const __m128i s2 = _mm_loadu_si128((const __m128i*)khashv_s2);
658 | const __m128i shuff = _mm_loadu_si128((const __m128i*)khashv_shuff);
659 | const __m128i mask = _mm_set1_epi32(0x0f0f0f0f);
660 |
661 | __m128i tmp_1;
662 | __m128i tmp_2;
663 |
664 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
665 | tmp_1 = _mm_cvtsi64_si128(data_len);
666 | #else
667 | tmp_1 = _mm_cvtsi32_si128(data_len);
668 | #endif
669 | hash = _mm_xor_si128(tmp_1, hash);
670 |
671 | const uint8_t* end = data + (data_len & ~((size_t)15));
672 | const uint8_t* end2 = data + data_len;
673 | while(data_len > 16 && data < end) {
674 | tmp_1 = _mm_lddqu_si128((const __m128i*)data);
675 | tmp_2 = _mm_srli_epi32 (tmp_1, 4);
676 |
677 | tmp_1 = _mm_and_si128 (tmp_1, mask);
678 | tmp_2 = _mm_and_si128 (tmp_2, mask);
679 | tmp_1 = _mm_shuffle_epi8(s1, tmp_1);
680 | tmp_2 = _mm_shuffle_epi8(s2, tmp_2);
681 | tmp_1 = _mm_xor_si128 (tmp_1, tmp_2);
682 |
683 | tmp_2 = _mm_slli_epi32 (tmp_1, 13);
684 | tmp_2 = _mm_add_epi32 (tmp_1, tmp_2);
685 | tmp_2 = _mm_xor_si128 (hash, tmp_2);
686 | tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5);
687 | hash = _mm_add_epi32 (tmp_2, tmp_1);
688 |
689 | tmp_2 = _mm_srli_epi32(hash, 3);
690 | tmp_1 = _mm_shuffle_epi8(hash, shuff);
691 | hash = _mm_add_epi32 (tmp_2, tmp_1);
692 |
693 | data += 16;
694 | }
695 | uintptr_t trailing = end2 - data;
696 | if(trailing) {
697 | tmp_1 = khashv_part_load_vector(data, trailing);
698 | tmp_2 = _mm_srli_epi32 (tmp_1, 4);
699 |
700 | tmp_1 = _mm_and_si128 (tmp_1, mask);
701 | tmp_2 = _mm_and_si128 (tmp_2, mask);
702 | tmp_1 = _mm_shuffle_epi8(s1, tmp_1);
703 | tmp_2 = _mm_shuffle_epi8(s2, tmp_2);
704 | tmp_1 = _mm_xor_si128 (tmp_1, tmp_2);
705 |
706 | tmp_2 = _mm_slli_epi32 (tmp_1, 13);
707 | tmp_2 = _mm_add_epi32 (tmp_1, tmp_2);
708 | tmp_2 = _mm_xor_si128 (hash, tmp_2);
709 | tmp_2 = _mm_alignr_epi8(tmp_2, tmp_2, 5);
710 | hash = _mm_add_epi32 (tmp_2, tmp_1);
711 |
712 | tmp_2 = _mm_srli_epi32(hash, 3);
713 | tmp_1 = _mm_shuffle_epi8(hash, shuff);
714 | hash = _mm_add_epi32 (tmp_2, tmp_1);
715 | }
716 | hash = khashv_mix_words_vector(hash);
717 | return hash;
718 | }
719 |
720 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) {
721 | __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init);
722 | s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed));
723 | seed_prepped->vec = khashv_mix_words_vector(s);
724 | }
725 |
726 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) {
727 | __m128i s = _mm_loadu_si128((const __m128i*)&khash_v_init);
728 | __m128i t = _mm_cvtsi32_si128(seed >> 32);
729 | s = _mm_xor_si128(s, _mm_cvtsi32_si128(seed));
730 | s = khashv_mix_words_vector(s);
731 | s = _mm_xor_si128(s, _mm_shuffle_epi32(t, 0xf3));
732 | seed_prepped->vec = khashv_mix_words_vector(s);
733 | }
734 |
735 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) {
736 | seed_prepped->vec = _mm_loadu_si128((const __m128i*)seed);
737 | }
738 |
739 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
740 | __m128i h = khashv_hash_vector(seed->vec, data, data_len);
741 | // using word[3] to avoid any overlap with with the
742 | // 64 bit hash which uses words [0] and [1], this ensures
743 | // the 2 bit outputs should behave differently when used.
744 | #if defined(__SSE4_1__)
745 | return _mm_extract_epi32(h, 3);
746 | #else
747 | h = _mm_shuffle_epi32(h, 0xff);
748 | return _mm_cvtsi128_si32(h);
749 | #endif
750 | }
751 |
752 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
753 | __m128i h = khashv_hash_vector(seed->vec, data, data_len);
754 | return _mm_cvtsi128_si64(h);
755 | }
756 |
757 | #endif
758 |
759 | /* Vectorization via GCCs Vectorization builtins */
760 | // Handy since it allows vectorization without explicit intrinsics
761 | // for a particular CPU.
762 |
763 | #if !defined(KHASH_VECTOR) && KHASH_GCC_LEAST__(6, 1)
764 |
765 | #define KHASH_VECTOR 1
766 |
767 | typedef uint8_t kv16ui __attribute__((vector_size(16)));
768 | typedef uint32_t kv4ui __attribute__((vector_size(16)));
769 |
770 | static KHASH_FINLINE kv16ui khashv_sub_s1_gcc(kv16ui in) {
771 | const kv16ui mask = {
772 | 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf,
773 | 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf
774 | };
775 | const kv16ui sub = {
776 | 0x1c, 0x5d, 0xf8, 0xe3, 0xc1, 0x9c, 0xda, 0xb7,
777 | 0x63, 0x91, 0x59, 0xb3, 0x2b, 0xa5, 0xee, 0x12,
778 | };
779 | in &= mask;
780 | return __builtin_shuffle(sub, in);
781 | }
782 |
783 | static KHASH_FINLINE kv16ui khashv_sub_s2_gcc(kv16ui in) {
784 | const kv16ui sub = {
785 | 0xef, 0xce, 0x66, 0xf3, 0xf6, 0x21, 0x42, 0xa5,
786 | 0x11, 0xad, 0x5b, 0xc6, 0x72, 0x38, 0x95, 0x7a,
787 | };
788 | in >>= 4;
789 | return __builtin_shuffle(sub, in);
790 | }
791 |
792 | static KHASH_FINLINE kv4ui khashv_rotr_5_bytes_gcc(kv4ui input) {
793 | const kv16ui rotrLE = {
794 | 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc,
795 | 0xd, 0xe, 0xf, 0x0, 0x1, 0x2, 0x3, 0x4
796 | };
797 | const kv16ui rotrBE = {
798 | 0xb, 0x4, 0x5, 0x6, 0xf, 0x8, 0x9, 0xa,
799 | 0x3, 0xc, 0xd, 0xe, 0x7, 0x0, 0x1, 0x2
800 | };
801 | kv16ui tmp;
802 | memcpy(&tmp, &input, 16);
803 | if (khashv_is_little_endian()) {
804 | tmp = __builtin_shuffle(tmp, rotrLE);
805 | } else {
806 | tmp = __builtin_shuffle(tmp, rotrBE);
807 | }
808 | memcpy(&input, &tmp, 16);
809 | return input;
810 | }
811 |
812 | static KHASH_FINLINE kv4ui khashv_rotr_9_bytes_gcc(kv4ui input) {
813 | const kv16ui rotrLE = {
814 | 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x0,
815 | 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
816 | };
817 | const kv16ui rotrBE = {
818 | 0xf, 0x8, 0x9, 0xa, 0x3, 0xc, 0xd, 0xe,
819 | 0x7, 0x0, 0x1, 0x2, 0xb, 0x4, 0x5, 0x6,
820 | };
821 | kv16ui tmp;
822 | memcpy(&tmp, &input, 16);
823 | if (khashv_is_little_endian()) {
824 | tmp = __builtin_shuffle(tmp, rotrLE);
825 | } else {
826 | tmp = __builtin_shuffle(tmp, rotrBE);
827 | }
828 | memcpy(&input, &tmp, 16);
829 | return input;
830 | }
831 |
832 | static KHASH_FINLINE kv4ui khash_byteswap_vec32_gcc( kv4ui input ) {
833 | const kv16ui bswap32 = {
834 | 0x3, 0x2, 0x1, 0x0, 0x7, 0x6, 0x5, 0x4,
835 | 0xb, 0xa, 0x9, 0x8, 0xf, 0xe, 0xd, 0xc,
836 | };
837 | kv16ui b;
838 |
839 | memcpy(&b, &input, 16);
840 | b = __builtin_shuffle(b, bswap32);
841 | memcpy(&input, &b, 16);
842 | return input;
843 | }
844 |
845 | static KHASH_FINLINE kv4ui khashv_replace_gcc(kv4ui input) {
846 | kv16ui s1;
847 | kv16ui s2;
848 | memcpy(&s1, &input, 16);
849 | s2 = khashv_sub_s2_gcc(s1);
850 | s1 = khashv_sub_s1_gcc(s1);
851 | s1 ^= s2;
852 | memcpy(&input, &s1, 16);
853 | return input;
854 | }
855 |
856 | static KHASH_FINLINE kv4ui khashv_mix_words_gcc(kv4ui val) {
857 | const unsigned rots[4] = { 5, 7, 11, 17 };
858 | for (int i = 0; i < 4; i++) {
859 | unsigned rot = rots[i];
860 | kv4ui tmp = val;
861 | tmp = khashv_rotr_5_bytes_gcc(tmp);
862 | tmp += val;
863 | tmp = (tmp >> rot) | (tmp << (32 - rot));
864 | val ^= tmp;
865 | }
866 | return val;
867 | }
868 |
869 | static KHASH_FINLINE kv4ui khashv_hash_block_gcc(kv4ui hash, kv4ui input) {
870 | kv4ui tmp_1 = khashv_replace_gcc(input);
871 | if (!khashv_is_little_endian()) {
872 | tmp_1 = khash_byteswap_vec32_gcc(tmp_1);
873 | }
874 | kv4ui tmp_2 = tmp_1 * 8193;
875 | tmp_2 ^= hash;
876 | tmp_2 = khashv_rotr_5_bytes_gcc(tmp_2);
877 | hash = tmp_1 + tmp_2;
878 |
879 | tmp_2 = hash >> 3;
880 | tmp_1 = khashv_rotr_9_bytes_gcc(hash);
881 | hash = tmp_1 + tmp_2;
882 | return hash;
883 | }
884 |
885 | static KHASH_FINLINE kv4ui khashv_hash_gcc(kv4ui hash, const uint8_t* data, size_t data_len) {
886 | hash[0] ^= data_len;
887 | #if defined(SIZE_MAX) && SIZE_MAX > 4294967295
888 | hash[1] ^= data_len >> 32;
889 | #endif
890 |
891 | kv4ui data_v;
892 | const uint8_t* end = data + (data_len & ~((size_t)15));
893 | while (data < end) {
894 | memcpy(&data_v, data, 16);
895 | hash = khashv_hash_block_gcc(hash, data_v);
896 | data += 16;
897 | }
898 |
899 | unsigned trailing = data_len & 0xf;
900 | if(trailing) {
901 | memset(&data_v, 0, 16);
902 | memcpy(&data_v, data, trailing);
903 | hash = khashv_hash_block_gcc(hash, data_v);
904 | }
905 | return khashv_mix_words_gcc(hash);
906 | }
907 |
908 |
909 | static void khashv_prep_seed32_vector(khashvSeed* seed_prepped, uint32_t seed) {
910 | kv4ui s;
911 | memcpy(&s, &khash_v_init, 16);
912 | s[0] ^= seed;
913 | s = khashv_mix_words_gcc(s);
914 | memcpy(seed_prepped, &s, 16);
915 | }
916 |
917 | static void khashv_prep_seed64_vector(khashvSeed* seed_prepped, uint64_t seed) {
918 | kv4ui s;
919 | memcpy(&s, &khash_v_init, 16);
920 | s[0] ^= seed;
921 | s = khashv_mix_words_gcc(s);
922 | s[1] ^= seed >> 32;
923 | s = khashv_mix_words_gcc(s);
924 | memcpy(seed_prepped, &s, 16);
925 | }
926 |
927 | static void khashv_prep_seed128_vector(khashvSeed* seed_prepped, const uint32_t seed[4]) {
928 | memcpy(seed_prepped->words, seed, 16);
929 | }
930 |
931 | static uint32_t khashv32_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
932 | kv4ui h;
933 | memcpy(&h, seed, 16);
934 | h = khashv_hash_gcc(h, data, data_len);
935 | return h[3];
936 | }
937 |
938 | static uint64_t khashv64_vector(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
939 | kv4ui h;
940 | memcpy(&h, seed, 16);
941 | h = khashv_hash_gcc(h, data, data_len);
942 | uint64_t ret;
943 | if (khashv_is_little_endian()) {
944 | memcpy(&ret, &h, 8);
945 | } else {
946 | ret = h[1];
947 | ret = (ret << 32) | h[0];
948 | }
949 | return ret;
950 | }
951 |
952 | #endif
953 |
954 | #if defined(KHASH_VECTOR) && !defined(KHASHV_SCALAR)
955 |
956 | static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) {
957 | khashv_prep_seed32_vector(seed_prepped, seed);
958 | }
959 |
960 | static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) {
961 | khashv_prep_seed64_vector(seed_prepped, seed);
962 | }
963 |
964 | static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) {
965 | khashv_prep_seed128_vector(seed_prepped, seed);
966 | }
967 |
968 | static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
969 | return khashv32_vector(seed, data, data_len);
970 | }
971 |
972 | static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
973 | return khashv64_vector(seed, data, data_len);
974 | }
975 |
976 | #else
977 |
978 | static inline void khashv_prep_seed32(khashvSeed* seed_prepped, uint32_t seed) {
979 | khashv_prep_seed32_scalar(seed_prepped, seed);
980 | }
981 |
982 | static inline void khashv_prep_seed64(khashvSeed* seed_prepped, uint64_t seed) {
983 | khashv_prep_seed64_scalar(seed_prepped, seed);
984 | }
985 |
986 | static inline void khashv_prep_seed128(khashvSeed* seed_prepped, const uint32_t seed[4]) {
987 | khashv_prep_seed128_scalar(seed_prepped, seed);
988 | }
989 |
990 | static inline uint32_t khashv32(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
991 | return khashv32_scalar(seed, data, data_len);
992 | }
993 |
994 | static inline uint64_t khashv64(const khashvSeed* seed, const uint8_t* data, size_t data_len) {
995 | return khashv64_scalar(seed, data, data_len);
996 | }
997 |
998 | #endif
999 |
1000 |
1001 | #ifdef __cplusplus
1002 | }
1003 | #endif
1004 | #endif
1005 |
--------------------------------------------------------------------------------