├── siphash.h ├── README.md ├── siphash_impl.h ├── Makefile ├── siphash.c ├── siphash_sse2.c ├── test.c └── siphash_ssse3.c /siphash.h: -------------------------------------------------------------------------------- 1 | #ifndef SIPHASH_H 2 | #define SIPHASH_H 3 | 4 | #if defined(_MSC_VER) 5 | typedef unsigned __int64 uint64_t; 6 | #else 7 | #include 8 | #include 9 | #endif 10 | 11 | #ifdef __cplusplus /* If this is a C++ compiler, use C linkage */ 12 | extern "C" { 13 | #endif 14 | 15 | uint64_t siphash(const unsigned char key[16], const unsigned char *m, size_t len); 16 | 17 | #ifdef __cplusplus /* If this is a C++ compiler, end C linkage */ 18 | } 19 | #endif 20 | 21 | 22 | #endif // SIPHASH_H 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Basic+SSE2+SSSE3 implementation of [SipHash-2-4](http://131002.net/siphash/) based off the paper. On an E5200, icc gives 2 | the best speeds for all SSE versions & 32bit versions, gcc gives the best speed for 64bit basic 3 | 4 | done to see how difficult implementing the spec was as their source is not up (yet), and curious about 32bit performance 5 | 6 | Performance on an E5200 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 |
Best Impl.7 bytes128 bytes1024 bytes
icc 32bit202 cycles1015 cycles6.7 cpb
icc 32bit sse2135 cycles688 cycles4.48 cpb
icc 32bit ssse3125 cycles614 cycles3.98 cpb
gcc 64bit61 cycles276 cycles1.81 cpb
17 | 18 | #### License 19 | 20 | Public Domain 21 | -------------------------------------------------------------------------------- /siphash_impl.h: -------------------------------------------------------------------------------- 1 | #ifndef SIPHASH_IMPL_H 2 | #define SIPHASH_IMPL_H 3 | 4 | #include "siphash.h" 5 | 6 | #if defined(_MSC_VER) 7 | #include 8 | 9 | #define INLINE __forceinline 10 | #define NOINLINE __declspec(noinline) 11 | #define ROTL64(a,b) _rotl64(a,b) 12 | #define MM16 __declspec(align(16)) 13 | 14 | typedef unsigned int uint32_t; 15 | 16 | #if (_MSC_VER >= 1500) 17 | #define __SSSE3__ 18 | #endif 19 | #if (_MSC_VER > 1200) || defined(_mm_free) 20 | #define __SSE2__ 21 | #endif 22 | #else 23 | #define INLINE __attribute__((always_inline)) 24 | #define NOINLINE __attribute__((noinline)) 25 | #define ROTL64(a,b) (((a)<<(b))|((a)>>(64-b))) 26 | #define MM16 __attribute__((aligned(16))) 27 | #endif 28 | 29 | #if defined(__SSE2__) 30 | #include 31 | typedef __m128i xmmi; 32 | typedef __m64 qmm; 33 | 34 | typedef union packedelem64_t { 35 | uint64_t u[2]; 36 | xmmi v; 37 | } packedelem64; 38 | 39 | typedef union packedelem8_t { 40 | unsigned char u[16]; 41 | xmmi v; 42 | } packedelem8; 43 | #endif 44 | 45 | #if defined(__SSSE3__) 46 | #include 47 | #endif 48 | 49 | #endif // SIPHASH_IMPL_H 50 | 51 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | deafult: gcc64 2 | gcc64: gcc64_siphash gcc64_siphash_sse2 gcc64_siphash_ssse3 3 | icc64: icc64_siphash icc64_siphash_sse2 icc64_siphash_ssse3 4 | gcc32: gcc32_siphash gcc32_siphash_sse2 gcc32_siphash_ssse3 5 | icc32: icc32_siphash icc32_siphash_sse2 icc32_siphash_ssse3 6 | 7 | gcc64_siphash: 8 | gcc siphash.c test.c -m64 -O3 -o test_gcc64_siphash -Wall 9 | gcc64_siphash_sse2: 10 | gcc siphash_sse2.c test.c -m64 -msse2 -O3 -o test_gcc64_siphash_sse2 -Wall 11 | gcc64_siphash_ssse3: 12 | gcc siphash_ssse3.c test.c -m64 -mssse3 -O3 -o test_gcc64_siphash_ssse3 -Wall 13 | 14 | icc64_siphash: 15 | icc siphash.c test.c -m64 -O3 -o test_icc64_siphash -Wall 16 | icc64_siphash_sse2: 17 | icc siphash_sse2.c test.c -m64 -msse2 -O3 -o test_icc64_siphash_sse2 -Wall 18 | icc64_siphash_ssse3: 19 | icc siphash_ssse3.c test.c -m64 -mssse3 -O3 -o test_icc64_siphash_ssse3 -Wall 20 | 21 | gcc32_siphash: 22 | gcc siphash.c test.c -m32 -O3 -o test_gcc32_siphash -Wall 23 | gcc32_siphash_sse2: 24 | gcc siphash_sse2.c test.c -m32 -msse2 -O3 -o test_gcc32_siphash_sse2 -Wall 25 | gcc32_siphash_ssse3: 26 | gcc siphash_ssse3.c test.c -m32 -mssse3 -O3 -o test_gcc32_siphash_ssse3 -Wall 27 | 28 | icc32_siphash: 29 | icc siphash.c test.c -m32 -O3 -o test_icc32_siphash -Wall 30 | icc32_siphash_sse2: 31 | icc siphash_sse2.c test.c -m32 -msse2 -O3 -o test_icc32_siphash_sse2 -Wall 32 | icc32_siphash_ssse3: 33 | icc siphash_ssse3.c test.c -m32 -mssse3 -O3 -o test_icc32_siphash_ssse3 -Wall 34 | 35 | clean: 36 | rm -f test_* 37 | -------------------------------------------------------------------------------- /siphash.c: -------------------------------------------------------------------------------- 1 | #include "siphash.h" 2 | #include "siphash_impl.h" 3 | 4 | 5 | static uint64_t INLINE 6 | U8TO64_LE(const unsigned char *p) { 7 | return *(const uint64_t *)p; 8 | } 9 | 10 | /* 11 | static void INLINE 12 | U64TO8_LE(unsigned char *p, const uint64_t v) { 13 | *(uint64_t *)p = v; 14 | } 15 | */ 16 | 17 | uint64_t 18 | siphash(const unsigned char key[16], const unsigned char *m, size_t len) { 19 | uint64_t v0, v1, v2, v3; 20 | uint64_t mi, k0, k1; 21 | uint64_t last7; 22 | size_t i, blocks; 23 | 24 | k0 = U8TO64_LE(key + 0); 25 | k1 = U8TO64_LE(key + 8); 26 | v0 = k0 ^ 0x736f6d6570736575ull; 27 | v1 = k1 ^ 0x646f72616e646f6dull; 28 | v2 = k0 ^ 0x6c7967656e657261ull; 29 | v3 = k1 ^ 0x7465646279746573ull; 30 | 31 | last7 = (uint64_t)(len & 0xff) << 56; 32 | 33 | #define sipcompress() \ 34 | v0 += v1; v2 += v3; \ 35 | v1 = ROTL64(v1,13); v3 = ROTL64(v3,16); \ 36 | v1 ^= v0; v3 ^= v2; \ 37 | v0 = ROTL64(v0,32); \ 38 | v2 += v1; v0 += v3; \ 39 | v1 = ROTL64(v1,17); v3 = ROTL64(v3,21); \ 40 | v1 ^= v2; v3 ^= v0; \ 41 | v2 = ROTL64(v2,32); 42 | 43 | for (i = 0, blocks = (len & ~7); i < blocks; i += 8) { 44 | mi = U8TO64_LE(m + i); 45 | v3 ^= mi; 46 | sipcompress() 47 | sipcompress() 48 | v0 ^= mi; 49 | } 50 | 51 | switch (len - blocks) { 52 | case 7: last7 |= (uint64_t)m[i + 6] << 48; 53 | case 6: last7 |= (uint64_t)m[i + 5] << 40; 54 | case 5: last7 |= (uint64_t)m[i + 4] << 32; 55 | case 4: last7 |= (uint64_t)m[i + 3] << 24; 56 | case 3: last7 |= (uint64_t)m[i + 2] << 16; 57 | case 2: last7 |= (uint64_t)m[i + 1] << 8; 58 | case 1: last7 |= (uint64_t)m[i + 0] ; 59 | case 0: 60 | default:; 61 | }; 62 | v3 ^= last7; 63 | sipcompress() 64 | sipcompress() 65 | v0 ^= last7; 66 | v2 ^= 0xff; 67 | sipcompress() 68 | sipcompress() 69 | sipcompress() 70 | sipcompress() 71 | return v0 ^ v1 ^ v2 ^ v3; 72 | } 73 | 74 | -------------------------------------------------------------------------------- /siphash_sse2.c: -------------------------------------------------------------------------------- 1 | #include "siphash_impl.h" 2 | 3 | /* 0,2,1,3 */ 4 | static const packedelem64 siphash_init[2] = { 5 | {{0x736f6d6570736575ull,0x6c7967656e657261ull}}, 6 | {{0x646f72616e646f6dull,0x7465646279746573ull}} 7 | }; 8 | 9 | static const packedelem64 siphash_final = { 10 | {0x0000000000000000ull,0x00000000000000ffull} 11 | }; 12 | 13 | uint64_t 14 | siphash(const unsigned char key[16], const unsigned char *m, size_t len) { 15 | xmmi k,v02,v20,v13,v11,v33,mi; 16 | uint64_t last7; 17 | uint32_t lo, hi; 18 | size_t i, blocks; 19 | 20 | k = _mm_loadu_si128((xmmi *)(key + 0)); 21 | v02 = siphash_init[0].v; 22 | v13 = siphash_init[1].v; 23 | v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k)); 24 | v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k)); 25 | 26 | last7 = (uint64_t)(len & 0xff) << 56; 27 | 28 | #define sipcompress() \ 29 | v11 = v13; \ 30 | v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \ 31 | v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13)); \ 32 | v02 = _mm_add_epi64(v02, v13); \ 33 | v33 = _mm_or_si128(_mm_slli_epi64(v33, 16), _mm_srli_epi64(v33, 64-16)); \ 34 | v13 = _mm_unpacklo_epi64(v11, v33); \ 35 | v13 = _mm_xor_si128(v13, v02); \ 36 | v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \ 37 | v11 = v13; \ 38 | v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \ 39 | v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17)); \ 40 | v20 = _mm_add_epi64(v20, v13); \ 41 | v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21)); \ 42 | v13 = _mm_unpacklo_epi64(v11, v33); \ 43 | v13 = _mm_unpacklo_epi64(v11, v33); \ 44 | v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \ 45 | v13 = _mm_xor_si128(v13, v20); 46 | 47 | for (i = 0, blocks = (len & ~7); i < blocks; i += 8) { 48 | mi = _mm_loadl_epi64((xmmi *)(m + i)); 49 | v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); 50 | sipcompress() 51 | sipcompress() 52 | v02 = _mm_xor_si128(v02, mi); 53 | } 54 | 55 | switch (len - blocks) { 56 | case 7: last7 |= (uint64_t)m[i + 6] << 48; 57 | case 6: last7 |= (uint64_t)m[i + 5] << 40; 58 | case 5: last7 |= (uint64_t)m[i + 4] << 32; 59 | case 4: last7 |= (uint64_t)m[i + 3] << 24; 60 | case 3: last7 |= (uint64_t)m[i + 2] << 16; 61 | case 2: last7 |= (uint64_t)m[i + 1] << 8; 62 | case 1: last7 |= (uint64_t)m[i + 0] ; 63 | case 0: 64 | default:; 65 | }; 66 | 67 | mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32))); 68 | v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); 69 | sipcompress() 70 | sipcompress() 71 | v02 = _mm_xor_si128(v02, mi); 72 | v02 = _mm_xor_si128(v02, siphash_final.v); 73 | sipcompress() 74 | sipcompress() 75 | sipcompress() 76 | sipcompress() 77 | 78 | v02 = _mm_xor_si128(v02, v13); 79 | v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2))); 80 | lo = _mm_cvtsi128_si32(v02); 81 | hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4)); 82 | return ((uint64_t)hi << 32) | lo; 83 | } 84 | -------------------------------------------------------------------------------- /test.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "siphash.h" 3 | #include "siphash_impl.h" 4 | 5 | #if defined(_MSC_VER) 6 | static uint64_t INLINE 7 | get_ticks(void) { 8 | return __rdtsc(); 9 | } 10 | #else 11 | static uint64_t INLINE 12 | get_ticks(void) { 13 | uint32_t lo, hi; 14 | __asm__ __volatile__("rdtsc" : "=a" (lo), "=d" (hi)); 15 | return ((uint64_t)lo | ((uint64_t)hi << 32)); 16 | } 17 | #endif 18 | 19 | static const uint64_t test_vectors[64] = { 20 | 0x726fdb47dd0e0e31ull,0x74f839c593dc67fdull,0x0d6c8009d9a94f5aull,0x85676696d7fb7e2dull, 21 | 0xcf2794e0277187b7ull,0x18765564cd99a68dull,0xcbc9466e58fee3ceull,0xab0200f58b01d137ull, 22 | 0x93f5f5799a932462ull,0x9e0082df0ba9e4b0ull,0x7a5dbbc594ddb9f3ull,0xf4b32f46226bada7ull, 23 | 0x751e8fbc860ee5fbull,0x14ea5627c0843d90ull,0xf723ca908e7af2eeull,0xa129ca6149be45e5ull, 24 | 0x3f2acc7f57c29bdbull,0x699ae9f52cbe4794ull,0x4bc1b3f0968dd39cull,0xbb6dc91da77961bdull, 25 | 0xbed65cf21aa2ee98ull,0xd0f2cbb02e3b67c7ull,0x93536795e3a33e88ull,0xa80c038ccd5ccec8ull, 26 | 0xb8ad50c6f649af94ull,0xbce192de8a85b8eaull,0x17d835b85bbb15f3ull,0x2f2e6163076bcfadull, 27 | 0xde4daaaca71dc9a5ull,0xa6a2506687956571ull,0xad87a3535c49ef28ull,0x32d892fad841c342ull, 28 | 0x7127512f72f27cceull,0xa7f32346f95978e3ull,0x12e0b01abb051238ull,0x15e034d40fa197aeull, 29 | 0x314dffbe0815a3b4ull,0x027990f029623981ull,0xcadcd4e59ef40c4dull,0x9abfd8766a33735cull, 30 | 0x0e3ea96b5304a7d0ull,0xad0c42d6fc585992ull,0x187306c89bc215a9ull,0xd4a60abcf3792b95ull, 31 | 0xf935451de4f21df2ull,0xa9538f0419755787ull,0xdb9acddff56ca510ull,0xd06c98cd5c0975ebull, 32 | 0xe612a3cb9ecba951ull,0xc766e62cfcadaf96ull,0xee64435a9752fe72ull,0xa192d576b245165aull, 33 | 0x0a8787bf8ecb74b2ull,0x81b3e73d20b49b6full,0x7fa8220ba3b2eceaull,0x245731c13ca42499ull, 34 | 0xb78dbfaf3a8d83bdull,0xea1ad565322a1a0bull,0x60e61c23a3795013ull,0x6606d7e446282b93ull, 35 | 0x6ca4ecb15c5f91e1ull,0x9f626da15c9625f3ull,0xe51b38608ef25f57ull,0x958a324ceb064572ull 36 | }; 37 | 38 | int main() { 39 | unsigned char key[16], msg[1024]; 40 | static const size_t outer_reps = 80, inner_reps = 128; 41 | size_t i, j, len; 42 | uint64_t t, sum, tempsum, res; 43 | double cycles; 44 | 45 | for (i = 0; i < 16; i++) 46 | key[i] = i; 47 | 48 | for (i = 0; i < 64; i++) { 49 | msg[i] = i; 50 | res = siphash(key, msg, i); 51 | if (res != test_vectors[i]) { 52 | printf("test vector %u failed\n", (uint32_t)i); 53 | return 1; 54 | } 55 | } 56 | 57 | for (i = 0; i < 500000; i++) 58 | key[0] += (unsigned char)siphash(key, msg, 1024); 59 | 60 | for (len = 1; len <= 1024; len++) { 61 | sum = 1000000000000000ull; 62 | for (j = 0; j < outer_reps; j++) { 63 | t = get_ticks(); 64 | for (i = 0; i < inner_reps; i++) 65 | key[0] += (unsigned char)siphash(key, msg, len); 66 | tempsum = (get_ticks() - t); 67 | if (tempsum < sum) 68 | sum = tempsum; 69 | } 70 | 71 | cycles = (double)sum / (double)inner_reps; 72 | printf("%u bytes, %.4f cycles/byte, %.4f cycles\n", (uint32_t)len, cycles / len, cycles); 73 | } 74 | 75 | return 0; 76 | } 77 | -------------------------------------------------------------------------------- /siphash_ssse3.c: -------------------------------------------------------------------------------- 1 | #include "siphash_impl.h" 2 | 3 | /* 0,2,1,3 */ 4 | static const packedelem64 siphash_init[2] = { 5 | {{0x736f6d6570736575ull,0x6c7967656e657261ull}}, 6 | {{0x646f72616e646f6dull,0x7465646279746573ull}} 7 | }; 8 | 9 | static const packedelem64 siphash_final = { 10 | {0x0000000000000000ull,0x00000000000000ffull} 11 | }; 12 | 13 | static const packedelem8 siphash_rot16v3 = { 14 | {14,15,8,9,10,11,12,13,8,9,10,11,12,13,14,15} 15 | }; 16 | 17 | uint64_t 18 | siphash(const unsigned char key[16], const unsigned char *m, size_t len) { 19 | xmmi k,v02,v20,v13,v11,v33,mi; 20 | uint64_t last7; 21 | uint32_t lo, hi; 22 | size_t i, blocks; 23 | 24 | k = _mm_loadu_si128((xmmi *)(key + 0)); 25 | v02 = siphash_init[0].v; 26 | v13 = siphash_init[1].v; 27 | v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k)); 28 | v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k)); 29 | 30 | last7 = (uint64_t)(len & 0xff) << 56; 31 | 32 | #define sipcompress() \ 33 | v11 = v13; \ 34 | v33 = v13; \ 35 | v11 = _mm_or_si128(_mm_slli_epi64(v11, 13), _mm_srli_epi64(v11, 64-13)); \ 36 | v02 = _mm_add_epi64(v02, v13); \ 37 | v33 = _mm_shuffle_epi8(v33, siphash_rot16v3.v); \ 38 | v13 = _mm_unpacklo_epi64(v11, v33); \ 39 | v13 = _mm_xor_si128(v13, v02); \ 40 | v20 = _mm_shuffle_epi32(v02, _MM_SHUFFLE(0,1,3,2)); \ 41 | v11 = v13; \ 42 | v33 = _mm_shuffle_epi32(v13, _MM_SHUFFLE(1,0,3,2)); \ 43 | v11 = _mm_or_si128(_mm_slli_epi64(v11, 17), _mm_srli_epi64(v11, 64-17)); \ 44 | v20 = _mm_add_epi64(v20, v13); \ 45 | v33 = _mm_or_si128(_mm_slli_epi64(v33, 21), _mm_srli_epi64(v33, 64-21)); \ 46 | v13 = _mm_unpacklo_epi64(v11, v33); \ 47 | v13 = _mm_unpacklo_epi64(v11, v33); \ 48 | v02 = _mm_shuffle_epi32(v20, _MM_SHUFFLE(0,1,3,2)); \ 49 | v13 = _mm_xor_si128(v13, v20); 50 | 51 | for (i = 0, blocks = (len & ~7); i < blocks; i += 8) { 52 | mi = _mm_loadl_epi64((xmmi *)(m + i)); 53 | v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); 54 | sipcompress() 55 | sipcompress() 56 | v02 = _mm_xor_si128(v02, mi); 57 | } 58 | 59 | switch (len - blocks) { 60 | case 7: last7 |= (uint64_t)m[i + 6] << 48; 61 | case 6: last7 |= (uint64_t)m[i + 5] << 40; 62 | case 5: last7 |= (uint64_t)m[i + 4] << 32; 63 | case 4: last7 |= (uint64_t)m[i + 3] << 24; 64 | case 3: last7 |= (uint64_t)m[i + 2] << 16; 65 | case 2: last7 |= (uint64_t)m[i + 1] << 8; 66 | case 1: last7 |= (uint64_t)m[i + 0] ; 67 | case 0: 68 | default:; 69 | }; 70 | 71 | mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128((uint32_t)last7),_mm_cvtsi32_si128((uint32_t)(last7 >> 32))); 72 | v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); 73 | sipcompress() 74 | sipcompress() 75 | v02 = _mm_xor_si128(v02, mi); 76 | v02 = _mm_xor_si128(v02, siphash_final.v); 77 | sipcompress() 78 | sipcompress() 79 | sipcompress() 80 | sipcompress() 81 | 82 | v02 = _mm_xor_si128(v02, v13); 83 | v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2))); 84 | lo = _mm_cvtsi128_si32(v02); 85 | hi = _mm_cvtsi128_si32(_mm_srli_si128(v02, 4)); 86 | return ((uint64_t)hi << 32) | lo; 87 | } 88 | --------------------------------------------------------------------------------