├── README.md
└── simd.c


/README.md:
--------------------------------------------------------------------------------
 1 | # simd_interleave
 2 | 
 3 | This is an attempt to get the fastest code to interleave bits from two source arrays of integers.  The problem was posed on [Daniel Lemire's Blog](https://lemire.me/blog/2018/01/09/how-fast-can-you-bit-interleave-32-bit-integers-simd-edition/).  
 4 | 
 5 | This version redefines the problem slightly to make it easier -- instead of merging pairs of consecutive int's, we take them from two separate arrays.  
 6 | 
 7 | The algorithm is, like others, based on interleaving zero bits in each source, and then merging them via bitwise or.  
 8 | 
 9 | The zero-interleaving is accomplished by mapping 4-bit units (nybbles) to bytes from a lookup table using the pshufb instruction.  Two vectors of nybbles ("even" and "odd") are extracted from each source, mapped to bytes, and then the bytes are interleaved to get the output word -- the key is that byte-level operations are all that is needed after the mapping.
10 | 
11 | To interleave two inputs, the bitwise or is done immediately after the nybble-to-byte mapping in each vector, and then the same bytewise interleaving is applied to the merged bytes.  
12 | 
13 | 


--------------------------------------------------------------------------------
/simd.c:
--------------------------------------------------------------------------------
  1 | #include <immintrin.h>
  2 | #include <stdbool.h>
  3 | #include <stdint.h>
  4 | #include <stdio.h>
  5 | #include <stdlib.h>
  6 | #include <string.h>
  7 | 
  8 | #define RDTSC_START(cycles)                                             \
  9 |   do {                                                                \
 10 |     uint32_t cyc_high, cyc_low;                                     \
 11 |     __asm volatile("cpuid\n"                                        \
 12 | 		                          "rdtsc\n"                                        \
 13 | 		                          "mov %%edx, %0\n"                                \
 14 | 		   "mov %%eax, %1" :                                \
 15 | 		   "=r" (cyc_high),                                 \
 16 | 		   "=r"(cyc_low) :                                  \
 17 | 		   : /* no read only */                             \
 18 | 		   "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */    \
 19 | 		   );                                               \
 20 |     (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                \
 21 |   } while (0)
 22 | 
 23 | #define RDTSC_STOP(cycles)                                              \
 24 |   do {                                                                \
 25 |     uint32_t cyc_high, cyc_low;                                     \
 26 |     __asm volatile("rdtscp\n"                                       \
 27 | 		                          "mov %%edx, %0\n"                                \
 28 | 		                          "mov %%eax, %1\n"                                \
 29 | 		   "cpuid" :                                        \
 30 | 		   "=r"(cyc_high),                                  \
 31 | 		   "=r"(cyc_low) :                                  \
 32 | 		   /* no read only registers */ :                   \
 33 | 		   "%rax", "%rbx", "%rcx", "%rdx" /* clobbers */    \
 34 | 		   );                                               \
 35 |     (cycles) = ((uint64_t)cyc_high << 32) | cyc_low;                \
 36 |   } while (0)
 37 | 
 38 | void dump( __m256i reg, char * msg ) {
 39 |   printf("%s ", msg);
 40 |   unsigned char c[32];
 41 |   _mm256_storeu_si256((__m256i *)c, reg);
 42 |   for( int i = 31; i >=0; i-- )
 43 |     printf("%02hhX ", c[i] );
 44 |   printf("\n");
 45 | }
 46 | 
 47 | #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC)
 48 | 
 49 | // this maps to multiple instructions in avx, but one in avx2 (?)
 50 | static inline void
 51 | __attribute__((__always_inline__))
 52 | _mm256_storeu2_m128i(__m128i* const hiaddr, __m128i* const loaddr, const __m256i a)
 53 | {
 54 |   _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
 55 |   _mm_storeu_si128(hiaddr, _mm256_extracti128_si256(a, 1));
 56 | }
 57 | #endif
 58 | 
 59 | static inline void morton_vec8(uint32_t *in0, uint32_t *in1,  uint64_t *out ) {
 60 | 
 61 |   // nybble -> byte lookups
 62 |   __m256i m0 = _mm256_set_epi8(85, 84, 81, 80, 69, 68, 65, 64, 21, 20, 17,
 63 | 		       16, 5, 4, 1, 0, 85, 84, 81, 80, 69, 68, 65,
 64 | 		       64, 21, 20, 17, 16, 5, 4, 1, 0);
 65 | 
 66 |   __m256i m1 = _mm256_slli_epi64( m0, 1);
 67 | 
 68 |   __m256i nybble_mask = _mm256_set1_epi8(0x0F);
 69 | 
 70 |   __m256i word0 = _mm256_loadu_si256((__m256i *)in0);
 71 |   __m256i word1 = _mm256_loadu_si256((__m256i *)in1);
 72 |   
 73 |   __m256i even0 = _mm256_and_si256( word0, nybble_mask);  // 1 / .33 ?
 74 |   __m256i odd0  = _mm256_and_si256( _mm256_srli_epi64( word0, 4), nybble_mask);
 75 | 
 76 |   __m256i even1 = _mm256_and_si256( word1, nybble_mask);
 77 |   __m256i odd1  = _mm256_and_si256( _mm256_srli_epi64( word1, 4), nybble_mask);
 78 |   
 79 |   // lookup
 80 |     
 81 |   even0 = _mm256_shuffle_epi8( m0, even0 );  // 1 / 1
 82 |   odd0  = _mm256_shuffle_epi8( m0, odd0  );
 83 |   
 84 |   even1 = _mm256_shuffle_epi8( m1, even1 );  
 85 |   odd1  = _mm256_shuffle_epi8( m1, odd1  );
 86 | 
 87 |   // combine
 88 |   
 89 |   __m256i even = _mm256_or_si256( even0, even1 );  // 1 / .33
 90 |   __m256i odd  = _mm256_or_si256( odd0, odd1 );
 91 | 
 92 |   // byte interleave
 93 |   __m256i los = _mm256_unpacklo_epi8( even, odd );  // 1 / 1
 94 |   __m256i his = _mm256_unpackhi_epi8( even, odd );
 95 | 
 96 |   // AC,BD => AB,CD
 97 |   // interleaved 128i stores to 8 x uint64_t
 98 |   _mm256_storeu2_m128i ((__m128i*) (out+4),(__m128i*) out, los);
 99 |   _mm256_storeu2_m128i ((__m128i*) (out+6),(__m128i*) (out+2), his);
100 | }
101 | 
102 | int main() {
103 |   uint64_t start,end;
104 |   uint32_t *in0, *in1;
105 | 
106 | #define N 1024
107 |   
108 |   in0 = malloc(N*sizeof(uint32_t));  in1 = malloc(N*sizeof(uint32_t));
109 |   for(uint32_t i= 0; i < N; i++) {
110 |     in0[i] = i;
111 |     in1[i] = i;
112 |   }
113 |   
114 |   uint64_t  *out;
115 |   out = malloc(N*sizeof(uint64_t));
116 | 
117 |   RDTSC_START(start);
118 | 
119 |   for(int i = 0; i < N; i+=8) 
120 |     morton_vec8(in0+i, in1+i, out+i);
121 | 
122 |   RDTSC_STOP(end);
123 | 
124 |   printf("N=%d\ncycles = %ld\nper int=%f\n",N, end-start, (end-start)/(N*2.0));
125 | 
126 |   // prevent optimizing out
127 |   uint64_t sum=0;
128 |   for(int i=0; i < N; i++)
129 |     sum += out[i];
130 | 
131 |   printf("sum %ld\n", sum);
132 |   
133 |   for(int i = 0; i < 8; i++ )
134 |     printf("%ld ", out[i]);
135 | 
136 |   printf("\nlast (%d, %d) => %ld\n", in0[N-1], in1[N-1], out[N-1]);
137 | 
138 |   printf("\n\n");
139 | }
140 | 


--------------------------------------------------------------------------------