├── FastMemcpy.c ├── FastMemcpy.h ├── FastMemcpy_Avx.c ├── FastMemcpy_Avx.h ├── LICENSE └── README.md /FastMemcpy.c: -------------------------------------------------------------------------------- 1 | //===================================================================== 2 | // 3 | // FastMemcpy.c - skywind3000@163.com, 2015 4 | // 5 | // feature: 6 | // 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9) 7 | // 8 | //===================================================================== 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #if (defined(_WIN32) || defined(WIN32)) 15 | #include 16 | #include 17 | #ifdef _MSC_VER 18 | #pragma comment(lib, "winmm.lib") 19 | #endif 20 | #elif defined(__unix) 21 | #include 22 | #include 23 | #else 24 | #error it can only be compiled under windows or unix 25 | #endif 26 | 27 | #include "FastMemcpy.h" 28 | 29 | unsigned int gettime() 30 | { 31 | #if (defined(_WIN32) || defined(WIN32)) 32 | return timeGetTime(); 33 | #else 34 | static struct timezone tz={ 0,0 }; 35 | struct timeval time; 36 | gettimeofday(&time,&tz); 37 | return (time.tv_sec * 1000 + time.tv_usec / 1000); 38 | #endif 39 | } 40 | 41 | void sleepms(unsigned int millisec) 42 | { 43 | #if defined(_WIN32) || defined(WIN32) 44 | Sleep(millisec); 45 | #else 46 | usleep(millisec * 1000); 47 | #endif 48 | } 49 | 50 | 51 | void benchmark(int dstalign, int srcalign, size_t size, int times) 52 | { 53 | char *DATA1 = (char*)malloc(size + 64); 54 | char *DATA2 = (char*)malloc(size + 64); 55 | size_t LINEAR1 = ((size_t)DATA1); 56 | size_t LINEAR2 = ((size_t)DATA2); 57 | char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1); 58 | char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2); 59 | char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1); 60 | char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3); 61 | unsigned int t1, t2; 62 | int k; 63 | 64 | sleepms(100); 65 | t1 = gettime(); 66 | for (k = times; k > 0; k--) { 67 | memcpy(dst, src, size); 68 | } 69 | t1 = gettime() - t1; 70 | sleepms(100); 71 | t2 = gettime(); 72 | for (k = times; k > 0; k--) { 73 | memcpy_fast(dst, src, size); 74 | } 75 | t2 = gettime() - t2; 76 | 77 | free(DATA1); 78 | free(DATA2); 79 | 80 | printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n", 81 | dstalign? "aligned" : "unalign", 82 | srcalign? "aligned" : "unalign", (int)t2, (int)t1); 83 | } 84 | 85 | 86 | void bench(int copysize, int times) 87 | { 88 | printf("benchmark(size=%d bytes, times=%d):\n", copysize, times); 89 | benchmark(1, 1, copysize, times); 90 | benchmark(1, 0, copysize, times); 91 | benchmark(0, 1, copysize, times); 92 | benchmark(0, 0, copysize, times); 93 | printf("\n"); 94 | } 95 | 96 | 97 | void random_bench(int maxsize, int times) 98 | { 99 | static char A[11 * 1024 * 1024 + 2]; 100 | static char B[11 * 1024 * 1024 + 2]; 101 | static int random_offsets[0x10000]; 102 | static int random_sizes[0x8000]; 103 | unsigned int i, p1, p2; 104 | unsigned int t1, t2; 105 | for (i = 0; i < 0x10000; i++) { // generate random offsets 106 | random_offsets[i] = rand() % (10 * 1024 * 1024 + 1); 107 | } 108 | for (i = 0; i < 0x8000; i++) { // generate random sizes 109 | random_sizes[i] = 1 + rand() % maxsize; 110 | } 111 | sleepms(100); 112 | t1 = gettime(); 113 | for (p1 = 0, p2 = 0, i = 0; i < times; i++) { 114 | int offset1 = random_offsets[(p1++) & 0xffff]; 115 | int offset2 = random_offsets[(p1++) & 0xffff]; 116 | int size = random_sizes[(p2++) & 0x7fff]; 117 | memcpy(A + offset1, B + offset2, size); 118 | } 119 | t1 = gettime() - t1; 120 | sleepms(100); 121 | t2 = gettime(); 122 | for (p1 = 0, p2 = 0, i = 0; i < times; i++) { 123 | int offset1 = random_offsets[(p1++) & 0xffff]; 124 | int offset2 = random_offsets[(p1++) & 0xffff]; 125 | int size = random_sizes[(p2++) & 0x7fff]; 126 | memcpy_fast(A + offset1, B + offset2, size); 127 | } 128 | t2 = gettime() - t2; 129 | printf("benchmark random access:\n"); 130 | printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1); 131 | } 132 | 133 | 134 | #ifdef _MSC_VER 135 | #pragma comment(lib, "winmm.lib") 136 | #endif 137 | 138 | int main(void) 139 | { 140 | bench(32, 0x1000000); 141 | bench(64, 0x1000000); 142 | bench(512, 0x800000); 143 | bench(1024, 0x400000); 144 | bench(4096, 0x80000); 145 | bench(8192, 0x40000); 146 | bench(1024 * 1024 * 1, 0x800); 147 | bench(1024 * 1024 * 4, 0x200); 148 | bench(1024 * 1024 * 8, 0x100); 149 | 150 | random_bench(2048, 8000000); 151 | 152 | return 0; 153 | } 154 | 155 | 156 | 157 | 158 | /* 159 | benchmark(size=32 bytes, times=16777216): 160 | result(dst aligned, src aligned): memcpy_fast=78ms memcpy=260 ms 161 | result(dst aligned, src unalign): memcpy_fast=78ms memcpy=250 ms 162 | result(dst unalign, src aligned): memcpy_fast=78ms memcpy=266 ms 163 | result(dst unalign, src unalign): memcpy_fast=78ms memcpy=234 ms 164 | 165 | benchmark(size=64 bytes, times=16777216): 166 | result(dst aligned, src aligned): memcpy_fast=109ms memcpy=281 ms 167 | result(dst aligned, src unalign): memcpy_fast=109ms memcpy=328 ms 168 | result(dst unalign, src aligned): memcpy_fast=109ms memcpy=343 ms 169 | result(dst unalign, src unalign): memcpy_fast=93ms memcpy=344 ms 170 | 171 | benchmark(size=512 bytes, times=8388608): 172 | result(dst aligned, src aligned): memcpy_fast=125ms memcpy=218 ms 173 | result(dst aligned, src unalign): memcpy_fast=156ms memcpy=484 ms 174 | result(dst unalign, src aligned): memcpy_fast=172ms memcpy=546 ms 175 | result(dst unalign, src unalign): memcpy_fast=172ms memcpy=515 ms 176 | 177 | benchmark(size=1024 bytes, times=4194304): 178 | result(dst aligned, src aligned): memcpy_fast=109ms memcpy=172 ms 179 | result(dst aligned, src unalign): memcpy_fast=187ms memcpy=453 ms 180 | result(dst unalign, src aligned): memcpy_fast=172ms memcpy=437 ms 181 | result(dst unalign, src unalign): memcpy_fast=156ms memcpy=452 ms 182 | 183 | benchmark(size=4096 bytes, times=524288): 184 | result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms 185 | result(dst aligned, src unalign): memcpy_fast=109ms memcpy=202 ms 186 | result(dst unalign, src aligned): memcpy_fast=94ms memcpy=203 ms 187 | result(dst unalign, src unalign): memcpy_fast=110ms memcpy=218 ms 188 | 189 | benchmark(size=8192 bytes, times=262144): 190 | result(dst aligned, src aligned): memcpy_fast=62ms memcpy=78 ms 191 | result(dst aligned, src unalign): memcpy_fast=78ms memcpy=202 ms 192 | result(dst unalign, src aligned): memcpy_fast=78ms memcpy=203 ms 193 | result(dst unalign, src unalign): memcpy_fast=94ms memcpy=203 ms 194 | 195 | benchmark(size=1048576 bytes, times=2048): 196 | result(dst aligned, src aligned): memcpy_fast=203ms memcpy=191 ms 197 | result(dst aligned, src unalign): memcpy_fast=219ms memcpy=281 ms 198 | result(dst unalign, src aligned): memcpy_fast=218ms memcpy=328 ms 199 | result(dst unalign, src unalign): memcpy_fast=218ms memcpy=312 ms 200 | 201 | benchmark(size=4194304 bytes, times=512): 202 | result(dst aligned, src aligned): memcpy_fast=312ms memcpy=406 ms 203 | result(dst aligned, src unalign): memcpy_fast=296ms memcpy=421 ms 204 | result(dst unalign, src aligned): memcpy_fast=312ms memcpy=468 ms 205 | result(dst unalign, src unalign): memcpy_fast=297ms memcpy=452 ms 206 | 207 | benchmark(size=8388608 bytes, times=256): 208 | result(dst aligned, src aligned): memcpy_fast=281ms memcpy=452 ms 209 | result(dst aligned, src unalign): memcpy_fast=280ms memcpy=468 ms 210 | result(dst unalign, src aligned): memcpy_fast=298ms memcpy=514 ms 211 | result(dst unalign, src unalign): memcpy_fast=344ms memcpy=472 ms 212 | 213 | benchmark random access: 214 | memcpy_fast=515ms memcpy=1014ms 215 | 216 | */ 217 | 218 | 219 | 220 | 221 | -------------------------------------------------------------------------------- /FastMemcpy.h: -------------------------------------------------------------------------------- 1 | //===================================================================== 2 | // 3 | // FastMemcpy.c - skywind3000@163.com, 2015 4 | // 5 | // feature: 6 | // 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) 7 | // 8 | //===================================================================== 9 | #ifndef __FAST_MEMCPY_H__ 10 | #define __FAST_MEMCPY_H__ 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | //--------------------------------------------------------------------- 18 | // force inline for compilers 19 | //--------------------------------------------------------------------- 20 | #ifndef INLINE 21 | #ifdef __GNUC__ 22 | #if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) 23 | #define INLINE __inline__ __attribute__((always_inline)) 24 | #else 25 | #define INLINE __inline__ 26 | #endif 27 | #elif defined(_MSC_VER) 28 | #define INLINE __forceinline 29 | #elif (defined(__BORLANDC__) || defined(__WATCOMC__)) 30 | #define INLINE __inline 31 | #else 32 | #define INLINE 33 | #endif 34 | #endif 35 | 36 | 37 | 38 | //--------------------------------------------------------------------- 39 | // fast copy for different sizes 40 | //--------------------------------------------------------------------- 41 | static INLINE void memcpy_sse2_16(void *dst, const void *src) { 42 | __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); 43 | _mm_storeu_si128(((__m128i*)dst) + 0, m0); 44 | } 45 | 46 | static INLINE void memcpy_sse2_32(void *dst, const void *src) { 47 | __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); 48 | __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); 49 | _mm_storeu_si128(((__m128i*)dst) + 0, m0); 50 | _mm_storeu_si128(((__m128i*)dst) + 1, m1); 51 | } 52 | 53 | static INLINE void memcpy_sse2_64(void *dst, const void *src) { 54 | __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); 55 | __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); 56 | __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); 57 | __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); 58 | _mm_storeu_si128(((__m128i*)dst) + 0, m0); 59 | _mm_storeu_si128(((__m128i*)dst) + 1, m1); 60 | _mm_storeu_si128(((__m128i*)dst) + 2, m2); 61 | _mm_storeu_si128(((__m128i*)dst) + 3, m3); 62 | } 63 | 64 | static INLINE void memcpy_sse2_128(void *dst, const void *src) { 65 | __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); 66 | __m128i m1 = _mm_loadu_si128(((const __m128i*)src) + 1); 67 | __m128i m2 = _mm_loadu_si128(((const __m128i*)src) + 2); 68 | __m128i m3 = _mm_loadu_si128(((const __m128i*)src) + 3); 69 | __m128i m4 = _mm_loadu_si128(((const __m128i*)src) + 4); 70 | __m128i m5 = _mm_loadu_si128(((const __m128i*)src) + 5); 71 | __m128i m6 = _mm_loadu_si128(((const __m128i*)src) + 6); 72 | __m128i m7 = _mm_loadu_si128(((const __m128i*)src) + 7); 73 | _mm_storeu_si128(((__m128i*)dst) + 0, m0); 74 | _mm_storeu_si128(((__m128i*)dst) + 1, m1); 75 | _mm_storeu_si128(((__m128i*)dst) + 2, m2); 76 | _mm_storeu_si128(((__m128i*)dst) + 3, m3); 77 | _mm_storeu_si128(((__m128i*)dst) + 4, m4); 78 | _mm_storeu_si128(((__m128i*)dst) + 5, m5); 79 | _mm_storeu_si128(((__m128i*)dst) + 6, m6); 80 | _mm_storeu_si128(((__m128i*)dst) + 7, m7); 81 | } 82 | 83 | 84 | //--------------------------------------------------------------------- 85 | // tiny memory copy with jump table optimized 86 | //--------------------------------------------------------------------- 87 | static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { 88 | unsigned char *dd = ((unsigned char*)dst) + size; 89 | const unsigned char *ss = ((const unsigned char*)src) + size; 90 | 91 | switch (size) { 92 | case 64: 93 | memcpy_sse2_64(dd - 64, ss - 64); 94 | case 0: 95 | break; 96 | 97 | case 65: 98 | memcpy_sse2_64(dd - 65, ss - 65); 99 | case 1: 100 | dd[-1] = ss[-1]; 101 | break; 102 | 103 | case 66: 104 | memcpy_sse2_64(dd - 66, ss - 66); 105 | case 2: 106 | *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); 107 | break; 108 | 109 | case 67: 110 | memcpy_sse2_64(dd - 67, ss - 67); 111 | case 3: 112 | *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); 113 | dd[-1] = ss[-1]; 114 | break; 115 | 116 | case 68: 117 | memcpy_sse2_64(dd - 68, ss - 68); 118 | case 4: 119 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 120 | break; 121 | 122 | case 69: 123 | memcpy_sse2_64(dd - 69, ss - 69); 124 | case 5: 125 | *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); 126 | dd[-1] = ss[-1]; 127 | break; 128 | 129 | case 70: 130 | memcpy_sse2_64(dd - 70, ss - 70); 131 | case 6: 132 | *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); 133 | *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); 134 | break; 135 | 136 | case 71: 137 | memcpy_sse2_64(dd - 71, ss - 71); 138 | case 7: 139 | *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); 140 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 141 | break; 142 | 143 | case 72: 144 | memcpy_sse2_64(dd - 72, ss - 72); 145 | case 8: 146 | *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); 147 | break; 148 | 149 | case 73: 150 | memcpy_sse2_64(dd - 73, ss - 73); 151 | case 9: 152 | *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); 153 | dd[-1] = ss[-1]; 154 | break; 155 | 156 | case 74: 157 | memcpy_sse2_64(dd - 74, ss - 74); 158 | case 10: 159 | *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); 160 | *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); 161 | break; 162 | 163 | case 75: 164 | memcpy_sse2_64(dd - 75, ss - 75); 165 | case 11: 166 | *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); 167 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 168 | break; 169 | 170 | case 76: 171 | memcpy_sse2_64(dd - 76, ss - 76); 172 | case 12: 173 | *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); 174 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 175 | break; 176 | 177 | case 77: 178 | memcpy_sse2_64(dd - 77, ss - 77); 179 | case 13: 180 | *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); 181 | *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); 182 | dd[-1] = ss[-1]; 183 | break; 184 | 185 | case 78: 186 | memcpy_sse2_64(dd - 78, ss - 78); 187 | case 14: 188 | *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); 189 | *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); 190 | break; 191 | 192 | case 79: 193 | memcpy_sse2_64(dd - 79, ss - 79); 194 | case 15: 195 | *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); 196 | *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); 197 | break; 198 | 199 | case 80: 200 | memcpy_sse2_64(dd - 80, ss - 80); 201 | case 16: 202 | memcpy_sse2_16(dd - 16, ss - 16); 203 | break; 204 | 205 | case 81: 206 | memcpy_sse2_64(dd - 81, ss - 81); 207 | case 17: 208 | memcpy_sse2_16(dd - 17, ss - 17); 209 | dd[-1] = ss[-1]; 210 | break; 211 | 212 | case 82: 213 | memcpy_sse2_64(dd - 82, ss - 82); 214 | case 18: 215 | memcpy_sse2_16(dd - 18, ss - 18); 216 | *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); 217 | break; 218 | 219 | case 83: 220 | memcpy_sse2_64(dd - 83, ss - 83); 221 | case 19: 222 | memcpy_sse2_16(dd - 19, ss - 19); 223 | *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); 224 | dd[-1] = ss[-1]; 225 | break; 226 | 227 | case 84: 228 | memcpy_sse2_64(dd - 84, ss - 84); 229 | case 20: 230 | memcpy_sse2_16(dd - 20, ss - 20); 231 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 232 | break; 233 | 234 | case 85: 235 | memcpy_sse2_64(dd - 85, ss - 85); 236 | case 21: 237 | memcpy_sse2_16(dd - 21, ss - 21); 238 | *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); 239 | dd[-1] = ss[-1]; 240 | break; 241 | 242 | case 86: 243 | memcpy_sse2_64(dd - 86, ss - 86); 244 | case 22: 245 | memcpy_sse2_16(dd - 22, ss - 22); 246 | *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); 247 | *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); 248 | break; 249 | 250 | case 87: 251 | memcpy_sse2_64(dd - 87, ss - 87); 252 | case 23: 253 | memcpy_sse2_16(dd - 23, ss - 23); 254 | *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); 255 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 256 | break; 257 | 258 | case 88: 259 | memcpy_sse2_64(dd - 88, ss - 88); 260 | case 24: 261 | memcpy_sse2_16(dd - 24, ss - 24); 262 | memcpy_sse2_16(dd - 16, ss - 16); 263 | break; 264 | 265 | case 89: 266 | memcpy_sse2_64(dd - 89, ss - 89); 267 | case 25: 268 | memcpy_sse2_16(dd - 25, ss - 25); 269 | memcpy_sse2_16(dd - 16, ss - 16); 270 | break; 271 | 272 | case 90: 273 | memcpy_sse2_64(dd - 90, ss - 90); 274 | case 26: 275 | memcpy_sse2_16(dd - 26, ss - 26); 276 | memcpy_sse2_16(dd - 16, ss - 16); 277 | break; 278 | 279 | case 91: 280 | memcpy_sse2_64(dd - 91, ss - 91); 281 | case 27: 282 | memcpy_sse2_16(dd - 27, ss - 27); 283 | memcpy_sse2_16(dd - 16, ss - 16); 284 | break; 285 | 286 | case 92: 287 | memcpy_sse2_64(dd - 92, ss - 92); 288 | case 28: 289 | memcpy_sse2_16(dd - 28, ss - 28); 290 | memcpy_sse2_16(dd - 16, ss - 16); 291 | break; 292 | 293 | case 93: 294 | memcpy_sse2_64(dd - 93, ss - 93); 295 | case 29: 296 | memcpy_sse2_16(dd - 29, ss - 29); 297 | memcpy_sse2_16(dd - 16, ss - 16); 298 | break; 299 | 300 | case 94: 301 | memcpy_sse2_64(dd - 94, ss - 94); 302 | case 30: 303 | memcpy_sse2_16(dd - 30, ss - 30); 304 | memcpy_sse2_16(dd - 16, ss - 16); 305 | break; 306 | 307 | case 95: 308 | memcpy_sse2_64(dd - 95, ss - 95); 309 | case 31: 310 | memcpy_sse2_16(dd - 31, ss - 31); 311 | memcpy_sse2_16(dd - 16, ss - 16); 312 | break; 313 | 314 | case 96: 315 | memcpy_sse2_64(dd - 96, ss - 96); 316 | case 32: 317 | memcpy_sse2_32(dd - 32, ss - 32); 318 | break; 319 | 320 | case 97: 321 | memcpy_sse2_64(dd - 97, ss - 97); 322 | case 33: 323 | memcpy_sse2_32(dd - 33, ss - 33); 324 | dd[-1] = ss[-1]; 325 | break; 326 | 327 | case 98: 328 | memcpy_sse2_64(dd - 98, ss - 98); 329 | case 34: 330 | memcpy_sse2_32(dd - 34, ss - 34); 331 | *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); 332 | break; 333 | 334 | case 99: 335 | memcpy_sse2_64(dd - 99, ss - 99); 336 | case 35: 337 | memcpy_sse2_32(dd - 35, ss - 35); 338 | *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); 339 | dd[-1] = ss[-1]; 340 | break; 341 | 342 | case 100: 343 | memcpy_sse2_64(dd - 100, ss - 100); 344 | case 36: 345 | memcpy_sse2_32(dd - 36, ss - 36); 346 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 347 | break; 348 | 349 | case 101: 350 | memcpy_sse2_64(dd - 101, ss - 101); 351 | case 37: 352 | memcpy_sse2_32(dd - 37, ss - 37); 353 | *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); 354 | dd[-1] = ss[-1]; 355 | break; 356 | 357 | case 102: 358 | memcpy_sse2_64(dd - 102, ss - 102); 359 | case 38: 360 | memcpy_sse2_32(dd - 38, ss - 38); 361 | *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); 362 | *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); 363 | break; 364 | 365 | case 103: 366 | memcpy_sse2_64(dd - 103, ss - 103); 367 | case 39: 368 | memcpy_sse2_32(dd - 39, ss - 39); 369 | *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); 370 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 371 | break; 372 | 373 | case 104: 374 | memcpy_sse2_64(dd - 104, ss - 104); 375 | case 40: 376 | memcpy_sse2_32(dd - 40, ss - 40); 377 | *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); 378 | break; 379 | 380 | case 105: 381 | memcpy_sse2_64(dd - 105, ss - 105); 382 | case 41: 383 | memcpy_sse2_32(dd - 41, ss - 41); 384 | *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); 385 | dd[-1] = ss[-1]; 386 | break; 387 | 388 | case 106: 389 | memcpy_sse2_64(dd - 106, ss - 106); 390 | case 42: 391 | memcpy_sse2_32(dd - 42, ss - 42); 392 | *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); 393 | *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); 394 | break; 395 | 396 | case 107: 397 | memcpy_sse2_64(dd - 107, ss - 107); 398 | case 43: 399 | memcpy_sse2_32(dd - 43, ss - 43); 400 | *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); 401 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 402 | break; 403 | 404 | case 108: 405 | memcpy_sse2_64(dd - 108, ss - 108); 406 | case 44: 407 | memcpy_sse2_32(dd - 44, ss - 44); 408 | *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); 409 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 410 | break; 411 | 412 | case 109: 413 | memcpy_sse2_64(dd - 109, ss - 109); 414 | case 45: 415 | memcpy_sse2_32(dd - 45, ss - 45); 416 | *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); 417 | *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); 418 | dd[-1] = ss[-1]; 419 | break; 420 | 421 | case 110: 422 | memcpy_sse2_64(dd - 110, ss - 110); 423 | case 46: 424 | memcpy_sse2_32(dd - 46, ss - 46); 425 | *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); 426 | *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); 427 | break; 428 | 429 | case 111: 430 | memcpy_sse2_64(dd - 111, ss - 111); 431 | case 47: 432 | memcpy_sse2_32(dd - 47, ss - 47); 433 | *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); 434 | *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); 435 | break; 436 | 437 | case 112: 438 | memcpy_sse2_64(dd - 112, ss - 112); 439 | case 48: 440 | memcpy_sse2_32(dd - 48, ss - 48); 441 | memcpy_sse2_16(dd - 16, ss - 16); 442 | break; 443 | 444 | case 113: 445 | memcpy_sse2_64(dd - 113, ss - 113); 446 | case 49: 447 | memcpy_sse2_32(dd - 49, ss - 49); 448 | memcpy_sse2_16(dd - 17, ss - 17); 449 | dd[-1] = ss[-1]; 450 | break; 451 | 452 | case 114: 453 | memcpy_sse2_64(dd - 114, ss - 114); 454 | case 50: 455 | memcpy_sse2_32(dd - 50, ss - 50); 456 | memcpy_sse2_16(dd - 18, ss - 18); 457 | *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); 458 | break; 459 | 460 | case 115: 461 | memcpy_sse2_64(dd - 115, ss - 115); 462 | case 51: 463 | memcpy_sse2_32(dd - 51, ss - 51); 464 | memcpy_sse2_16(dd - 19, ss - 19); 465 | *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); 466 | dd[-1] = ss[-1]; 467 | break; 468 | 469 | case 116: 470 | memcpy_sse2_64(dd - 116, ss - 116); 471 | case 52: 472 | memcpy_sse2_32(dd - 52, ss - 52); 473 | memcpy_sse2_16(dd - 20, ss - 20); 474 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 475 | break; 476 | 477 | case 117: 478 | memcpy_sse2_64(dd - 117, ss - 117); 479 | case 53: 480 | memcpy_sse2_32(dd - 53, ss - 53); 481 | memcpy_sse2_16(dd - 21, ss - 21); 482 | *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); 483 | dd[-1] = ss[-1]; 484 | break; 485 | 486 | case 118: 487 | memcpy_sse2_64(dd - 118, ss - 118); 488 | case 54: 489 | memcpy_sse2_32(dd - 54, ss - 54); 490 | memcpy_sse2_16(dd - 22, ss - 22); 491 | *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); 492 | *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); 493 | break; 494 | 495 | case 119: 496 | memcpy_sse2_64(dd - 119, ss - 119); 497 | case 55: 498 | memcpy_sse2_32(dd - 55, ss - 55); 499 | memcpy_sse2_16(dd - 23, ss - 23); 500 | *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); 501 | *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); 502 | break; 503 | 504 | case 120: 505 | memcpy_sse2_64(dd - 120, ss - 120); 506 | case 56: 507 | memcpy_sse2_32(dd - 56, ss - 56); 508 | memcpy_sse2_16(dd - 24, ss - 24); 509 | memcpy_sse2_16(dd - 16, ss - 16); 510 | break; 511 | 512 | case 121: 513 | memcpy_sse2_64(dd - 121, ss - 121); 514 | case 57: 515 | memcpy_sse2_32(dd - 57, ss - 57); 516 | memcpy_sse2_16(dd - 25, ss - 25); 517 | memcpy_sse2_16(dd - 16, ss - 16); 518 | break; 519 | 520 | case 122: 521 | memcpy_sse2_64(dd - 122, ss - 122); 522 | case 58: 523 | memcpy_sse2_32(dd - 58, ss - 58); 524 | memcpy_sse2_16(dd - 26, ss - 26); 525 | memcpy_sse2_16(dd - 16, ss - 16); 526 | break; 527 | 528 | case 123: 529 | memcpy_sse2_64(dd - 123, ss - 123); 530 | case 59: 531 | memcpy_sse2_32(dd - 59, ss - 59); 532 | memcpy_sse2_16(dd - 27, ss - 27); 533 | memcpy_sse2_16(dd - 16, ss - 16); 534 | break; 535 | 536 | case 124: 537 | memcpy_sse2_64(dd - 124, ss - 124); 538 | case 60: 539 | memcpy_sse2_32(dd - 60, ss - 60); 540 | memcpy_sse2_16(dd - 28, ss - 28); 541 | memcpy_sse2_16(dd - 16, ss - 16); 542 | break; 543 | 544 | case 125: 545 | memcpy_sse2_64(dd - 125, ss - 125); 546 | case 61: 547 | memcpy_sse2_32(dd - 61, ss - 61); 548 | memcpy_sse2_16(dd - 29, ss - 29); 549 | memcpy_sse2_16(dd - 16, ss - 16); 550 | break; 551 | 552 | case 126: 553 | memcpy_sse2_64(dd - 126, ss - 126); 554 | case 62: 555 | memcpy_sse2_32(dd - 62, ss - 62); 556 | memcpy_sse2_16(dd - 30, ss - 30); 557 | memcpy_sse2_16(dd - 16, ss - 16); 558 | break; 559 | 560 | case 127: 561 | memcpy_sse2_64(dd - 127, ss - 127); 562 | case 63: 563 | memcpy_sse2_32(dd - 63, ss - 63); 564 | memcpy_sse2_16(dd - 31, ss - 31); 565 | memcpy_sse2_16(dd - 16, ss - 16); 566 | break; 567 | 568 | case 128: 569 | memcpy_sse2_128(dd - 128, ss - 128); 570 | break; 571 | } 572 | 573 | return dst; 574 | } 575 | 576 | 577 | //--------------------------------------------------------------------- 578 | // main routine 579 | //--------------------------------------------------------------------- 580 | static void* memcpy_fast(void *destination, const void *source, size_t size) 581 | { 582 | unsigned char *dst = (unsigned char*)destination; 583 | const unsigned char *src = (const unsigned char*)source; 584 | static size_t cachesize = 0x200000; // L2-cache size 585 | size_t padding; 586 | 587 | // small memory copy 588 | if (size <= 128) { 589 | return memcpy_tiny(dst, src, size); 590 | } 591 | 592 | // align destination to 16 bytes boundary 593 | padding = (16 - (((size_t)dst) & 15)) & 15; 594 | 595 | if (padding > 0) { 596 | __m128i head = _mm_loadu_si128((const __m128i*)src); 597 | _mm_storeu_si128((__m128i*)dst, head); 598 | dst += padding; 599 | src += padding; 600 | size -= padding; 601 | } 602 | 603 | // medium size copy 604 | if (size <= cachesize) { 605 | __m128i c0, c1, c2, c3, c4, c5, c6, c7; 606 | 607 | for (; size >= 128; size -= 128) { 608 | c0 = _mm_loadu_si128(((const __m128i*)src) + 0); 609 | c1 = _mm_loadu_si128(((const __m128i*)src) + 1); 610 | c2 = _mm_loadu_si128(((const __m128i*)src) + 2); 611 | c3 = _mm_loadu_si128(((const __m128i*)src) + 3); 612 | c4 = _mm_loadu_si128(((const __m128i*)src) + 4); 613 | c5 = _mm_loadu_si128(((const __m128i*)src) + 5); 614 | c6 = _mm_loadu_si128(((const __m128i*)src) + 6); 615 | c7 = _mm_loadu_si128(((const __m128i*)src) + 7); 616 | _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); 617 | src += 128; 618 | _mm_store_si128((((__m128i*)dst) + 0), c0); 619 | _mm_store_si128((((__m128i*)dst) + 1), c1); 620 | _mm_store_si128((((__m128i*)dst) + 2), c2); 621 | _mm_store_si128((((__m128i*)dst) + 3), c3); 622 | _mm_store_si128((((__m128i*)dst) + 4), c4); 623 | _mm_store_si128((((__m128i*)dst) + 5), c5); 624 | _mm_store_si128((((__m128i*)dst) + 6), c6); 625 | _mm_store_si128((((__m128i*)dst) + 7), c7); 626 | dst += 128; 627 | } 628 | } 629 | else { // big memory copy 630 | __m128i c0, c1, c2, c3, c4, c5, c6, c7; 631 | 632 | _mm_prefetch((const char*)(src), _MM_HINT_NTA); 633 | 634 | if ((((size_t)src) & 15) == 0) { // source aligned 635 | for (; size >= 128; size -= 128) { 636 | c0 = _mm_load_si128(((const __m128i*)src) + 0); 637 | c1 = _mm_load_si128(((const __m128i*)src) + 1); 638 | c2 = _mm_load_si128(((const __m128i*)src) + 2); 639 | c3 = _mm_load_si128(((const __m128i*)src) + 3); 640 | c4 = _mm_load_si128(((const __m128i*)src) + 4); 641 | c5 = _mm_load_si128(((const __m128i*)src) + 5); 642 | c6 = _mm_load_si128(((const __m128i*)src) + 6); 643 | c7 = _mm_load_si128(((const __m128i*)src) + 7); 644 | _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); 645 | src += 128; 646 | _mm_stream_si128((((__m128i*)dst) + 0), c0); 647 | _mm_stream_si128((((__m128i*)dst) + 1), c1); 648 | _mm_stream_si128((((__m128i*)dst) + 2), c2); 649 | _mm_stream_si128((((__m128i*)dst) + 3), c3); 650 | _mm_stream_si128((((__m128i*)dst) + 4), c4); 651 | _mm_stream_si128((((__m128i*)dst) + 5), c5); 652 | _mm_stream_si128((((__m128i*)dst) + 6), c6); 653 | _mm_stream_si128((((__m128i*)dst) + 7), c7); 654 | dst += 128; 655 | } 656 | } 657 | else { // source unaligned 658 | for (; size >= 128; size -= 128) { 659 | c0 = _mm_loadu_si128(((const __m128i*)src) + 0); 660 | c1 = _mm_loadu_si128(((const __m128i*)src) + 1); 661 | c2 = _mm_loadu_si128(((const __m128i*)src) + 2); 662 | c3 = _mm_loadu_si128(((const __m128i*)src) + 3); 663 | c4 = _mm_loadu_si128(((const __m128i*)src) + 4); 664 | c5 = _mm_loadu_si128(((const __m128i*)src) + 5); 665 | c6 = _mm_loadu_si128(((const __m128i*)src) + 6); 666 | c7 = _mm_loadu_si128(((const __m128i*)src) + 7); 667 | _mm_prefetch((const char*)(src + 256), _MM_HINT_NTA); 668 | src += 128; 669 | _mm_stream_si128((((__m128i*)dst) + 0), c0); 670 | _mm_stream_si128((((__m128i*)dst) + 1), c1); 671 | _mm_stream_si128((((__m128i*)dst) + 2), c2); 672 | _mm_stream_si128((((__m128i*)dst) + 3), c3); 673 | _mm_stream_si128((((__m128i*)dst) + 4), c4); 674 | _mm_stream_si128((((__m128i*)dst) + 5), c5); 675 | _mm_stream_si128((((__m128i*)dst) + 6), c6); 676 | _mm_stream_si128((((__m128i*)dst) + 7), c7); 677 | dst += 128; 678 | } 679 | } 680 | _mm_sfence(); 681 | } 682 | 683 | memcpy_tiny(dst, src, size); 684 | 685 | return destination; 686 | } 687 | 688 | 689 | #endif 690 | 691 | 692 | 693 | -------------------------------------------------------------------------------- /FastMemcpy_Avx.c: -------------------------------------------------------------------------------- 1 | //===================================================================== 2 | // 3 | // FastMemcpy.c - skywind3000@163.com, 2015 4 | // 5 | // feature: 6 | // 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc4.9) 7 | // 8 | //===================================================================== 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | 15 | #if (defined(_WIN32) || defined(WIN32)) 16 | #include 17 | #include 18 | #ifdef _MSC_VER 19 | #pragma comment(lib, "winmm.lib") 20 | #endif 21 | #elif defined(__unix) 22 | #include 23 | #include 24 | #else 25 | #error it can only be compiled under windows or unix 26 | #endif 27 | 28 | #include "FastMemcpy_Avx.h" 29 | 30 | 31 | unsigned int gettime() 32 | { 33 | #if (defined(_WIN32) || defined(WIN32)) 34 | return timeGetTime(); 35 | #else 36 | static struct timezone tz={ 0,0 }; 37 | struct timeval time; 38 | gettimeofday(&time,&tz); 39 | return (time.tv_sec * 1000 + time.tv_usec / 1000); 40 | #endif 41 | } 42 | 43 | void sleepms(unsigned int millisec) 44 | { 45 | #if defined(_WIN32) || defined(WIN32) 46 | Sleep(millisec); 47 | #else 48 | usleep(millisec * 1000); 49 | #endif 50 | } 51 | 52 | 53 | 54 | void benchmark(int dstalign, int srcalign, size_t size, int times) 55 | { 56 | char *DATA1 = (char*)malloc(size + 64); 57 | char *DATA2 = (char*)malloc(size + 64); 58 | size_t LINEAR1 = ((size_t)DATA1); 59 | size_t LINEAR2 = ((size_t)DATA2); 60 | char *ALIGN1 = (char*)(((64 - (LINEAR1 & 63)) & 63) + LINEAR1); 61 | char *ALIGN2 = (char*)(((64 - (LINEAR2 & 63)) & 63) + LINEAR2); 62 | char *dst = (dstalign)? ALIGN1 : (ALIGN1 + 1); 63 | char *src = (srcalign)? ALIGN2 : (ALIGN2 + 3); 64 | unsigned int t1, t2; 65 | int k; 66 | 67 | sleepms(100); 68 | t1 = gettime(); 69 | for (k = times; k > 0; k--) { 70 | memcpy(dst, src, size); 71 | } 72 | t1 = gettime() - t1; 73 | sleepms(100); 74 | t2 = gettime(); 75 | for (k = times; k > 0; k--) { 76 | memcpy_fast(dst, src, size); 77 | } 78 | t2 = gettime() - t2; 79 | 80 | free(DATA1); 81 | free(DATA2); 82 | 83 | printf("result(dst %s, src %s): memcpy_fast=%dms memcpy=%d ms\n", 84 | dstalign? "aligned" : "unalign", 85 | srcalign? "aligned" : "unalign", (int)t2, (int)t1); 86 | } 87 | 88 | 89 | void bench(int copysize, int times) 90 | { 91 | printf("benchmark(size=%d bytes, times=%d):\n", copysize, times); 92 | benchmark(1, 1, copysize, times); 93 | benchmark(1, 0, copysize, times); 94 | benchmark(0, 1, copysize, times); 95 | benchmark(0, 0, copysize, times); 96 | printf("\n"); 97 | } 98 | 99 | 100 | void random_bench(int maxsize, int times) 101 | { 102 | static char A[11 * 1024 * 1024 + 2]; 103 | static char B[11 * 1024 * 1024 + 2]; 104 | static int random_offsets[0x10000]; 105 | static int random_sizes[0x8000]; 106 | unsigned int i, p1, p2; 107 | unsigned int t1, t2; 108 | for (i = 0; i < 0x10000; i++) { // generate random offsets 109 | random_offsets[i] = rand() % (10 * 1024 * 1024 + 1); 110 | } 111 | for (i = 0; i < 0x8000; i++) { // generate random sizes 112 | random_sizes[i] = 1 + rand() % maxsize; 113 | } 114 | sleepms(100); 115 | t1 = gettime(); 116 | for (p1 = 0, p2 = 0, i = 0; i < times; i++) { 117 | int offset1 = random_offsets[(p1++) & 0xffff]; 118 | int offset2 = random_offsets[(p1++) & 0xffff]; 119 | int size = random_sizes[(p2++) & 0x7fff]; 120 | memcpy(A + offset1, B + offset2, size); 121 | } 122 | t1 = gettime() - t1; 123 | sleepms(100); 124 | t2 = gettime(); 125 | for (p1 = 0, p2 = 0, i = 0; i < times; i++) { 126 | int offset1 = random_offsets[(p1++) & 0xffff]; 127 | int offset2 = random_offsets[(p1++) & 0xffff]; 128 | int size = random_sizes[(p2++) & 0x7fff]; 129 | memcpy_fast(A + offset1, B + offset2, size); 130 | } 131 | t2 = gettime() - t2; 132 | printf("benchmark random access:\n"); 133 | printf("memcpy_fast=%dms memcpy=%dms\n\n", (int)t2, (int)t1); 134 | } 135 | 136 | 137 | #ifdef _MSC_VER 138 | #pragma comment(lib, "winmm.lib") 139 | #endif 140 | 141 | int main(void) 142 | { 143 | #if 1 144 | bench(32, 0x1000000); 145 | bench(64, 0x1000000); 146 | bench(512, 0x800000); 147 | bench(1024, 0x400000); 148 | #endif 149 | bench(4096, 0x80000); 150 | bench(8192, 0x40000); 151 | #if 1 152 | bench(1024 * 1024 * 1, 0x800); 153 | bench(1024 * 1024 * 4, 0x200); 154 | #endif 155 | bench(1024 * 1024 * 8, 0x100); 156 | 157 | random_bench(2048, 8000000); 158 | 159 | return 0; 160 | } 161 | 162 | 163 | 164 | 165 | /* 166 | 167 | */ 168 | 169 | 170 | 171 | 172 | -------------------------------------------------------------------------------- /FastMemcpy_Avx.h: -------------------------------------------------------------------------------- 1 | //===================================================================== 2 | // 3 | // FastMemcpy.c - skywind3000@163.com, 2015 4 | // 5 | // feature: 6 | // 50% speed up in avg. vs standard memcpy (tested in vc2012/gcc5.1) 7 | // 8 | //===================================================================== 9 | #ifndef __FAST_MEMCPY_H__ 10 | #define __FAST_MEMCPY_H__ 11 | 12 | #include 13 | #include 14 | #include 15 | 16 | 17 | //--------------------------------------------------------------------- 18 | // force inline for compilers 19 | //--------------------------------------------------------------------- 20 | #ifndef INLINE 21 | #ifdef __GNUC__ 22 | #if (__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 1)) 23 | #define INLINE __inline__ __attribute__((always_inline)) 24 | #else 25 | #define INLINE __inline__ 26 | #endif 27 | #elif defined(_MSC_VER) 28 | #define INLINE __forceinline 29 | #elif (defined(__BORLANDC__) || defined(__WATCOMC__)) 30 | #define INLINE __inline 31 | #else 32 | #define INLINE 33 | #endif 34 | #endif 35 | 36 | 37 | 38 | //--------------------------------------------------------------------- 39 | // fast copy for different sizes 40 | //--------------------------------------------------------------------- 41 | static INLINE void memcpy_avx_16(void *dst, const void *src) { 42 | #if 1 43 | __m128i m0 = _mm_loadu_si128(((const __m128i*)src) + 0); 44 | _mm_storeu_si128(((__m128i*)dst) + 0, m0); 45 | #else 46 | *((uint64_t*)((char*)dst + 0)) = *((uint64_t*)((const char*)src + 0)); 47 | *((uint64_t*)((char*)dst + 8)) = *((uint64_t*)((const char*)src + 8)); 48 | #endif 49 | } 50 | 51 | static INLINE void memcpy_avx_32(void *dst, const void *src) { 52 | __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); 53 | _mm256_storeu_si256(((__m256i*)dst) + 0, m0); 54 | } 55 | 56 | static INLINE void memcpy_avx_64(void *dst, const void *src) { 57 | __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); 58 | __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); 59 | _mm256_storeu_si256(((__m256i*)dst) + 0, m0); 60 | _mm256_storeu_si256(((__m256i*)dst) + 1, m1); 61 | } 62 | 63 | static INLINE void memcpy_avx_128(void *dst, const void *src) { 64 | __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); 65 | __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); 66 | __m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2); 67 | __m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3); 68 | _mm256_storeu_si256(((__m256i*)dst) + 0, m0); 69 | _mm256_storeu_si256(((__m256i*)dst) + 1, m1); 70 | _mm256_storeu_si256(((__m256i*)dst) + 2, m2); 71 | _mm256_storeu_si256(((__m256i*)dst) + 3, m3); 72 | } 73 | 74 | static INLINE void memcpy_avx_256(void *dst, const void *src) { 75 | __m256i m0 = _mm256_loadu_si256(((const __m256i*)src) + 0); 76 | __m256i m1 = _mm256_loadu_si256(((const __m256i*)src) + 1); 77 | __m256i m2 = _mm256_loadu_si256(((const __m256i*)src) + 2); 78 | __m256i m3 = _mm256_loadu_si256(((const __m256i*)src) + 3); 79 | __m256i m4 = _mm256_loadu_si256(((const __m256i*)src) + 4); 80 | __m256i m5 = _mm256_loadu_si256(((const __m256i*)src) + 5); 81 | __m256i m6 = _mm256_loadu_si256(((const __m256i*)src) + 6); 82 | __m256i m7 = _mm256_loadu_si256(((const __m256i*)src) + 7); 83 | _mm256_storeu_si256(((__m256i*)dst) + 0, m0); 84 | _mm256_storeu_si256(((__m256i*)dst) + 1, m1); 85 | _mm256_storeu_si256(((__m256i*)dst) + 2, m2); 86 | _mm256_storeu_si256(((__m256i*)dst) + 3, m3); 87 | _mm256_storeu_si256(((__m256i*)dst) + 4, m4); 88 | _mm256_storeu_si256(((__m256i*)dst) + 5, m5); 89 | _mm256_storeu_si256(((__m256i*)dst) + 6, m6); 90 | _mm256_storeu_si256(((__m256i*)dst) + 7, m7); 91 | } 92 | 93 | 94 | //--------------------------------------------------------------------- 95 | // tiny memory copy with jump table optimized 96 | //--------------------------------------------------------------------- 97 | static INLINE void *memcpy_tiny(void *dst, const void *src, size_t size) { 98 | unsigned char *dd = ((unsigned char*)dst) + size; 99 | const unsigned char *ss = ((const unsigned char*)src) + size; 100 | 101 | switch (size) { 102 | case 128: memcpy_avx_128(dd - 128, ss - 128); 103 | case 0: break; 104 | case 129: memcpy_avx_128(dd - 129, ss - 129); 105 | case 1: dd[-1] = ss[-1]; break; 106 | case 130: memcpy_avx_128(dd - 130, ss - 130); 107 | case 2: *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; 108 | case 131: memcpy_avx_128(dd - 131, ss - 131); 109 | case 3: *((uint16_t*)(dd - 3)) = *((uint16_t*)(ss - 3)); dd[-1] = ss[-1]; break; 110 | case 132: memcpy_avx_128(dd - 132, ss - 132); 111 | case 4: *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; 112 | case 133: memcpy_avx_128(dd - 133, ss - 133); 113 | case 5: *((uint32_t*)(dd - 5)) = *((uint32_t*)(ss - 5)); dd[-1] = ss[-1]; break; 114 | case 134: memcpy_avx_128(dd - 134, ss - 134); 115 | case 6: *((uint32_t*)(dd - 6)) = *((uint32_t*)(ss - 6)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; 116 | case 135: memcpy_avx_128(dd - 135, ss - 135); 117 | case 7: *((uint32_t*)(dd - 7)) = *((uint32_t*)(ss - 7)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; 118 | case 136: memcpy_avx_128(dd - 136, ss - 136); 119 | case 8: *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 120 | case 137: memcpy_avx_128(dd - 137, ss - 137); 121 | case 9: *((uint64_t*)(dd - 9)) = *((uint64_t*)(ss - 9)); dd[-1] = ss[-1]; break; 122 | case 138: memcpy_avx_128(dd - 138, ss - 138); 123 | case 10: *((uint64_t*)(dd - 10)) = *((uint64_t*)(ss - 10)); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; 124 | case 139: memcpy_avx_128(dd - 139, ss - 139); 125 | case 11: *((uint64_t*)(dd - 11)) = *((uint64_t*)(ss - 11)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; 126 | case 140: memcpy_avx_128(dd - 140, ss - 140); 127 | case 12: *((uint64_t*)(dd - 12)) = *((uint64_t*)(ss - 12)); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; 128 | case 141: memcpy_avx_128(dd - 141, ss - 141); 129 | case 13: *((uint64_t*)(dd - 13)) = *((uint64_t*)(ss - 13)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 130 | case 142: memcpy_avx_128(dd - 142, ss - 142); 131 | case 14: *((uint64_t*)(dd - 14)) = *((uint64_t*)(ss - 14)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 132 | case 143: memcpy_avx_128(dd - 143, ss - 143); 133 | case 15: *((uint64_t*)(dd - 15)) = *((uint64_t*)(ss - 15)); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 134 | case 144: memcpy_avx_128(dd - 144, ss - 144); 135 | case 16: memcpy_avx_16(dd - 16, ss - 16); break; 136 | case 145: memcpy_avx_128(dd - 145, ss - 145); 137 | case 17: memcpy_avx_16(dd - 17, ss - 17); dd[-1] = ss[-1]; break; 138 | case 146: memcpy_avx_128(dd - 146, ss - 146); 139 | case 18: memcpy_avx_16(dd - 18, ss - 18); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; 140 | case 147: memcpy_avx_128(dd - 147, ss - 147); 141 | case 19: memcpy_avx_16(dd - 19, ss - 19); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; 142 | case 148: memcpy_avx_128(dd - 148, ss - 148); 143 | case 20: memcpy_avx_16(dd - 20, ss - 20); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; 144 | case 149: memcpy_avx_128(dd - 149, ss - 149); 145 | case 21: memcpy_avx_16(dd - 21, ss - 21); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 146 | case 150: memcpy_avx_128(dd - 150, ss - 150); 147 | case 22: memcpy_avx_16(dd - 22, ss - 22); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 148 | case 151: memcpy_avx_128(dd - 151, ss - 151); 149 | case 23: memcpy_avx_16(dd - 23, ss - 23); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 150 | case 152: memcpy_avx_128(dd - 152, ss - 152); 151 | case 24: memcpy_avx_16(dd - 24, ss - 24); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 152 | case 153: memcpy_avx_128(dd - 153, ss - 153); 153 | case 25: memcpy_avx_16(dd - 25, ss - 25); memcpy_avx_16(dd - 16, ss - 16); break; 154 | case 154: memcpy_avx_128(dd - 154, ss - 154); 155 | case 26: memcpy_avx_16(dd - 26, ss - 26); memcpy_avx_16(dd - 16, ss - 16); break; 156 | case 155: memcpy_avx_128(dd - 155, ss - 155); 157 | case 27: memcpy_avx_16(dd - 27, ss - 27); memcpy_avx_16(dd - 16, ss - 16); break; 158 | case 156: memcpy_avx_128(dd - 156, ss - 156); 159 | case 28: memcpy_avx_16(dd - 28, ss - 28); memcpy_avx_16(dd - 16, ss - 16); break; 160 | case 157: memcpy_avx_128(dd - 157, ss - 157); 161 | case 29: memcpy_avx_16(dd - 29, ss - 29); memcpy_avx_16(dd - 16, ss - 16); break; 162 | case 158: memcpy_avx_128(dd - 158, ss - 158); 163 | case 30: memcpy_avx_16(dd - 30, ss - 30); memcpy_avx_16(dd - 16, ss - 16); break; 164 | case 159: memcpy_avx_128(dd - 159, ss - 159); 165 | case 31: memcpy_avx_16(dd - 31, ss - 31); memcpy_avx_16(dd - 16, ss - 16); break; 166 | case 160: memcpy_avx_128(dd - 160, ss - 160); 167 | case 32: memcpy_avx_32(dd - 32, ss - 32); break; 168 | case 161: memcpy_avx_128(dd - 161, ss - 161); 169 | case 33: memcpy_avx_32(dd - 33, ss - 33); dd[-1] = ss[-1]; break; 170 | case 162: memcpy_avx_128(dd - 162, ss - 162); 171 | case 34: memcpy_avx_32(dd - 34, ss - 34); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; 172 | case 163: memcpy_avx_128(dd - 163, ss - 163); 173 | case 35: memcpy_avx_32(dd - 35, ss - 35); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; 174 | case 164: memcpy_avx_128(dd - 164, ss - 164); 175 | case 36: memcpy_avx_32(dd - 36, ss - 36); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; 176 | case 165: memcpy_avx_128(dd - 165, ss - 165); 177 | case 37: memcpy_avx_32(dd - 37, ss - 37); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 178 | case 166: memcpy_avx_128(dd - 166, ss - 166); 179 | case 38: memcpy_avx_32(dd - 38, ss - 38); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 180 | case 167: memcpy_avx_128(dd - 167, ss - 167); 181 | case 39: memcpy_avx_32(dd - 39, ss - 39); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 182 | case 168: memcpy_avx_128(dd - 168, ss - 168); 183 | case 40: memcpy_avx_32(dd - 40, ss - 40); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 184 | case 169: memcpy_avx_128(dd - 169, ss - 169); 185 | case 41: memcpy_avx_32(dd - 41, ss - 41); memcpy_avx_16(dd - 16, ss - 16); break; 186 | case 170: memcpy_avx_128(dd - 170, ss - 170); 187 | case 42: memcpy_avx_32(dd - 42, ss - 42); memcpy_avx_16(dd - 16, ss - 16); break; 188 | case 171: memcpy_avx_128(dd - 171, ss - 171); 189 | case 43: memcpy_avx_32(dd - 43, ss - 43); memcpy_avx_16(dd - 16, ss - 16); break; 190 | case 172: memcpy_avx_128(dd - 172, ss - 172); 191 | case 44: memcpy_avx_32(dd - 44, ss - 44); memcpy_avx_16(dd - 16, ss - 16); break; 192 | case 173: memcpy_avx_128(dd - 173, ss - 173); 193 | case 45: memcpy_avx_32(dd - 45, ss - 45); memcpy_avx_16(dd - 16, ss - 16); break; 194 | case 174: memcpy_avx_128(dd - 174, ss - 174); 195 | case 46: memcpy_avx_32(dd - 46, ss - 46); memcpy_avx_16(dd - 16, ss - 16); break; 196 | case 175: memcpy_avx_128(dd - 175, ss - 175); 197 | case 47: memcpy_avx_32(dd - 47, ss - 47); memcpy_avx_16(dd - 16, ss - 16); break; 198 | case 176: memcpy_avx_128(dd - 176, ss - 176); 199 | case 48: memcpy_avx_32(dd - 48, ss - 48); memcpy_avx_16(dd - 16, ss - 16); break; 200 | case 177: memcpy_avx_128(dd - 177, ss - 177); 201 | case 49: memcpy_avx_32(dd - 49, ss - 49); memcpy_avx_32(dd - 32, ss - 32); break; 202 | case 178: memcpy_avx_128(dd - 178, ss - 178); 203 | case 50: memcpy_avx_32(dd - 50, ss - 50); memcpy_avx_32(dd - 32, ss - 32); break; 204 | case 179: memcpy_avx_128(dd - 179, ss - 179); 205 | case 51: memcpy_avx_32(dd - 51, ss - 51); memcpy_avx_32(dd - 32, ss - 32); break; 206 | case 180: memcpy_avx_128(dd - 180, ss - 180); 207 | case 52: memcpy_avx_32(dd - 52, ss - 52); memcpy_avx_32(dd - 32, ss - 32); break; 208 | case 181: memcpy_avx_128(dd - 181, ss - 181); 209 | case 53: memcpy_avx_32(dd - 53, ss - 53); memcpy_avx_32(dd - 32, ss - 32); break; 210 | case 182: memcpy_avx_128(dd - 182, ss - 182); 211 | case 54: memcpy_avx_32(dd - 54, ss - 54); memcpy_avx_32(dd - 32, ss - 32); break; 212 | case 183: memcpy_avx_128(dd - 183, ss - 183); 213 | case 55: memcpy_avx_32(dd - 55, ss - 55); memcpy_avx_32(dd - 32, ss - 32); break; 214 | case 184: memcpy_avx_128(dd - 184, ss - 184); 215 | case 56: memcpy_avx_32(dd - 56, ss - 56); memcpy_avx_32(dd - 32, ss - 32); break; 216 | case 185: memcpy_avx_128(dd - 185, ss - 185); 217 | case 57: memcpy_avx_32(dd - 57, ss - 57); memcpy_avx_32(dd - 32, ss - 32); break; 218 | case 186: memcpy_avx_128(dd - 186, ss - 186); 219 | case 58: memcpy_avx_32(dd - 58, ss - 58); memcpy_avx_32(dd - 32, ss - 32); break; 220 | case 187: memcpy_avx_128(dd - 187, ss - 187); 221 | case 59: memcpy_avx_32(dd - 59, ss - 59); memcpy_avx_32(dd - 32, ss - 32); break; 222 | case 188: memcpy_avx_128(dd - 188, ss - 188); 223 | case 60: memcpy_avx_32(dd - 60, ss - 60); memcpy_avx_32(dd - 32, ss - 32); break; 224 | case 189: memcpy_avx_128(dd - 189, ss - 189); 225 | case 61: memcpy_avx_32(dd - 61, ss - 61); memcpy_avx_32(dd - 32, ss - 32); break; 226 | case 190: memcpy_avx_128(dd - 190, ss - 190); 227 | case 62: memcpy_avx_32(dd - 62, ss - 62); memcpy_avx_32(dd - 32, ss - 32); break; 228 | case 191: memcpy_avx_128(dd - 191, ss - 191); 229 | case 63: memcpy_avx_32(dd - 63, ss - 63); memcpy_avx_32(dd - 32, ss - 32); break; 230 | case 192: memcpy_avx_128(dd - 192, ss - 192); 231 | case 64: memcpy_avx_64(dd - 64, ss - 64); break; 232 | case 193: memcpy_avx_128(dd - 193, ss - 193); 233 | case 65: memcpy_avx_64(dd - 65, ss - 65); dd[-1] = ss[-1]; break; 234 | case 194: memcpy_avx_128(dd - 194, ss - 194); 235 | case 66: memcpy_avx_64(dd - 66, ss - 66); *((uint16_t*)(dd - 2)) = *((uint16_t*)(ss - 2)); break; 236 | case 195: memcpy_avx_128(dd - 195, ss - 195); 237 | case 67: memcpy_avx_64(dd - 67, ss - 67); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; 238 | case 196: memcpy_avx_128(dd - 196, ss - 196); 239 | case 68: memcpy_avx_64(dd - 68, ss - 68); *((uint32_t*)(dd - 4)) = *((uint32_t*)(ss - 4)); break; 240 | case 197: memcpy_avx_128(dd - 197, ss - 197); 241 | case 69: memcpy_avx_64(dd - 69, ss - 69); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 242 | case 198: memcpy_avx_128(dd - 198, ss - 198); 243 | case 70: memcpy_avx_64(dd - 70, ss - 70); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 244 | case 199: memcpy_avx_128(dd - 199, ss - 199); 245 | case 71: memcpy_avx_64(dd - 71, ss - 71); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 246 | case 200: memcpy_avx_128(dd - 200, ss - 200); 247 | case 72: memcpy_avx_64(dd - 72, ss - 72); *((uint64_t*)(dd - 8)) = *((uint64_t*)(ss - 8)); break; 248 | case 201: memcpy_avx_128(dd - 201, ss - 201); 249 | case 73: memcpy_avx_64(dd - 73, ss - 73); memcpy_avx_16(dd - 16, ss - 16); break; 250 | case 202: memcpy_avx_128(dd - 202, ss - 202); 251 | case 74: memcpy_avx_64(dd - 74, ss - 74); memcpy_avx_16(dd - 16, ss - 16); break; 252 | case 203: memcpy_avx_128(dd - 203, ss - 203); 253 | case 75: memcpy_avx_64(dd - 75, ss - 75); memcpy_avx_16(dd - 16, ss - 16); break; 254 | case 204: memcpy_avx_128(dd - 204, ss - 204); 255 | case 76: memcpy_avx_64(dd - 76, ss - 76); memcpy_avx_16(dd - 16, ss - 16); break; 256 | case 205: memcpy_avx_128(dd - 205, ss - 205); 257 | case 77: memcpy_avx_64(dd - 77, ss - 77); memcpy_avx_16(dd - 16, ss - 16); break; 258 | case 206: memcpy_avx_128(dd - 206, ss - 206); 259 | case 78: memcpy_avx_64(dd - 78, ss - 78); memcpy_avx_16(dd - 16, ss - 16); break; 260 | case 207: memcpy_avx_128(dd - 207, ss - 207); 261 | case 79: memcpy_avx_64(dd - 79, ss - 79); memcpy_avx_16(dd - 16, ss - 16); break; 262 | case 208: memcpy_avx_128(dd - 208, ss - 208); 263 | case 80: memcpy_avx_64(dd - 80, ss - 80); memcpy_avx_16(dd - 16, ss - 16); break; 264 | case 209: memcpy_avx_128(dd - 209, ss - 209); 265 | case 81: memcpy_avx_64(dd - 81, ss - 81); memcpy_avx_32(dd - 32, ss - 32); break; 266 | case 210: memcpy_avx_128(dd - 210, ss - 210); 267 | case 82: memcpy_avx_64(dd - 82, ss - 82); memcpy_avx_32(dd - 32, ss - 32); break; 268 | case 211: memcpy_avx_128(dd - 211, ss - 211); 269 | case 83: memcpy_avx_64(dd - 83, ss - 83); memcpy_avx_32(dd - 32, ss - 32); break; 270 | case 212: memcpy_avx_128(dd - 212, ss - 212); 271 | case 84: memcpy_avx_64(dd - 84, ss - 84); memcpy_avx_32(dd - 32, ss - 32); break; 272 | case 213: memcpy_avx_128(dd - 213, ss - 213); 273 | case 85: memcpy_avx_64(dd - 85, ss - 85); memcpy_avx_32(dd - 32, ss - 32); break; 274 | case 214: memcpy_avx_128(dd - 214, ss - 214); 275 | case 86: memcpy_avx_64(dd - 86, ss - 86); memcpy_avx_32(dd - 32, ss - 32); break; 276 | case 215: memcpy_avx_128(dd - 215, ss - 215); 277 | case 87: memcpy_avx_64(dd - 87, ss - 87); memcpy_avx_32(dd - 32, ss - 32); break; 278 | case 216: memcpy_avx_128(dd - 216, ss - 216); 279 | case 88: memcpy_avx_64(dd - 88, ss - 88); memcpy_avx_32(dd - 32, ss - 32); break; 280 | case 217: memcpy_avx_128(dd - 217, ss - 217); 281 | case 89: memcpy_avx_64(dd - 89, ss - 89); memcpy_avx_32(dd - 32, ss - 32); break; 282 | case 218: memcpy_avx_128(dd - 218, ss - 218); 283 | case 90: memcpy_avx_64(dd - 90, ss - 90); memcpy_avx_32(dd - 32, ss - 32); break; 284 | case 219: memcpy_avx_128(dd - 219, ss - 219); 285 | case 91: memcpy_avx_64(dd - 91, ss - 91); memcpy_avx_32(dd - 32, ss - 32); break; 286 | case 220: memcpy_avx_128(dd - 220, ss - 220); 287 | case 92: memcpy_avx_64(dd - 92, ss - 92); memcpy_avx_32(dd - 32, ss - 32); break; 288 | case 221: memcpy_avx_128(dd - 221, ss - 221); 289 | case 93: memcpy_avx_64(dd - 93, ss - 93); memcpy_avx_32(dd - 32, ss - 32); break; 290 | case 222: memcpy_avx_128(dd - 222, ss - 222); 291 | case 94: memcpy_avx_64(dd - 94, ss - 94); memcpy_avx_32(dd - 32, ss - 32); break; 292 | case 223: memcpy_avx_128(dd - 223, ss - 223); 293 | case 95: memcpy_avx_64(dd - 95, ss - 95); memcpy_avx_32(dd - 32, ss - 32); break; 294 | case 224: memcpy_avx_128(dd - 224, ss - 224); 295 | case 96: memcpy_avx_64(dd - 96, ss - 96); memcpy_avx_32(dd - 32, ss - 32); break; 296 | case 225: memcpy_avx_128(dd - 225, ss - 225); 297 | case 97: memcpy_avx_64(dd - 97, ss - 97); memcpy_avx_64(dd - 64, ss - 64); break; 298 | case 226: memcpy_avx_128(dd - 226, ss - 226); 299 | case 98: memcpy_avx_64(dd - 98, ss - 98); memcpy_avx_64(dd - 64, ss - 64); break; 300 | case 227: memcpy_avx_128(dd - 227, ss - 227); 301 | case 99: memcpy_avx_64(dd - 99, ss - 99); memcpy_avx_64(dd - 64, ss - 64); break; 302 | case 228: memcpy_avx_128(dd - 228, ss - 228); 303 | case 100: memcpy_avx_64(dd - 100, ss - 100); memcpy_avx_64(dd - 64, ss - 64); break; 304 | case 229: memcpy_avx_128(dd - 229, ss - 229); 305 | case 101: memcpy_avx_64(dd - 101, ss - 101); memcpy_avx_64(dd - 64, ss - 64); break; 306 | case 230: memcpy_avx_128(dd - 230, ss - 230); 307 | case 102: memcpy_avx_64(dd - 102, ss - 102); memcpy_avx_64(dd - 64, ss - 64); break; 308 | case 231: memcpy_avx_128(dd - 231, ss - 231); 309 | case 103: memcpy_avx_64(dd - 103, ss - 103); memcpy_avx_64(dd - 64, ss - 64); break; 310 | case 232: memcpy_avx_128(dd - 232, ss - 232); 311 | case 104: memcpy_avx_64(dd - 104, ss - 104); memcpy_avx_64(dd - 64, ss - 64); break; 312 | case 233: memcpy_avx_128(dd - 233, ss - 233); 313 | case 105: memcpy_avx_64(dd - 105, ss - 105); memcpy_avx_64(dd - 64, ss - 64); break; 314 | case 234: memcpy_avx_128(dd - 234, ss - 234); 315 | case 106: memcpy_avx_64(dd - 106, ss - 106); memcpy_avx_64(dd - 64, ss - 64); break; 316 | case 235: memcpy_avx_128(dd - 235, ss - 235); 317 | case 107: memcpy_avx_64(dd - 107, ss - 107); memcpy_avx_64(dd - 64, ss - 64); break; 318 | case 236: memcpy_avx_128(dd - 236, ss - 236); 319 | case 108: memcpy_avx_64(dd - 108, ss - 108); memcpy_avx_64(dd - 64, ss - 64); break; 320 | case 237: memcpy_avx_128(dd - 237, ss - 237); 321 | case 109: memcpy_avx_64(dd - 109, ss - 109); memcpy_avx_64(dd - 64, ss - 64); break; 322 | case 238: memcpy_avx_128(dd - 238, ss - 238); 323 | case 110: memcpy_avx_64(dd - 110, ss - 110); memcpy_avx_64(dd - 64, ss - 64); break; 324 | case 239: memcpy_avx_128(dd - 239, ss - 239); 325 | case 111: memcpy_avx_64(dd - 111, ss - 111); memcpy_avx_64(dd - 64, ss - 64); break; 326 | case 240: memcpy_avx_128(dd - 240, ss - 240); 327 | case 112: memcpy_avx_64(dd - 112, ss - 112); memcpy_avx_64(dd - 64, ss - 64); break; 328 | case 241: memcpy_avx_128(dd - 241, ss - 241); 329 | case 113: memcpy_avx_64(dd - 113, ss - 113); memcpy_avx_64(dd - 64, ss - 64); break; 330 | case 242: memcpy_avx_128(dd - 242, ss - 242); 331 | case 114: memcpy_avx_64(dd - 114, ss - 114); memcpy_avx_64(dd - 64, ss - 64); break; 332 | case 243: memcpy_avx_128(dd - 243, ss - 243); 333 | case 115: memcpy_avx_64(dd - 115, ss - 115); memcpy_avx_64(dd - 64, ss - 64); break; 334 | case 244: memcpy_avx_128(dd - 244, ss - 244); 335 | case 116: memcpy_avx_64(dd - 116, ss - 116); memcpy_avx_64(dd - 64, ss - 64); break; 336 | case 245: memcpy_avx_128(dd - 245, ss - 245); 337 | case 117: memcpy_avx_64(dd - 117, ss - 117); memcpy_avx_64(dd - 64, ss - 64); break; 338 | case 246: memcpy_avx_128(dd - 246, ss - 246); 339 | case 118: memcpy_avx_64(dd - 118, ss - 118); memcpy_avx_64(dd - 64, ss - 64); break; 340 | case 247: memcpy_avx_128(dd - 247, ss - 247); 341 | case 119: memcpy_avx_64(dd - 119, ss - 119); memcpy_avx_64(dd - 64, ss - 64); break; 342 | case 248: memcpy_avx_128(dd - 248, ss - 248); 343 | case 120: memcpy_avx_64(dd - 120, ss - 120); memcpy_avx_64(dd - 64, ss - 64); break; 344 | case 249: memcpy_avx_128(dd - 249, ss - 249); 345 | case 121: memcpy_avx_64(dd - 121, ss - 121); memcpy_avx_64(dd - 64, ss - 64); break; 346 | case 250: memcpy_avx_128(dd - 250, ss - 250); 347 | case 122: memcpy_avx_64(dd - 122, ss - 122); memcpy_avx_64(dd - 64, ss - 64); break; 348 | case 251: memcpy_avx_128(dd - 251, ss - 251); 349 | case 123: memcpy_avx_64(dd - 123, ss - 123); memcpy_avx_64(dd - 64, ss - 64); break; 350 | case 252: memcpy_avx_128(dd - 252, ss - 252); 351 | case 124: memcpy_avx_64(dd - 124, ss - 124); memcpy_avx_64(dd - 64, ss - 64); break; 352 | case 253: memcpy_avx_128(dd - 253, ss - 253); 353 | case 125: memcpy_avx_64(dd - 125, ss - 125); memcpy_avx_64(dd - 64, ss - 64); break; 354 | case 254: memcpy_avx_128(dd - 254, ss - 254); 355 | case 126: memcpy_avx_64(dd - 126, ss - 126); memcpy_avx_64(dd - 64, ss - 64); break; 356 | case 255: memcpy_avx_128(dd - 255, ss - 255); 357 | case 127: memcpy_avx_64(dd - 127, ss - 127); memcpy_avx_64(dd - 64, ss - 64); break; 358 | case 256: memcpy_avx_256(dd - 256, ss - 256); break; 359 | } 360 | 361 | return dst; 362 | } 363 | 364 | 365 | //--------------------------------------------------------------------- 366 | // main routine 367 | //--------------------------------------------------------------------- 368 | static void* memcpy_fast(void *destination, const void *source, size_t size) 369 | { 370 | unsigned char *dst = (unsigned char*)destination; 371 | const unsigned char *src = (const unsigned char*)source; 372 | static size_t cachesize = 0x200000; // L3-cache size 373 | size_t padding; 374 | 375 | // small memory copy 376 | if (size <= 256) { 377 | memcpy_tiny(dst, src, size); 378 | _mm256_zeroupper(); 379 | return destination; 380 | } 381 | 382 | // align destination to 16 bytes boundary 383 | padding = (32 - (((size_t)dst) & 31)) & 31; 384 | 385 | #if 0 386 | if (padding > 0) { 387 | __m256i head = _mm256_loadu_si256((const __m256i*)src); 388 | _mm256_storeu_si256((__m256i*)dst, head); 389 | dst += padding; 390 | src += padding; 391 | size -= padding; 392 | } 393 | #else 394 | __m256i head = _mm256_loadu_si256((const __m256i*)src); 395 | _mm256_storeu_si256((__m256i*)dst, head); 396 | dst += padding; 397 | src += padding; 398 | size -= padding; 399 | #endif 400 | 401 | // medium size copy 402 | if (size <= cachesize) { 403 | __m256i c0, c1, c2, c3, c4, c5, c6, c7; 404 | 405 | for (; size >= 256; size -= 256) { 406 | c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); 407 | c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); 408 | c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); 409 | c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); 410 | c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); 411 | c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); 412 | c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); 413 | c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); 414 | _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); 415 | src += 256; 416 | _mm256_storeu_si256((((__m256i*)dst) + 0), c0); 417 | _mm256_storeu_si256((((__m256i*)dst) + 1), c1); 418 | _mm256_storeu_si256((((__m256i*)dst) + 2), c2); 419 | _mm256_storeu_si256((((__m256i*)dst) + 3), c3); 420 | _mm256_storeu_si256((((__m256i*)dst) + 4), c4); 421 | _mm256_storeu_si256((((__m256i*)dst) + 5), c5); 422 | _mm256_storeu_si256((((__m256i*)dst) + 6), c6); 423 | _mm256_storeu_si256((((__m256i*)dst) + 7), c7); 424 | dst += 256; 425 | } 426 | } 427 | else { // big memory copy 428 | __m256i c0, c1, c2, c3, c4, c5, c6, c7; 429 | /* __m256i c0, c1, c2, c3, c4, c5, c6, c7; */ 430 | 431 | _mm_prefetch((const char*)(src), _MM_HINT_NTA); 432 | 433 | if ((((size_t)src) & 31) == 0) { // source aligned 434 | for (; size >= 256; size -= 256) { 435 | c0 = _mm256_load_si256(((const __m256i*)src) + 0); 436 | c1 = _mm256_load_si256(((const __m256i*)src) + 1); 437 | c2 = _mm256_load_si256(((const __m256i*)src) + 2); 438 | c3 = _mm256_load_si256(((const __m256i*)src) + 3); 439 | c4 = _mm256_load_si256(((const __m256i*)src) + 4); 440 | c5 = _mm256_load_si256(((const __m256i*)src) + 5); 441 | c6 = _mm256_load_si256(((const __m256i*)src) + 6); 442 | c7 = _mm256_load_si256(((const __m256i*)src) + 7); 443 | _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); 444 | src += 256; 445 | _mm256_stream_si256((((__m256i*)dst) + 0), c0); 446 | _mm256_stream_si256((((__m256i*)dst) + 1), c1); 447 | _mm256_stream_si256((((__m256i*)dst) + 2), c2); 448 | _mm256_stream_si256((((__m256i*)dst) + 3), c3); 449 | _mm256_stream_si256((((__m256i*)dst) + 4), c4); 450 | _mm256_stream_si256((((__m256i*)dst) + 5), c5); 451 | _mm256_stream_si256((((__m256i*)dst) + 6), c6); 452 | _mm256_stream_si256((((__m256i*)dst) + 7), c7); 453 | dst += 256; 454 | } 455 | } 456 | else { // source unaligned 457 | for (; size >= 256; size -= 256) { 458 | c0 = _mm256_loadu_si256(((const __m256i*)src) + 0); 459 | c1 = _mm256_loadu_si256(((const __m256i*)src) + 1); 460 | c2 = _mm256_loadu_si256(((const __m256i*)src) + 2); 461 | c3 = _mm256_loadu_si256(((const __m256i*)src) + 3); 462 | c4 = _mm256_loadu_si256(((const __m256i*)src) + 4); 463 | c5 = _mm256_loadu_si256(((const __m256i*)src) + 5); 464 | c6 = _mm256_loadu_si256(((const __m256i*)src) + 6); 465 | c7 = _mm256_loadu_si256(((const __m256i*)src) + 7); 466 | _mm_prefetch((const char*)(src + 512), _MM_HINT_NTA); 467 | src += 256; 468 | _mm256_stream_si256((((__m256i*)dst) + 0), c0); 469 | _mm256_stream_si256((((__m256i*)dst) + 1), c1); 470 | _mm256_stream_si256((((__m256i*)dst) + 2), c2); 471 | _mm256_stream_si256((((__m256i*)dst) + 3), c3); 472 | _mm256_stream_si256((((__m256i*)dst) + 4), c4); 473 | _mm256_stream_si256((((__m256i*)dst) + 5), c5); 474 | _mm256_stream_si256((((__m256i*)dst) + 6), c6); 475 | _mm256_stream_si256((((__m256i*)dst) + 7), c7); 476 | dst += 256; 477 | } 478 | } 479 | _mm_sfence(); 480 | } 481 | 482 | memcpy_tiny(dst, src, size); 483 | _mm256_zeroupper(); 484 | 485 | return destination; 486 | } 487 | 488 | 489 | #endif 490 | 491 | 492 | 493 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 Linwei 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Build SSE 2 | ===== 3 | 4 | with gcc: 5 | > gcc -O3 -msse2 FastMemcpy.c -o FastMemcpy 6 | 7 | with msvc: 8 | > cl -nologo -arch:SSE2 -O2 FastMemcpy.c 9 | 10 | Build AVX 11 | ===== 12 | 13 | with gcc: 14 | > gcc -O3 -mavx FastMemcpy_Avx.c -o FastMemcpy_Avx 15 | 16 | with msvc: 17 | > cl -nologo -arch:AVX -O2 FastMemcpy_Avx.c 18 | 19 | Features 20 | ======== 21 | 22 | * 50% speedup in avg. vs traditional memcpy in msvc 2012 or gcc 4.9 23 | * small size copy optimized with jump table 24 | * medium size copy optimized with sse2 vector copy 25 | * huge size copy optimized with cache prefetch & movntdq 26 | 27 | Reference 28 | ========= 29 | 30 | [Using Block Prefetch for Optimized Memory Performance](http://files.rsdn.ru/23380/AMD_block_prefetch_paper.pdf) 31 | 32 | The artical only focused on aligned huge memory copy. You need handle other cases by your self. 33 | 34 | 35 | Results 36 | ======= 37 | 38 | ``` 39 | result: gcc4.9 (msvc 2012 got a similar result): 40 | 41 | benchmark(size=32 bytes, times=16777216): 42 | result(dst aligned, src aligned): memcpy_fast=81ms memcpy=281 ms 43 | result(dst aligned, src unalign): memcpy_fast=88ms memcpy=254 ms 44 | result(dst unalign, src aligned): memcpy_fast=87ms memcpy=245 ms 45 | result(dst unalign, src unalign): memcpy_fast=81ms memcpy=258 ms 46 | 47 | benchmark(size=64 bytes, times=16777216): 48 | result(dst aligned, src aligned): memcpy_fast=91ms memcpy=364 ms 49 | result(dst aligned, src unalign): memcpy_fast=95ms memcpy=336 ms 50 | result(dst unalign, src aligned): memcpy_fast=96ms memcpy=353 ms 51 | result(dst unalign, src unalign): memcpy_fast=99ms memcpy=346 ms 52 | 53 | benchmark(size=512 bytes, times=8388608): 54 | result(dst aligned, src aligned): memcpy_fast=124ms memcpy=242 ms 55 | result(dst aligned, src unalign): memcpy_fast=166ms memcpy=555 ms 56 | result(dst unalign, src aligned): memcpy_fast=168ms memcpy=602 ms 57 | result(dst unalign, src unalign): memcpy_fast=174ms memcpy=614 ms 58 | 59 | benchmark(size=1024 bytes, times=4194304): 60 | result(dst aligned, src aligned): memcpy_fast=119ms memcpy=171 ms 61 | result(dst aligned, src unalign): memcpy_fast=182ms memcpy=442 ms 62 | result(dst unalign, src aligned): memcpy_fast=163ms memcpy=466 ms 63 | result(dst unalign, src unalign): memcpy_fast=168ms memcpy=472 ms 64 | 65 | benchmark(size=4096 bytes, times=524288): 66 | result(dst aligned, src aligned): memcpy_fast=68ms memcpy=82 ms 67 | result(dst aligned, src unalign): memcpy_fast=94ms memcpy=226 ms 68 | result(dst unalign, src aligned): memcpy_fast=134ms memcpy=216 ms 69 | result(dst unalign, src unalign): memcpy_fast=84ms memcpy=188 ms 70 | 71 | benchmark(size=8192 bytes, times=262144): 72 | result(dst aligned, src aligned): memcpy_fast=55ms memcpy=70 ms 73 | result(dst aligned, src unalign): memcpy_fast=75ms memcpy=192 ms 74 | result(dst unalign, src aligned): memcpy_fast=79ms memcpy=223 ms 75 | result(dst unalign, src unalign): memcpy_fast=91ms memcpy=219 ms 76 | 77 | benchmark(size=1048576 bytes, times=2048): 78 | result(dst aligned, src aligned): memcpy_fast=181ms memcpy=165 ms 79 | result(dst aligned, src unalign): memcpy_fast=192ms memcpy=303 ms 80 | result(dst unalign, src aligned): memcpy_fast=218ms memcpy=310 ms 81 | result(dst unalign, src unalign): memcpy_fast=183ms memcpy=307 ms 82 | 83 | benchmark(size=4194304 bytes, times=512): 84 | result(dst aligned, src aligned): memcpy_fast=263ms memcpy=398 ms 85 | result(dst aligned, src unalign): memcpy_fast=269ms memcpy=433 ms 86 | result(dst unalign, src aligned): memcpy_fast=306ms memcpy=497 ms 87 | result(dst unalign, src unalign): memcpy_fast=285ms memcpy=417 ms 88 | 89 | benchmark(size=8388608 bytes, times=256): 90 | result(dst aligned, src aligned): memcpy_fast=287ms memcpy=421 ms 91 | result(dst aligned, src unalign): memcpy_fast=288ms memcpy=430 ms 92 | result(dst unalign, src aligned): memcpy_fast=285ms memcpy=510 ms 93 | result(dst unalign, src unalign): memcpy_fast=291ms memcpy=440 ms 94 | 95 | benchmark random access: 96 | memcpy_fast=487ms memcpy=1000ms 97 | 98 | ``` 99 | 100 | 101 | About 102 | ===== 103 | 104 | skywind 105 | 106 | http://www.skywind.me 107 | --------------------------------------------------------------------------------