├── docs
    ├── hist.png
    ├── align.png
    ├── memcpy_r.png
    ├── memset_r.png
    ├── memcpy_bench.png
    ├── memset_bench.png
    └── annotated_glibc.txt
├── .gitignore
├── src
    ├── utils
    │   ├── CMakeLists.txt
    │   └── hist_tool.c
    ├── memcpy
    │   ├── CMakeLists.txt
    │   ├── impl.c
    │   ├── bench_memcpy.cc
    │   ├── impl.S
    │   ├── test_memcpy.cc
    │   └── folly.S
    └── memset
    │   ├── shims.c
    │   ├── CMakeLists.txt
    │   ├── test_memset.cc
    │   ├── bench_memset.cc
    │   ├── impl.S
    │   └── impl.c
├── include
    ├── decl.h
    ├── types.h
    └── utils.h
├── CMakeLists.txt
└── README.md


/docs/hist.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/hist.png


--------------------------------------------------------------------------------
/docs/align.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/align.png


--------------------------------------------------------------------------------
/docs/memcpy_r.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/memcpy_r.png


--------------------------------------------------------------------------------
/docs/memset_r.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/memset_r.png


--------------------------------------------------------------------------------
/docs/memcpy_bench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/memcpy_bench.png


--------------------------------------------------------------------------------
/docs/memset_bench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/memset_bench.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.swn
 3 | *.swo
 4 | *.swp
 5 | *~
 6 | .DS_Store
 7 | *.so
 8 | *.dylib
 9 | 
10 | GPATH
11 | GRTAGS
12 | GTAGS
13 | tags
14 | 
15 | compile_commands.json
16 | 
17 | toolchain/
18 | llvm-project/
19 | gcc-project/
20 | build*/
21 | .vscode/
22 | .vim/
23 | .idea/
24 | 


--------------------------------------------------------------------------------
/src/utils/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(hist_tool SHARED
 2 |             hist_tool.c
 3 |            )
 4 | 
 5 | set_target_properties(hist_tool PROPERTIES
 6 |      VERSION ${PROJECT_VERSION}
 7 |      SOVERSION 1
 8 |      )
 9 | 
10 | target_compile_options(hist_tool PRIVATE "-fno-builtin")
11 | 
12 | install(TARGETS hist_tool LIBRARY DESTINATION bin)
13 | 


--------------------------------------------------------------------------------
/src/memcpy/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_executable(test_memcpy
 2 |                  test_memcpy.cc
 3 |                  folly.S
 4 |                  impl.S
 5 |                  impl.c
 6 |                  )
 7 | 
 8 | target_link_libraries(test_memcpy PUBLIC)
 9 | 
10 | add_executable(bench_memcpy
11 |                  bench_memcpy.cc
12 |                  folly.S
13 |                  impl.S
14 |                  impl.c
15 |                  )
16 | 
17 | install(TARGETS bench_memcpy DESTINATION bin)
18 | install(TARGETS test_memcpy DESTINATION bin)
19 | 
20 | 


--------------------------------------------------------------------------------
/src/memset/shims.c:
--------------------------------------------------------------------------------
 1 | #include "decl.h"
 2 | 
 3 | ////////////////////////////////////////////////////////////////////////////////
 4 | /// This is a small utility that swaps the builtin call to memset with the
 5 | /// local implementation of memset, implemented in this project.
 6 | /// The shared object can be loaded using LD_PRELOAD (on Linux) or
 7 | /// DYLD_INSERT_LIBRARIES (on Mac).
 8 | ////////////////////////////////////////////////////////////////////////////////
 9 | 
10 | void *memset(void *s, int c, size_t n) { return local_memset(s, c, n); }
11 | 


--------------------------------------------------------------------------------
/src/memset/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | add_library(mem_shim SHARED
 2 |             shims.c
 3 |             impl.S
 4 |             impl.c
 5 |            )
 6 | 
 7 | set_target_properties(mem_shim PROPERTIES
 8 |      VERSION ${PROJECT_VERSION}
 9 |      SOVERSION 1
10 |      )
11 | 
12 | add_executable(bench_memset
13 |                  bench_memset.cc
14 |                  impl.S
15 |                  impl.c
16 |                  )
17 | 
18 | add_executable(test_memset
19 |                  test_memset.cc
20 |                  impl.S
21 |                  impl.c
22 |                  )
23 | 
24 | target_link_libraries(bench_memset PUBLIC)
25 | target_link_libraries(test_memset PUBLIC)
26 | 
27 | install(TARGETS bench_memset DESTINATION bin)
28 | install(TARGETS test_memset DESTINATION bin)
29 | install(TARGETS mem_shim LIBRARY DESTINATION bin)
30 | 
31 | 


--------------------------------------------------------------------------------
/include/decl.h:
--------------------------------------------------------------------------------
 1 | #ifndef DECLS
 2 | #define DECLS
 3 | 
 4 | #include <stddef.h>
 5 | 
 6 | #ifdef __cplusplus
 7 | 
 8 | using memset_ty = void *(void *s, int c, size_t n);
 9 | using memcpy_ty = void *(void *dest, const void *src, size_t n);
10 | 
11 | extern "C" {
12 | #endif
13 | 
14 | void *memcpy(void *dest, const void *src, size_t n);
15 | void *__folly_memcpy(void *dest, const void *src, size_t n);
16 | void *libc_memcpy(void *dest, const void *src, size_t n);
17 | void *local_memcpy(void *dest, const void *src, size_t n);
18 | void *asm_memcpy(void *dest, const void *src, size_t n);
19 | 
20 | void *memset(void *s, int c, size_t n);
21 | void *libc_memset(void *s, int c, size_t n);
22 | void *local_memset(void *s, int c, size_t n);
23 | void *asm_memset(void *s, int c, size_t n);
24 | void *musl_memset(void *s, int c, size_t n);
25 | 
26 | #ifdef __cplusplus
27 | }
28 | #endif
29 | 
30 | #endif // DECLS
31 | 


--------------------------------------------------------------------------------
/include/types.h:
--------------------------------------------------------------------------------
 1 | #ifndef TYPES
 2 | #define TYPES
 3 | 
 4 | #include <stdint.h>
 5 | 
 6 | #define NO_INLINE __attribute__((noinline))
 7 | 
 8 | #ifdef __clang__
 9 | typedef char char8 __attribute__((ext_vector_type(8), aligned(1)));
10 | typedef char char16 __attribute__((ext_vector_type(16), aligned(1)));
11 | typedef char char32 __attribute__((ext_vector_type(32), aligned(1)));
12 | typedef char char32a __attribute__((ext_vector_type(32), aligned(32)));
13 | 
14 | #else
15 | // __GNUC__
16 | typedef char char8 __attribute__((vector_size(8), aligned(1)));
17 | typedef char char16 __attribute__((vector_size(16), aligned(1)));
18 | typedef char char32 __attribute__((vector_size(32), aligned(1)));
19 | typedef char char32a __attribute__((vector_size(32), aligned(32)));
20 | #endif
21 | 
22 | typedef uint32_t __attribute__((aligned(1))) u32;
23 | typedef uint64_t __attribute__((aligned(1))) u64;
24 | 
25 | #endif // TYPES
26 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 3.7)
 2 | 
 3 | project(bpf_tracer VERSION 1.0.0 DESCRIPTION "Memset benchmarks")
 4 | 
 5 | set(CMAKE_CXX_STANDARD 14)
 6 | set(CXX_STANDARD_REQUIRED ON)
 7 | set(CMAKE_CXX_EXTENSIONS OFF)
 8 | 
 9 | # Export a JSON file with the compilation commands that external tools can use
10 | # to analyze the source code of the project.
11 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
12 | 
13 | enable_language(C ASM)
14 | 
15 | # Disable exceptions
16 | SET (CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-fno-rtti ")
17 | 
18 | if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
19 |   message(STATUS "No build type selected, default to Release")
20 |   set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Build type (default RelWithDebInfo)" FORCE)
21 | endif()
22 | 
23 | add_compile_options(-Wall -g3 -O3 -march=native)
24 | 
25 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -march=native")
26 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -fno-omit-frame-pointer -O0")
27 | 
28 | # Place all of the binaries in the build directory.
29 | set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
30 | set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
31 | 
32 | include_directories(BEFORE
33 |   ${CMAKE_CURRENT_BINARY_DIR}/include
34 |   ${CMAKE_CURRENT_SOURCE_DIR}/include
35 |   )
36 | 
37 | add_subdirectory(src/memset/)
38 | add_subdirectory(src/memcpy/)
39 | add_subdirectory(src/utils/)
40 | 


--------------------------------------------------------------------------------
/src/memcpy/impl.c:
--------------------------------------------------------------------------------
 1 | #include "types.h"
 2 | 
 3 | #include <stddef.h>
 4 | #include <stdint.h>
 5 | 
 6 | void *local_memcpy(void *dest, const void *src, size_t n) {
 7 |   char *d = (char *)dest;
 8 |   const char *s = (char *)src;
 9 | 
10 |   if (n < 5) {
11 |     if (n == 0)
12 |       return dest;
13 |     d[0] = s[0];
14 |     d[n - 1] = s[n - 1];
15 |     if (n <= 2)
16 |       return dest;
17 |     d[1] = s[1];
18 |     d[2] = s[2];
19 |     return dest;
20 |   }
21 | 
22 |   if (n <= 16) {
23 |     if (n >= 8) {
24 |       const char *first_s = s;
25 |       const char *last_s = s + n - 8;
26 |       char *first_d = d;
27 |       char *last_d = d + n - 8;
28 |       *((u64 *)first_d) = *((u64 *)first_s);
29 |       *((u64 *)last_d) = *((u64 *)last_s);
30 |       return dest;
31 |     }
32 | 
33 |     const char *first_s = s;
34 |     const char *last_s = s + n - 4;
35 |     char *first_d = d;
36 |     char *last_d = d + n - 4;
37 |     *((u32 *)first_d) = *((u32 *)first_s);
38 |     *((u32 *)last_d) = *((u32 *)last_s);
39 |     return dest;
40 |   }
41 | 
42 |   if (n <= 32) {
43 |     const char *first_s = s;
44 |     const char *last_s = s + n - 16;
45 |     char *first_d = d;
46 |     char *last_d = d + n - 16;
47 | 
48 |     *((char16 *)first_d) = *((char16 *)first_s);
49 |     *((char16 *)last_d) = *((char16 *)last_s);
50 |     return dest;
51 |   }
52 | 
53 |   const char *last_word_s = s + n - 32;
54 |   char *last_word_d = d + n - 32;
55 | 
56 |   // Stamp the 32-byte chunks.
57 |   do {
58 |     *((char32 *)d) = *((char32 *)s);
59 |     d += 32;
60 |     s += 32;
61 |   } while (d < last_word_d);
62 | 
63 |   // Stamp the last unaligned 32 bytes of the buffer.
64 |   *((char32 *)last_word_d) = *((char32 *)last_word_s);
65 |   return dest;
66 | }
67 | 


--------------------------------------------------------------------------------
/src/utils/hist_tool.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | #include <stdint.h>
 3 | #include <stdio.h>
 4 | #include <unistd.h>
 5 | 
 6 | ////////////////////////////////////////////////////////////////////////////////
 7 | /// This is a small utility that records calls to some methods and creates a
 8 | /// histogram of the lengths of calls to memset. It prints the histogram when
 9 | /// the program is terminated. The shared object can be loaded using LD_PRELOAD
10 | /// (on Linux) or DYLD_INSERT_LIBRARIES (on Mac).
11 | ////////////////////////////////////////////////////////////////////////////////
12 | 
13 | uint32_t memset_len_dist[32] = {
14 |     0,
15 | };
16 | uint32_t memcpy_len_dist[32] = {
17 |     0,
18 | };
19 | uint32_t align_dist[32] = {
20 |     0,
21 | };
22 | 
23 | 
24 | const int tab32[32] = {0,  9,  1,  10, 13, 21, 2,  29, 11, 14, 16,
25 |                        18, 22, 25, 3,  30, 8,  12, 20, 28, 15, 17,
26 |                        24, 7,  19, 27, 23, 6,  26, 5,  4,  31};
27 | 
28 | int log2_32(uint32_t value) {
29 |   value |= value >> 1;
30 |   value |= value >> 2;
31 |   value |= value >> 4;
32 |   value |= value >> 8;
33 |   value |= value >> 16;
34 |   return tab32[(uint32_t)(value * 0x07C4ACDD) >> 27];
35 | }
36 | 
37 | void __attribute__((destructor)) print_hitograms() {
38 |   FILE *ff = fopen("/tmp/hist.txt", "a+");
39 |   if (!ff) {
40 |     return;
41 |   }
42 |   pid_t pid = getpid();
43 | 
44 |   fprintf(ff, "Histogram for (%d):\n", pid);
45 |   fprintf(ff, "size, memset, memcpy, alignment:\n");
46 |   for (int i = 0; i < 32; i++) {
47 |     fprintf(ff, "%d, %d, %d, %d,\n", i, memset_len_dist[i], memcpy_len_dist[i], align_dist[i]);
48 |   }
49 |   fclose(ff);
50 | }
51 | 
52 | void *memcpy(void *dest, const void *src, size_t len) {
53 |   memcpy_len_dist[log2_32(len)]++;
54 |   align_dist[(unsigned long)dest % 32]++;
55 |   align_dist[(unsigned long)src % 32]++;
56 |   char *d = (char *)dest;
57 |   char *s = (char *)src;
58 |   for (size_t i = 0; i < len; i++) {
59 |     d[i] = s[i];
60 |   }
61 |   return dest;
62 | }
63 | 
64 | void *memset(void *s, int c, size_t len) {
65 |   memset_len_dist[log2_32(len)]++;
66 |   align_dist[(unsigned long)s % 32]++;
67 |   char *p = s;
68 | 
69 |   for (int i = 0; i < len; i++) {
70 |     p[i] = c;
71 |   }
72 |   return s;
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/src/memset/test_memset.cc:
--------------------------------------------------------------------------------
 1 | #include <cstring>
 2 | #include <iostream>
 3 | #include <vector>
 4 | 
 5 | #include "decl.h"
 6 | #include "utils.h"
 7 | 
 8 | ////////////////////////////////////////////////////////////////////////////////
 9 | // This is a small program that checks if some memset implementation is correct.
10 | // The tool currently checks libc, musl and the local implementation.
11 | ////////////////////////////////////////////////////////////////////////////////
12 | 
13 | #define MAGIC_VALUE0 'X'
14 | #define MAGIC_VALUE1 'O'
15 | 
16 | void print_buffer(const char *start, const char *end, char val,
17 |                   const char *ptr) {
18 |   const char *it = start;
19 |   while (it != end) {
20 |     std::cout << *it;
21 |     it++;
22 |   }
23 |   std::cout << "\n";
24 |   it = start;
25 |   while (it != ptr) {
26 |     std::cout << " ";
27 |     it++;
28 |   }
29 |   std::cout << "^\n";
30 |   std::cout << "Filling a buffer of length " << end - start << ".";
31 |   std::cout << " Expected \"" << val << "\" at index " << ptr - start << "\n";
32 | }
33 | 
34 | void assert_uniform_value(const char *start, const char *end, char val) {
35 |   const char *ptr = start;
36 |   while (ptr != end) {
37 |     if (val != *ptr) {
38 |       print_buffer(start, end, val, ptr);
39 |       fflush(stdout);
40 |       abort();
41 |     }
42 |     ptr++;
43 |   }
44 | }
45 | 
46 | void test_impl(memset_ty handle, const std::string &name, unsigned chunk_size) {
47 |   std::vector<char> memory(chunk_size + 512, MAGIC_VALUE0);
48 |   // Start mem-setting the array at different offsets.
49 |   for (int offset = 0; offset < 128; offset++) {
50 |     const char *buffer_start = &*memory.begin();
51 |     const char *buffer_end = &*memory.end();
52 | 
53 |     const char *region_start = &memory[offset];
54 |     const char *region_end = region_start + chunk_size;
55 | 
56 |     assert_uniform_value(buffer_start, buffer_end, MAGIC_VALUE0);
57 | 
58 |     (handle)((void *)region_start, MAGIC_VALUE1, chunk_size);
59 | 
60 |     // Check the chunk.
61 |     assert_uniform_value(region_start, region_end, MAGIC_VALUE1);
62 |     // Check before chunk.
63 |     assert_uniform_value(buffer_start, region_start, MAGIC_VALUE0);
64 |     // Check after chunk.
65 |     assert_uniform_value(region_end, buffer_end, MAGIC_VALUE0);
66 | 
67 |     // Reset the buffer:
68 |     std::fill(memory.begin(), memory.end(), MAGIC_VALUE0);
69 |     assert_uniform_value(buffer_start, buffer_end, MAGIC_VALUE0);
70 |   }
71 | }
72 | 
73 | int main(int argc, char **argv) {
74 |   std::cout << "Testing memset... \n";
75 | 
76 | #define TEST(FUNC, SIZE) test_impl(FUNC, #FUNC, SIZE);
77 | 
78 |   for (int i = 0; i < 1024; i++) {
79 |     TEST(libc_memset, i);
80 |     TEST(local_memset, i);
81 |     TEST(musl_memset, i);
82 |     TEST(asm_memset, i);
83 |   }
84 |   std::cout << "Done.\n";
85 | 
86 |   return 0;
87 | }
88 | 


--------------------------------------------------------------------------------
/src/memset/bench_memset.cc:
--------------------------------------------------------------------------------
 1 | #include <algorithm>
 2 | #include <cstring>
 3 | #include <iomanip>
 4 | #include <iostream>
 5 | #include <unistd.h>
 6 | #include <vector>
 7 | 
 8 | #include "decl.h"
 9 | #include "utils.h"
10 | 
11 | ////////////////////////////////////////////////////////////////////////////////
12 | // This is a small program that compares two memset implementations and records
13 | // the output in a csv file.
14 | ////////////////////////////////////////////////////////////////////////////////
15 | 
16 | #define ITER (1000L * 1000L * 10L)
17 | #define SAMPLES (20)
18 | 
19 | DoomRNG RNG;
20 | 
21 | /// Measure a single implementation \p handle.
22 | uint64_t measure(memset_ty handle, unsigned size, unsigned align,
23 |                  unsigned offset, void *ptr) {
24 |   Stopwatch T;
25 |   for (unsigned i = 0; i < SAMPLES; i++) {
26 |     T.start();
27 |     for (size_t j = 0; j < ITER; j++) {
28 |       (handle)(ptr, 0, size);
29 |     }
30 |     T.stop();
31 |   }
32 |   return T.get_median();
33 | }
34 | 
35 | // Allocate memory and benchmark each implementation at a specific size \p size.
36 | void bench_impl(const std::vector<memset_ty *> &toTest, unsigned size,
37 |                 unsigned align, unsigned offset) {
38 |   std::vector<char> memory(size + 256, 0);
39 |   void *ptr = align_pointer(&memory[0], align, offset);
40 | 
41 |   std::cout << size << ", ";
42 |   for (auto handle : toTest) {
43 |     u_int64_t res = measure(handle, size, align, offset, ptr);
44 |     std::cout << res << ", ";
45 |   }
46 |   std::cout << std::endl;
47 | }
48 | 
49 | /// Try to allocate buffers at random offsets and in random sizes.
50 | /// The sizes and the offsets are in the range 0..256.
51 | void bench_rand_range(const std::vector<memset_ty *> &toTest) {
52 |   std::vector<char> memory(1024, 0);
53 |   void *ptr = &memory[0];
54 | 
55 |   for (auto handle : toTest) {
56 |     Stopwatch T;
57 |     sleep(1);
58 |     for (unsigned i = 0; i < SAMPLES; i++) {
59 |       RNG.rand_reset();
60 |       T.start();
61 |       for (size_t j = 0; j < ITER; j++) {
62 |         (handle)((char *)ptr + RNG.next_u8_random(), 0, RNG.next_u8_random());
63 |       }
64 |       T.stop();
65 |     }
66 | 
67 |     std::cout << T.get_median() << ", ";
68 |   }
69 |   std::cout << std::endl;
70 | }
71 | 
72 | // To measure the call overhead.
73 | void *nop(void *s, int c, size_t n) { return s; }
74 | 
75 | int main(int argc, char **argv) {
76 |   std::cout << std::setprecision(3);
77 |   std::cout << std::fixed;
78 | 
79 |   std::vector<memset_ty *> toTest = {musl_memset,  libc_memset, &memset,
80 |                                      local_memset, asm_memset,  &nop};
81 | 
82 |   std::cout << "Batches of random sizes:\n";
83 |   std::cout << " musl, libc@plt, libc, c_memset, asm_memset, nop,\n";
84 |   bench_rand_range(toTest);
85 | 
86 |   std::cout << "\nFixed size:\n";
87 |   std::cout << "size, musl, libc@plt, libc, c_memset, asm_memset, nop,\n";
88 | 
89 |   for (int i = 0; i < 512; i++) {
90 |     bench_impl(toTest, i, 16, 0);
91 |   }
92 | 
93 |   return 0;
94 | }
95 | 


--------------------------------------------------------------------------------
/src/memcpy/bench_memcpy.cc:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <cstring>
  3 | #include <iomanip>
  4 | #include <iostream>
  5 | #include <unistd.h>
  6 | #include <vector>
  7 | 
  8 | #include "decl.h"
  9 | #include "utils.h"
 10 | 
 11 | ////////////////////////////////////////////////////////////////////////////////
 12 | // This is a small program that compares two memcpy implementations and records
 13 | // the output in a csv file.
 14 | ////////////////////////////////////////////////////////////////////////////////
 15 | 
 16 | #define ITER (1000L * 1000L * 10L)
 17 | #define SAMPLES (20)
 18 | 
 19 | DoomRNG RNG;
 20 | 
 21 | /// Measure a single implementation \p handle.
 22 | uint64_t measure(memcpy_ty handle, void *dest, void *src, unsigned size) {
 23 |   Stopwatch T;
 24 |   for (unsigned i = 0; i < SAMPLES; i++) {
 25 |     T.start();
 26 |     for (size_t j = 0; j < ITER; j++) {
 27 |       (handle)(dest, src, size);
 28 |     }
 29 |     T.stop();
 30 |   }
 31 |   return T.get_median();
 32 | }
 33 | 
 34 | // Allocate memory and benchmark each implementation at a specific size \p size.
 35 | void bench_impl(const std::vector<memcpy_ty *> &toTest, unsigned size,
 36 |                 unsigned align, unsigned offset) {
 37 |   std::vector<char> dest(size + 256, 0);
 38 |   std::vector<char> src(size + 256, 0);
 39 | 
 40 |   char *src_ptr = (char *)align_pointer(&src[0], align, offset);
 41 |   char *dest_ptr = (char *)align_pointer(&dest[0], align, offset);
 42 | 
 43 |   std::cout << size << ", ";
 44 |   for (auto handle : toTest) {
 45 |     u_int64_t res = measure(handle, dest_ptr, src_ptr, size);
 46 |     std::cout << res << ", ";
 47 |   }
 48 |   std::cout << std::endl;
 49 | }
 50 | 
 51 | /// Allocate and copy buffers at random offsets and in random sizes.
 52 | /// The sizes and the offsets are in the range 0..256.
 53 | void bench_rand_range(const std::vector<memcpy_ty *> &toTest) {
 54 |   std::vector<char> dest(4096, 1);
 55 |   std::vector<char> src(4096, 0);
 56 |   const char *src_p = &src[0];
 57 |   char *dest_p = &dest[0];
 58 | 
 59 |   for (auto handle : toTest) {
 60 |     Stopwatch T;
 61 |     sleep(1);
 62 |     for (unsigned i = 0; i < SAMPLES; i++) {
 63 |       RNG.rand_reset();
 64 |       T.start();
 65 |       for (size_t j = 0; j < ITER; j++) {
 66 |         char *to = dest_p + RNG.next_u8_random();
 67 |         const char *from = src_p + RNG.next_u8_random();
 68 |         (handle)(to, from, RNG.next_u8_random());
 69 |       }
 70 |       T.stop();
 71 |     }
 72 | 
 73 |     std::cout << T.get_median() << ", ";
 74 |   }
 75 |   std::cout << std::endl;
 76 | }
 77 | 
 78 | // To measure the call overhead.
 79 | void *nop(void *dest, const void *src, size_t n) { return dest; }
 80 | 
 81 | int main(int argc, char **argv) {
 82 |   std::cout << std::setprecision(3);
 83 |   std::cout << std::fixed;
 84 | 
 85 |   std::vector<memcpy_ty *> toTest = {
 86 |       &libc_memcpy, &memcpy, &__folly_memcpy, &local_memcpy, &asm_memcpy, &nop};
 87 | 
 88 |   std::cout << "Batches of random sizes:\n";
 89 |   std::cout << "libc@plt, libc, folly, c_memcpy, asm_memcpy, nop,\n";
 90 | 
 91 |   bench_rand_range(toTest);
 92 | 
 93 |   std::cout << "\nFixed size:\n";
 94 |   std::cout << "size, libc@plt, libc, folly, c_memcpy, asm_memcpy, nop,\n";
 95 | 
 96 |   for (int i = 0; i < 512; i++) {
 97 |     bench_impl(toTest, i, 16, 0);
 98 |   }
 99 | 
100 |   return 0;
101 | }
102 | 


--------------------------------------------------------------------------------
/include/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H
 2 | #define UTILS_H
 3 | 
 4 | #include <algorithm>
 5 | #include <chrono>
 6 | #include <string>
 7 | 
 8 | #include "types.h"
 9 | 
10 | /// Aligns the pointer \p ptr, to alignment \p alignment and offset \p offset
11 | /// within the word.
12 | void *align_pointer(void *ptr, unsigned alignment, unsigned offset) {
13 |   size_t p = (size_t)ptr;
14 |   while (p % alignment)
15 |     ++p;
16 |   return (void *)(p + (size_t)offset);
17 | }
18 | 
19 | using time_point = std::chrono::steady_clock::time_point;
20 | 
21 | class Stopwatch {
22 |   /// The time of the last sample;
23 |   time_point begin_;
24 |   /// A list of recorded intervals.
25 |   std::vector<uint64_t> intervals_;
26 | 
27 | public:
28 |   NO_INLINE
29 |   Stopwatch() : begin_() {}
30 | 
31 |   NO_INLINE
32 |   void start() { begin_ = std::chrono::steady_clock::now(); }
33 | 
34 |   NO_INLINE
35 |   void stop() {
36 |     time_point end = std::chrono::steady_clock::now();
37 |     uint64_t interval =
38 |         std::chrono::duration_cast<std::chrono::microseconds>(end - begin_)
39 |             .count();
40 |     intervals_.push_back(interval);
41 |   }
42 | 
43 |   NO_INLINE
44 |   uint64_t get_median() {
45 |     std::sort(intervals_.begin(), intervals_.end());
46 |     return intervals_[intervals_.size() / 2];
47 |   }
48 | };
49 | 
50 | uint8_t random_bytes[320] = {
51 |     227, 138, 244, 198, 73,  247, 185, 248, 229, 75,  24,  215, 159, 230, 136,
52 |     246, 200, 144, 65,  67,  109, 86,  118, 61,  209, 103, 188, 213, 187, 8,
53 |     210, 121, 214, 178, 232, 59,  153, 92,  209, 239, 44,  85,  156, 172, 237,
54 |     41,  150, 195, 247, 202, 249, 142, 208, 133, 21,  204, 114, 38,  51,  150,
55 |     194, 46,  184, 138, 50,  250, 190, 180, 161, 5,   211, 191, 62,  137, 142,
56 |     122, 63,  72,  233, 125, 189, 51,  238, 51,  116, 10,  44,  18,  240, 41,
57 |     157, 81,  183, 252, 214, 17,  81,  12,  44,  119, 77,  97,  101, 80,  106,
58 |     128, 190, 89,  160, 104, 244, 192, 46,  69,  73,  255, 45,  213, 190, 86,
59 |     18,  89,  34,  46,  134, 145, 166, 128, 87,  97,  192, 71,  105, 94,  51,
60 |     30,  7,   9,   0,   40,  0,   187, 205, 189, 151, 159, 107, 105, 180, 182,
61 |     233, 52,  209, 108, 186, 31,  184, 254, 170, 71,  162, 31,  80,  226, 75,
62 |     125, 214, 125, 247, 197, 149, 132, 247, 157, 253, 101, 107, 1,   127, 236,
63 |     249, 242, 152, 169, 123, 240, 129, 230, 135, 25,  57,  227, 130, 189, 76,
64 |     254, 33,  193, 39,  82,  177, 143, 31,  17,  20,  195, 219, 165, 171, 198,
65 |     125, 119, 216, 143, 55,  210, 17,  88,  150, 126, 38,  160, 71,  214, 10,
66 |     162, 158, 6,   234, 233, 119, 221, 167, 62,  146, 50,  150, 176, 142, 167,
67 |     201, 250, 195, 26,  156, 96,  36,  177, 95,  23,  7,   63,  55,  142, 80,
68 |     227, 73,  124, 93,  211, 231, 166, 182, 57,  145, 55,  242, 213, 246, 30,
69 |     146, 247, 19,  229, 34,  210, 37,  147, 242, 103, 125, 91,  171, 51,  22,
70 |     126, 248, 149, 19,  60,  89,  5,   241, 132, 72,  217, 195, 11,  173, 247,
71 |     47,  144, 222, 94,  51,  166, 192, 50,  109, 62,  42,  126, 111, 204, 141,
72 |     66,
73 | };
74 | 
75 | /// Implements a doom-style random number generator.
76 | struct DoomRNG {
77 |   // Points to the current random number.
78 |   unsigned rand_curr = 0;
79 | 
80 |   void rand_reset() { rand_curr = 0; }
81 | 
82 |   uint8_t next_u8_random() { return random_bytes[rand_curr++ % 320]; }
83 | };
84 | 
85 | #endif // UTILS_H
86 | 


--------------------------------------------------------------------------------
/docs/annotated_glibc.txt:
--------------------------------------------------------------------------------
 1 |                <+0>:   endbr64 
 2 |                <+4>:   vmovd  %esi, %xmm0
 3 |                <+8>:   movq   %rdi, %rax
 4 |                <+11>:  vpbroadcastb %xmm0, %ymm0
 5 |                <+16>:  cmpq   $0x20, %rdx
 6 |                <+20>:  jb     0xBELOW_32____            ; <+190>
 7 |                <+26>:  cmpq   $0x40, %rdx
 8 |                <+30>:  ja     0xABOVE_64____            ; <+46>
 9 |                <+32>:  vmovdqu %ymm0, -0x20(%rdi,%rdx)
10 |                <+38>:  vmovdqu %ymm0, (%rdi)
11 |                <+42>:  vzeroupper 
12 |                <+45>:  retq   
13 | 0xABOVE_64____ <+46>:  cmpq   $0x800, %rdx              ; imm = 0x800 
14 |                <+53>:  ja     0xABOVE_2048__            ; ___lldb_unnamed_symbol1097$$libc.so.6 + 4
15 |                <+55>:  cmpq   $0x80, %rdx
16 |                <+62>:  ja     0xABOVE_128___            ; <+89>
17 | 0xSZ_64_TO_128 <+64>:  vmovdqu %ymm0, (%rdi)
18 |                <+68>:  vmovdqu %ymm0, 0x20(%rdi)
19 |                <+73>:  vmovdqu %ymm0, -0x20(%rdi,%rdx)
20 |                <+79>:  vmovdqu %ymm0, -0x40(%rdi,%rdx)
21 | 0xEXIT_EXIT___ <+85>:  vzeroupper 
22 |                <+88>:  retq   
23 | 0xABOVE_128___ <+89>:  leaq   0x80(%rdi), %rcx
24 |                <+96>:  vmovdqu %ymm0, (%rdi)
25 |                <+100>: andq   $-0x80, %rcx
26 |                <+104>: vmovdqu %ymm0, -0x20(%rdi,%rdx)
27 |                <+110>: vmovdqu %ymm0, 0x20(%rdi)
28 |                <+115>: vmovdqu %ymm0, -0x40(%rdi,%rdx)
29 |                <+121>: vmovdqu %ymm0, 0x40(%rdi)
30 |                <+126>: vmovdqu %ymm0, -0x60(%rdi,%rdx)
31 |                <+132>: vmovdqu %ymm0, 0x60(%rdi)
32 |                <+137>: vmovdqu %ymm0, -0x80(%rdi,%rdx)
33 |                <+143>: addq   %rdi, %rdx
34 |                <+146>: andq   $-0x80, %rdx
35 |                <+150>: cmpq   %rdx, %rcx
36 |                <+153>: je     0xEXIT_EXIT___            ; <+85>
37 | 0xLOOP_4x32B__ <+155>: vmovdqa %ymm0, (%rcx)
38 |                <+159>: vmovdqa %ymm0, 0x20(%rcx)
39 |                <+164>: vmovdqa %ymm0, 0x40(%rcx)
40 |                <+169>: vmovdqa %ymm0, 0x60(%rcx)
41 |                <+174>: addq   $0x80, %rcx
42 |                <+181>: cmpq   %rcx, %rdx
43 |                <+184>: jne    0xLOOP_4x32B__            ; <+155>
44 |                <+186>: vzeroupper 
45 |                <+189>: retq   
46 | 0xBELOW_32____ <+190>: cmpb   $0x10, %dl
47 |                <+193>: jae    0xBELOW_16____            ; <+223>
48 |                <+195>: vmovq  %xmm0, %rcx
49 |                <+200>: cmpb   $0x8, %dl
50 |                <+203>: jae    0xABOVE_8_____            ; <+237>
51 |                <+205>: cmpb   $0x4, %dl
52 |                <+208>: jae    0xABOVE_4_____            ; <+249>
53 |                <+210>: cmpb   $0x1, %dl
54 |                <+213>: ja     0xABOVE_1_____            ; <+259>
55 |                <+215>: jb     0xIS_ZERO_CASE            ; <+219>
56 |                <+217>: movb   %cl, (%rdi)
57 | 0xIS_ZERO_CASE <+219>: vzeroupper 
58 |                <+222>: retq   
59 | 0xBELOW_16____ <+223>: vmovdqu %xmm0, -0x10(%rdi,%rdx)
60 |                <+229>: vmovdqu %xmm0, (%rdi)
61 |                <+233>: vzeroupper 
62 |                <+236>: retq   
63 | 0xABOVE_8_____ <+237>: movq   %rcx, -0x8(%rdi,%rdx)
64 |                <+242>: movq   %rcx, (%rdi)
65 |                <+245>: vzeroupper 
66 |                <+248>: retq   
67 | 0xABOVE_4____ <+249>: movl   %ecx, -0x4(%rdi,%rdx)
68 |                <+253>: movl   %ecx, (%rdi)
69 |                <+255>: vzeroupper 
70 |                <+258>: retq   
71 | 0xABOVE_1_____ <+259>: movw   %cx, -0x2(%rdi,%rdx)
72 |                <+264>: movw   %cx, (%rdi)
73 |                <+267>: vzeroupper 
74 |                <+270>: retq   
75 |                <+271>: nop    
76 | 
77 | 


--------------------------------------------------------------------------------
/src/memcpy/impl.S:
--------------------------------------------------------------------------------
  1 | #if defined(__APPLE__)
  2 | .text
  3 | .global _libc_memcpy
  4 | .p2align  4, 0x90
  5 | _libc_memcpy:
  6 |         jmp _memcpy
  7 | 
  8 | #else
  9 | 
 10 | .text
 11 | .global libc_memcpy
 12 | .p2align  4, 0x90
 13 | libc_memcpy:
 14 |         jmp memcpy
 15 | #endif
 16 | 
 17 | #define LABEL(x)     .L##x
 18 | #if defined(__APPLE__)
 19 | .text
 20 | .global _asm_memcpy
 21 | .p2align  5, 0x90
 22 | _asm_memcpy:
 23 | #else
 24 | .text
 25 | .global asm_memcpy
 26 | .p2align  5, 0x90
 27 | asm_memcpy:
 28 | #endif
 29 | 
 30 | // RDI is the dest
 31 | // RSI is the src
 32 | // RDX is length
 33 |   mov  %rdi, %rax
 34 |   cmp    $64,%rdx
 35 |   ja LABEL(over_64)
 36 |   cmp    $16,%rdx
 37 |   jae LABEL(16_to_64)
 38 | 
 39 | LABEL(below_16):
 40 |   cmp    $4,%rdx
 41 |   jbe LABEL(0_to_4)
 42 |   cmp    $8,%rdx
 43 |   jbe LABEL(in_4_to_8)
 44 | LABEL(8_to_16):
 45 |   movq  (%rsi), %rcx
 46 |   movq  %rcx, (%rax)
 47 |   movq  -8(%rsi,%rdx), %rcx
 48 |   movq  %rcx, -8(%rax,%rdx)
 49 |   retq
 50 | 
 51 | LABEL(0_to_4):
 52 |   // Copy the first two bytes:
 53 |   cmp    $0,%rdx
 54 |   je      LABEL(exit)
 55 |   movb  (%rsi), %cl
 56 |   movb  %cl, (%rdi)
 57 |   movb  -1(%rsi,%rdx), %cl
 58 |   movb  %cl, -1(%rdi,%rdx)
 59 |   cmp   $2,%rdx
 60 |   jbe   LABEL(exit)
 61 |   // Copy the second two bytes, if n > 2.
 62 |   movb  1(%rsi), %cl
 63 |   movb  %cl, 1(%rdi)
 64 |   movb  2(%rsi), %cl
 65 |   movb  %cl, 2(%rdi)
 66 |   retq
 67 | LABEL(in_4_to_8):
 68 |   movl  (%rsi), %ecx
 69 |   movl  %ecx, (%rdi)
 70 |   movl  -4(%rsi,%rdx), %ecx
 71 |   movl  %ecx, -4(%rdi,%rdx)
 72 | LABEL(exit):
 73 |   retq
 74 | 
 75 | LABEL(16_to_64):
 76 |   cmp    $32, %rdx
 77 |   jbe LABEL(16_to_32)
 78 | 
 79 | LABEL(32_to_64):
 80 |   vmovdqu  (%rsi), %ymm0
 81 |   vmovdqu  %ymm0, (%rdi)
 82 |   vmovdqu  -32(%rsi,%rdx), %ymm0
 83 |   vmovdqu  %ymm0, -32(%rdi,%rdx)
 84 |   vzeroupper
 85 |   retq
 86 | 
 87 | LABEL(16_to_32):
 88 |   movups  (%rsi), %xmm0
 89 |   movups  %xmm0, (%rdi)
 90 |   movups  -16(%rsi,%rdx), %xmm0
 91 |   movups  %xmm0, -16(%rdi,%rdx)
 92 |   retq
 93 | 
 94 |   // Handle buffers over 64 bytes:
 95 | LABEL(over_64):
 96 |   cmp    $128, %rdx
 97 |   ja LABEL(over_128)
 98 | 
 99 |   // Copy the last wide word.
100 |   vmovups  -32(%rsi,%rdx), %ymm0
101 | 
102 |   // Handle cases in the range 64 to 128. This is two unconditional
103 |   // stores (64), 1 conditional store (32), and the one 32 byte store at
104 |   // the end.
105 |   vmovups  (%rsi), %ymm1
106 |   vmovups  32(%rsi), %ymm2
107 | 
108 |   cmp    $96, %rdx
109 |   jbe    LABEL(64_to_128_done)
110 |   vmovups  64(%rsi), %ymm3
111 |   vmovups  %ymm3, 64(%rax)
112 | 
113 | .align 4
114 | LABEL(64_to_128_done):
115 |   vmovups  %ymm1, (%rax)
116 |   vmovups  %ymm2, 32(%rax)
117 |   // Store the last wide word.
118 |   vmovups  %ymm0, -32(%rax,%rdx)
119 |   vzeroupper
120 |   retq
121 | 
122 | LABEL(over_128):
123 |   // Compute the last writeable destination.
124 |   lea -128(%rdx), %rcx
125 |   xor %r8, %r8
126 | .align 16
127 | LABEL(over_128_copy_loop):
128 |   vmovdqu       (%rsi, %r8), %ymm0
129 |   vmovdqu     32(%rsi, %r8), %ymm1
130 |   vmovdqu     64(%rsi, %r8), %ymm2
131 |   vmovdqu     96(%rsi, %r8), %ymm3
132 |   vmovdqu     %ymm0,   (%rdi, %r8)
133 |   vmovdqu     %ymm1, 32(%rdi, %r8)
134 |   vmovdqu     %ymm2, 64(%rdi, %r8)
135 |   vmovdqu     %ymm3, 96(%rdi, %r8)
136 |   add         $128, %r8
137 |   cmp         %rcx, %r8
138 |   jb LABEL(over_128_copy_loop)
139 | 
140 | // Handle the tail:
141 |   lea    -32(%rdx), %rcx
142 |   cmp    %r8, %rcx
143 |   jb     LABEL(over_128_done)
144 |   vmovdqu     (%rsi, %r8), %ymm0
145 |   vmovdqu     %ymm0,   (%rdi, %r8)
146 |   add         $32, %r8
147 | 
148 |   cmp         %r8, %rcx
149 |   jb          LABEL(over_128_done)
150 |   vmovdqu     (%rsi, %r8), %ymm0
151 |   vmovdqu     %ymm0,   (%rdi, %r8)
152 |   add         $32, %r8
153 | 
154 |   cmp         %r8, %rcx
155 |   jb          LABEL(over_128_done)
156 |   vmovdqu     (%rsi, %r8), %ymm0
157 |   vmovdqu     %ymm0,   (%rdi, %r8)
158 | 
159 | LABEL(over_128_done):
160 |   // Copy the last 32 bytes
161 |   vmovdqu   -32(%rsi, %rdx), %ymm0
162 |   vmovdqu   %ymm0,   -32(%rdi, %rdx)
163 | 
164 |   vzeroupper
165 |   retq
166 | 


--------------------------------------------------------------------------------
/src/memcpy/test_memcpy.cc:
--------------------------------------------------------------------------------
  1 | #include <cstring>
  2 | #include <iostream>
  3 | #include <vector>
  4 | 
  5 | #include "decl.h"
  6 | #include "utils.h"
  7 | 
  8 | ////////////////////////////////////////////////////////////////////////////////
  9 | // This is a small program that checks if some memcpy implementation is correct.
 10 | ////////////////////////////////////////////////////////////////////////////////
 11 | 
 12 | #define MAGIC_VALUE0 '#'
 13 | #define MAGIC_VALUE1 '='
 14 | 
 15 | void print_buffer(const char *start, const char *end, char val,
 16 |                   const char *ptr) {
 17 |   const char *it = start;
 18 |   while (it != end) {
 19 |     std::cout << *it;
 20 |     it++;
 21 |   }
 22 |   std::cout << "\n";
 23 |   it = start;
 24 |   while (it != ptr) {
 25 |     std::cout << " ";
 26 |     it++;
 27 |   }
 28 |   std::cout << "^\n";
 29 |   std::cout << "Filling a buffer of length " << end - start << ".";
 30 |   std::cout << " Expected \"" << val << "\" at index " << ptr - start
 31 |             << std::endl;
 32 | }
 33 | 
 34 | void print_buffer_match(const char *start0, const char *start1, size_t len,
 35 |                         size_t error_at) {
 36 | 
 37 |   for (size_t i = 0; i < len; i++) {
 38 |     std::cout << start0[i];
 39 |   }
 40 |   std::cout << "\n";
 41 |   for (size_t i = 0; i < len; i++) {
 42 |     std::cout << start1[i];
 43 |   }
 44 |   std::cout << "\n";
 45 | 
 46 |   for (size_t i = 0; i < error_at; i++) {
 47 |     std::cout << " ";
 48 |   }
 49 |   std::cout << "^\n";
 50 |   std::cout << "Comparing buffers of length " << len << ".";
 51 |   std::cout << " Invalid value at index " << error_at << "." << std::endl;
 52 | }
 53 | 
 54 | // Make sure that the whole buffer, from \p start to \p end, is set to \p val.
 55 | void assert_uniform_value(const char *start, const char *end, char val) {
 56 |   const char *ptr = start;
 57 |   while (ptr != end) {
 58 |     if (val != *ptr) {
 59 |       print_buffer(start, end, val, ptr);
 60 |       abort();
 61 |     }
 62 |     ptr++;
 63 |   }
 64 | }
 65 | 
 66 | // Make sure that two buffers contain the same memory content.
 67 | void assert_buffers_match(const char *buff1, const char *buff2, size_t len) {
 68 |   for (size_t i = 0; i < len; i++) {
 69 |     if (buff1[i] != buff2[i]) {
 70 |       print_buffer_match(buff1, buff2, len, i);
 71 |       abort();
 72 |     }
 73 |   }
 74 | }
 75 | 
 76 | void test_impl(memcpy_ty handle, const std::string &name, unsigned chunk_size) {
 77 |   std::vector<char> src(chunk_size + 512);
 78 |   std::vector<char> dest(chunk_size + 512, MAGIC_VALUE0);
 79 | 
 80 |   // Fill the buffer with a running counter of printable chars.
 81 |   for (unsigned i = 0; i < src.size(); i++) {
 82 |     src[i] = 'A' + (i % 26);
 83 |   }
 84 | 
 85 |   // Start copying memory at different offsets.
 86 |   for (int src_offset = 0; src_offset < 32; src_offset++) {
 87 |     for (int dest_offset = 0; dest_offset < 32; dest_offset++) {
 88 |       const char *dest_start = &*dest.begin();
 89 |       const char *dest_end = &*dest.end();
 90 | 
 91 |       const char *src_region_start = &src[src_offset];
 92 |       char *dest_region_start = &dest[dest_offset];
 93 |       char *dest_region_end = &dest[dest_offset + chunk_size];
 94 | 
 95 |       void *res =
 96 |           (handle)((void *)dest_region_start, src_region_start, chunk_size);
 97 |       if (res != dest_region_start) {
 98 |         std::cout << "Invalid return value." << std::endl;
 99 |         abort();
100 |       }
101 | 
102 |       // Check the chunk.
103 |       assert_buffers_match(dest_region_start, src_region_start, chunk_size);
104 |       // Check before chunk.
105 |       assert_uniform_value(dest_start, dest_region_start, MAGIC_VALUE0);
106 |       // Check after chunk.
107 |       assert_uniform_value(dest_region_end, dest_end, MAGIC_VALUE0);
108 | 
109 |       // Reset the dest buffer:
110 |       std::fill(dest.begin(), dest.end(), MAGIC_VALUE0);
111 |     }
112 |   }
113 | }
114 | 
115 | int main(int argc, char **argv) {
116 |   std::cout << "Testing memcpy... \n";
117 | 
118 | #define TEST(FUNC, SIZE) test_impl(FUNC, #FUNC, SIZE);
119 | 
120 |   for (int i = 0; i < 1024; i++) {
121 |     TEST(&memcpy, i);
122 |     TEST(&__folly_memcpy, i);
123 |     TEST(&local_memcpy, i);
124 |     TEST(&asm_memcpy, i);
125 |   }
126 | 
127 |   std::cout << "Done.\n";
128 | 
129 |   return 0;
130 | }
131 | 


--------------------------------------------------------------------------------
/src/memset/impl.S:
--------------------------------------------------------------------------------
  1 | #if defined(__APPLE__)
  2 | .text
  3 | .global _libc_memset
  4 | .p2align  4, 0x90
  5 | _libc_memset:
  6 |         jmp _memset
  7 | 
  8 | #else
  9 | 
 10 | .text
 11 | .global libc_memset
 12 | .p2align  4, 0x90
 13 | libc_memset:
 14 |         jmp memset
 15 | #endif
 16 | 
 17 | #define LABEL(x)     .L##x
 18 | #if defined(__APPLE__)
 19 | .text
 20 | .global _asm_memset
 21 | .p2align  5, 0x90
 22 | _asm_memset:
 23 | #else
 24 | .text
 25 | .global asm_memset
 26 | .p2align  5, 0x90
 27 | asm_memset:
 28 | #endif
 29 | 
 30 | // RDI is the buffer
 31 | // RSI is the value
 32 | // RDX is length
 33 |         vmovd  %esi, %xmm0
 34 |         vpbroadcastb %xmm0,%ymm0
 35 |         mov    %rdi,%rax
 36 |         cmp    $0x40,%rdx
 37 |         jae LABEL(above_64)
 38 | LABEL(below_64):
 39 |         cmp    $0x20, %rdx
 40 |         jb LABEL(below_32)
 41 |         vmovdqu %ymm0,(%rdi)
 42 |         vmovdqu %ymm0,-0x20(%rdi,%rdx)
 43 |         vzeroupper
 44 |         retq
 45 | LABEL(below_32):
 46 |         cmp    $0x10, %rdx
 47 |         jae     LABEL(in_16_to_32)
 48 | LABEL(below_16):
 49 |         cmp    $0x4, %rdx
 50 |         jbe     LABEL(below_4)
 51 | LABEL(in_4_to_16):
 52 |         // Scalar stores from this point.
 53 |         vmovq %xmm0, %rsi
 54 |         cmp    $0x7, %rdx
 55 |         jbe    LABEL(in_4_to_8)
 56 |         // two 8-wide stores, up to 16 bytes.
 57 |         mov    %rsi, -0x8(%rdi, %rdx)
 58 |         mov    %rsi,(%rdi)
 59 |         vzeroupper
 60 |         retq
 61 | .align 4
 62 | LABEL(below_4):
 63 |         test   %rdx, %rdx
 64 |         je     LABEL(exit)
 65 |         mov    %sil, (%rdi)
 66 |         mov    %sil, -0x1(%rdi,%rdx)
 67 |         cmp    $0x2, %rdx
 68 |         jbe    LABEL(exit)
 69 |         mov     %sil, 0x1(%rdi)
 70 |         mov     %sil, 0x2(%rdi)
 71 |         mov    %rdi,%rax
 72 | .align 4
 73 | LABEL(exit):
 74 |         vzeroupper
 75 |         retq
 76 | LABEL(in_4_to_8):
 77 |         // two 4-wide stores, upto 8 bytes.
 78 |         mov    %esi,-0x4(%rdi,%rdx)
 79 |         mov    %esi,(%rdi)
 80 |         vzeroupper
 81 |         retq
 82 | LABEL(in_16_to_32):
 83 |         vmovups %xmm0,(%rdi)
 84 |         vmovups %xmm0,-0x10(%rdi,%rdx)
 85 |         vzeroupper
 86 |         retq
 87 | LABEL(above_64):
 88 |         cmp    $0xb0, %rdx
 89 |         ja LABEL(above_192)
 90 |         cmp    $0x80, %rdx
 91 |         jbe LABEL(in_64_to_128)
 92 |         // Do some work filling unaligned 32bit words.
 93 |         // last_word -> rsi
 94 |         lea    -0x20(%rdi,%rdx),%rsi
 95 |         // rcx -> fill pointer.
 96 | 
 97 |         // We have at least 128 bytes to store.
 98 |         vmovdqu %ymm0,(%rdi)
 99 |         vmovdqu %ymm0, 0x20(%rdi)
100 |         vmovdqu %ymm0, 0x40(%rdi)
101 |         add    $0x60,%rdi
102 | .align 8
103 | LABEL(fill_32):
104 |         vmovdqu %ymm0,(%rdi)
105 |         add    $0x20,%rdi
106 |         cmp    %rdi,%rsi
107 |         ja     LABEL(fill_32)
108 |         // Stamp the last unaligned store.
109 |         vmovdqu %ymm0,(%rsi)
110 |         vzeroupper
111 |         retq
112 | LABEL(in_64_to_128):
113 |         // last_word -> rsi
114 |         vmovdqu %ymm0,(%rdi)
115 |         vmovdqu %ymm0, 0x20(%rdi)
116 |         vmovdqu %ymm0,-0x40(%rdi,%rdx)
117 |         vmovdqu %ymm0,-0x20(%rdi,%rdx)
118 |         vzeroupper
119 |         retq
120 | 
121 | LABEL(above_192):
122 | // rdi is the buffer address
123 | // rsi is the value
124 | // rdx is length
125 |         // Store the first unaligned 32 bytes.
126 |         vmovdqu %ymm0,(%rdi)
127 | 
128 |         // The first aligned word is stored in %rsi.
129 |         mov    %rdi,%rsi
130 |         and    $0xffffffffffffffe0,%rsi
131 |         lea    0x20(%rsi),%rsi
132 | 
133 |         // Compute the address of the last unaligned word into rdi.
134 |         lea    -0x20(%rdx), %rdx
135 |         add     %rdx, %rdi
136 | 
137 |         // Check if we can do a full 5x32B stamp.
138 |         lea    0xa0(%rsi),%rcx
139 |         cmp    %rcx, %rdi
140 |         jb     LABEL(stamp_4)
141 | .align 8
142 | LABEL(fill_192):
143 |         vmovdqa %ymm0,(%rsi)
144 |         vmovdqa %ymm0,0x20(%rsi)
145 |         vmovdqa %ymm0,0x40(%rsi)
146 |         vmovdqa %ymm0,0x60(%rsi)
147 |         vmovdqa %ymm0,0x80(%rsi)
148 |         add     $0xa0, %rsi
149 |         lea    0xa0(%rsi),%rcx
150 |         cmp    %rcx, %rdi
151 |         ja     LABEL(fill_192)
152 | 
153 | LABEL(fill_192_tail):
154 |         cmp    %rsi, %rdi
155 |         jb     LABEL(fill_192_done)
156 |         vmovdqa %ymm0, (%rsi)
157 | 
158 |         lea    0x20(%rsi),%rcx
159 |         cmp    %rcx, %rdi
160 |         jb     LABEL(fill_192_done)
161 |         vmovdqa %ymm0, 0x20(%rsi)
162 | 
163 |         lea    0x40(%rsi),%rcx
164 |         cmp    %rcx, %rdi
165 |         jb     LABEL(fill_192_done)
166 |         vmovdqa %ymm0, 0x40(%rsi)
167 | 
168 |         lea    0x60(%rsi),%rcx
169 |         cmp    %rcx, %rdi
170 |         jb     LABEL(fill_192_done)
171 |         vmovdqa %ymm0, 0x60(%rsi)
172 | 
173 | LABEL(last_wide_store):
174 |         lea    0x80(%rsi),%rcx
175 |         cmp    %rcx, %rdi
176 |         jb     LABEL(fill_192_done)
177 |         vmovdqa %ymm0, 0x80(%rsi)
178 | LABEL(fill_192_done):
179 |         // Stamp the last word.
180 |         vmovdqu %ymm0,(%rdi)
181 |         vzeroupper
182 |         ret
183 | LABEL(stamp_4):
184 |         vmovdqa %ymm0,(%rsi)
185 |         vmovdqa %ymm0,0x20(%rsi)
186 |         vmovdqa %ymm0,0x40(%rsi)
187 |         vmovdqa %ymm0,0x60(%rsi)
188 |         jmp     LABEL(last_wide_store)
189 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Fast Memset and Memcpy implementations
  2 | 
  3 | *UPDATE*: Ilya Albrecht landed the memset implementation from this repo into [Folly](https://github.com/facebook/folly/blob/main/folly/memset.S).
  4 | 
  5 | This repository contains high-performance implementations of memset and memcpy.
  6 | These implementations outperform the folly and glibc implementations.  This
  7 | repository contains several reference implementations in C and assembly.  The
  8 | high-performance implementations are found in the files called "impl.S".
  9 | 
 10 | Before reading the source code in this repository you probably want to read an
 11 | excellent blog [post](https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines/)
 12 | by Joe Bialek about his work to optimize memset for windows.
 13 | 
 14 | The charts below compare the code in this repo with other implementations:
 15 | folly, musl, and glibc.  The glibc implementations are measured with and without
 16 | the elf indirection, as suggested by Dave Zarzycki.
 17 | 
 18 | ## Memset
 19 | ![Memset](docs/memset_bench.png)
 20 | 
 21 | ## Memcpy
 22 | ![Memcpy](docs/memcpy_bench.png)
 23 | 
 24 | The chart below compares the performance of different memset implementations on
 25 | buffers of varying sizes and offsets. Unlike the hot loop that hammers a single
 26 | value, this benchmark is more realistic and takes into account mispredicted
 27 | branches and the performance of the cpu decoder. The buffers are in the size
 28 | range 0 to 256. The random function is made of pre-computed random values, to
 29 | lower the overhead of the random function.  This was suggested by Yann Collet.
 30 | The 'nop' function is used to compute the benchmark setup and call overhead. The
 31 | numbers below represent the implementation execution time minus the nop function
 32 | time.
 33 | 
 34 | ![memset](docs/memset_r.png) ![memcpy](docs/memcpy_r.png)
 35 | 
 36 | The size of the buffer that memset and memcpy mutates is typically small. The
 37 | picture below presents the buffer length distribution in google-chrome. Vim,
 38 | Python, and even server workloads have a similar distribution. The values in the
 39 | chart represent the power of two buffer size (10 represents the values between
 40 | 512 and 1024).
 41 |  
 42 | ![Histogram](docs/hist.png)
 43 | 
 44 | 
 45 | The chart below presents a histogram of pointer alignment (from the game
 46 | minecraft). Most of the pointers that are called by memset and memcpy are
 47 | aligned to 8-byte values. Some programs have histograms that are not as sharp,
 48 | meaning that there are more values that are not aligned to 4 or 8-byte boundary.
 49 | 
 50 | ![Pointer Alignment](docs/align.png)
 51 | 
 52 | 
 53 | Memcpy and Memset and frequently called by low-level high-performance libraries.
 54 | Here is one example of one stack trace from the Firefox codebase:
 55 | 
 56 | ```
 57 |   (gdb) bt
 58 |   #0  __memmove_avx_unaligned_erms () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:225
 59 |   #1  in memcpy (__dest=, __src=, __len=40) at /usr/include/x86_64-linux-gnu/bits/string_fortified.h:34
 60 |   #2  mozilla::BufferList<InfallibleAllocPolicy>::ReadBytes(mozilla::BufferList<InfallibleAllocPolicy>::IterImpl&, char*, unsigned long) const
 61 |   #3  Pickle::ReadBytesInto(PickleIterator*, void*, unsigned int) const (this=, iter=, data=, length=<optimized out>)
 62 |   #4  in IPC::Message::ReadFooter(void*, unsigned int, bool) (this=, buffer=, buffer_len=40, truncate=true)
 63 |   #5  in mozilla::ipc::NodeController::DeserializeEventMessage(mozilla::UniquePtr<IPC::Message, mozilla::DefaultDelete<IPC::Message> >) (this=, aMessage=...)
 64 |   #6  in mozilla::ipc::NodeController::OnEventMessage(mojo::core::ports::NodeName const&, mozilla::UniquePtr<IPC::Message, mozilla::DefaultDelete<IPC::Message> >)
 65 |   #7  in mozilla::ipc::NodeChannel::OnMessageReceived(IPC::Message&&) (this=<optimized out>, aMessage=...)
 66 |   #8  in IPC::Channel::ChannelImpl::ProcessIncomingMessages() (this=<optimized out>)
 67 |   #9  in IPC::Channel::ChannelImpl::OnFileCanReadWithoutBlocking(int) (this=, fd=)
 68 |   #10 in base::MessagePumpLibevent::OnLibeventNotification(int, short, void*) (fd=, flags=, context=)
 69 |   #11 in event_persist_closure (base=, ev=) at /build/firefox-HSiFn6/firefox-94.0+build3/ipc/chromium/src/third_party/libevent/event.c:1580
 70 |   #12 event_process_active_single_queue (base=, activeq=, max_to_process=, endtime=)
 71 | 
 72 | ```
 73 | 
 74 | The repository contains a few utilities for testing and measuring the
 75 | performance and correctness of memset and memcpy.
 76 | 
 77 | ## Test tool
 78 | 
 79 | This is a small test harness that verifies the correctness of the
 80 | implementations. It's really easy to make mistakes with off-by-one errors and
 81 | run into alignment issues. The exhaustive tester catches these issues.
 82 | 
 83 | This is a sample output:
 84 | ```
 85 | OOOOOOOOOOOXX
 86 |            ^
 87 | Filling a buffer of length 13. Expected "O" at index 11
 88 | ```
 89 | 
 90 | ## Benchmark tool
 91 | 
 92 | The benchmark tool measures the performance of the system libc and the local
 93 | implementation. The benchmarking tool runs each of the implementations in a loop
 94 | millions of times. It runs the benchmark several times and picks the least noisy
 95 | results. It's a good idea to run the benchmark tool and compare some
 96 | implementation to itself to assess the noise level in the system. The
 97 | benchmarking tool uses a trampoline to prevent the compiler from inlining and
 98 | expanding the memset.
 99 | 
100 | ## Histogram tool
101 | 
102 | The histogram tool is a shared object that collects records calls to memset and
103 | memcpy and creates a histogram of the length parameter. It prints the histogram
104 | when the program exits cleanly. The shared object can be loaded using
105 | LD\_PRELOAD (on Linux) or DYLD\_INSERT\_LIBRARIES (on Mac). Each bucket in the
106 | output represents the log2 size of the buffer, and each value represents the
107 | number of hits for the bucket.
108 | 
109 | ## Proxy tool
110 | 
111 | This is a small utility that swaps the builtin call to memset and memcpy with
112 | the local implementation from this project. The shared object can be loaded
113 | using LD\_PRELOAD (on Linux) or DYLD\_INSERT\_LIBRARIES (on Mac).
114 | 
115 | 


--------------------------------------------------------------------------------
/src/memset/impl.c:
--------------------------------------------------------------------------------
  1 | #include "types.h"
  2 | 
  3 | #include <stddef.h>
  4 | #include <stdint.h>
  5 | 
  6 | // Handle memsets of sizes 0..32
  7 | static inline void *small_memset(void *s, int c, size_t n) {
  8 |   if (n < 5) {
  9 |     if (n == 0)
 10 |       return s;
 11 |     char *p = s;
 12 |     p[0] = c;
 13 |     p[n - 1] = c;
 14 |     if (n <= 2)
 15 |       return s;
 16 |     p[1] = c;
 17 |     p[2] = c;
 18 |     return s;
 19 |   }
 20 | 
 21 |   if (n <= 16) {
 22 |     uint64_t val8 = ((uint64_t)0x0101010101010101L * ((uint8_t)c));
 23 |     if (n >= 8) {
 24 |       char *first = s;
 25 |       char *last = s + n - 8;
 26 |       *((u64 *)first) = val8;
 27 |       *((u64 *)last) = val8;
 28 |       return s;
 29 |     }
 30 | 
 31 |     uint32_t val4 = val8;
 32 |     char *first = s;
 33 |     char *last = s + n - 4;
 34 |     *((u32 *)first) = val4;
 35 |     *((u32 *)last) = val4;
 36 |     return s;
 37 |   }
 38 | 
 39 |   char X = c;
 40 |   char *p = s;
 41 |   char16 val16 = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X};
 42 |   char *last = s + n - 16;
 43 |   *((char16 *)last) = val16;
 44 |   *((char16 *)p) = val16;
 45 |   return s;
 46 | }
 47 | 
 48 | static inline void *huge_memset(void *s, int c, size_t n) {
 49 |   char *p = s;
 50 |   char X = c;
 51 |   char32 val32 = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,
 52 |                   X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X};
 53 | 
 54 |   // Stamp the first 32byte store.
 55 |   *((char32 *)p) = val32;
 56 | 
 57 |   char *first_aligned = p + 32 - ((uint64_t)p % 32);
 58 |   char *buffer_end = p + n;
 59 |   char *last_word = buffer_end - 32;
 60 | 
 61 |   // Align the next stores.
 62 |   p = first_aligned;
 63 | 
 64 |   // Unroll the body of the loop to increase parallelism.
 65 |   while (p + (32 * 5) < buffer_end) {
 66 |     *((char32a *)p) = val32;
 67 |     p += 32;
 68 |     *((char32a *)p) = val32;
 69 |     p += 32;
 70 |     *((char32a *)p) = val32;
 71 |     p += 32;
 72 |     *((char32a *)p) = val32;
 73 |     p += 32;
 74 |     *((char32a *)p) = val32;
 75 |     p += 32;
 76 |   }
 77 | 
 78 | // Complete the last few iterations:
 79 | #define TRY_STAMP_32_BYTES                                                     \
 80 |   if (p < last_word) {                                                         \
 81 |     *((char32a *)p) = val32;                                                   \
 82 |     p += 32;                                                                   \
 83 |   }
 84 | 
 85 |   TRY_STAMP_32_BYTES
 86 |   TRY_STAMP_32_BYTES
 87 |   TRY_STAMP_32_BYTES
 88 |   TRY_STAMP_32_BYTES
 89 | 
 90 |   // Stamp the last unaligned word.
 91 |   *((char32 *)last_word) = val32;
 92 |   return s;
 93 | }
 94 | 
 95 | void *local_memset(void *s, int c, size_t n) {
 96 |   char *p = s;
 97 |   char X = c;
 98 | 
 99 |   if (n < 32) {
100 |     return small_memset(s, c, n);
101 |   }
102 | 
103 |   if (n > 160) {
104 |     return huge_memset(s, c, n);
105 |   }
106 | 
107 |   char32 val32 = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,
108 |                   X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X};
109 | 
110 |   char *last_word = s + n - 32;
111 | 
112 |   // Stamp the 32-byte chunks.
113 |   do {
114 |     *((char32 *)p) = val32;
115 |     p += 32;
116 |   } while (p < last_word);
117 | 
118 |   // Stamp the last unaligned 32 bytes of the buffer.
119 |   *((char32 *)last_word) = val32;
120 |   return s;
121 | }
122 | 
123 | /// This a memset implementation that was copied from musl. We only use it for
124 | /// benchmarking.
125 | 
126 | void *musl_memset(void *dest, int c, size_t n) {
127 |   unsigned char *s = dest;
128 |   size_t k;
129 | 
130 |   /* Fill head and tail with minimal branching. Each
131 |    * conditional ensures that all the subsequently used
132 |    * offsets are well-defined and in the dest region. */
133 | 
134 |   if (!n)
135 |     return dest;
136 |   s[0] = c;
137 |   s[n - 1] = c;
138 |   if (n <= 2)
139 |     return dest;
140 |   s[1] = c;
141 |   s[2] = c;
142 |   s[n - 2] = c;
143 |   s[n - 3] = c;
144 |   if (n <= 6)
145 |     return dest;
146 |   s[3] = c;
147 |   s[n - 4] = c;
148 |   if (n <= 8)
149 |     return dest;
150 | 
151 |   /* Advance pointer to align it at a 4-byte boundary,
152 |    * and truncate n to a multiple of 4. The previous code
153 |    * already took care of any head/tail that get cut off
154 |    * by the alignment. */
155 | 
156 |   k = -(uintptr_t)s & 3;
157 |   s += k;
158 |   n -= k;
159 |   n &= -4;
160 | 
161 | #ifdef __GNUC__
162 |   typedef uint32_t __attribute__((__may_alias__)) u32;
163 |   typedef uint64_t __attribute__((__may_alias__)) u64;
164 | 
165 |   u32 c32 = ((u32)-1) / 255 * (unsigned char)c;
166 | 
167 |   /* In preparation to copy 32 bytes at a time, aligned on
168 |    * an 8-byte bounary, fill head/tail up to 28 bytes each.
169 |    * As in the initial byte-based head/tail fill, each
170 |    * conditional below ensures that the subsequent offsets
171 |    * are valid (e.g. !(n<=24) implies n>=28). */
172 | 
173 |   *(u32 *)(s + 0) = c32;
174 |   *(u32 *)(s + n - 4) = c32;
175 |   if (n <= 8)
176 |     return dest;
177 |   *(u32 *)(s + 4) = c32;
178 |   *(u32 *)(s + 8) = c32;
179 |   *(u32 *)(s + n - 12) = c32;
180 |   *(u32 *)(s + n - 8) = c32;
181 |   if (n <= 24)
182 |     return dest;
183 |   *(u32 *)(s + 12) = c32;
184 |   *(u32 *)(s + 16) = c32;
185 |   *(u32 *)(s + 20) = c32;
186 |   *(u32 *)(s + 24) = c32;
187 |   *(u32 *)(s + n - 28) = c32;
188 |   *(u32 *)(s + n - 24) = c32;
189 |   *(u32 *)(s + n - 20) = c32;
190 |   *(u32 *)(s + n - 16) = c32;
191 | 
192 |   /* Align to a multiple of 8 so we can fill 64 bits at a time,
193 |    * and avoid writing the same bytes twice as much as is
194 |    * practical without introducing additional branching. */
195 | 
196 |   k = 24 + ((uintptr_t)s & 4);
197 |   s += k;
198 |   n -= k;
199 | 
200 |   /* If this loop is reached, 28 tail bytes have already been
201 |    * filled, so any remainder when n drops below 32 can be
202 |    * safely ignored. */
203 | 
204 |   u64 c64 = c32 | ((u64)c32 << 32);
205 |   for (; n >= 32; n -= 32, s += 32) {
206 |     *(u64 *)(s + 0) = c64;
207 |     *(u64 *)(s + 8) = c64;
208 |     *(u64 *)(s + 16) = c64;
209 |     *(u64 *)(s + 24) = c64;
210 |   }
211 | #else
212 |   /* Pure C fallback with no aliasing violations. */
213 |   for (; n; n--, s++)
214 |     *s = c;
215 | #endif
216 | 
217 |   return dest;
218 | }
219 | 


--------------------------------------------------------------------------------
/src/memcpy/folly.S:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright (c) Facebook, Inc. and its affiliates.
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | /*
 18 |  * __folly_memcpy: An optimized memcpy implementation that uses prefetch and
 19 |  * AVX2 instructions.
 20 |  *
 21 |  * This implementation of memcpy acts as a memmove, but it is not optimized for
 22 |  * this purpose. While overlapping copies are undefined in memcpy, this
 23 |  * implementation acts like memmove for sizes up through 256 bytes and will
 24 |  * detect overlapping copies and call memmove for overlapping copies of 257 or
 25 |  * more bytes.
 26 |  *
 27 |  * This implementation uses prefetch to avoid dtlb misses. This can
 28 |  * substantially reduce dtlb store misses in cases where the destination
 29 |  * location is absent from L1 cache and where the copy size is small enough
 30 |  * that the hardware prefetcher doesn't have a large impact.
 31 |  *
 32 |  * The number of branches is limited by the use of overlapping copies. This
 33 |  * helps with copies where the source and destination cache lines are already
 34 |  * present in L1 because there are fewer instructions to execute and fewer
 35 |  * branches to potentially mispredict.
 36 |  *
 37 |  * Vector operations up to 32-bytes are used (avx2 instruction set). Larger
 38 |  * mov operations (avx512) are not used.
 39 |  *
 40 |  * Large copies make use of aligned store operations. This operation is
 41 |  * observed to always be faster than rep movsb, so the rep movsb instruction
 42 |  * is not used.
 43 |  *
 44 |  * If the copy size is humongous and the source and destination are both
 45 |  * aligned, this memcpy will use non-temporal operations. This can have
 46 |  * a substantial speedup for copies where data is absent from L1, but it
 47 |  * is significantly slower if the source and destination data were already
 48 |  * in L1. The use of non-temporal operations also has the effect that after
 49 |  * the copy is complete, the data will be moved out of L1, even if the data was
 50 |  * present before the copy started.
 51 |  *
 52 |  * @author Logan Evans <lpe@fb.com>
 53 |  */
 54 | 
 55 | #if defined(__AVX2__)
 56 | 
 57 | // This threshold is half of L1 cache on a Skylake machine, which means that
 58 | // potentially all of L1 will be populated by this copy once it is executed
 59 | // (dst and src are cached for temporal copies).
 60 | #define NON_TEMPORAL_STORE_THRESHOLD $32768
 61 | 
 62 |         .file       "memcpy.S"
 63 |         .section    .text,"ax"
 64 | 
 65 |         .type       __folly_memcpy_short, @function
 66 | __folly_memcpy_short:
 67 |         .cfi_startproc
 68 | 
 69 | .L_GE1_LE7:
 70 |         cmp         $1, %rdx
 71 |         je          .L_EQ1
 72 | 
 73 |         cmp         $4, %rdx
 74 |         jae         .L_GE4_LE7
 75 | 
 76 | .L_GE2_LE3:
 77 |         movw        (%rsi), %r8w
 78 |         movw        -2(%rsi,%rdx), %r9w
 79 |         movw        %r8w, (%rdi)
 80 |         movw        %r9w, -2(%rdi,%rdx)
 81 |         ret
 82 | 
 83 |         .align      2
 84 | .L_EQ1:
 85 |         movb        (%rsi), %r8b
 86 |         movb        %r8b, (%rdi)
 87 |         ret
 88 | 
 89 |         // Aligning the target of a jump to an even address has a measurable
 90 |         // speedup in microbenchmarks.
 91 |         .align      2
 92 | .L_GE4_LE7:
 93 |         movl        (%rsi), %r8d
 94 |         movl        -4(%rsi,%rdx), %r9d
 95 |         movl        %r8d, (%rdi)
 96 |         movl        %r9d, -4(%rdi,%rdx)
 97 |         ret
 98 | 
 99 |         .cfi_endproc
100 |         .size       __folly_memcpy_short, .-__folly_memcpy_short
101 | 
102 | // memcpy is an alternative entrypoint into the function named __folly_memcpy.
103 | // The compiler is able to call memcpy since the name is global while
104 | // stacktraces will show __folly_memcpy since that is the name of the function.
105 | // This is intended to aid in debugging by making it obvious which version of
106 | // memcpy is being used.
107 |         .align      64
108 |         .globl      __folly_memcpy
109 |         .type       __folly_memcpy, @function
110 | 
111 | __folly_memcpy:
112 |         .cfi_startproc
113 | 
114 |         mov         %rdi, %rax
115 | 
116 |         test        %rdx, %rdx
117 |         je          .L_EQ0
118 | 
119 |         prefetchw   (%rdi)
120 |         prefetchw   -1(%rdi,%rdx)
121 | 
122 |         cmp         $8, %rdx
123 |         jb          .L_GE1_LE7
124 | 
125 | .L_GE8:
126 |         cmp         $32, %rdx
127 |         ja          .L_GE33
128 | 
129 | .L_GE8_LE32:
130 |         cmp         $16, %rdx
131 |         ja          .L_GE17_LE32
132 | 
133 | .L_GE8_LE16:
134 |         mov         (%rsi), %r8
135 |         mov         -8(%rsi,%rdx), %r9
136 |         mov         %r8, (%rdi)
137 |         mov         %r9, -8(%rdi,%rdx)
138 | .L_EQ0:
139 |         ret
140 | 
141 |         .align      2
142 | .L_GE17_LE32:
143 |         movdqu      (%rsi), %xmm0
144 |         movdqu      -16(%rsi,%rdx), %xmm1
145 |         movdqu      %xmm0, (%rdi)
146 |         movdqu      %xmm1, -16(%rdi,%rdx)
147 |         ret
148 | 
149 |         .align      2
150 | .L_GE193_LE256:
151 |         vmovdqu     %ymm3, 96(%rdi)
152 |         vmovdqu     %ymm4, -128(%rdi,%rdx)
153 | 
154 | .L_GE129_LE192:
155 |         vmovdqu     %ymm2, 64(%rdi)
156 |         vmovdqu     %ymm5, -96(%rdi,%rdx)
157 | 
158 | .L_GE65_LE128:
159 |         vmovdqu     %ymm1, 32(%rdi)
160 |         vmovdqu     %ymm6, -64(%rdi,%rdx)
161 | 
162 | .L_GE33_LE64:
163 |         vmovdqu     %ymm0, (%rdi)
164 |         vmovdqu     %ymm7, -32(%rdi,%rdx)
165 | 
166 |         vzeroupper
167 |         ret
168 | 
169 |         .align      2
170 | .L_GE33:
171 |         vmovdqu     (%rsi), %ymm0
172 |         vmovdqu     -32(%rsi,%rdx), %ymm7
173 | 
174 |         cmp         $64, %rdx
175 |         jbe         .L_GE33_LE64
176 | 
177 |         prefetchw   64(%rdi)
178 | 
179 |         vmovdqu     32(%rsi), %ymm1
180 |         vmovdqu     -64(%rsi,%rdx), %ymm6
181 | 
182 |         cmp         $128, %rdx
183 |         jbe         .L_GE65_LE128
184 | 
185 |         prefetchw   128(%rdi)
186 | 
187 |         vmovdqu     64(%rsi), %ymm2
188 |         vmovdqu     -96(%rsi,%rdx), %ymm5
189 | 
190 |         cmp         $192, %rdx
191 |         jbe         .L_GE129_LE192
192 | 
193 |         prefetchw   192(%rdi)
194 | 
195 |         vmovdqu     96(%rsi), %ymm3
196 |         vmovdqu     -128(%rsi,%rdx), %ymm4
197 | 
198 |         cmp         $256, %rdx
199 |         jbe         .L_GE193_LE256
200 | 
201 | .L_GE257:
202 |         prefetchw   256(%rdi)
203 | 
204 |         // Check if there is an overlap. If there is an overlap then the caller
205 |         // has a bug since this is undefined behavior. However, for legacy
206 |         // reasons this behavior is expected by some callers.
207 |         //
208 |         // All copies through 256 bytes will operate as a memmove since for
209 |         // those sizes all reads are performed before any writes.
210 |         //
211 |         // This check uses the idea that there is an overlap if
212 |         // (%rdi < (%rsi + %rdx)) && (%rsi < (%rdi + %rdx)),
213 |         // or equivalently, there is no overlap if
214 |         // ((%rsi + %rdx) <= %rdi) || ((%rdi + %rdx) <= %rsi).
215 |         //
216 |         // %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many
217 |         // bytes remain to be copied.
218 |         lea         (%rsi,%rdx), %r9
219 |         cmp         %rdi, %r9
220 |         jbe         .L_NO_OVERLAP
221 |         lea         (%rdi,%rdx), %r8
222 |         cmp         %rsi, %r8
223 |         // This is a forward jump so that the branch predictor will not predict
224 |         // a memmove.
225 |         ja          .L_MEMMOVE
226 | 
227 |         .align      2
228 | .L_NO_OVERLAP:
229 |         vmovdqu     %ymm0, (%rdi)
230 |         vmovdqu     %ymm1, 32(%rdi)
231 |         vmovdqu     %ymm2, 64(%rdi)
232 |         vmovdqu     %ymm3, 96(%rdi)
233 | 
234 |         // Align %rdi to a 32 byte boundary.
235 |         // %rcx = 128 - 31 & %rdi
236 |         mov         $128, %rcx
237 |         and         $31, %rdi
238 |         sub         %rdi, %rcx
239 | 
240 |         lea         (%rsi,%rcx), %rsi
241 |         lea         (%rax,%rcx), %rdi
242 |         sub         %rcx, %rdx
243 | 
244 |         // %r8 is the end condition for the loop.
245 |         lea         -128(%rsi,%rdx), %r8
246 | 
247 |         cmp         NON_TEMPORAL_STORE_THRESHOLD, %rdx
248 |         jae         .L_NON_TEMPORAL_LOOP
249 | 
250 |         .align      2
251 | .L_ALIGNED_DST_LOOP:
252 |         prefetchw   128(%rdi)
253 |         prefetchw   192(%rdi)
254 | 
255 |         vmovdqu     (%rsi), %ymm0
256 |         vmovdqu     32(%rsi), %ymm1
257 |         vmovdqu     64(%rsi), %ymm2
258 |         vmovdqu     96(%rsi), %ymm3
259 |         add         $128, %rsi
260 | 
261 |         vmovdqa     %ymm0, (%rdi)
262 |         vmovdqa     %ymm1, 32(%rdi)
263 |         vmovdqa     %ymm2, 64(%rdi)
264 |         vmovdqa     %ymm3, 96(%rdi)
265 |         add         $128, %rdi
266 | 
267 |         cmp         %r8, %rsi
268 |         jb          .L_ALIGNED_DST_LOOP
269 | 
270 | .L_ALIGNED_DST_LOOP_END:
271 |         sub         %rsi, %r9
272 |         mov         %r9, %rdx
273 | 
274 |         vmovdqu     %ymm4, -128(%rdi,%rdx)
275 |         vmovdqu     %ymm5, -96(%rdi,%rdx)
276 |         vmovdqu     %ymm6, -64(%rdi,%rdx)
277 |         vmovdqu     %ymm7, -32(%rdi,%rdx)
278 | 
279 |         vzeroupper
280 |         ret
281 | 
282 |         .align      2
283 | .L_NON_TEMPORAL_LOOP:
284 |         testb       $31, %sil
285 |         jne         .L_ALIGNED_DST_LOOP
286 |         // This is prefetching the source data unlike ALIGNED_DST_LOOP which
287 |         // prefetches the destination data. This choice is again informed by
288 |         // benchmarks. With a non-temporal store the entirety of the cache line
289 |         // is being written so the previous data can be discarded without being
290 |         // fetched.
291 |         prefetchnta 128(%rsi)
292 |         prefetchnta 196(%rsi)
293 | 
294 |         vmovntdqa   (%rsi), %ymm0
295 |         vmovntdqa   32(%rsi), %ymm1
296 |         vmovntdqa   64(%rsi), %ymm2
297 |         vmovntdqa   96(%rsi), %ymm3
298 |         add         $128, %rsi
299 | 
300 |         vmovntdq    %ymm0, (%rdi)
301 |         vmovntdq    %ymm1, 32(%rdi)
302 |         vmovntdq    %ymm2, 64(%rdi)
303 |         vmovntdq    %ymm3, 96(%rdi)
304 |         add         $128, %rdi
305 | 
306 |         cmp         %r8, %rsi
307 |         jb          .L_NON_TEMPORAL_LOOP
308 | 
309 |         sfence
310 |         jmp         .L_ALIGNED_DST_LOOP_END
311 | 
312 | .L_MEMMOVE:
313 |         call        memmove
314 |         ret
315 | 
316 |         .cfi_endproc
317 |         .size       __folly_memcpy, .-__folly_memcpy
318 | 
319 | #ifdef FOLLY_MEMCPY_IS_MEMCPY
320 |         .weak       memcpy
321 |         memcpy = __folly_memcpy
322 | #endif
323 | 
324 |         .ident "GCC: (GNU) 4.8.2"
325 | #ifdef __linux__
326 |         .section .note.GNU-stack,"",@progbits
327 | #endif
328 | 
329 | #endif
330 | 


--------------------------------------------------------------------------------