├── docs ├── hist.png ├── align.png ├── memcpy_r.png ├── memset_r.png ├── memcpy_bench.png ├── memset_bench.png └── annotated_glibc.txt ├── .gitignore ├── src ├── utils │ ├── CMakeLists.txt │ └── hist_tool.c ├── memcpy │ ├── CMakeLists.txt │ ├── impl.c │ ├── bench_memcpy.cc │ ├── impl.S │ ├── test_memcpy.cc │ └── folly.S └── memset │ ├── shims.c │ ├── CMakeLists.txt │ ├── test_memset.cc │ ├── bench_memset.cc │ ├── impl.S │ └── impl.c ├── include ├── decl.h ├── types.h └── utils.h ├── CMakeLists.txt └── README.md /docs/hist.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/hist.png -------------------------------------------------------------------------------- /docs/align.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/align.png -------------------------------------------------------------------------------- /docs/memcpy_r.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/memcpy_r.png -------------------------------------------------------------------------------- /docs/memset_r.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/memset_r.png -------------------------------------------------------------------------------- /docs/memcpy_bench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/memcpy_bench.png -------------------------------------------------------------------------------- /docs/memset_bench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nadavrot/memset_benchmark/HEAD/docs/memset_bench.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.swn 3 | *.swo 4 | *.swp 5 | *~ 6 | .DS_Store 7 | *.so 8 | *.dylib 9 | 10 | GPATH 11 | GRTAGS 12 | GTAGS 13 | tags 14 | 15 | compile_commands.json 16 | 17 | toolchain/ 18 | llvm-project/ 19 | gcc-project/ 20 | build*/ 21 | .vscode/ 22 | .vim/ 23 | .idea/ 24 | -------------------------------------------------------------------------------- /src/utils/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(hist_tool SHARED 2 | hist_tool.c 3 | ) 4 | 5 | set_target_properties(hist_tool PROPERTIES 6 | VERSION ${PROJECT_VERSION} 7 | SOVERSION 1 8 | ) 9 | 10 | target_compile_options(hist_tool PRIVATE "-fno-builtin") 11 | 12 | install(TARGETS hist_tool LIBRARY DESTINATION bin) 13 | -------------------------------------------------------------------------------- /src/memcpy/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_executable(test_memcpy 2 | test_memcpy.cc 3 | folly.S 4 | impl.S 5 | impl.c 6 | ) 7 | 8 | target_link_libraries(test_memcpy PUBLIC) 9 | 10 | add_executable(bench_memcpy 11 | bench_memcpy.cc 12 | folly.S 13 | impl.S 14 | impl.c 15 | ) 16 | 17 | install(TARGETS bench_memcpy DESTINATION bin) 18 | install(TARGETS test_memcpy DESTINATION bin) 19 | 20 | -------------------------------------------------------------------------------- /src/memset/shims.c: -------------------------------------------------------------------------------- 1 | #include "decl.h" 2 | 3 | //////////////////////////////////////////////////////////////////////////////// 4 | /// This is a small utility that swaps the builtin call to memset with the 5 | /// local implementation of memset, implemented in this project. 6 | /// The shared object can be loaded using LD_PRELOAD (on Linux) or 7 | /// DYLD_INSERT_LIBRARIES (on Mac). 8 | //////////////////////////////////////////////////////////////////////////////// 9 | 10 | void *memset(void *s, int c, size_t n) { return local_memset(s, c, n); } 11 | -------------------------------------------------------------------------------- /src/memset/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | add_library(mem_shim SHARED 2 | shims.c 3 | impl.S 4 | impl.c 5 | ) 6 | 7 | set_target_properties(mem_shim PROPERTIES 8 | VERSION ${PROJECT_VERSION} 9 | SOVERSION 1 10 | ) 11 | 12 | add_executable(bench_memset 13 | bench_memset.cc 14 | impl.S 15 | impl.c 16 | ) 17 | 18 | add_executable(test_memset 19 | test_memset.cc 20 | impl.S 21 | impl.c 22 | ) 23 | 24 | target_link_libraries(bench_memset PUBLIC) 25 | target_link_libraries(test_memset PUBLIC) 26 | 27 | install(TARGETS bench_memset DESTINATION bin) 28 | install(TARGETS test_memset DESTINATION bin) 29 | install(TARGETS mem_shim LIBRARY DESTINATION bin) 30 | 31 | -------------------------------------------------------------------------------- /include/decl.h: -------------------------------------------------------------------------------- 1 | #ifndef DECLS 2 | #define DECLS 3 | 4 | #include 5 | 6 | #ifdef __cplusplus 7 | 8 | using memset_ty = void *(void *s, int c, size_t n); 9 | using memcpy_ty = void *(void *dest, const void *src, size_t n); 10 | 11 | extern "C" { 12 | #endif 13 | 14 | void *memcpy(void *dest, const void *src, size_t n); 15 | void *__folly_memcpy(void *dest, const void *src, size_t n); 16 | void *libc_memcpy(void *dest, const void *src, size_t n); 17 | void *local_memcpy(void *dest, const void *src, size_t n); 18 | void *asm_memcpy(void *dest, const void *src, size_t n); 19 | 20 | void *memset(void *s, int c, size_t n); 21 | void *libc_memset(void *s, int c, size_t n); 22 | void *local_memset(void *s, int c, size_t n); 23 | void *asm_memset(void *s, int c, size_t n); 24 | void *musl_memset(void *s, int c, size_t n); 25 | 26 | #ifdef __cplusplus 27 | } 28 | #endif 29 | 30 | #endif // DECLS 31 | -------------------------------------------------------------------------------- /include/types.h: -------------------------------------------------------------------------------- 1 | #ifndef TYPES 2 | #define TYPES 3 | 4 | #include 5 | 6 | #define NO_INLINE __attribute__((noinline)) 7 | 8 | #ifdef __clang__ 9 | typedef char char8 __attribute__((ext_vector_type(8), aligned(1))); 10 | typedef char char16 __attribute__((ext_vector_type(16), aligned(1))); 11 | typedef char char32 __attribute__((ext_vector_type(32), aligned(1))); 12 | typedef char char32a __attribute__((ext_vector_type(32), aligned(32))); 13 | 14 | #else 15 | // __GNUC__ 16 | typedef char char8 __attribute__((vector_size(8), aligned(1))); 17 | typedef char char16 __attribute__((vector_size(16), aligned(1))); 18 | typedef char char32 __attribute__((vector_size(32), aligned(1))); 19 | typedef char char32a __attribute__((vector_size(32), aligned(32))); 20 | #endif 21 | 22 | typedef uint32_t __attribute__((aligned(1))) u32; 23 | typedef uint64_t __attribute__((aligned(1))) u64; 24 | 25 | #endif // TYPES 26 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 3.7) 2 | 3 | project(bpf_tracer VERSION 1.0.0 DESCRIPTION "Memset benchmarks") 4 | 5 | set(CMAKE_CXX_STANDARD 14) 6 | set(CXX_STANDARD_REQUIRED ON) 7 | set(CMAKE_CXX_EXTENSIONS OFF) 8 | 9 | # Export a JSON file with the compilation commands that external tools can use 10 | # to analyze the source code of the project. 11 | set(CMAKE_EXPORT_COMPILE_COMMANDS ON) 12 | 13 | enable_language(C ASM) 14 | 15 | # Disable exceptions 16 | SET (CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "-fno-rtti ") 17 | 18 | if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) 19 | message(STATUS "No build type selected, default to Release") 20 | set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Build type (default RelWithDebInfo)" FORCE) 21 | endif() 22 | 23 | add_compile_options(-Wall -g3 -O3 -march=native) 24 | 25 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wall -march=native") 26 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall -fno-omit-frame-pointer -O0") 27 | 28 | # Place all of the binaries in the build directory. 29 | set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 30 | set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) 31 | 32 | include_directories(BEFORE 33 | ${CMAKE_CURRENT_BINARY_DIR}/include 34 | ${CMAKE_CURRENT_SOURCE_DIR}/include 35 | ) 36 | 37 | add_subdirectory(src/memset/) 38 | add_subdirectory(src/memcpy/) 39 | add_subdirectory(src/utils/) 40 | -------------------------------------------------------------------------------- /src/memcpy/impl.c: -------------------------------------------------------------------------------- 1 | #include "types.h" 2 | 3 | #include 4 | #include 5 | 6 | void *local_memcpy(void *dest, const void *src, size_t n) { 7 | char *d = (char *)dest; 8 | const char *s = (char *)src; 9 | 10 | if (n < 5) { 11 | if (n == 0) 12 | return dest; 13 | d[0] = s[0]; 14 | d[n - 1] = s[n - 1]; 15 | if (n <= 2) 16 | return dest; 17 | d[1] = s[1]; 18 | d[2] = s[2]; 19 | return dest; 20 | } 21 | 22 | if (n <= 16) { 23 | if (n >= 8) { 24 | const char *first_s = s; 25 | const char *last_s = s + n - 8; 26 | char *first_d = d; 27 | char *last_d = d + n - 8; 28 | *((u64 *)first_d) = *((u64 *)first_s); 29 | *((u64 *)last_d) = *((u64 *)last_s); 30 | return dest; 31 | } 32 | 33 | const char *first_s = s; 34 | const char *last_s = s + n - 4; 35 | char *first_d = d; 36 | char *last_d = d + n - 4; 37 | *((u32 *)first_d) = *((u32 *)first_s); 38 | *((u32 *)last_d) = *((u32 *)last_s); 39 | return dest; 40 | } 41 | 42 | if (n <= 32) { 43 | const char *first_s = s; 44 | const char *last_s = s + n - 16; 45 | char *first_d = d; 46 | char *last_d = d + n - 16; 47 | 48 | *((char16 *)first_d) = *((char16 *)first_s); 49 | *((char16 *)last_d) = *((char16 *)last_s); 50 | return dest; 51 | } 52 | 53 | const char *last_word_s = s + n - 32; 54 | char *last_word_d = d + n - 32; 55 | 56 | // Stamp the 32-byte chunks. 57 | do { 58 | *((char32 *)d) = *((char32 *)s); 59 | d += 32; 60 | s += 32; 61 | } while (d < last_word_d); 62 | 63 | // Stamp the last unaligned 32 bytes of the buffer. 64 | *((char32 *)last_word_d) = *((char32 *)last_word_s); 65 | return dest; 66 | } 67 | -------------------------------------------------------------------------------- /src/utils/hist_tool.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | //////////////////////////////////////////////////////////////////////////////// 7 | /// This is a small utility that records calls to some methods and creates a 8 | /// histogram of the lengths of calls to memset. It prints the histogram when 9 | /// the program is terminated. The shared object can be loaded using LD_PRELOAD 10 | /// (on Linux) or DYLD_INSERT_LIBRARIES (on Mac). 11 | //////////////////////////////////////////////////////////////////////////////// 12 | 13 | uint32_t memset_len_dist[32] = { 14 | 0, 15 | }; 16 | uint32_t memcpy_len_dist[32] = { 17 | 0, 18 | }; 19 | uint32_t align_dist[32] = { 20 | 0, 21 | }; 22 | 23 | 24 | const int tab32[32] = {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 25 | 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 26 | 24, 7, 19, 27, 23, 6, 26, 5, 4, 31}; 27 | 28 | int log2_32(uint32_t value) { 29 | value |= value >> 1; 30 | value |= value >> 2; 31 | value |= value >> 4; 32 | value |= value >> 8; 33 | value |= value >> 16; 34 | return tab32[(uint32_t)(value * 0x07C4ACDD) >> 27]; 35 | } 36 | 37 | void __attribute__((destructor)) print_hitograms() { 38 | FILE *ff = fopen("/tmp/hist.txt", "a+"); 39 | if (!ff) { 40 | return; 41 | } 42 | pid_t pid = getpid(); 43 | 44 | fprintf(ff, "Histogram for (%d):\n", pid); 45 | fprintf(ff, "size, memset, memcpy, alignment:\n"); 46 | for (int i = 0; i < 32; i++) { 47 | fprintf(ff, "%d, %d, %d, %d,\n", i, memset_len_dist[i], memcpy_len_dist[i], align_dist[i]); 48 | } 49 | fclose(ff); 50 | } 51 | 52 | void *memcpy(void *dest, const void *src, size_t len) { 53 | memcpy_len_dist[log2_32(len)]++; 54 | align_dist[(unsigned long)dest % 32]++; 55 | align_dist[(unsigned long)src % 32]++; 56 | char *d = (char *)dest; 57 | char *s = (char *)src; 58 | for (size_t i = 0; i < len; i++) { 59 | d[i] = s[i]; 60 | } 61 | return dest; 62 | } 63 | 64 | void *memset(void *s, int c, size_t len) { 65 | memset_len_dist[log2_32(len)]++; 66 | align_dist[(unsigned long)s % 32]++; 67 | char *p = s; 68 | 69 | for (int i = 0; i < len; i++) { 70 | p[i] = c; 71 | } 72 | return s; 73 | } 74 | 75 | -------------------------------------------------------------------------------- /src/memset/test_memset.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "decl.h" 6 | #include "utils.h" 7 | 8 | //////////////////////////////////////////////////////////////////////////////// 9 | // This is a small program that checks if some memset implementation is correct. 10 | // The tool currently checks libc, musl and the local implementation. 11 | //////////////////////////////////////////////////////////////////////////////// 12 | 13 | #define MAGIC_VALUE0 'X' 14 | #define MAGIC_VALUE1 'O' 15 | 16 | void print_buffer(const char *start, const char *end, char val, 17 | const char *ptr) { 18 | const char *it = start; 19 | while (it != end) { 20 | std::cout << *it; 21 | it++; 22 | } 23 | std::cout << "\n"; 24 | it = start; 25 | while (it != ptr) { 26 | std::cout << " "; 27 | it++; 28 | } 29 | std::cout << "^\n"; 30 | std::cout << "Filling a buffer of length " << end - start << "."; 31 | std::cout << " Expected \"" << val << "\" at index " << ptr - start << "\n"; 32 | } 33 | 34 | void assert_uniform_value(const char *start, const char *end, char val) { 35 | const char *ptr = start; 36 | while (ptr != end) { 37 | if (val != *ptr) { 38 | print_buffer(start, end, val, ptr); 39 | fflush(stdout); 40 | abort(); 41 | } 42 | ptr++; 43 | } 44 | } 45 | 46 | void test_impl(memset_ty handle, const std::string &name, unsigned chunk_size) { 47 | std::vector memory(chunk_size + 512, MAGIC_VALUE0); 48 | // Start mem-setting the array at different offsets. 49 | for (int offset = 0; offset < 128; offset++) { 50 | const char *buffer_start = &*memory.begin(); 51 | const char *buffer_end = &*memory.end(); 52 | 53 | const char *region_start = &memory[offset]; 54 | const char *region_end = region_start + chunk_size; 55 | 56 | assert_uniform_value(buffer_start, buffer_end, MAGIC_VALUE0); 57 | 58 | (handle)((void *)region_start, MAGIC_VALUE1, chunk_size); 59 | 60 | // Check the chunk. 61 | assert_uniform_value(region_start, region_end, MAGIC_VALUE1); 62 | // Check before chunk. 63 | assert_uniform_value(buffer_start, region_start, MAGIC_VALUE0); 64 | // Check after chunk. 65 | assert_uniform_value(region_end, buffer_end, MAGIC_VALUE0); 66 | 67 | // Reset the buffer: 68 | std::fill(memory.begin(), memory.end(), MAGIC_VALUE0); 69 | assert_uniform_value(buffer_start, buffer_end, MAGIC_VALUE0); 70 | } 71 | } 72 | 73 | int main(int argc, char **argv) { 74 | std::cout << "Testing memset... \n"; 75 | 76 | #define TEST(FUNC, SIZE) test_impl(FUNC, #FUNC, SIZE); 77 | 78 | for (int i = 0; i < 1024; i++) { 79 | TEST(libc_memset, i); 80 | TEST(local_memset, i); 81 | TEST(musl_memset, i); 82 | TEST(asm_memset, i); 83 | } 84 | std::cout << "Done.\n"; 85 | 86 | return 0; 87 | } 88 | -------------------------------------------------------------------------------- /src/memset/bench_memset.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "decl.h" 9 | #include "utils.h" 10 | 11 | //////////////////////////////////////////////////////////////////////////////// 12 | // This is a small program that compares two memset implementations and records 13 | // the output in a csv file. 14 | //////////////////////////////////////////////////////////////////////////////// 15 | 16 | #define ITER (1000L * 1000L * 10L) 17 | #define SAMPLES (20) 18 | 19 | DoomRNG RNG; 20 | 21 | /// Measure a single implementation \p handle. 22 | uint64_t measure(memset_ty handle, unsigned size, unsigned align, 23 | unsigned offset, void *ptr) { 24 | Stopwatch T; 25 | for (unsigned i = 0; i < SAMPLES; i++) { 26 | T.start(); 27 | for (size_t j = 0; j < ITER; j++) { 28 | (handle)(ptr, 0, size); 29 | } 30 | T.stop(); 31 | } 32 | return T.get_median(); 33 | } 34 | 35 | // Allocate memory and benchmark each implementation at a specific size \p size. 36 | void bench_impl(const std::vector &toTest, unsigned size, 37 | unsigned align, unsigned offset) { 38 | std::vector memory(size + 256, 0); 39 | void *ptr = align_pointer(&memory[0], align, offset); 40 | 41 | std::cout << size << ", "; 42 | for (auto handle : toTest) { 43 | u_int64_t res = measure(handle, size, align, offset, ptr); 44 | std::cout << res << ", "; 45 | } 46 | std::cout << std::endl; 47 | } 48 | 49 | /// Try to allocate buffers at random offsets and in random sizes. 50 | /// The sizes and the offsets are in the range 0..256. 51 | void bench_rand_range(const std::vector &toTest) { 52 | std::vector memory(1024, 0); 53 | void *ptr = &memory[0]; 54 | 55 | for (auto handle : toTest) { 56 | Stopwatch T; 57 | sleep(1); 58 | for (unsigned i = 0; i < SAMPLES; i++) { 59 | RNG.rand_reset(); 60 | T.start(); 61 | for (size_t j = 0; j < ITER; j++) { 62 | (handle)((char *)ptr + RNG.next_u8_random(), 0, RNG.next_u8_random()); 63 | } 64 | T.stop(); 65 | } 66 | 67 | std::cout << T.get_median() << ", "; 68 | } 69 | std::cout << std::endl; 70 | } 71 | 72 | // To measure the call overhead. 73 | void *nop(void *s, int c, size_t n) { return s; } 74 | 75 | int main(int argc, char **argv) { 76 | std::cout << std::setprecision(3); 77 | std::cout << std::fixed; 78 | 79 | std::vector toTest = {musl_memset, libc_memset, &memset, 80 | local_memset, asm_memset, &nop}; 81 | 82 | std::cout << "Batches of random sizes:\n"; 83 | std::cout << " musl, libc@plt, libc, c_memset, asm_memset, nop,\n"; 84 | bench_rand_range(toTest); 85 | 86 | std::cout << "\nFixed size:\n"; 87 | std::cout << "size, musl, libc@plt, libc, c_memset, asm_memset, nop,\n"; 88 | 89 | for (int i = 0; i < 512; i++) { 90 | bench_impl(toTest, i, 16, 0); 91 | } 92 | 93 | return 0; 94 | } 95 | -------------------------------------------------------------------------------- /src/memcpy/bench_memcpy.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "decl.h" 9 | #include "utils.h" 10 | 11 | //////////////////////////////////////////////////////////////////////////////// 12 | // This is a small program that compares two memcpy implementations and records 13 | // the output in a csv file. 14 | //////////////////////////////////////////////////////////////////////////////// 15 | 16 | #define ITER (1000L * 1000L * 10L) 17 | #define SAMPLES (20) 18 | 19 | DoomRNG RNG; 20 | 21 | /// Measure a single implementation \p handle. 22 | uint64_t measure(memcpy_ty handle, void *dest, void *src, unsigned size) { 23 | Stopwatch T; 24 | for (unsigned i = 0; i < SAMPLES; i++) { 25 | T.start(); 26 | for (size_t j = 0; j < ITER; j++) { 27 | (handle)(dest, src, size); 28 | } 29 | T.stop(); 30 | } 31 | return T.get_median(); 32 | } 33 | 34 | // Allocate memory and benchmark each implementation at a specific size \p size. 35 | void bench_impl(const std::vector &toTest, unsigned size, 36 | unsigned align, unsigned offset) { 37 | std::vector dest(size + 256, 0); 38 | std::vector src(size + 256, 0); 39 | 40 | char *src_ptr = (char *)align_pointer(&src[0], align, offset); 41 | char *dest_ptr = (char *)align_pointer(&dest[0], align, offset); 42 | 43 | std::cout << size << ", "; 44 | for (auto handle : toTest) { 45 | u_int64_t res = measure(handle, dest_ptr, src_ptr, size); 46 | std::cout << res << ", "; 47 | } 48 | std::cout << std::endl; 49 | } 50 | 51 | /// Allocate and copy buffers at random offsets and in random sizes. 52 | /// The sizes and the offsets are in the range 0..256. 53 | void bench_rand_range(const std::vector &toTest) { 54 | std::vector dest(4096, 1); 55 | std::vector src(4096, 0); 56 | const char *src_p = &src[0]; 57 | char *dest_p = &dest[0]; 58 | 59 | for (auto handle : toTest) { 60 | Stopwatch T; 61 | sleep(1); 62 | for (unsigned i = 0; i < SAMPLES; i++) { 63 | RNG.rand_reset(); 64 | T.start(); 65 | for (size_t j = 0; j < ITER; j++) { 66 | char *to = dest_p + RNG.next_u8_random(); 67 | const char *from = src_p + RNG.next_u8_random(); 68 | (handle)(to, from, RNG.next_u8_random()); 69 | } 70 | T.stop(); 71 | } 72 | 73 | std::cout << T.get_median() << ", "; 74 | } 75 | std::cout << std::endl; 76 | } 77 | 78 | // To measure the call overhead. 79 | void *nop(void *dest, const void *src, size_t n) { return dest; } 80 | 81 | int main(int argc, char **argv) { 82 | std::cout << std::setprecision(3); 83 | std::cout << std::fixed; 84 | 85 | std::vector toTest = { 86 | &libc_memcpy, &memcpy, &__folly_memcpy, &local_memcpy, &asm_memcpy, &nop}; 87 | 88 | std::cout << "Batches of random sizes:\n"; 89 | std::cout << "libc@plt, libc, folly, c_memcpy, asm_memcpy, nop,\n"; 90 | 91 | bench_rand_range(toTest); 92 | 93 | std::cout << "\nFixed size:\n"; 94 | std::cout << "size, libc@plt, libc, folly, c_memcpy, asm_memcpy, nop,\n"; 95 | 96 | for (int i = 0; i < 512; i++) { 97 | bench_impl(toTest, i, 16, 0); 98 | } 99 | 100 | return 0; 101 | } 102 | -------------------------------------------------------------------------------- /include/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "types.h" 9 | 10 | /// Aligns the pointer \p ptr, to alignment \p alignment and offset \p offset 11 | /// within the word. 12 | void *align_pointer(void *ptr, unsigned alignment, unsigned offset) { 13 | size_t p = (size_t)ptr; 14 | while (p % alignment) 15 | ++p; 16 | return (void *)(p + (size_t)offset); 17 | } 18 | 19 | using time_point = std::chrono::steady_clock::time_point; 20 | 21 | class Stopwatch { 22 | /// The time of the last sample; 23 | time_point begin_; 24 | /// A list of recorded intervals. 25 | std::vector intervals_; 26 | 27 | public: 28 | NO_INLINE 29 | Stopwatch() : begin_() {} 30 | 31 | NO_INLINE 32 | void start() { begin_ = std::chrono::steady_clock::now(); } 33 | 34 | NO_INLINE 35 | void stop() { 36 | time_point end = std::chrono::steady_clock::now(); 37 | uint64_t interval = 38 | std::chrono::duration_cast(end - begin_) 39 | .count(); 40 | intervals_.push_back(interval); 41 | } 42 | 43 | NO_INLINE 44 | uint64_t get_median() { 45 | std::sort(intervals_.begin(), intervals_.end()); 46 | return intervals_[intervals_.size() / 2]; 47 | } 48 | }; 49 | 50 | uint8_t random_bytes[320] = { 51 | 227, 138, 244, 198, 73, 247, 185, 248, 229, 75, 24, 215, 159, 230, 136, 52 | 246, 200, 144, 65, 67, 109, 86, 118, 61, 209, 103, 188, 213, 187, 8, 53 | 210, 121, 214, 178, 232, 59, 153, 92, 209, 239, 44, 85, 156, 172, 237, 54 | 41, 150, 195, 247, 202, 249, 142, 208, 133, 21, 204, 114, 38, 51, 150, 55 | 194, 46, 184, 138, 50, 250, 190, 180, 161, 5, 211, 191, 62, 137, 142, 56 | 122, 63, 72, 233, 125, 189, 51, 238, 51, 116, 10, 44, 18, 240, 41, 57 | 157, 81, 183, 252, 214, 17, 81, 12, 44, 119, 77, 97, 101, 80, 106, 58 | 128, 190, 89, 160, 104, 244, 192, 46, 69, 73, 255, 45, 213, 190, 86, 59 | 18, 89, 34, 46, 134, 145, 166, 128, 87, 97, 192, 71, 105, 94, 51, 60 | 30, 7, 9, 0, 40, 0, 187, 205, 189, 151, 159, 107, 105, 180, 182, 61 | 233, 52, 209, 108, 186, 31, 184, 254, 170, 71, 162, 31, 80, 226, 75, 62 | 125, 214, 125, 247, 197, 149, 132, 247, 157, 253, 101, 107, 1, 127, 236, 63 | 249, 242, 152, 169, 123, 240, 129, 230, 135, 25, 57, 227, 130, 189, 76, 64 | 254, 33, 193, 39, 82, 177, 143, 31, 17, 20, 195, 219, 165, 171, 198, 65 | 125, 119, 216, 143, 55, 210, 17, 88, 150, 126, 38, 160, 71, 214, 10, 66 | 162, 158, 6, 234, 233, 119, 221, 167, 62, 146, 50, 150, 176, 142, 167, 67 | 201, 250, 195, 26, 156, 96, 36, 177, 95, 23, 7, 63, 55, 142, 80, 68 | 227, 73, 124, 93, 211, 231, 166, 182, 57, 145, 55, 242, 213, 246, 30, 69 | 146, 247, 19, 229, 34, 210, 37, 147, 242, 103, 125, 91, 171, 51, 22, 70 | 126, 248, 149, 19, 60, 89, 5, 241, 132, 72, 217, 195, 11, 173, 247, 71 | 47, 144, 222, 94, 51, 166, 192, 50, 109, 62, 42, 126, 111, 204, 141, 72 | 66, 73 | }; 74 | 75 | /// Implements a doom-style random number generator. 76 | struct DoomRNG { 77 | // Points to the current random number. 78 | unsigned rand_curr = 0; 79 | 80 | void rand_reset() { rand_curr = 0; } 81 | 82 | uint8_t next_u8_random() { return random_bytes[rand_curr++ % 320]; } 83 | }; 84 | 85 | #endif // UTILS_H 86 | -------------------------------------------------------------------------------- /docs/annotated_glibc.txt: -------------------------------------------------------------------------------- 1 | <+0>: endbr64 2 | <+4>: vmovd %esi, %xmm0 3 | <+8>: movq %rdi, %rax 4 | <+11>: vpbroadcastb %xmm0, %ymm0 5 | <+16>: cmpq $0x20, %rdx 6 | <+20>: jb 0xBELOW_32____ ; <+190> 7 | <+26>: cmpq $0x40, %rdx 8 | <+30>: ja 0xABOVE_64____ ; <+46> 9 | <+32>: vmovdqu %ymm0, -0x20(%rdi,%rdx) 10 | <+38>: vmovdqu %ymm0, (%rdi) 11 | <+42>: vzeroupper 12 | <+45>: retq 13 | 0xABOVE_64____ <+46>: cmpq $0x800, %rdx ; imm = 0x800 14 | <+53>: ja 0xABOVE_2048__ ; ___lldb_unnamed_symbol1097$$libc.so.6 + 4 15 | <+55>: cmpq $0x80, %rdx 16 | <+62>: ja 0xABOVE_128___ ; <+89> 17 | 0xSZ_64_TO_128 <+64>: vmovdqu %ymm0, (%rdi) 18 | <+68>: vmovdqu %ymm0, 0x20(%rdi) 19 | <+73>: vmovdqu %ymm0, -0x20(%rdi,%rdx) 20 | <+79>: vmovdqu %ymm0, -0x40(%rdi,%rdx) 21 | 0xEXIT_EXIT___ <+85>: vzeroupper 22 | <+88>: retq 23 | 0xABOVE_128___ <+89>: leaq 0x80(%rdi), %rcx 24 | <+96>: vmovdqu %ymm0, (%rdi) 25 | <+100>: andq $-0x80, %rcx 26 | <+104>: vmovdqu %ymm0, -0x20(%rdi,%rdx) 27 | <+110>: vmovdqu %ymm0, 0x20(%rdi) 28 | <+115>: vmovdqu %ymm0, -0x40(%rdi,%rdx) 29 | <+121>: vmovdqu %ymm0, 0x40(%rdi) 30 | <+126>: vmovdqu %ymm0, -0x60(%rdi,%rdx) 31 | <+132>: vmovdqu %ymm0, 0x60(%rdi) 32 | <+137>: vmovdqu %ymm0, -0x80(%rdi,%rdx) 33 | <+143>: addq %rdi, %rdx 34 | <+146>: andq $-0x80, %rdx 35 | <+150>: cmpq %rdx, %rcx 36 | <+153>: je 0xEXIT_EXIT___ ; <+85> 37 | 0xLOOP_4x32B__ <+155>: vmovdqa %ymm0, (%rcx) 38 | <+159>: vmovdqa %ymm0, 0x20(%rcx) 39 | <+164>: vmovdqa %ymm0, 0x40(%rcx) 40 | <+169>: vmovdqa %ymm0, 0x60(%rcx) 41 | <+174>: addq $0x80, %rcx 42 | <+181>: cmpq %rcx, %rdx 43 | <+184>: jne 0xLOOP_4x32B__ ; <+155> 44 | <+186>: vzeroupper 45 | <+189>: retq 46 | 0xBELOW_32____ <+190>: cmpb $0x10, %dl 47 | <+193>: jae 0xBELOW_16____ ; <+223> 48 | <+195>: vmovq %xmm0, %rcx 49 | <+200>: cmpb $0x8, %dl 50 | <+203>: jae 0xABOVE_8_____ ; <+237> 51 | <+205>: cmpb $0x4, %dl 52 | <+208>: jae 0xABOVE_4_____ ; <+249> 53 | <+210>: cmpb $0x1, %dl 54 | <+213>: ja 0xABOVE_1_____ ; <+259> 55 | <+215>: jb 0xIS_ZERO_CASE ; <+219> 56 | <+217>: movb %cl, (%rdi) 57 | 0xIS_ZERO_CASE <+219>: vzeroupper 58 | <+222>: retq 59 | 0xBELOW_16____ <+223>: vmovdqu %xmm0, -0x10(%rdi,%rdx) 60 | <+229>: vmovdqu %xmm0, (%rdi) 61 | <+233>: vzeroupper 62 | <+236>: retq 63 | 0xABOVE_8_____ <+237>: movq %rcx, -0x8(%rdi,%rdx) 64 | <+242>: movq %rcx, (%rdi) 65 | <+245>: vzeroupper 66 | <+248>: retq 67 | 0xABOVE_4____ <+249>: movl %ecx, -0x4(%rdi,%rdx) 68 | <+253>: movl %ecx, (%rdi) 69 | <+255>: vzeroupper 70 | <+258>: retq 71 | 0xABOVE_1_____ <+259>: movw %cx, -0x2(%rdi,%rdx) 72 | <+264>: movw %cx, (%rdi) 73 | <+267>: vzeroupper 74 | <+270>: retq 75 | <+271>: nop 76 | 77 | -------------------------------------------------------------------------------- /src/memcpy/impl.S: -------------------------------------------------------------------------------- 1 | #if defined(__APPLE__) 2 | .text 3 | .global _libc_memcpy 4 | .p2align 4, 0x90 5 | _libc_memcpy: 6 | jmp _memcpy 7 | 8 | #else 9 | 10 | .text 11 | .global libc_memcpy 12 | .p2align 4, 0x90 13 | libc_memcpy: 14 | jmp memcpy 15 | #endif 16 | 17 | #define LABEL(x) .L##x 18 | #if defined(__APPLE__) 19 | .text 20 | .global _asm_memcpy 21 | .p2align 5, 0x90 22 | _asm_memcpy: 23 | #else 24 | .text 25 | .global asm_memcpy 26 | .p2align 5, 0x90 27 | asm_memcpy: 28 | #endif 29 | 30 | // RDI is the dest 31 | // RSI is the src 32 | // RDX is length 33 | mov %rdi, %rax 34 | cmp $64,%rdx 35 | ja LABEL(over_64) 36 | cmp $16,%rdx 37 | jae LABEL(16_to_64) 38 | 39 | LABEL(below_16): 40 | cmp $4,%rdx 41 | jbe LABEL(0_to_4) 42 | cmp $8,%rdx 43 | jbe LABEL(in_4_to_8) 44 | LABEL(8_to_16): 45 | movq (%rsi), %rcx 46 | movq %rcx, (%rax) 47 | movq -8(%rsi,%rdx), %rcx 48 | movq %rcx, -8(%rax,%rdx) 49 | retq 50 | 51 | LABEL(0_to_4): 52 | // Copy the first two bytes: 53 | cmp $0,%rdx 54 | je LABEL(exit) 55 | movb (%rsi), %cl 56 | movb %cl, (%rdi) 57 | movb -1(%rsi,%rdx), %cl 58 | movb %cl, -1(%rdi,%rdx) 59 | cmp $2,%rdx 60 | jbe LABEL(exit) 61 | // Copy the second two bytes, if n > 2. 62 | movb 1(%rsi), %cl 63 | movb %cl, 1(%rdi) 64 | movb 2(%rsi), %cl 65 | movb %cl, 2(%rdi) 66 | retq 67 | LABEL(in_4_to_8): 68 | movl (%rsi), %ecx 69 | movl %ecx, (%rdi) 70 | movl -4(%rsi,%rdx), %ecx 71 | movl %ecx, -4(%rdi,%rdx) 72 | LABEL(exit): 73 | retq 74 | 75 | LABEL(16_to_64): 76 | cmp $32, %rdx 77 | jbe LABEL(16_to_32) 78 | 79 | LABEL(32_to_64): 80 | vmovdqu (%rsi), %ymm0 81 | vmovdqu %ymm0, (%rdi) 82 | vmovdqu -32(%rsi,%rdx), %ymm0 83 | vmovdqu %ymm0, -32(%rdi,%rdx) 84 | vzeroupper 85 | retq 86 | 87 | LABEL(16_to_32): 88 | movups (%rsi), %xmm0 89 | movups %xmm0, (%rdi) 90 | movups -16(%rsi,%rdx), %xmm0 91 | movups %xmm0, -16(%rdi,%rdx) 92 | retq 93 | 94 | // Handle buffers over 64 bytes: 95 | LABEL(over_64): 96 | cmp $128, %rdx 97 | ja LABEL(over_128) 98 | 99 | // Copy the last wide word. 100 | vmovups -32(%rsi,%rdx), %ymm0 101 | 102 | // Handle cases in the range 64 to 128. This is two unconditional 103 | // stores (64), 1 conditional store (32), and the one 32 byte store at 104 | // the end. 105 | vmovups (%rsi), %ymm1 106 | vmovups 32(%rsi), %ymm2 107 | 108 | cmp $96, %rdx 109 | jbe LABEL(64_to_128_done) 110 | vmovups 64(%rsi), %ymm3 111 | vmovups %ymm3, 64(%rax) 112 | 113 | .align 4 114 | LABEL(64_to_128_done): 115 | vmovups %ymm1, (%rax) 116 | vmovups %ymm2, 32(%rax) 117 | // Store the last wide word. 118 | vmovups %ymm0, -32(%rax,%rdx) 119 | vzeroupper 120 | retq 121 | 122 | LABEL(over_128): 123 | // Compute the last writeable destination. 124 | lea -128(%rdx), %rcx 125 | xor %r8, %r8 126 | .align 16 127 | LABEL(over_128_copy_loop): 128 | vmovdqu (%rsi, %r8), %ymm0 129 | vmovdqu 32(%rsi, %r8), %ymm1 130 | vmovdqu 64(%rsi, %r8), %ymm2 131 | vmovdqu 96(%rsi, %r8), %ymm3 132 | vmovdqu %ymm0, (%rdi, %r8) 133 | vmovdqu %ymm1, 32(%rdi, %r8) 134 | vmovdqu %ymm2, 64(%rdi, %r8) 135 | vmovdqu %ymm3, 96(%rdi, %r8) 136 | add $128, %r8 137 | cmp %rcx, %r8 138 | jb LABEL(over_128_copy_loop) 139 | 140 | // Handle the tail: 141 | lea -32(%rdx), %rcx 142 | cmp %r8, %rcx 143 | jb LABEL(over_128_done) 144 | vmovdqu (%rsi, %r8), %ymm0 145 | vmovdqu %ymm0, (%rdi, %r8) 146 | add $32, %r8 147 | 148 | cmp %r8, %rcx 149 | jb LABEL(over_128_done) 150 | vmovdqu (%rsi, %r8), %ymm0 151 | vmovdqu %ymm0, (%rdi, %r8) 152 | add $32, %r8 153 | 154 | cmp %r8, %rcx 155 | jb LABEL(over_128_done) 156 | vmovdqu (%rsi, %r8), %ymm0 157 | vmovdqu %ymm0, (%rdi, %r8) 158 | 159 | LABEL(over_128_done): 160 | // Copy the last 32 bytes 161 | vmovdqu -32(%rsi, %rdx), %ymm0 162 | vmovdqu %ymm0, -32(%rdi, %rdx) 163 | 164 | vzeroupper 165 | retq 166 | -------------------------------------------------------------------------------- /src/memcpy/test_memcpy.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "decl.h" 6 | #include "utils.h" 7 | 8 | //////////////////////////////////////////////////////////////////////////////// 9 | // This is a small program that checks if some memcpy implementation is correct. 10 | //////////////////////////////////////////////////////////////////////////////// 11 | 12 | #define MAGIC_VALUE0 '#' 13 | #define MAGIC_VALUE1 '=' 14 | 15 | void print_buffer(const char *start, const char *end, char val, 16 | const char *ptr) { 17 | const char *it = start; 18 | while (it != end) { 19 | std::cout << *it; 20 | it++; 21 | } 22 | std::cout << "\n"; 23 | it = start; 24 | while (it != ptr) { 25 | std::cout << " "; 26 | it++; 27 | } 28 | std::cout << "^\n"; 29 | std::cout << "Filling a buffer of length " << end - start << "."; 30 | std::cout << " Expected \"" << val << "\" at index " << ptr - start 31 | << std::endl; 32 | } 33 | 34 | void print_buffer_match(const char *start0, const char *start1, size_t len, 35 | size_t error_at) { 36 | 37 | for (size_t i = 0; i < len; i++) { 38 | std::cout << start0[i]; 39 | } 40 | std::cout << "\n"; 41 | for (size_t i = 0; i < len; i++) { 42 | std::cout << start1[i]; 43 | } 44 | std::cout << "\n"; 45 | 46 | for (size_t i = 0; i < error_at; i++) { 47 | std::cout << " "; 48 | } 49 | std::cout << "^\n"; 50 | std::cout << "Comparing buffers of length " << len << "."; 51 | std::cout << " Invalid value at index " << error_at << "." << std::endl; 52 | } 53 | 54 | // Make sure that the whole buffer, from \p start to \p end, is set to \p val. 55 | void assert_uniform_value(const char *start, const char *end, char val) { 56 | const char *ptr = start; 57 | while (ptr != end) { 58 | if (val != *ptr) { 59 | print_buffer(start, end, val, ptr); 60 | abort(); 61 | } 62 | ptr++; 63 | } 64 | } 65 | 66 | // Make sure that two buffers contain the same memory content. 67 | void assert_buffers_match(const char *buff1, const char *buff2, size_t len) { 68 | for (size_t i = 0; i < len; i++) { 69 | if (buff1[i] != buff2[i]) { 70 | print_buffer_match(buff1, buff2, len, i); 71 | abort(); 72 | } 73 | } 74 | } 75 | 76 | void test_impl(memcpy_ty handle, const std::string &name, unsigned chunk_size) { 77 | std::vector src(chunk_size + 512); 78 | std::vector dest(chunk_size + 512, MAGIC_VALUE0); 79 | 80 | // Fill the buffer with a running counter of printable chars. 81 | for (unsigned i = 0; i < src.size(); i++) { 82 | src[i] = 'A' + (i % 26); 83 | } 84 | 85 | // Start copying memory at different offsets. 86 | for (int src_offset = 0; src_offset < 32; src_offset++) { 87 | for (int dest_offset = 0; dest_offset < 32; dest_offset++) { 88 | const char *dest_start = &*dest.begin(); 89 | const char *dest_end = &*dest.end(); 90 | 91 | const char *src_region_start = &src[src_offset]; 92 | char *dest_region_start = &dest[dest_offset]; 93 | char *dest_region_end = &dest[dest_offset + chunk_size]; 94 | 95 | void *res = 96 | (handle)((void *)dest_region_start, src_region_start, chunk_size); 97 | if (res != dest_region_start) { 98 | std::cout << "Invalid return value." << std::endl; 99 | abort(); 100 | } 101 | 102 | // Check the chunk. 103 | assert_buffers_match(dest_region_start, src_region_start, chunk_size); 104 | // Check before chunk. 105 | assert_uniform_value(dest_start, dest_region_start, MAGIC_VALUE0); 106 | // Check after chunk. 107 | assert_uniform_value(dest_region_end, dest_end, MAGIC_VALUE0); 108 | 109 | // Reset the dest buffer: 110 | std::fill(dest.begin(), dest.end(), MAGIC_VALUE0); 111 | } 112 | } 113 | } 114 | 115 | int main(int argc, char **argv) { 116 | std::cout << "Testing memcpy... \n"; 117 | 118 | #define TEST(FUNC, SIZE) test_impl(FUNC, #FUNC, SIZE); 119 | 120 | for (int i = 0; i < 1024; i++) { 121 | TEST(&memcpy, i); 122 | TEST(&__folly_memcpy, i); 123 | TEST(&local_memcpy, i); 124 | TEST(&asm_memcpy, i); 125 | } 126 | 127 | std::cout << "Done.\n"; 128 | 129 | return 0; 130 | } 131 | -------------------------------------------------------------------------------- /src/memset/impl.S: -------------------------------------------------------------------------------- 1 | #if defined(__APPLE__) 2 | .text 3 | .global _libc_memset 4 | .p2align 4, 0x90 5 | _libc_memset: 6 | jmp _memset 7 | 8 | #else 9 | 10 | .text 11 | .global libc_memset 12 | .p2align 4, 0x90 13 | libc_memset: 14 | jmp memset 15 | #endif 16 | 17 | #define LABEL(x) .L##x 18 | #if defined(__APPLE__) 19 | .text 20 | .global _asm_memset 21 | .p2align 5, 0x90 22 | _asm_memset: 23 | #else 24 | .text 25 | .global asm_memset 26 | .p2align 5, 0x90 27 | asm_memset: 28 | #endif 29 | 30 | // RDI is the buffer 31 | // RSI is the value 32 | // RDX is length 33 | vmovd %esi, %xmm0 34 | vpbroadcastb %xmm0,%ymm0 35 | mov %rdi,%rax 36 | cmp $0x40,%rdx 37 | jae LABEL(above_64) 38 | LABEL(below_64): 39 | cmp $0x20, %rdx 40 | jb LABEL(below_32) 41 | vmovdqu %ymm0,(%rdi) 42 | vmovdqu %ymm0,-0x20(%rdi,%rdx) 43 | vzeroupper 44 | retq 45 | LABEL(below_32): 46 | cmp $0x10, %rdx 47 | jae LABEL(in_16_to_32) 48 | LABEL(below_16): 49 | cmp $0x4, %rdx 50 | jbe LABEL(below_4) 51 | LABEL(in_4_to_16): 52 | // Scalar stores from this point. 53 | vmovq %xmm0, %rsi 54 | cmp $0x7, %rdx 55 | jbe LABEL(in_4_to_8) 56 | // two 8-wide stores, up to 16 bytes. 57 | mov %rsi, -0x8(%rdi, %rdx) 58 | mov %rsi,(%rdi) 59 | vzeroupper 60 | retq 61 | .align 4 62 | LABEL(below_4): 63 | test %rdx, %rdx 64 | je LABEL(exit) 65 | mov %sil, (%rdi) 66 | mov %sil, -0x1(%rdi,%rdx) 67 | cmp $0x2, %rdx 68 | jbe LABEL(exit) 69 | mov %sil, 0x1(%rdi) 70 | mov %sil, 0x2(%rdi) 71 | mov %rdi,%rax 72 | .align 4 73 | LABEL(exit): 74 | vzeroupper 75 | retq 76 | LABEL(in_4_to_8): 77 | // two 4-wide stores, upto 8 bytes. 78 | mov %esi,-0x4(%rdi,%rdx) 79 | mov %esi,(%rdi) 80 | vzeroupper 81 | retq 82 | LABEL(in_16_to_32): 83 | vmovups %xmm0,(%rdi) 84 | vmovups %xmm0,-0x10(%rdi,%rdx) 85 | vzeroupper 86 | retq 87 | LABEL(above_64): 88 | cmp $0xb0, %rdx 89 | ja LABEL(above_192) 90 | cmp $0x80, %rdx 91 | jbe LABEL(in_64_to_128) 92 | // Do some work filling unaligned 32bit words. 93 | // last_word -> rsi 94 | lea -0x20(%rdi,%rdx),%rsi 95 | // rcx -> fill pointer. 96 | 97 | // We have at least 128 bytes to store. 98 | vmovdqu %ymm0,(%rdi) 99 | vmovdqu %ymm0, 0x20(%rdi) 100 | vmovdqu %ymm0, 0x40(%rdi) 101 | add $0x60,%rdi 102 | .align 8 103 | LABEL(fill_32): 104 | vmovdqu %ymm0,(%rdi) 105 | add $0x20,%rdi 106 | cmp %rdi,%rsi 107 | ja LABEL(fill_32) 108 | // Stamp the last unaligned store. 109 | vmovdqu %ymm0,(%rsi) 110 | vzeroupper 111 | retq 112 | LABEL(in_64_to_128): 113 | // last_word -> rsi 114 | vmovdqu %ymm0,(%rdi) 115 | vmovdqu %ymm0, 0x20(%rdi) 116 | vmovdqu %ymm0,-0x40(%rdi,%rdx) 117 | vmovdqu %ymm0,-0x20(%rdi,%rdx) 118 | vzeroupper 119 | retq 120 | 121 | LABEL(above_192): 122 | // rdi is the buffer address 123 | // rsi is the value 124 | // rdx is length 125 | // Store the first unaligned 32 bytes. 126 | vmovdqu %ymm0,(%rdi) 127 | 128 | // The first aligned word is stored in %rsi. 129 | mov %rdi,%rsi 130 | and $0xffffffffffffffe0,%rsi 131 | lea 0x20(%rsi),%rsi 132 | 133 | // Compute the address of the last unaligned word into rdi. 134 | lea -0x20(%rdx), %rdx 135 | add %rdx, %rdi 136 | 137 | // Check if we can do a full 5x32B stamp. 138 | lea 0xa0(%rsi),%rcx 139 | cmp %rcx, %rdi 140 | jb LABEL(stamp_4) 141 | .align 8 142 | LABEL(fill_192): 143 | vmovdqa %ymm0,(%rsi) 144 | vmovdqa %ymm0,0x20(%rsi) 145 | vmovdqa %ymm0,0x40(%rsi) 146 | vmovdqa %ymm0,0x60(%rsi) 147 | vmovdqa %ymm0,0x80(%rsi) 148 | add $0xa0, %rsi 149 | lea 0xa0(%rsi),%rcx 150 | cmp %rcx, %rdi 151 | ja LABEL(fill_192) 152 | 153 | LABEL(fill_192_tail): 154 | cmp %rsi, %rdi 155 | jb LABEL(fill_192_done) 156 | vmovdqa %ymm0, (%rsi) 157 | 158 | lea 0x20(%rsi),%rcx 159 | cmp %rcx, %rdi 160 | jb LABEL(fill_192_done) 161 | vmovdqa %ymm0, 0x20(%rsi) 162 | 163 | lea 0x40(%rsi),%rcx 164 | cmp %rcx, %rdi 165 | jb LABEL(fill_192_done) 166 | vmovdqa %ymm0, 0x40(%rsi) 167 | 168 | lea 0x60(%rsi),%rcx 169 | cmp %rcx, %rdi 170 | jb LABEL(fill_192_done) 171 | vmovdqa %ymm0, 0x60(%rsi) 172 | 173 | LABEL(last_wide_store): 174 | lea 0x80(%rsi),%rcx 175 | cmp %rcx, %rdi 176 | jb LABEL(fill_192_done) 177 | vmovdqa %ymm0, 0x80(%rsi) 178 | LABEL(fill_192_done): 179 | // Stamp the last word. 180 | vmovdqu %ymm0,(%rdi) 181 | vzeroupper 182 | ret 183 | LABEL(stamp_4): 184 | vmovdqa %ymm0,(%rsi) 185 | vmovdqa %ymm0,0x20(%rsi) 186 | vmovdqa %ymm0,0x40(%rsi) 187 | vmovdqa %ymm0,0x60(%rsi) 188 | jmp LABEL(last_wide_store) 189 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fast Memset and Memcpy implementations 2 | 3 | *UPDATE*: Ilya Albrecht landed the memset implementation from this repo into [Folly](https://github.com/facebook/folly/blob/main/folly/memset.S). 4 | 5 | This repository contains high-performance implementations of memset and memcpy. 6 | These implementations outperform the folly and glibc implementations. This 7 | repository contains several reference implementations in C and assembly. The 8 | high-performance implementations are found in the files called "impl.S". 9 | 10 | Before reading the source code in this repository you probably want to read an 11 | excellent blog [post](https://msrc-blog.microsoft.com/2021/01/11/building-faster-amd64-memset-routines/) 12 | by Joe Bialek about his work to optimize memset for windows. 13 | 14 | The charts below compare the code in this repo with other implementations: 15 | folly, musl, and glibc. The glibc implementations are measured with and without 16 | the elf indirection, as suggested by Dave Zarzycki. 17 | 18 | ## Memset 19 | ![Memset](docs/memset_bench.png) 20 | 21 | ## Memcpy 22 | ![Memcpy](docs/memcpy_bench.png) 23 | 24 | The chart below compares the performance of different memset implementations on 25 | buffers of varying sizes and offsets. Unlike the hot loop that hammers a single 26 | value, this benchmark is more realistic and takes into account mispredicted 27 | branches and the performance of the cpu decoder. The buffers are in the size 28 | range 0 to 256. The random function is made of pre-computed random values, to 29 | lower the overhead of the random function. This was suggested by Yann Collet. 30 | The 'nop' function is used to compute the benchmark setup and call overhead. The 31 | numbers below represent the implementation execution time minus the nop function 32 | time. 33 | 34 | ![memset](docs/memset_r.png) ![memcpy](docs/memcpy_r.png) 35 | 36 | The size of the buffer that memset and memcpy mutates is typically small. The 37 | picture below presents the buffer length distribution in google-chrome. Vim, 38 | Python, and even server workloads have a similar distribution. The values in the 39 | chart represent the power of two buffer size (10 represents the values between 40 | 512 and 1024). 41 | 42 | ![Histogram](docs/hist.png) 43 | 44 | 45 | The chart below presents a histogram of pointer alignment (from the game 46 | minecraft). Most of the pointers that are called by memset and memcpy are 47 | aligned to 8-byte values. Some programs have histograms that are not as sharp, 48 | meaning that there are more values that are not aligned to 4 or 8-byte boundary. 49 | 50 | ![Pointer Alignment](docs/align.png) 51 | 52 | 53 | Memcpy and Memset and frequently called by low-level high-performance libraries. 54 | Here is one example of one stack trace from the Firefox codebase: 55 | 56 | ``` 57 | (gdb) bt 58 | #0 __memmove_avx_unaligned_erms () at ../sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:225 59 | #1 in memcpy (__dest=, __src=, __len=40) at /usr/include/x86_64-linux-gnu/bits/string_fortified.h:34 60 | #2 mozilla::BufferList::ReadBytes(mozilla::BufferList::IterImpl&, char*, unsigned long) const 61 | #3 Pickle::ReadBytesInto(PickleIterator*, void*, unsigned int) const (this=, iter=, data=, length=) 62 | #4 in IPC::Message::ReadFooter(void*, unsigned int, bool) (this=, buffer=, buffer_len=40, truncate=true) 63 | #5 in mozilla::ipc::NodeController::DeserializeEventMessage(mozilla::UniquePtr >) (this=, aMessage=...) 64 | #6 in mozilla::ipc::NodeController::OnEventMessage(mojo::core::ports::NodeName const&, mozilla::UniquePtr >) 65 | #7 in mozilla::ipc::NodeChannel::OnMessageReceived(IPC::Message&&) (this=, aMessage=...) 66 | #8 in IPC::Channel::ChannelImpl::ProcessIncomingMessages() (this=) 67 | #9 in IPC::Channel::ChannelImpl::OnFileCanReadWithoutBlocking(int) (this=, fd=) 68 | #10 in base::MessagePumpLibevent::OnLibeventNotification(int, short, void*) (fd=, flags=, context=) 69 | #11 in event_persist_closure (base=, ev=) at /build/firefox-HSiFn6/firefox-94.0+build3/ipc/chromium/src/third_party/libevent/event.c:1580 70 | #12 event_process_active_single_queue (base=, activeq=, max_to_process=, endtime=) 71 | 72 | ``` 73 | 74 | The repository contains a few utilities for testing and measuring the 75 | performance and correctness of memset and memcpy. 76 | 77 | ## Test tool 78 | 79 | This is a small test harness that verifies the correctness of the 80 | implementations. It's really easy to make mistakes with off-by-one errors and 81 | run into alignment issues. The exhaustive tester catches these issues. 82 | 83 | This is a sample output: 84 | ``` 85 | OOOOOOOOOOOXX 86 | ^ 87 | Filling a buffer of length 13. Expected "O" at index 11 88 | ``` 89 | 90 | ## Benchmark tool 91 | 92 | The benchmark tool measures the performance of the system libc and the local 93 | implementation. The benchmarking tool runs each of the implementations in a loop 94 | millions of times. It runs the benchmark several times and picks the least noisy 95 | results. It's a good idea to run the benchmark tool and compare some 96 | implementation to itself to assess the noise level in the system. The 97 | benchmarking tool uses a trampoline to prevent the compiler from inlining and 98 | expanding the memset. 99 | 100 | ## Histogram tool 101 | 102 | The histogram tool is a shared object that collects records calls to memset and 103 | memcpy and creates a histogram of the length parameter. It prints the histogram 104 | when the program exits cleanly. The shared object can be loaded using 105 | LD\_PRELOAD (on Linux) or DYLD\_INSERT\_LIBRARIES (on Mac). Each bucket in the 106 | output represents the log2 size of the buffer, and each value represents the 107 | number of hits for the bucket. 108 | 109 | ## Proxy tool 110 | 111 | This is a small utility that swaps the builtin call to memset and memcpy with 112 | the local implementation from this project. The shared object can be loaded 113 | using LD\_PRELOAD (on Linux) or DYLD\_INSERT\_LIBRARIES (on Mac). 114 | 115 | -------------------------------------------------------------------------------- /src/memset/impl.c: -------------------------------------------------------------------------------- 1 | #include "types.h" 2 | 3 | #include 4 | #include 5 | 6 | // Handle memsets of sizes 0..32 7 | static inline void *small_memset(void *s, int c, size_t n) { 8 | if (n < 5) { 9 | if (n == 0) 10 | return s; 11 | char *p = s; 12 | p[0] = c; 13 | p[n - 1] = c; 14 | if (n <= 2) 15 | return s; 16 | p[1] = c; 17 | p[2] = c; 18 | return s; 19 | } 20 | 21 | if (n <= 16) { 22 | uint64_t val8 = ((uint64_t)0x0101010101010101L * ((uint8_t)c)); 23 | if (n >= 8) { 24 | char *first = s; 25 | char *last = s + n - 8; 26 | *((u64 *)first) = val8; 27 | *((u64 *)last) = val8; 28 | return s; 29 | } 30 | 31 | uint32_t val4 = val8; 32 | char *first = s; 33 | char *last = s + n - 4; 34 | *((u32 *)first) = val4; 35 | *((u32 *)last) = val4; 36 | return s; 37 | } 38 | 39 | char X = c; 40 | char *p = s; 41 | char16 val16 = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}; 42 | char *last = s + n - 16; 43 | *((char16 *)last) = val16; 44 | *((char16 *)p) = val16; 45 | return s; 46 | } 47 | 48 | static inline void *huge_memset(void *s, int c, size_t n) { 49 | char *p = s; 50 | char X = c; 51 | char32 val32 = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, 52 | X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}; 53 | 54 | // Stamp the first 32byte store. 55 | *((char32 *)p) = val32; 56 | 57 | char *first_aligned = p + 32 - ((uint64_t)p % 32); 58 | char *buffer_end = p + n; 59 | char *last_word = buffer_end - 32; 60 | 61 | // Align the next stores. 62 | p = first_aligned; 63 | 64 | // Unroll the body of the loop to increase parallelism. 65 | while (p + (32 * 5) < buffer_end) { 66 | *((char32a *)p) = val32; 67 | p += 32; 68 | *((char32a *)p) = val32; 69 | p += 32; 70 | *((char32a *)p) = val32; 71 | p += 32; 72 | *((char32a *)p) = val32; 73 | p += 32; 74 | *((char32a *)p) = val32; 75 | p += 32; 76 | } 77 | 78 | // Complete the last few iterations: 79 | #define TRY_STAMP_32_BYTES \ 80 | if (p < last_word) { \ 81 | *((char32a *)p) = val32; \ 82 | p += 32; \ 83 | } 84 | 85 | TRY_STAMP_32_BYTES 86 | TRY_STAMP_32_BYTES 87 | TRY_STAMP_32_BYTES 88 | TRY_STAMP_32_BYTES 89 | 90 | // Stamp the last unaligned word. 91 | *((char32 *)last_word) = val32; 92 | return s; 93 | } 94 | 95 | void *local_memset(void *s, int c, size_t n) { 96 | char *p = s; 97 | char X = c; 98 | 99 | if (n < 32) { 100 | return small_memset(s, c, n); 101 | } 102 | 103 | if (n > 160) { 104 | return huge_memset(s, c, n); 105 | } 106 | 107 | char32 val32 = {X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, 108 | X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X}; 109 | 110 | char *last_word = s + n - 32; 111 | 112 | // Stamp the 32-byte chunks. 113 | do { 114 | *((char32 *)p) = val32; 115 | p += 32; 116 | } while (p < last_word); 117 | 118 | // Stamp the last unaligned 32 bytes of the buffer. 119 | *((char32 *)last_word) = val32; 120 | return s; 121 | } 122 | 123 | /// This a memset implementation that was copied from musl. We only use it for 124 | /// benchmarking. 125 | 126 | void *musl_memset(void *dest, int c, size_t n) { 127 | unsigned char *s = dest; 128 | size_t k; 129 | 130 | /* Fill head and tail with minimal branching. Each 131 | * conditional ensures that all the subsequently used 132 | * offsets are well-defined and in the dest region. */ 133 | 134 | if (!n) 135 | return dest; 136 | s[0] = c; 137 | s[n - 1] = c; 138 | if (n <= 2) 139 | return dest; 140 | s[1] = c; 141 | s[2] = c; 142 | s[n - 2] = c; 143 | s[n - 3] = c; 144 | if (n <= 6) 145 | return dest; 146 | s[3] = c; 147 | s[n - 4] = c; 148 | if (n <= 8) 149 | return dest; 150 | 151 | /* Advance pointer to align it at a 4-byte boundary, 152 | * and truncate n to a multiple of 4. The previous code 153 | * already took care of any head/tail that get cut off 154 | * by the alignment. */ 155 | 156 | k = -(uintptr_t)s & 3; 157 | s += k; 158 | n -= k; 159 | n &= -4; 160 | 161 | #ifdef __GNUC__ 162 | typedef uint32_t __attribute__((__may_alias__)) u32; 163 | typedef uint64_t __attribute__((__may_alias__)) u64; 164 | 165 | u32 c32 = ((u32)-1) / 255 * (unsigned char)c; 166 | 167 | /* In preparation to copy 32 bytes at a time, aligned on 168 | * an 8-byte bounary, fill head/tail up to 28 bytes each. 169 | * As in the initial byte-based head/tail fill, each 170 | * conditional below ensures that the subsequent offsets 171 | * are valid (e.g. !(n<=24) implies n>=28). */ 172 | 173 | *(u32 *)(s + 0) = c32; 174 | *(u32 *)(s + n - 4) = c32; 175 | if (n <= 8) 176 | return dest; 177 | *(u32 *)(s + 4) = c32; 178 | *(u32 *)(s + 8) = c32; 179 | *(u32 *)(s + n - 12) = c32; 180 | *(u32 *)(s + n - 8) = c32; 181 | if (n <= 24) 182 | return dest; 183 | *(u32 *)(s + 12) = c32; 184 | *(u32 *)(s + 16) = c32; 185 | *(u32 *)(s + 20) = c32; 186 | *(u32 *)(s + 24) = c32; 187 | *(u32 *)(s + n - 28) = c32; 188 | *(u32 *)(s + n - 24) = c32; 189 | *(u32 *)(s + n - 20) = c32; 190 | *(u32 *)(s + n - 16) = c32; 191 | 192 | /* Align to a multiple of 8 so we can fill 64 bits at a time, 193 | * and avoid writing the same bytes twice as much as is 194 | * practical without introducing additional branching. */ 195 | 196 | k = 24 + ((uintptr_t)s & 4); 197 | s += k; 198 | n -= k; 199 | 200 | /* If this loop is reached, 28 tail bytes have already been 201 | * filled, so any remainder when n drops below 32 can be 202 | * safely ignored. */ 203 | 204 | u64 c64 = c32 | ((u64)c32 << 32); 205 | for (; n >= 32; n -= 32, s += 32) { 206 | *(u64 *)(s + 0) = c64; 207 | *(u64 *)(s + 8) = c64; 208 | *(u64 *)(s + 16) = c64; 209 | *(u64 *)(s + 24) = c64; 210 | } 211 | #else 212 | /* Pure C fallback with no aliasing violations. */ 213 | for (; n; n--, s++) 214 | *s = c; 215 | #endif 216 | 217 | return dest; 218 | } 219 | -------------------------------------------------------------------------------- /src/memcpy/folly.S: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) Facebook, Inc. and its affiliates. 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | /* 18 | * __folly_memcpy: An optimized memcpy implementation that uses prefetch and 19 | * AVX2 instructions. 20 | * 21 | * This implementation of memcpy acts as a memmove, but it is not optimized for 22 | * this purpose. While overlapping copies are undefined in memcpy, this 23 | * implementation acts like memmove for sizes up through 256 bytes and will 24 | * detect overlapping copies and call memmove for overlapping copies of 257 or 25 | * more bytes. 26 | * 27 | * This implementation uses prefetch to avoid dtlb misses. This can 28 | * substantially reduce dtlb store misses in cases where the destination 29 | * location is absent from L1 cache and where the copy size is small enough 30 | * that the hardware prefetcher doesn't have a large impact. 31 | * 32 | * The number of branches is limited by the use of overlapping copies. This 33 | * helps with copies where the source and destination cache lines are already 34 | * present in L1 because there are fewer instructions to execute and fewer 35 | * branches to potentially mispredict. 36 | * 37 | * Vector operations up to 32-bytes are used (avx2 instruction set). Larger 38 | * mov operations (avx512) are not used. 39 | * 40 | * Large copies make use of aligned store operations. This operation is 41 | * observed to always be faster than rep movsb, so the rep movsb instruction 42 | * is not used. 43 | * 44 | * If the copy size is humongous and the source and destination are both 45 | * aligned, this memcpy will use non-temporal operations. This can have 46 | * a substantial speedup for copies where data is absent from L1, but it 47 | * is significantly slower if the source and destination data were already 48 | * in L1. The use of non-temporal operations also has the effect that after 49 | * the copy is complete, the data will be moved out of L1, even if the data was 50 | * present before the copy started. 51 | * 52 | * @author Logan Evans 53 | */ 54 | 55 | #if defined(__AVX2__) 56 | 57 | // This threshold is half of L1 cache on a Skylake machine, which means that 58 | // potentially all of L1 will be populated by this copy once it is executed 59 | // (dst and src are cached for temporal copies). 60 | #define NON_TEMPORAL_STORE_THRESHOLD $32768 61 | 62 | .file "memcpy.S" 63 | .section .text,"ax" 64 | 65 | .type __folly_memcpy_short, @function 66 | __folly_memcpy_short: 67 | .cfi_startproc 68 | 69 | .L_GE1_LE7: 70 | cmp $1, %rdx 71 | je .L_EQ1 72 | 73 | cmp $4, %rdx 74 | jae .L_GE4_LE7 75 | 76 | .L_GE2_LE3: 77 | movw (%rsi), %r8w 78 | movw -2(%rsi,%rdx), %r9w 79 | movw %r8w, (%rdi) 80 | movw %r9w, -2(%rdi,%rdx) 81 | ret 82 | 83 | .align 2 84 | .L_EQ1: 85 | movb (%rsi), %r8b 86 | movb %r8b, (%rdi) 87 | ret 88 | 89 | // Aligning the target of a jump to an even address has a measurable 90 | // speedup in microbenchmarks. 91 | .align 2 92 | .L_GE4_LE7: 93 | movl (%rsi), %r8d 94 | movl -4(%rsi,%rdx), %r9d 95 | movl %r8d, (%rdi) 96 | movl %r9d, -4(%rdi,%rdx) 97 | ret 98 | 99 | .cfi_endproc 100 | .size __folly_memcpy_short, .-__folly_memcpy_short 101 | 102 | // memcpy is an alternative entrypoint into the function named __folly_memcpy. 103 | // The compiler is able to call memcpy since the name is global while 104 | // stacktraces will show __folly_memcpy since that is the name of the function. 105 | // This is intended to aid in debugging by making it obvious which version of 106 | // memcpy is being used. 107 | .align 64 108 | .globl __folly_memcpy 109 | .type __folly_memcpy, @function 110 | 111 | __folly_memcpy: 112 | .cfi_startproc 113 | 114 | mov %rdi, %rax 115 | 116 | test %rdx, %rdx 117 | je .L_EQ0 118 | 119 | prefetchw (%rdi) 120 | prefetchw -1(%rdi,%rdx) 121 | 122 | cmp $8, %rdx 123 | jb .L_GE1_LE7 124 | 125 | .L_GE8: 126 | cmp $32, %rdx 127 | ja .L_GE33 128 | 129 | .L_GE8_LE32: 130 | cmp $16, %rdx 131 | ja .L_GE17_LE32 132 | 133 | .L_GE8_LE16: 134 | mov (%rsi), %r8 135 | mov -8(%rsi,%rdx), %r9 136 | mov %r8, (%rdi) 137 | mov %r9, -8(%rdi,%rdx) 138 | .L_EQ0: 139 | ret 140 | 141 | .align 2 142 | .L_GE17_LE32: 143 | movdqu (%rsi), %xmm0 144 | movdqu -16(%rsi,%rdx), %xmm1 145 | movdqu %xmm0, (%rdi) 146 | movdqu %xmm1, -16(%rdi,%rdx) 147 | ret 148 | 149 | .align 2 150 | .L_GE193_LE256: 151 | vmovdqu %ymm3, 96(%rdi) 152 | vmovdqu %ymm4, -128(%rdi,%rdx) 153 | 154 | .L_GE129_LE192: 155 | vmovdqu %ymm2, 64(%rdi) 156 | vmovdqu %ymm5, -96(%rdi,%rdx) 157 | 158 | .L_GE65_LE128: 159 | vmovdqu %ymm1, 32(%rdi) 160 | vmovdqu %ymm6, -64(%rdi,%rdx) 161 | 162 | .L_GE33_LE64: 163 | vmovdqu %ymm0, (%rdi) 164 | vmovdqu %ymm7, -32(%rdi,%rdx) 165 | 166 | vzeroupper 167 | ret 168 | 169 | .align 2 170 | .L_GE33: 171 | vmovdqu (%rsi), %ymm0 172 | vmovdqu -32(%rsi,%rdx), %ymm7 173 | 174 | cmp $64, %rdx 175 | jbe .L_GE33_LE64 176 | 177 | prefetchw 64(%rdi) 178 | 179 | vmovdqu 32(%rsi), %ymm1 180 | vmovdqu -64(%rsi,%rdx), %ymm6 181 | 182 | cmp $128, %rdx 183 | jbe .L_GE65_LE128 184 | 185 | prefetchw 128(%rdi) 186 | 187 | vmovdqu 64(%rsi), %ymm2 188 | vmovdqu -96(%rsi,%rdx), %ymm5 189 | 190 | cmp $192, %rdx 191 | jbe .L_GE129_LE192 192 | 193 | prefetchw 192(%rdi) 194 | 195 | vmovdqu 96(%rsi), %ymm3 196 | vmovdqu -128(%rsi,%rdx), %ymm4 197 | 198 | cmp $256, %rdx 199 | jbe .L_GE193_LE256 200 | 201 | .L_GE257: 202 | prefetchw 256(%rdi) 203 | 204 | // Check if there is an overlap. If there is an overlap then the caller 205 | // has a bug since this is undefined behavior. However, for legacy 206 | // reasons this behavior is expected by some callers. 207 | // 208 | // All copies through 256 bytes will operate as a memmove since for 209 | // those sizes all reads are performed before any writes. 210 | // 211 | // This check uses the idea that there is an overlap if 212 | // (%rdi < (%rsi + %rdx)) && (%rsi < (%rdi + %rdx)), 213 | // or equivalently, there is no overlap if 214 | // ((%rsi + %rdx) <= %rdi) || ((%rdi + %rdx) <= %rsi). 215 | // 216 | // %r9 will be used after .L_ALIGNED_DST_LOOP to calculate how many 217 | // bytes remain to be copied. 218 | lea (%rsi,%rdx), %r9 219 | cmp %rdi, %r9 220 | jbe .L_NO_OVERLAP 221 | lea (%rdi,%rdx), %r8 222 | cmp %rsi, %r8 223 | // This is a forward jump so that the branch predictor will not predict 224 | // a memmove. 225 | ja .L_MEMMOVE 226 | 227 | .align 2 228 | .L_NO_OVERLAP: 229 | vmovdqu %ymm0, (%rdi) 230 | vmovdqu %ymm1, 32(%rdi) 231 | vmovdqu %ymm2, 64(%rdi) 232 | vmovdqu %ymm3, 96(%rdi) 233 | 234 | // Align %rdi to a 32 byte boundary. 235 | // %rcx = 128 - 31 & %rdi 236 | mov $128, %rcx 237 | and $31, %rdi 238 | sub %rdi, %rcx 239 | 240 | lea (%rsi,%rcx), %rsi 241 | lea (%rax,%rcx), %rdi 242 | sub %rcx, %rdx 243 | 244 | // %r8 is the end condition for the loop. 245 | lea -128(%rsi,%rdx), %r8 246 | 247 | cmp NON_TEMPORAL_STORE_THRESHOLD, %rdx 248 | jae .L_NON_TEMPORAL_LOOP 249 | 250 | .align 2 251 | .L_ALIGNED_DST_LOOP: 252 | prefetchw 128(%rdi) 253 | prefetchw 192(%rdi) 254 | 255 | vmovdqu (%rsi), %ymm0 256 | vmovdqu 32(%rsi), %ymm1 257 | vmovdqu 64(%rsi), %ymm2 258 | vmovdqu 96(%rsi), %ymm3 259 | add $128, %rsi 260 | 261 | vmovdqa %ymm0, (%rdi) 262 | vmovdqa %ymm1, 32(%rdi) 263 | vmovdqa %ymm2, 64(%rdi) 264 | vmovdqa %ymm3, 96(%rdi) 265 | add $128, %rdi 266 | 267 | cmp %r8, %rsi 268 | jb .L_ALIGNED_DST_LOOP 269 | 270 | .L_ALIGNED_DST_LOOP_END: 271 | sub %rsi, %r9 272 | mov %r9, %rdx 273 | 274 | vmovdqu %ymm4, -128(%rdi,%rdx) 275 | vmovdqu %ymm5, -96(%rdi,%rdx) 276 | vmovdqu %ymm6, -64(%rdi,%rdx) 277 | vmovdqu %ymm7, -32(%rdi,%rdx) 278 | 279 | vzeroupper 280 | ret 281 | 282 | .align 2 283 | .L_NON_TEMPORAL_LOOP: 284 | testb $31, %sil 285 | jne .L_ALIGNED_DST_LOOP 286 | // This is prefetching the source data unlike ALIGNED_DST_LOOP which 287 | // prefetches the destination data. This choice is again informed by 288 | // benchmarks. With a non-temporal store the entirety of the cache line 289 | // is being written so the previous data can be discarded without being 290 | // fetched. 291 | prefetchnta 128(%rsi) 292 | prefetchnta 196(%rsi) 293 | 294 | vmovntdqa (%rsi), %ymm0 295 | vmovntdqa 32(%rsi), %ymm1 296 | vmovntdqa 64(%rsi), %ymm2 297 | vmovntdqa 96(%rsi), %ymm3 298 | add $128, %rsi 299 | 300 | vmovntdq %ymm0, (%rdi) 301 | vmovntdq %ymm1, 32(%rdi) 302 | vmovntdq %ymm2, 64(%rdi) 303 | vmovntdq %ymm3, 96(%rdi) 304 | add $128, %rdi 305 | 306 | cmp %r8, %rsi 307 | jb .L_NON_TEMPORAL_LOOP 308 | 309 | sfence 310 | jmp .L_ALIGNED_DST_LOOP_END 311 | 312 | .L_MEMMOVE: 313 | call memmove 314 | ret 315 | 316 | .cfi_endproc 317 | .size __folly_memcpy, .-__folly_memcpy 318 | 319 | #ifdef FOLLY_MEMCPY_IS_MEMCPY 320 | .weak memcpy 321 | memcpy = __folly_memcpy 322 | #endif 323 | 324 | .ident "GCC: (GNU) 4.8.2" 325 | #ifdef __linux__ 326 | .section .note.GNU-stack,"",@progbits 327 | #endif 328 | 329 | #endif 330 | --------------------------------------------------------------------------------