├── .gitignore ├── .gitmodules ├── BUILD ├── CMakeLists.txt ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── WORKSPACE ├── cmake ├── bazel_to_cmake.py ├── bazel_to_cmake.sh ├── run_android_test.sh ├── ruyConfig.cmake.in ├── ruy_add_all_subdirs.cmake ├── ruy_cc_binary.cmake ├── ruy_cc_library.cmake ├── ruy_cc_test.cmake └── ruy_include_directories.cmake ├── doc ├── README.md ├── depgraph.sh └── depgraph.svg ├── example ├── BUILD ├── CMakeLists.txt ├── README.md ├── example.cc └── parametrized_example.cc ├── ruy ├── BUILD ├── CMakeLists.txt ├── allocator.cc ├── allocator.h ├── allocator_test.cc ├── apply_multiplier.cc ├── apply_multiplier.h ├── apply_multiplier_test.cc ├── asm_helpers.h ├── benchmark.cc ├── block_map.cc ├── block_map.h ├── block_map_test.cc ├── blocking_counter.cc ├── blocking_counter.h ├── build_defs.bzl ├── build_defs.oss.bzl ├── check_macros.h ├── check_macros_test.cc ├── context.cc ├── context.h ├── context_get_ctx.cc ├── context_get_ctx.h ├── context_test.cc ├── cpu_cache_params.h ├── cpuinfo.cc ├── cpuinfo.h ├── create_trmul_params.h ├── ctx.cc ├── ctx.h ├── ctx_impl.h ├── ctx_test.cc ├── denormal.cc ├── denormal.h ├── frontend.cc ├── frontend.h ├── gtest_wrapper.h ├── have_built_path_for.h ├── have_built_path_for_avx.cc ├── have_built_path_for_avx2_fma.cc ├── have_built_path_for_avx512.cc ├── kernel.h ├── kernel_arm.h ├── kernel_arm32.cc ├── kernel_arm64.cc ├── kernel_avx.cc ├── kernel_avx2_fma.cc ├── kernel_avx512.cc ├── kernel_common.h ├── kernel_x86.h ├── mat.h ├── matrix.h ├── matrix_test.cc ├── mul_params.h ├── mul_params_test.cc ├── opt_set.h ├── pack.h ├── pack_arm.cc ├── pack_arm.h ├── pack_avx.cc ├── pack_avx2_fma.cc ├── pack_avx512.cc ├── pack_common.h ├── pack_x86.h ├── path.h ├── perchannel_buffers_reallocation_test.cc ├── performance_advisory.h ├── platform.h ├── pmu.cc ├── pmu.h ├── prepacked_cache.cc ├── prepacked_cache.h ├── prepacked_cache_test.cc ├── prepare_packed_matrices.cc ├── prepare_packed_matrices.h ├── profiler │ ├── BUILD │ ├── CMakeLists.txt │ ├── README.md │ ├── instrumentation.cc │ ├── instrumentation.h │ ├── profiler.cc │ ├── profiler.h │ ├── test.cc │ ├── test_instrumented_library.cc │ ├── test_instrumented_library.h │ ├── treeview.cc │ └── treeview.h ├── reference_mul.h ├── ruy.h ├── ruy_test.bzl ├── ruy_test_ext.oss.bzl ├── side_pair.h ├── size_util.h ├── size_util_test.cc ├── strategy_controls.h ├── system_aligned_alloc.cc ├── system_aligned_alloc.h ├── test.h ├── test_fast.cc ├── test_overflow_dst_zero_point.cc ├── test_slow.cc ├── thread_pool.cc ├── thread_pool.h ├── time.h ├── trace.h ├── trmul.cc ├── trmul.h ├── trmul_params.h ├── tune.cc ├── tune.h ├── tune_test.cc ├── validate.h ├── wait.cc ├── wait.h └── wait_test.cc └── third_party ├── BUILD ├── CMakeLists.txt └── cpuinfo.BUILD /.gitignore: -------------------------------------------------------------------------------- 1 | # Visual Studio files 2 | .vs/ 3 | .vscode/ 4 | *.sdf 5 | *.opensdf 6 | *.VC.opendb 7 | *.suo 8 | *.user 9 | 10 | # macOS files 11 | .DS_Store 12 | 13 | # CMake artifacts 14 | build/ 15 | build-*/ 16 | 17 | # Bazel artifacts 18 | **/bazel-* 19 | 20 | # Emacs autosaves 21 | *~ 22 | \#*\# 23 | 24 | # Vim swap files 25 | [._]*.sw[a-p] 26 | 27 | # Source indexing files 28 | compile_commands.json 29 | .cache/clangd 30 | .clangd/ 31 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "googletest"] 2 | path = third_party/googletest 3 | url = https://github.com/google/googletest 4 | [submodule "cpuinfo"] 5 | path = third_party/cpuinfo 6 | url = https://github.com/pytorch/cpuinfo 7 | -------------------------------------------------------------------------------- /BUILD: -------------------------------------------------------------------------------- 1 | # Ruy is not BLAS 2 | 3 | load("//tools/build_defs/license:license.bzl", "license") 4 | 5 | package( 6 | default_applicable_licenses = ["//third_party/ruy:license"], 7 | licenses = ["notice"], # Apache 2.0 8 | ) 9 | 10 | license( 11 | name = "license", 12 | package_name = "ruy", 13 | ) 14 | 15 | exports_files(["LICENSE"]) 16 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Copyright 2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | cmake_policy(SET CMP0012 NEW) 16 | cmake_policy(SET CMP0048 NEW) 17 | project(ruy CXX) 18 | cmake_minimum_required(VERSION 3.13) # Copied from IREE 19 | set(CMAKE_CXX_STANDARD 14) 20 | 21 | include(GNUInstallDirs) 22 | 23 | if (PROJECT_NAME STREQUAL CMAKE_PROJECT_NAME) 24 | set(RUY_IS_TOPLEVEL TRUE) 25 | set(RUY_MINIMAL_BUILD_DEFAULT_VALUE OFF) 26 | else() 27 | set(RUY_IS_TOPLEVEL FALSE) 28 | set(RUY_MINIMAL_BUILD_DEFAULT_VALUE ON) 29 | endif() 30 | 31 | option(RUY_MINIMAL_BUILD "Disable ruy's tests, examples, etc. Build only ruy public libraries." ${RUY_MINIMAL_BUILD_DEFAULT_VALUE}) 32 | if (NOT RUY_MINIMAL_BUILD) 33 | enable_testing() 34 | endif() 35 | 36 | option(RUY_PROFILER "Enable ruy's built-in profiler (harms performance)" OFF) 37 | 38 | option(RUY_ENABLE_INSTALL "Enable install rule" ${RUY_IS_TOPLEVEL}) 39 | 40 | include(cmake/ruy_add_all_subdirs.cmake) 41 | include(cmake/ruy_cc_library.cmake) 42 | include(cmake/ruy_cc_binary.cmake) 43 | include(cmake/ruy_cc_test.cmake) 44 | 45 | option(RUY_FIND_CPUINFO "Use find_package to find cpuinfo" OFF) 46 | 47 | # Skip cpuinfo if it was already generated, which can happen when ruy is 48 | # a subdirectory in a wider project that already uses cpuinfo. 49 | if (NOT TARGET cpuinfo::cpuinfo) 50 | if (RUY_FIND_CPUINFO) 51 | find_package(cpuinfo REQUIRED) 52 | else() 53 | # Test if the third_party/cpuinfo submodule was checked out before 54 | # adding that subdirectory, so we can do more helpful things below in the 55 | # else() block when it's not. 56 | set(RUY_CPUINFO_CMAKELISTS_FILE "${CMAKE_CURRENT_SOURCE_DIR}/third_party/cpuinfo/CMakeLists.txt") 57 | if (EXISTS "${RUY_CPUINFO_CMAKELISTS_FILE}") 58 | # Disabling cpuinfo's tests and benchmarks to prevent a copy of its 59 | # googletest dependency getting downloaded into a 'deps' directory in the 60 | # source tree! 61 | set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE) 62 | set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL "" FORCE) 63 | set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "" FORCE) 64 | add_subdirectory("third_party/cpuinfo" EXCLUDE_FROM_ALL) 65 | else() 66 | # third_party/cpuinfo is not checked out. That could be intentional when 67 | # ruy is a subdirectory in a wider project that is already providing 68 | # the cpuinfo target. Maybe that wider project's CMakeLists is ordered 69 | # in such a way that cpuinfo gets generated after ruy. In that case, 70 | # it's helpful that we continue silently. In the worst case if the cpuinfo 71 | # target never gets defined, ruy will fail to compile. 72 | # On the other hand, if ruy is the top-level project here (not part of a 73 | # wider project) then nothing will define the cpuinfo target for us, 74 | # so we will definitely fail to compile, so we may as well fail right here. 75 | if (RUY_IS_TOPLEVEL) 76 | message(FATAL_ERROR "This file does not exist:\n${RUY_CPUINFO_CMAKELISTS_FILE}\n" 77 | "That typically means that the git submodules of the ruy " 78 | "repository haven't been checked out. Try this in the ruy " 79 | "git directory:\n git submodule update --init") 80 | endif() 81 | endif() 82 | endif() 83 | endif() 84 | 85 | # googletest is only needed for tests. Projects embedding ruy as a subdirectory 86 | # and not needing to build ruy tests may proceed without a local checkout of 87 | # third_party/googletest. 88 | if (NOT RUY_MINIMAL_BUILD 89 | AND NOT TARGET gtest 90 | AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/third_party/googletest/CMakeLists.txt") 91 | add_subdirectory("third_party/googletest" EXCLUDE_FROM_ALL) 92 | endif() 93 | 94 | add_subdirectory("ruy") 95 | 96 | if (NOT RUY_MINIMAL_BUILD) 97 | add_subdirectory("example") 98 | endif() 99 | 100 | if (RUY_ENABLE_INSTALL) 101 | install(EXPORT ${PROJECT_NAME}Targets 102 | NAMESPACE ${PROJECT_NAME}:: 103 | DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" 104 | ) 105 | 106 | include(CMakePackageConfigHelpers) 107 | 108 | configure_package_config_file( 109 | "cmake/${PROJECT_NAME}Config.cmake.in" 110 | "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" 111 | INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" 112 | ) 113 | 114 | install(FILES "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" 115 | DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" 116 | ) 117 | endif() 118 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We'd love to accept your patches and contributions to this project. There are 4 | just a few small guidelines you need to follow. 5 | 6 | ## Contributor License Agreement 7 | 8 | Contributions to this project must be accompanied by a Contributor License 9 | Agreement. You (or your employer) retain the copyright to your contribution; 10 | this simply gives us permission to use and redistribute your contributions as 11 | part of the project. Head over to to see 12 | your current agreements on file or to sign a new one. 13 | 14 | You generally only need to submit a CLA once, so if you've already submitted one 15 | (even if it was for a different project), you probably don't need to do it 16 | again. 17 | 18 | ## Code reviews 19 | 20 | All submissions, including submissions by project members, require review. We 21 | use GitHub pull requests for this purpose. Consult 22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more 23 | information on using pull requests. 24 | 25 | ## Community Guidelines 26 | 27 | This project follows [Google's Open Source Community 28 | Guidelines](https://opensource.google/conduct/). 29 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # The ruy matrix multiplication library 2 | 3 | This is not an officially supported Google product. 4 | 5 | ruy is a matrix multiplication library. Its focus is to cover the matrix 6 | multiplication needs of neural network inference engines. Its initial user has 7 | been TensorFlow Lite, where it is used by default on the ARM CPU architecture. 8 | 9 | ruy supports both floating-point and 8bit-integer-quantized matrices. 10 | 11 | ## Efficiency 12 | 13 | ruy is designed to achieve high performance not just on very large sizes, as 14 | is the focus of many established libraries, but on whatever are the actual sizes 15 | and shapes of matrices most critical in current TensorFlow Lite applications. 16 | This often means quite small sizes, e.g. 100x100 or even 50x50, and all sorts of 17 | rectangular shapes. It's not as fast as completely specialized code for each 18 | shape, but it aims to offer a good compromise of speed across all shapes and a 19 | small binary size. 20 | 21 | ## Documentation 22 | 23 | Some documentation will eventually be available in the doc/ directory, see 24 | [doc/README.md](doc/README.md). 25 | 26 | -------------------------------------------------------------------------------- /WORKSPACE: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Workspace file for the Ruy project. 16 | 17 | workspace(name = "com_google_ruy") 18 | 19 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive") 20 | load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe") 21 | 22 | maybe( 23 | local_repository, 24 | name = "com_google_googletest", 25 | path = "third_party/googletest", 26 | ) 27 | 28 | maybe( 29 | new_local_repository, 30 | name = "cpuinfo", 31 | path = "third_party/cpuinfo", 32 | build_file = "@//third_party:cpuinfo.BUILD", 33 | ) 34 | 35 | # skylib utility for additional bazel functionality. 36 | skylib_version = "0.9.0" 37 | http_archive( 38 | name = "bazel_skylib", 39 | type = "tar.gz", 40 | url = "https://github.com/bazelbuild/bazel-skylib/releases/download/{}/bazel_skylib-{}.tar.gz".format (skylib_version, skylib_version), 41 | sha256 = "1dde365491125a3db70731e25658dfdd3bc5dbdfd11b840b3e987ecf043c7ca0", 42 | ) 43 | load("@bazel_skylib//lib:versions.bzl", "versions") 44 | versions.check(minimum_bazel_version = "2.0.0") 45 | -------------------------------------------------------------------------------- /cmake/bazel_to_cmake.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2021 Google LLC 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # https://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | this_script_dir="$(dirname "$0")" 17 | 18 | root_dir="$(git -C "${this_script_dir}" rev-parse --show-toplevel)" 19 | 20 | build_files="$(find "${root_dir}" -type f -name BUILD)" 21 | 22 | if ! command -v python3 &> /dev/null; then 23 | python_command=python 24 | else 25 | python_command=python3 26 | fi 27 | 28 | for build_file in ${build_files}; do 29 | package_dir="$(dirname "${build_file}")" 30 | if [[ "${package_dir}" == "${root_dir}" ]]; then 31 | # The root CMakeLists.txt is not generated. 32 | continue 33 | fi 34 | "${python_command}" "${this_script_dir}/bazel_to_cmake.py" "${root_dir}" "${package_dir}" > "${package_dir}/CMakeLists.txt" 35 | done 36 | -------------------------------------------------------------------------------- /cmake/run_android_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Minimal script pushing and running a file on device! 4 | # Contemporary versions of ADB properly propagate exit codes so nothing more 5 | # is needed to let CTest report test success/failure. 6 | 7 | # TODO: consider clearing temporary files after testing, although that will 8 | # get in the way of debugging and will make code more complex... also, 9 | # Ruy's test files aren't huge and people running these probably have 10 | # bigger clutter issues in their /data/local/tmp anyway. Anyway, if we want 11 | # to do this, we could copy IREE's code. 12 | 13 | device_tmpdir=/data/local/tmp 14 | 15 | adb push "$1" "${device_tmpdir}" 16 | adb shell "${device_tmpdir}/$(basename "$1")" 17 | -------------------------------------------------------------------------------- /cmake/ruyConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # ruy CMake configuration file. 2 | 3 | include(CMakeFindDependencyMacro) 4 | 5 | find_dependency(cpuinfo) 6 | 7 | @PACKAGE_INIT@ 8 | 9 | include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake") 10 | -------------------------------------------------------------------------------- /cmake/ruy_add_all_subdirs.cmake: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Forked from IREE's iree_add_all_subdirs.cmake. 16 | 17 | # add_all_subidrs 18 | # 19 | # CMake function to add all subdirectories of the current directory that contain 20 | # a CMakeLists.txt file 21 | # 22 | # Takes no arguments. 23 | function(ruy_add_all_subdirs) 24 | FILE(GLOB _CHILDREN RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/*) 25 | SET(_DIRLIST "") 26 | foreach(_CHILD ${_CHILDREN}) 27 | if((NOT(subdir MATCHES third_party)) AND 28 | (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${_CHILD}) AND 29 | (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${_CHILD}/CMakeLists.txt)) 30 | LIST(APPEND _DIRLIST ${_CHILD}) 31 | endif() 32 | endforeach() 33 | 34 | foreach(subdir ${_DIRLIST}) 35 | add_subdirectory(${subdir}) 36 | endforeach() 37 | endfunction() 38 | -------------------------------------------------------------------------------- /cmake/ruy_cc_binary.cmake: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Forked from IREE's iree_cc_binary.cmake. 16 | 17 | include(CMakeParseArguments) 18 | include(cmake/ruy_include_directories.cmake) 19 | 20 | # ruy_cc_binary() 21 | # 22 | # CMake function to imitate Bazel's cc_binary rule. 23 | function(ruy_cc_binary) 24 | cmake_parse_arguments( 25 | _RULE 26 | "TESTONLY" 27 | "NAME" 28 | "SRCS;COPTS;LINKOPTS;DEPS;TAGS" 29 | ${ARGN} 30 | ) 31 | 32 | if(_RULE_TESTONLY AND RUY_MINIMAL_BUILD) 33 | return() 34 | endif() 35 | 36 | set(_NAME "${_RULE_NAME}") 37 | 38 | add_executable(${_NAME} "") 39 | target_sources(${_NAME} 40 | PRIVATE 41 | ${_RULE_SRCS} 42 | ) 43 | set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "${_RULE_NAME}") 44 | ruy_include_directories(${_NAME} "${_RULE_DEPS}") 45 | target_compile_options(${_NAME} 46 | PRIVATE 47 | ${_RULE_COPTS} 48 | ) 49 | target_link_options(${_NAME} 50 | PRIVATE 51 | ${_RULE_LINKOPTS} 52 | ) 53 | target_link_libraries(${_NAME} 54 | PUBLIC 55 | ${_RULE_DEPS} 56 | ) 57 | endfunction() 58 | -------------------------------------------------------------------------------- /cmake/ruy_cc_library.cmake: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Forked from IREE's iree_cc_library.cmake. 16 | 17 | include(CMakeParseArguments) 18 | include(cmake/ruy_include_directories.cmake) 19 | 20 | # ruy_cc_library() 21 | # 22 | # CMake function to imitate Bazel's cc_library rule. 23 | function(ruy_cc_library) 24 | cmake_parse_arguments( 25 | _RULE 26 | "PUBLIC;TESTONLY" 27 | "NAME" 28 | "HDRS;SRCS;COPTS;DEFINES;LINKOPTS;DEPS" 29 | ${ARGN} 30 | ) 31 | 32 | if(_RULE_TESTONLY AND RUY_MINIMAL_BUILD) 33 | return() 34 | endif() 35 | 36 | set(_NAME "${_RULE_NAME}") 37 | 38 | # Check if this is a header-only library. 39 | if("${_RULE_SRCS}" STREQUAL "") 40 | set(_RULE_IS_INTERFACE 1) 41 | else() 42 | set(_RULE_IS_INTERFACE 0) 43 | endif() 44 | 45 | file(RELATIVE_PATH _SUBDIR ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_LIST_DIR}) 46 | 47 | if(_RULE_IS_INTERFACE) 48 | # Generating a header-only library. 49 | add_library(${_NAME} INTERFACE) 50 | set_target_properties(${_NAME} PROPERTIES PUBLIC_HEADER "${_RULE_HDRS}") 51 | target_include_directories(${_NAME} 52 | INTERFACE 53 | "$" 54 | "$" 55 | ) 56 | target_link_libraries(${_NAME} 57 | INTERFACE 58 | ${_RULE_DEPS} 59 | ${_RULE_LINKOPTS} 60 | ) 61 | target_compile_definitions(${_NAME} 62 | INTERFACE 63 | ${_RULE_DEFINES} 64 | ) 65 | else() 66 | # Generating a static binary library. 67 | add_library(${_NAME} STATIC ${_RULE_SRCS} ${_RULE_HDRS}) 68 | set_target_properties(${_NAME} PROPERTIES PUBLIC_HEADER "${_RULE_HDRS}") 69 | ruy_include_directories(${_NAME} "${_RULE_DEPS}") 70 | target_compile_options(${_NAME} 71 | PRIVATE 72 | ${_RULE_COPTS} 73 | ) 74 | target_link_libraries(${_NAME} 75 | PUBLIC 76 | ${_RULE_DEPS} 77 | PRIVATE 78 | ${_RULE_LINKOPTS} 79 | ) 80 | target_compile_definitions(${_NAME} 81 | PUBLIC 82 | ${_RULE_DEFINES} 83 | ) 84 | endif() 85 | 86 | add_library(${PROJECT_NAME}::${_NAME} ALIAS ${_NAME}) 87 | 88 | if(NOT _RULE_TESTONLY) 89 | install( 90 | TARGETS ${_NAME} 91 | EXPORT ruyTargets 92 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 93 | PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${_SUBDIR} 94 | ) 95 | endif() 96 | endfunction() 97 | -------------------------------------------------------------------------------- /cmake/ruy_cc_test.cmake: -------------------------------------------------------------------------------- 1 | # Copyright 2019 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Forked from IREE's iree_cc_test.cmake. 16 | 17 | include(CMakeParseArguments) 18 | include(cmake/ruy_include_directories.cmake) 19 | 20 | # ruy_cc_test() 21 | # 22 | # CMake function to imitate Bazel's cc_test rule. 23 | function(ruy_cc_test) 24 | cmake_parse_arguments( 25 | _RULE 26 | "" 27 | "NAME" 28 | "SRCS;COPTS;LINKOPTS;DEPS;TAGS" 29 | ${ARGN} 30 | ) 31 | 32 | if(RUY_MINIMAL_BUILD) 33 | return() 34 | endif() 35 | 36 | set(_NAME "${_RULE_NAME}") 37 | 38 | add_executable(${_NAME} "") 39 | target_sources(${_NAME} 40 | PRIVATE 41 | ${_RULE_SRCS} 42 | ) 43 | set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "${_RULE_NAME}") 44 | ruy_include_directories(${_NAME} "${_RULE_DEPS}") 45 | target_compile_options(${_NAME} 46 | PRIVATE 47 | ${_RULE_COPTS} 48 | ) 49 | target_link_options(${_NAME} 50 | PRIVATE 51 | ${_RULE_LINKOPTS} 52 | ) 53 | target_link_libraries(${_NAME} 54 | PUBLIC 55 | ${_RULE_DEPS} 56 | ) 57 | if(ANDROID) 58 | add_test( 59 | NAME 60 | ${_NAME} 61 | COMMAND 62 | "${CMAKE_SOURCE_DIR}/cmake/run_android_test.sh" 63 | "$" 64 | ) 65 | else() 66 | add_test( 67 | NAME 68 | ${_NAME} 69 | COMMAND 70 | "$" 71 | ) 72 | endif() 73 | if (_RULE_TAGS) 74 | set_property(TEST ${_NAME} PROPERTY LABELS ${_RULE_TAGS}) 75 | endif() 76 | endfunction() 77 | -------------------------------------------------------------------------------- /cmake/ruy_include_directories.cmake: -------------------------------------------------------------------------------- 1 | # Copyright 2019-2021 Google LLC 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # https://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | function(ruy_include_directories NAME DEPS) 16 | target_include_directories(${NAME} 17 | PUBLIC 18 | "$" 19 | "$" 20 | ) 21 | endfunction() 22 | -------------------------------------------------------------------------------- /doc/README.md: -------------------------------------------------------------------------------- 1 | # Ruy documentation 2 | 3 | This directory will eventually contain ruy documentation. 4 | -------------------------------------------------------------------------------- /doc/depgraph.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Generates a graphviz dependency graph for :ruy, with details trimmed. 4 | # Suggested rendering: pipe to `neato` (part of graphviz standard distribution) 5 | # doc/depgraph.sh | dot -Tsvg > depgraph.svg 6 | 7 | drop=( 8 | ':platform' 9 | ':check_macros' 10 | ':asm_helpers' 11 | ':size_util' 12 | ':system_aligned_alloc' 13 | ':side_pair' 14 | ':opt_set' 15 | ':blocking_counter' 16 | ':wait' 17 | ':time' 18 | ':path' 19 | ':performance_advisory' 20 | ':tune' 21 | ':matrix' 22 | ':mat' 23 | ':mul_params' 24 | ':context_get_ctx' 25 | ':have_built_path_for' 26 | ':pack_common' 27 | ':kernel_common' 28 | ':trace' 29 | ':validate' 30 | 'profiler:instrumentation' 31 | '\bclog\b' 32 | '\bcpuinfo\b' 33 | ':apply_multiplier' 34 | '\blabel=' 35 | ) 36 | 37 | graph="$(bazel query 'kind("cc_library", deps(//ruy))' --output graph --noimplicit_deps 2>/dev/null)" 38 | 39 | graph="$(echo "${graph}" | sed 's|//ruy/\?||g')" 40 | 41 | for t in "${drop[@]}"; do 42 | graph="$(echo "${graph}" | grep -v "${t}")" 43 | done 44 | 45 | graph="$(echo "${graph}" | sed 's|//:cpuinfo_with_unstripped_include_path||g')" 46 | graph="$(echo "${graph}" | sed 's|//third_party/cpuinfo:[a-z0-9_]*|@cpuinfo|g')" 47 | 48 | frontend=( 49 | ':ruy' 50 | ':context' 51 | ':frontend' 52 | ':prepare_packed_matrices' 53 | ':create_trmul_params' 54 | ) 55 | 56 | middleend=( 57 | ':ctx' 58 | ':trmul_params' 59 | ':trmul' 60 | ':block_map' 61 | ':cpuinfo' 62 | ':cpu_cache_params' 63 | ':allocator' 64 | ':prepacked_cache' 65 | ) 66 | 67 | backend=( 68 | ':kernel.*' 69 | ':pack.*' 70 | ) 71 | 72 | threadpool=( 73 | ':thread_pool' 74 | ) 75 | 76 | frontend_lines=() 77 | middleend_lines=() 78 | backend_lines=() 79 | threadpool_lines=() 80 | misc_lines=() 81 | arrow_lines=() 82 | 83 | while IFS= read -r line; do 84 | if [[ "${line}" =~ '->' ]]; then 85 | arrow_lines+=("${line}") 86 | else 87 | handled=false 88 | if [ $handled = false ]; then 89 | for f in "${frontend[@]}"; do 90 | if [[ "${line}" =~ ${f} ]]; then 91 | frontend_lines+=("${line}") 92 | handled=true 93 | break 94 | fi 95 | done 96 | fi 97 | if [ $handled = false ]; then 98 | for f in "${middleend[@]}"; do 99 | if [[ "${line}" =~ ${f} ]]; then 100 | middleend_lines+=("${line}") 101 | handled=true 102 | break 103 | fi 104 | done 105 | fi 106 | if [ $handled = false ]; then 107 | for f in "${backend[@]}"; do 108 | if [[ "${line}" =~ ${f} ]]; then 109 | backend_lines+=("${line}") 110 | handled=true 111 | break 112 | fi 113 | done 114 | fi 115 | if [ $handled = false ]; then 116 | for f in "${threadpool[@]}"; do 117 | if [[ "${line}" =~ ${f} ]]; then 118 | threadpool_lines+=("${line}") 119 | handled=true 120 | break 121 | fi 122 | done 123 | fi 124 | if [ $handled = false ]; then 125 | if [[ "${line}" =~ ^[[:space:]]+\" ]]; then 126 | misc_lines+=("${line}") 127 | fi 128 | fi 129 | fi 130 | done <<< "${graph}" 131 | 132 | echo "digraph ruy {" 133 | echo " splines = true" 134 | echo " node [shape=box]" 135 | for f in "${frontend_lines[@]}"; do 136 | echo " $f [style=filled, color=\"#B2EBF2\"];" 137 | done 138 | for m in "${middleend_lines[@]}"; do 139 | echo " $m [style=filled, color=\"#C8E6C9\"];" 140 | done 141 | for b in "${backend_lines[@]}"; do 142 | echo " $b [style=filled, color=\"#FFCDD2\"];" 143 | done 144 | for b in "${threadpool_lines[@]}"; do 145 | echo " $b [style=filled, color=\"#FFF9C4\"];" 146 | done 147 | for m in "${misc_lines[@]}"; do 148 | echo "$m" 149 | done 150 | for a in "${arrow_lines[@]}"; do 151 | echo "$a" 152 | done 153 | echo "}" 154 | -------------------------------------------------------------------------------- /example/BUILD: -------------------------------------------------------------------------------- 1 | load("//third_party/bazel_rules/rules_cc/cc:cc_binary.bzl", "cc_binary") 2 | 3 | package( 4 | default_applicable_licenses = ["//third_party/ruy:license"], 5 | licenses = ["notice"], # Apache 2.0 6 | ) 7 | 8 | # Usage examples. 9 | cc_binary( 10 | name = "example", 11 | srcs = ["example.cc"], 12 | deps = ["//ruy"], 13 | ) 14 | 15 | cc_binary( 16 | name = "parametrized_example", 17 | srcs = ["parametrized_example.cc"], 18 | deps = ["//ruy"], 19 | ) 20 | -------------------------------------------------------------------------------- /example/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # This file is generated (whence no license header). Do not edit! 2 | # To regenerate, run: 3 | # cmake/bazel_to_cmake.sh 4 | 5 | ruy_cc_binary( 6 | NAME 7 | ruy_example_example 8 | SRCS 9 | example.cc 10 | DEPS 11 | ruy 12 | ) 13 | 14 | ruy_cc_binary( 15 | NAME 16 | ruy_example_parametrized_example 17 | SRCS 18 | parametrized_example.cc 19 | DEPS 20 | ruy 21 | ) 22 | 23 | ruy_add_all_subdirs() 24 | -------------------------------------------------------------------------------- /example/README.md: -------------------------------------------------------------------------------- 1 | ## Introduction 2 | 3 | These are some examples about how to use RUY. 4 | 5 | ## BUILD 6 | 7 | Build the example with bazel commands: 8 | ``` 9 | bazel build //ruy/example:example 10 | ``` 11 | You can find the generated target under directory: 12 | ``` 13 | ./bazel-bin/ruy/example 14 | ``` 15 | -------------------------------------------------------------------------------- /ruy/allocator.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "ruy/allocator.h" 17 | 18 | #include "ruy/opt_set.h" 19 | #include "ruy/size_util.h" 20 | #include "ruy/system_aligned_alloc.h" 21 | 22 | namespace ruy { 23 | 24 | Allocator::~Allocator() { 25 | FreeAll(); 26 | detail::SystemAlignedFree(ptr_); 27 | } 28 | 29 | void* Allocator::AllocateFast(std::ptrdiff_t num_bytes) { 30 | if (current_ + num_bytes > size_) { 31 | return nullptr; 32 | } 33 | void* ret = static_cast(ptr_) + current_; 34 | current_ += num_bytes; 35 | return ret; 36 | } 37 | 38 | void* Allocator::AllocateSlow(std::ptrdiff_t num_bytes) { 39 | void* p = detail::SystemAlignedAlloc(num_bytes); 40 | fallback_blocks_total_size_ += num_bytes; 41 | fallback_blocks_.push_back(p); 42 | return p; 43 | } 44 | 45 | void* Allocator::AllocateBytes(std::ptrdiff_t num_bytes) { 46 | if (num_bytes == 0) { 47 | return nullptr; 48 | } 49 | const std::ptrdiff_t rounded_num_bytes = 50 | round_up_pot(num_bytes, detail::kMinimumBlockAlignment); 51 | if (void* p = AllocateFast(rounded_num_bytes)) { 52 | return p; 53 | } 54 | return AllocateSlow(rounded_num_bytes); 55 | } 56 | 57 | void* Allocator::AllocateBytesAvoidingAliasingWith(std::ptrdiff_t num_bytes, 58 | const void* to_avoid) { 59 | #if RUY_OPT(AVOID_ALIASING) 60 | if (num_bytes == 0) { 61 | return nullptr; 62 | } 63 | // The minimum L1D cache aliasing periodicity in bytes that we expect to 64 | // encounter on any device. This does not seem to be documented, but 65 | // empirically we observe the following: 66 | // Cortex-A53: 1024 67 | // Cortex-A55r1: 2048 68 | // Cortex-A76: not as easily observable. 69 | // Over-estimating this makes the AVOID_ALIASING optimization useless on 70 | // devices with lower periodicity. 71 | // Under-estimating this by 2x should be harmless. 72 | // Under-estimating this by a larger factor should gradually degrade 73 | // performance due to cache aliasing causing mutual eviction between 74 | // the packed matrix data, and the source matrix data being prefetched by the 75 | // CPU ahead of the packing code execution. 76 | static constexpr std::uint32_t kMinPeriod = 1024; 77 | static_assert(is_pot(kMinPeriod), ""); 78 | void* p = AllocateBytes(num_bytes + kMinPeriod); 79 | auto unsigned_low_bits = [](const void* p) { 80 | return static_cast(reinterpret_cast(p)); 81 | }; 82 | // This relies on unsigned integer overflow wrapping around. 83 | std::uint32_t diff_modulus = 84 | (unsigned_low_bits(p) - unsigned_low_bits(to_avoid)) % kMinPeriod; 85 | // diff_modulus is in [0, kMinPeriod). 86 | // We want it as close as possible to the middle of that interval, 87 | // kMinPeriod/2. The bad 'aliasing' case, that we are working to avoid, 88 | // is when diff_modulus is close to the ends of that interval, 0 or 89 | // kMinPeriod. So we want to add an offset of kMinPeriod/2 if it is in the 90 | // first or the last quarter of that interval. 91 | bool need_offset = 92 | diff_modulus < kMinPeriod / 4 || diff_modulus > 3 * kMinPeriod / 4; 93 | return static_cast(p) + (need_offset ? (kMinPeriod / 2) : 0); 94 | #else 95 | (void)to_avoid; 96 | return AllocateBytes(num_bytes); 97 | #endif 98 | } 99 | 100 | void Allocator::FreeAll() { 101 | current_ = 0; 102 | if (fallback_blocks_.empty()) { 103 | return; 104 | } 105 | 106 | // Free all memory before reallocating `ptr_`. 107 | // This minimizes the memory high-water-mark. 108 | detail::SystemAlignedFree(ptr_); 109 | for (void* p : fallback_blocks_) { 110 | detail::SystemAlignedFree(p); 111 | } 112 | 113 | // We reallocate to the exact new size, rather than growing 114 | // exponentially like std::vector. This means linear instead of logarithmic 115 | // bound on the number of allocation in some worst-case calling patterns. 116 | // This is considered worth it because minimizing memory usage is important 117 | // and actual calling patterns in applications that we care about still 118 | // reach the no-further-allocations steady state in a small finite number 119 | // of iterations. 120 | std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_; 121 | ptr_ = detail::SystemAlignedAlloc(new_size); 122 | size_ = new_size; 123 | 124 | fallback_blocks_.clear(); 125 | fallback_blocks_total_size_ = 0; 126 | } 127 | 128 | } // namespace ruy 129 | -------------------------------------------------------------------------------- /ruy/allocator.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef RUY_RUY_ALLOCATOR_H_ 17 | #define RUY_RUY_ALLOCATOR_H_ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | namespace ruy { 25 | 26 | // Specialized allocator designed to converge to a steady-state where all 27 | // allocations are bump-ptr allocations from an already-allocated buffer. 28 | // 29 | // To support these constraints, this allocator only supports two 30 | // operations. 31 | // - AllocateBytes/Allocate: allocates a pointer to storage of a 32 | // specified size, which will be aligned to kMinimumBlockAlignment. 33 | // - FreeAll: frees all previous allocations (but retains the internal 34 | // buffer to minimize future calls into the system allocator). 35 | // 36 | // This class is specialized for supporting just those two operations 37 | // under this specific steady-state usage pattern. Extending this class 38 | // with new allocation interfaces that don't fit that pattern is probably not 39 | // the right choice. Instead, build a new class on top of 40 | // SystemAlignedAlloc/SystemAlignedFree. 41 | // 42 | // All operations happen on aligned blocks for simplicity. 43 | // 44 | // Theory of operation: 45 | // 46 | // - ptr_, current_, and size_ implement a basic bump-ptr allocator. 47 | // 48 | // - in AllocateBytes, the fast path is just a bump-ptr 49 | // allocation. If our bump-ptr allocator doesn't have enough space for an 50 | // allocation, then we allocate a block from the system allocator to 51 | // service the allocation request. We save that block in fallback_blocks_ 52 | // and track the total size of the fallback blocks in 53 | // fallback_blocks_total_size_. 54 | // 55 | // - in FreeAll, the fast path just resets the bump-ptr allocator. If 56 | // there are any fallback blocks, we free them and reallocate the 57 | // bump-ptr allocator's buffer so that the next sequence of allocations 58 | // will hopefully not need any fallback blocks. 59 | class Allocator final { 60 | public: 61 | ~Allocator(); 62 | 63 | // Allocate a buffer. 64 | void* AllocateBytes(std::ptrdiff_t num_bytes); 65 | // Allocate a buffer, trying to avoid having its address close to aliasing 66 | // the specified `to_avoid` in the L1D cache. 67 | void* AllocateBytesAvoidingAliasingWith(std::ptrdiff_t num_bytes, 68 | const void* to_avoid); 69 | // Allocate an array of `count` elements of type T. 70 | template 71 | T* Allocate(std::ptrdiff_t count) { 72 | return static_cast(AllocateBytes(count * sizeof(T))); 73 | } 74 | // Allocate an array of `count` elements of the given `Pointer` type's 75 | // element_type. 76 | template 77 | void Allocate(std::ptrdiff_t count, Pointer* out) { 78 | using T = typename std::pointer_traits::element_type; 79 | *out = Allocate(count); 80 | } 81 | 82 | // Free all allocated blocks. Internally consolidate allocated buffers as 83 | // explained in the class comment. 84 | void FreeAll(); 85 | 86 | private: 87 | void operator=(const Allocator&) = delete; 88 | void* AllocateFast(std::ptrdiff_t num_bytes); 89 | void* AllocateSlow(std::ptrdiff_t num_bytes); 90 | 91 | void* ptr_ = nullptr; 92 | std::ptrdiff_t current_ = 0; 93 | std::ptrdiff_t size_ = 0; 94 | std::vector fallback_blocks_; 95 | std::ptrdiff_t fallback_blocks_total_size_ = 0; 96 | }; 97 | 98 | } // namespace ruy 99 | 100 | #endif // RUY_RUY_ALLOCATOR_H_ 101 | -------------------------------------------------------------------------------- /ruy/allocator_test.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "ruy/allocator.h" 17 | 18 | #include "ruy/gtest_wrapper.h" 19 | 20 | namespace ruy { 21 | namespace { 22 | 23 | TEST(AllocatorTest, ReturnsValidMemory) { 24 | Allocator allocator; 25 | int *p; 26 | allocator.Allocate(1, &p); 27 | ASSERT_NE(p, nullptr); 28 | 29 | // If this is bogus memory, ASan will cause this test to fail. 30 | *p = 42; 31 | 32 | allocator.FreeAll(); 33 | } 34 | 35 | TEST(AllocatorTest, NoLeak) { 36 | Allocator allocator; 37 | // Allocate and free some ridiculously large total amount of memory, so 38 | // that a leak will hopefully cause some sort of resource exhaustion. 39 | // 40 | // Despite the large number of allocations, this test is actually quite 41 | // fast, since our fast-path allocation logic is very fast. 42 | constexpr int kNumAllocations = 100 * 1024; 43 | constexpr int kAllocationSize = 1024 * 1024; 44 | for (int i = 0; i < kNumAllocations; i++) { 45 | char *p; 46 | allocator.Allocate(kAllocationSize, &p); 47 | allocator.FreeAll(); 48 | } 49 | } 50 | 51 | TEST(AllocatorTest, IncreasingSizes) { 52 | Allocator allocator; 53 | // Allocate sizes that increase by small amounts across FreeAll calls. 54 | for (int i = 1; i < 100 * 1024; i++) { 55 | char *p; 56 | allocator.Allocate(i, &p); 57 | allocator.FreeAll(); 58 | } 59 | } 60 | 61 | TEST(AllocatorTest, ManySmallAllocations) { 62 | Allocator allocator; 63 | // Allocate many small allocations between FreeAll calls. 64 | for (int i = 0; i < 10 * 1024; i += 100) { 65 | for (int j = 0; j < i; j++) { 66 | char *p; 67 | allocator.Allocate(1, &p); 68 | } 69 | allocator.FreeAll(); 70 | } 71 | } 72 | 73 | TEST(AllocatorTest, DestructorHandlesMainBumpPtr) { 74 | // This is a white-box test. 75 | Allocator allocator; 76 | allocator.AllocateBytes(1); 77 | allocator.FreeAll(); 78 | // After the call to FreeAll, the allocator will consolidate all of the memory 79 | // into the main bump-ptr allocator's block, which we then expect to be freed 80 | // in the destructor. 81 | // 82 | // We have no test assertions -- we primarily expect that this trigger a leak 83 | // checker and cause the test to fail. 84 | } 85 | 86 | TEST(AllocatorTest, DestructorHandlesFallbackBlocks) { 87 | // This is a white-box test. 88 | Allocator allocator; 89 | // Since we just created the allocator, this will allocate a fallback block, 90 | // which we then expect to be freed in the destructor. 91 | // 92 | // We have no test assertions -- we primarily expect that this trigger a leak 93 | // checker and cause the test to fail. 94 | allocator.AllocateBytes(1); 95 | } 96 | 97 | TEST(AllocatorTest, AvoidAliasing) { 98 | Allocator allocator; 99 | // Run twice with a FreeAll in between, just in case some future 100 | // change of internal logic makes that bug-prone. 101 | for (int repeat = 0; repeat < 2; repeat++) { 102 | for (int i = 1; i < 100; i++) { 103 | const void *to_avoid = 104 | reinterpret_cast(0x1234567890123ull + 123 * i); 105 | void *ptr = allocator.AllocateBytesAvoidingAliasingWith(i * 10, to_avoid); 106 | auto unsigned_low_bits = [](const void *p) { 107 | return static_cast(reinterpret_cast(p)); 108 | }; 109 | static constexpr int kMinPeriod = 1024; 110 | std::uint32_t unsigned_diff = 111 | (unsigned_low_bits(ptr) - unsigned_low_bits(to_avoid)) % kMinPeriod; 112 | std::uint32_t unsigned_diff_mod = unsigned_diff % kMinPeriod; 113 | ASSERT_TRUE(unsigned_diff_mod >= (kMinPeriod / 4) && 114 | unsigned_diff_mod <= 3 * (kMinPeriod / 4)); 115 | } 116 | allocator.FreeAll(); 117 | } 118 | } 119 | 120 | } // namespace 121 | } // namespace ruy 122 | 123 | int main(int argc, char **argv) { 124 | ::testing::InitGoogleTest(&argc, argv); 125 | return RUN_ALL_TESTS(); 126 | } 127 | -------------------------------------------------------------------------------- /ruy/apply_multiplier.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2020 Google LLC. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "ruy/apply_multiplier.h" 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | 23 | namespace ruy { 24 | namespace detail { 25 | 26 | // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 27 | // Warning: this code is not meant to be bit-exact-normative. 28 | // Please refer to the class comment of ruy::MulParams, in mul_params.h. 29 | // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 30 | // Simplified multiplier application function 31 | // 32 | // Double rounding and symmetric rounding are removed compared to reference. 33 | // Double rounding seems unnecessary and can complicate implementations. 34 | // Symmetric rounding also adds implementation complexity. 35 | // 36 | // Composed of a single rounding shift right and can lead to more HW 37 | // friendly implementations. 38 | // 39 | // On NEON this can be translated to a SQDMULH + rounding shift right sequence. 40 | // The use of SQDMULH rather than SQRDMULH gives a result that is 41 | // equivalent to a single rounded shift since the truncating shift of SQDMULH 42 | // can be combined with the rounding right shift via the formula (for k>=1): 43 | // ((x>>31)+(1<<(k-1)))>>k = (x + (1<<(30+k))>>(31+k) 44 | // 45 | // Preconditions: 46 | // - quantized_multiplier >= 0 47 | // - shift is -31 to +7 (negative for right shift) 48 | std::int32_t MultiplyByQuantizedMultiplier(std::int32_t x, 49 | std::int32_t quantized_multiplier, 50 | int shift) { 51 | RUY_CHECK_GE(shift, -31); 52 | 53 | int total_shift = 31 - shift; 54 | 55 | std::int64_t x_64(x); 56 | std::int64_t quantized_multiplier_64(quantized_multiplier); 57 | std::int64_t round = (int64_t)1 << (total_shift - 1); 58 | int64_t result = x_64 * quantized_multiplier_64 + round; 59 | result = result >> total_shift; 60 | 61 | RUY_DCHECK_GE(result, std::numeric_limits::lowest()); 62 | RUY_DCHECK_LE(result, std::numeric_limits::max()); 63 | 64 | return static_cast(result); 65 | } 66 | 67 | } // namespace detail 68 | 69 | } // namespace ruy 70 | -------------------------------------------------------------------------------- /ruy/apply_multiplier.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2020 Google LLC. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | // Provides a reference (portable, non-optimized) ApplyMultiplier function. 17 | // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 18 | // Warning: this code is not meant to be bit-exact-normative. 19 | // Please refer to the class comment of ruy::MulParams, in mul_params.h. 20 | // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 21 | 22 | #ifndef RUY_RUY_APPLY_MULTIPLIER_H_ 23 | #define RUY_RUY_APPLY_MULTIPLIER_H_ 24 | 25 | #include 26 | #include 27 | 28 | #include "ruy/check_macros.h" 29 | #include "ruy/mul_params.h" 30 | 31 | namespace ruy { 32 | 33 | // Applies the quantized multiplier to the `*accum` accumulator value, if 34 | // applicable, that is, if AccumScalar==int32 and DstScalar!=int32. Otherwise, 35 | // does nothing. 36 | // 37 | // This is slow, portable, 'reference' code. It should only be used in 38 | // ReferenceMul and in Path::kStandardCpp. There isn't a point in optimizing it, 39 | // either. Fast paths have that multiplier work done as part of the kernel, 40 | // typically written in assembly anyway. 41 | template 42 | void ApplyMultiplier(const MulParams& mul_params, 43 | int channel, AccumScalar* accum); 44 | 45 | namespace detail { 46 | 47 | // Copied from TF Lite code. 48 | std::int32_t MultiplyByQuantizedMultiplier(std::int32_t x, 49 | std::int32_t quantized_multiplier, 50 | int shift); 51 | 52 | // Helper to apply a fixed-point multiplier. Only 'applicable' if AccumScalar 53 | // is int32 (i.e. in all cases except floating-point) and if the destination is 54 | // not int32 (i.e. unless the user wants to get raw accumulators). 55 | template ::value && 57 | !std::is_same::value> 58 | struct ApplyMultiplierImpl {}; 59 | 60 | // Specialization in non-applicable case: do nothing. 61 | template 62 | struct ApplyMultiplierImpl { 63 | static void Run(const MulParams&, int, AccumScalar*) { 64 | } 65 | }; 66 | 67 | template 68 | struct ApplyMultiplierImpl { 69 | static void Run(const MulParams& mul_params, 70 | int channel, AccumScalar* accum) { 71 | AccumScalar m = mul_params.multiplier_fixedpoint_perchannel() 72 | ? mul_params.multiplier_fixedpoint_perchannel()[channel] 73 | : mul_params.multiplier_fixedpoint(); 74 | int e = mul_params.multiplier_exponent_perchannel() 75 | ? mul_params.multiplier_exponent_perchannel()[channel] 76 | : mul_params.multiplier_exponent(); 77 | *accum = MultiplyByQuantizedMultiplier(*accum, m, e); 78 | } 79 | }; 80 | 81 | } // namespace detail 82 | 83 | template 84 | void ApplyMultiplier(const MulParams& mul_params, 85 | int channel, AccumScalar* accum) { 86 | detail::ApplyMultiplierImpl::Run(mul_params, channel, 87 | accum); 88 | } 89 | 90 | } // namespace ruy 91 | 92 | #endif // RUY_RUY_APPLY_MULTIPLIER_H_ 93 | -------------------------------------------------------------------------------- /ruy/asm_helpers.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | // Some helpers to write inline asm. 17 | 18 | #ifndef RUY_RUY_ASM_HELPERS_H_ 19 | #define RUY_RUY_ASM_HELPERS_H_ 20 | 21 | #include "ruy/opt_set.h" 22 | 23 | // Enclose load-prefetch instructions in RUY_PREFETCH_LOAD() so we can 24 | // conditionally enable them based on the RUY_OPT_SET. 25 | #if RUY_OPT(PREFETCH_LOAD) 26 | #define RUY_PREFETCH_LOAD(X) X 27 | #else 28 | #define RUY_PREFETCH_LOAD(X) 29 | #endif 30 | 31 | // Enclose store-prefetch instructions in RUY_PREFETCH_STORE() so we can 32 | // conditionally enable them based on the RUY_OPT_SET. 33 | #if RUY_OPT(PREFETCH_STORE) 34 | #define RUY_PREFETCH_STORE(X) X 35 | #else 36 | #define RUY_PREFETCH_STORE(X) 37 | #endif 38 | 39 | // The usual stringification macro. 40 | #define RUY_STR(s) RUY_STR_UNEXPANDED(s) 41 | #define RUY_STR_UNEXPANDED(s) #s 42 | 43 | #endif // RUY_RUY_ASM_HELPERS_H_ 44 | -------------------------------------------------------------------------------- /ruy/blocking_counter.cc: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #include "ruy/blocking_counter.h" 17 | 18 | #include "ruy/check_macros.h" 19 | #include "ruy/wait.h" 20 | 21 | namespace ruy { 22 | 23 | void BlockingCounter::Reset(int initial_count) { 24 | int old_count_value = count_.load(std::memory_order_relaxed); 25 | RUY_DCHECK_EQ(old_count_value, 0); 26 | (void)old_count_value; 27 | count_.store(initial_count, std::memory_order_release); 28 | } 29 | 30 | bool BlockingCounter::DecrementCount() { 31 | int old_count_value = count_.fetch_sub(1, std::memory_order_acq_rel); 32 | RUY_DCHECK_GT(old_count_value, 0); 33 | int count_value = old_count_value - 1; 34 | bool hit_zero = (count_value == 0); 35 | if (hit_zero) { 36 | std::lock_guard lock(count_mutex_); 37 | count_cond_.notify_all(); 38 | } 39 | return hit_zero; 40 | } 41 | 42 | void BlockingCounter::Wait(const Duration spin_duration) { 43 | const auto& condition = [this]() { 44 | return count_.load(std::memory_order_acquire) == 0; 45 | }; 46 | ruy::Wait(condition, spin_duration, &count_cond_, &count_mutex_); 47 | } 48 | 49 | } // namespace ruy 50 | -------------------------------------------------------------------------------- /ruy/blocking_counter.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | #ifndef RUY_RUY_BLOCKING_COUNTER_H_ 17 | #define RUY_RUY_BLOCKING_COUNTER_H_ 18 | 19 | #include 20 | #include // NOLINT(build/c++11) // IWYU pragma: keep 21 | #include // NOLINT(build/c++11) // IWYU pragma: keep 22 | 23 | #include "ruy/time.h" 24 | 25 | namespace ruy { 26 | 27 | // A BlockingCounter lets one thread to wait for N events to occur. 28 | // This is how the master thread waits for all the worker threads 29 | // to have finished working. 30 | // The waiting is done using a naive spinlock waiting for the atomic 31 | // count_ to hit the value 0. This is acceptable because in our usage 32 | // pattern, BlockingCounter is used only to synchronize threads after 33 | // short-lived tasks (performing parts of the same GEMM). It is not used 34 | // for synchronizing longer waits (resuming work on the next GEMM). 35 | class BlockingCounter { 36 | public: 37 | BlockingCounter() : count_(0) {} 38 | 39 | // Sets/resets the counter; initial_count is the number of 40 | // decrementing events that the Wait() call will be waiting for. 41 | void Reset(int initial_count); 42 | 43 | // Decrements the counter; if the counter hits zero, signals 44 | // the threads that were waiting for that, and returns true. 45 | // Otherwise (if the decremented count is still nonzero), 46 | // returns false. 47 | bool DecrementCount(); 48 | 49 | // Waits for the N other threads (N having been set by Reset()) 50 | // to hit the BlockingCounter. 51 | // 52 | // Will first spin-wait for `spin_duration` before reverting to passive wait. 53 | void Wait(const Duration spin_duration); 54 | 55 | private: 56 | std::atomic count_; 57 | 58 | // The condition variable and mutex allowing to passively wait for count_ 59 | // to reach the value zero, in the case of longer waits. 60 | std::condition_variable count_cond_; 61 | std::mutex count_mutex_; 62 | }; 63 | 64 | } // namespace ruy 65 | 66 | #endif // RUY_RUY_BLOCKING_COUNTER_H_ 67 | -------------------------------------------------------------------------------- /ruy/build_defs.bzl: -------------------------------------------------------------------------------- 1 | """Build definitions for Ruy.""" 2 | 3 | # Helper for ruy_copts(). 4 | # Returns warnings flags to use for all ruy code. 5 | def ruy_copts_warnings(): 6 | return select({ 7 | "//tools/cc_target_os:windows": [ 8 | # We run into trouble on Windows toolchains with warning flags, 9 | # as mentioned in the comments below on each flag. 10 | # We could be more aggressive in enabling supported warnings on each 11 | # Windows toolchain, but we compromise with keeping BUILD files simple 12 | # by limiting the number of config_setting's. 13 | ], 14 | "//conditions:default": [ 15 | "-Wall", 16 | # Some clang-based Windows toolchains have more warnings in -Wextra. 17 | "-Wextra", 18 | # Warn on preprocessor expansion of an undefined token, e.g. catching 19 | # typos such as `#ifdef __linus__` instead of `#ifdef __linux__`. 20 | # Not supported by MSVC. 21 | "-Wundef", 22 | ], 23 | }) 24 | 25 | # Helper for ruy_copts(). 26 | # Returns flags to use to enable NEON if applicable, for all ruy code. 27 | def ruy_copts_neon(): 28 | return select({ 29 | # OK to crash old devices that lack full NEON support. 30 | # No need to pass -mfloat-abi=softfp, that is already on. 31 | "//ruy:arm32_assuming_neon": [ 32 | "-mfpu=neon", 33 | ], 34 | "//conditions:default": [], 35 | }) 36 | 37 | # Helper for ruy_copts(). 38 | # Returns optimization flags to use for all ruy code. 39 | def ruy_copts_optimize(): 40 | return select({ 41 | # On some toolchains, typically mobile, "-c opt" is interpreted by 42 | # default as "optimize for size, not for speed". For Ruy code, 43 | # optimizing for speed is the better compromise, so we override that. 44 | # Careful to keep debug builds debuggable, whence the select based 45 | # on the compilation mode. 46 | "//ruy:do_not_want_O3": [], 47 | "//conditions:default": ["-O3"], 48 | }) 49 | 50 | # Returns compiler flags to use for all ruy code. 51 | def ruy_copts(): 52 | return ruy_copts_warnings() + ruy_copts_neon() + ruy_copts_optimize() 53 | 54 | def ruy_copts_avx(): 55 | return select({ 56 | "//ruy:x86_64_and_not_msvc": ["-mavx"], 57 | "//conditions:default": [], 58 | }) 59 | 60 | def ruy_copts_avx2_fma(): 61 | return select({ 62 | "//ruy:x86_64_and_not_msvc": ["-mavx2", "-mfma"], 63 | "//conditions:default": [], 64 | }) 65 | 66 | def ruy_copts_avx512(): 67 | # In some clang-based toolchains, in the default compilation mode (not -c opt), 68 | # heavy spillage in the AVX512 kernels results in stack frames > 50k. This issue does not exist 69 | # in optimized builds (-c opt). 70 | return select({ 71 | "//ruy:x86_64_and_not_msvc": ["$(STACK_FRAME_UNLIMITED)", "-mavx512f", "-mavx512vl", "-mavx512cd", "-mavx512bw", "-mavx512dq"], 72 | "//conditions:default": [], 73 | }) 74 | -------------------------------------------------------------------------------- /ruy/build_defs.oss.bzl: -------------------------------------------------------------------------------- 1 | """Build definitions for Ruy that are specific to the open-source build.""" 2 | 3 | # Used for targets that #include 4 | def ruy_linkopts_thread_standard_library(): 5 | # In open source builds, GCC is a common occurence. It requires "-pthread" 6 | # to use the C++11 standard library header. This breaks the 7 | # opensource build on Windows and probably some other platforms, so that 8 | # will need to be fixed as needed. Ideally we would like to do this based 9 | # on GCC being the compiler, but that does not seem to be easy to achieve 10 | # with Bazel. Instead we do the following, which is copied from 11 | # https://github.com/abseil/abseil-cpp/blob/1112609635037a32435de7aa70a9188dcb591458/absl/base/BUILD.bazel#L155 12 | return select({ 13 | "//tools/cc_target_os:windows": [], 14 | "//conditions:default": ["-pthread"], 15 | }) 16 | -------------------------------------------------------------------------------- /ruy/check_macros.h: -------------------------------------------------------------------------------- 1 | /* Copyright 2019 Google LLC. All Rights Reserved. 2 | 3 | Licensed under the Apache License, Version 2.0 (the "License"); 4 | you may not use this file except in compliance with the License. 5 | You may obtain a copy of the License at 6 | 7 | http://www.apache.org/licenses/LICENSE-2.0 8 | 9 | Unless required by applicable law or agreed to in writing, software 10 | distributed under the License is distributed on an "AS IS" BASIS, 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | See the License for the specific language governing permissions and 13 | limitations under the License. 14 | ==============================================================================*/ 15 | 16 | // self-contained, minimal, CHECK/DCHECK macros similar to glog. 17 | 18 | #ifndef RUY_RUY_CHECK_MACROS_H_ 19 | #define RUY_RUY_CHECK_MACROS_H_ 20 | 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | namespace ruy { 27 | namespace check_macros { 28 | 29 | constexpr int kValueBufSize = 32; 30 | 31 | template 32 | struct ToString { 33 | static void Run(const T&, char* buf) { snprintf(buf, kValueBufSize, "(?)"); } 34 | }; 35 | 36 | template <> 37 | struct ToString { 38 | static void Run(float value, char* buf) { 39 | snprintf(buf, kValueBufSize, "%.9g", static_cast(value)); 40 | } 41 | }; 42 | 43 | template <> 44 | struct ToString { 45 | static void Run(double value, char* buf) { 46 | snprintf(buf, kValueBufSize, "%.16g", value); 47 | } 48 | }; 49 | 50 | template 51 | struct ToString::value>::type> { 52 | static void Run(const T& value, char* buf) { 53 | snprintf(buf, kValueBufSize, "%lld", static_cast(value)); 54 | } 55 | }; 56 | 57 | template 58 | struct ToString { 59 | static void Run(T* value, char* buf) { 60 | snprintf(buf, kValueBufSize, "%p", value); 61 | } 62 | }; 63 | 64 | template 65 | struct ToString::value>::type> { 66 | static void Run(const T& value, char* buf) { 67 | snprintf(buf, kValueBufSize, "(enum value %d)", static_cast(value)); 68 | } 69 | }; 70 | 71 | inline void CheckImpl(bool condition, const char* file, int line, 72 | const char* macro, const char* condition_str) { 73 | if (!condition) { 74 | fprintf(stderr, "%s:%d: %s condition not satisfied: %s\n", file, line, 75 | macro, condition_str); 76 | abort(); 77 | } 78 | } 79 | 80 | template