├── .gitignore
├── .gitmodules
├── BUILD
├── CMakeLists.txt
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── WORKSPACE
├── cmake
    ├── bazel_to_cmake.py
    ├── bazel_to_cmake.sh
    ├── run_android_test.sh
    ├── ruyConfig.cmake.in
    ├── ruy_add_all_subdirs.cmake
    ├── ruy_cc_binary.cmake
    ├── ruy_cc_library.cmake
    ├── ruy_cc_test.cmake
    └── ruy_include_directories.cmake
├── doc
    ├── README.md
    ├── depgraph.sh
    └── depgraph.svg
├── example
    ├── BUILD
    ├── CMakeLists.txt
    ├── README.md
    ├── example.cc
    └── parametrized_example.cc
├── ruy
    ├── BUILD
    ├── CMakeLists.txt
    ├── allocator.cc
    ├── allocator.h
    ├── allocator_test.cc
    ├── apply_multiplier.cc
    ├── apply_multiplier.h
    ├── apply_multiplier_test.cc
    ├── asm_helpers.h
    ├── benchmark.cc
    ├── block_map.cc
    ├── block_map.h
    ├── block_map_test.cc
    ├── blocking_counter.cc
    ├── blocking_counter.h
    ├── build_defs.bzl
    ├── build_defs.oss.bzl
    ├── check_macros.h
    ├── check_macros_test.cc
    ├── context.cc
    ├── context.h
    ├── context_get_ctx.cc
    ├── context_get_ctx.h
    ├── context_test.cc
    ├── cpu_cache_params.h
    ├── cpuinfo.cc
    ├── cpuinfo.h
    ├── create_trmul_params.h
    ├── ctx.cc
    ├── ctx.h
    ├── ctx_impl.h
    ├── ctx_test.cc
    ├── denormal.cc
    ├── denormal.h
    ├── frontend.cc
    ├── frontend.h
    ├── gtest_wrapper.h
    ├── have_built_path_for.h
    ├── have_built_path_for_avx.cc
    ├── have_built_path_for_avx2_fma.cc
    ├── have_built_path_for_avx512.cc
    ├── kernel.h
    ├── kernel_arm.h
    ├── kernel_arm32.cc
    ├── kernel_arm64.cc
    ├── kernel_avx.cc
    ├── kernel_avx2_fma.cc
    ├── kernel_avx512.cc
    ├── kernel_common.h
    ├── kernel_x86.h
    ├── mat.h
    ├── matrix.h
    ├── matrix_test.cc
    ├── mul_params.h
    ├── mul_params_test.cc
    ├── opt_set.h
    ├── pack.h
    ├── pack_arm.cc
    ├── pack_arm.h
    ├── pack_avx.cc
    ├── pack_avx2_fma.cc
    ├── pack_avx512.cc
    ├── pack_common.h
    ├── pack_x86.h
    ├── path.h
    ├── perchannel_buffers_reallocation_test.cc
    ├── performance_advisory.h
    ├── platform.h
    ├── pmu.cc
    ├── pmu.h
    ├── prepacked_cache.cc
    ├── prepacked_cache.h
    ├── prepacked_cache_test.cc
    ├── prepare_packed_matrices.cc
    ├── prepare_packed_matrices.h
    ├── profiler
    │   ├── BUILD
    │   ├── CMakeLists.txt
    │   ├── README.md
    │   ├── instrumentation.cc
    │   ├── instrumentation.h
    │   ├── profiler.cc
    │   ├── profiler.h
    │   ├── test.cc
    │   ├── test_instrumented_library.cc
    │   ├── test_instrumented_library.h
    │   ├── treeview.cc
    │   └── treeview.h
    ├── reference_mul.h
    ├── ruy.h
    ├── ruy_test.bzl
    ├── ruy_test_ext.oss.bzl
    ├── side_pair.h
    ├── size_util.h
    ├── size_util_test.cc
    ├── strategy_controls.h
    ├── system_aligned_alloc.cc
    ├── system_aligned_alloc.h
    ├── test.h
    ├── test_fast.cc
    ├── test_overflow_dst_zero_point.cc
    ├── test_slow.cc
    ├── thread_pool.cc
    ├── thread_pool.h
    ├── time.h
    ├── trace.h
    ├── trmul.cc
    ├── trmul.h
    ├── trmul_params.h
    ├── tune.cc
    ├── tune.h
    ├── tune_test.cc
    ├── validate.h
    ├── wait.cc
    ├── wait.h
    └── wait_test.cc
└── third_party
    ├── BUILD
    ├── CMakeLists.txt
    └── cpuinfo.BUILD
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Visual Studio files
 2 | .vs/
 3 | .vscode/
 4 | *.sdf
 5 | *.opensdf
 6 | *.VC.opendb
 7 | *.suo
 8 | *.user
 9 | 
10 | # macOS files
11 | .DS_Store
12 | 
13 | # CMake artifacts
14 | build/
15 | build-*/
16 | 
17 | # Bazel artifacts
18 | **/bazel-*
19 | 
20 | # Emacs autosaves
21 | *~
22 | \#*\#
23 | 
24 | # Vim swap files
25 | [._]*.sw[a-p]
26 | 
27 | # Source indexing files
28 | compile_commands.json
29 | .cache/clangd
30 | .clangd/
31 | 
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "googletest"]
2 | 	path = third_party/googletest
3 | 	url = https://github.com/google/googletest
4 | [submodule "cpuinfo"]
5 | 	path = third_party/cpuinfo
6 | 	url = https://github.com/pytorch/cpuinfo
7 | 
--------------------------------------------------------------------------------
/BUILD:
--------------------------------------------------------------------------------
 1 | # Ruy is not BLAS
 2 | 
 3 | load("//tools/build_defs/license:license.bzl", "license")
 4 | 
 5 | package(
 6 |     default_applicable_licenses = ["//third_party/ruy:license"],
 7 |     licenses = ["notice"],  # Apache 2.0
 8 | )
 9 | 
10 | license(
11 |     name = "license",
12 |     package_name = "ruy",
13 | )
14 | 
15 | exports_files(["LICENSE"])
16 | 
--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Copyright 2021 Google LLC
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #      https://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | cmake_policy(SET CMP0012 NEW)
 16 | cmake_policy(SET CMP0048 NEW)
 17 | project(ruy CXX)
 18 | cmake_minimum_required(VERSION 3.13)  # Copied from IREE
 19 | set(CMAKE_CXX_STANDARD 14)
 20 | 
 21 | include(GNUInstallDirs)
 22 | 
 23 | if (PROJECT_NAME STREQUAL CMAKE_PROJECT_NAME)
 24 |   set(RUY_IS_TOPLEVEL TRUE)
 25 |   set(RUY_MINIMAL_BUILD_DEFAULT_VALUE OFF)
 26 | else()
 27 |   set(RUY_IS_TOPLEVEL FALSE)
 28 |   set(RUY_MINIMAL_BUILD_DEFAULT_VALUE ON)
 29 | endif()
 30 | 
 31 | option(RUY_MINIMAL_BUILD "Disable ruy's tests, examples, etc. Build only ruy public libraries." ${RUY_MINIMAL_BUILD_DEFAULT_VALUE})
 32 | if (NOT RUY_MINIMAL_BUILD)
 33 |   enable_testing()
 34 | endif()
 35 | 
 36 | option(RUY_PROFILER "Enable ruy's built-in profiler (harms performance)" OFF)
 37 | 
 38 | option(RUY_ENABLE_INSTALL "Enable install rule" ${RUY_IS_TOPLEVEL})
 39 | 
 40 | include(cmake/ruy_add_all_subdirs.cmake)
 41 | include(cmake/ruy_cc_library.cmake)
 42 | include(cmake/ruy_cc_binary.cmake)
 43 | include(cmake/ruy_cc_test.cmake)
 44 | 
 45 | option(RUY_FIND_CPUINFO "Use find_package to find cpuinfo" OFF)
 46 | 
 47 | # Skip cpuinfo if it was already generated, which can happen when ruy is
 48 | # a subdirectory in a wider project that already uses cpuinfo.
 49 | if (NOT TARGET cpuinfo::cpuinfo)
 50 |   if (RUY_FIND_CPUINFO)
 51 |     find_package(cpuinfo REQUIRED)
 52 |   else()
 53 |     # Test if the third_party/cpuinfo submodule was checked out before
 54 |     # adding that subdirectory, so we can do more helpful things below in the
 55 |     # else() block when it's not.
 56 |     set(RUY_CPUINFO_CMAKELISTS_FILE "${CMAKE_CURRENT_SOURCE_DIR}/third_party/cpuinfo/CMakeLists.txt")
 57 |     if (EXISTS "${RUY_CPUINFO_CMAKELISTS_FILE}")
 58 |       # Disabling cpuinfo's tests and benchmarks to prevent a copy of its
 59 |       # googletest dependency getting downloaded into a 'deps' directory in the
 60 |       # source tree!
 61 |       set(CPUINFO_BUILD_BENCHMARKS OFF CACHE BOOL "" FORCE)
 62 |       set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE BOOL "" FORCE)
 63 |       set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE BOOL "" FORCE)
 64 |       add_subdirectory("third_party/cpuinfo" EXCLUDE_FROM_ALL)
 65 |     else()
 66 |       # third_party/cpuinfo is not checked out. That could be intentional when
 67 |       # ruy is a subdirectory in a wider project that is already providing
 68 |       # the cpuinfo target. Maybe that wider project's CMakeLists is ordered
 69 |       # in such a way that cpuinfo gets generated after ruy. In that case,
 70 |       # it's helpful that we continue silently. In the worst case if the cpuinfo
 71 |       # target never gets defined, ruy will fail to compile.
 72 |       # On the other hand, if ruy is the top-level project here (not part of a
 73 |       # wider project) then nothing will define the cpuinfo target for us,
 74 |       # so we will definitely fail to compile, so we may as well fail right here.
 75 |       if (RUY_IS_TOPLEVEL)
 76 |         message(FATAL_ERROR "This file does not exist:\n${RUY_CPUINFO_CMAKELISTS_FILE}\n"
 77 |                       "That typically means that the git submodules of the ruy "
 78 |                       "repository haven't been checked out. Try this in the ruy "
 79 |                       "git directory:\n  git submodule update --init")
 80 |       endif()
 81 |     endif()
 82 |   endif()
 83 | endif()
 84 | 
 85 | # googletest is only needed for tests. Projects embedding ruy as a subdirectory
 86 | # and not needing to build ruy tests may proceed without a local checkout of
 87 | # third_party/googletest.
 88 | if (NOT RUY_MINIMAL_BUILD
 89 |     AND NOT TARGET gtest
 90 |     AND EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/third_party/googletest/CMakeLists.txt")
 91 |   add_subdirectory("third_party/googletest" EXCLUDE_FROM_ALL)
 92 | endif()
 93 | 
 94 | add_subdirectory("ruy")
 95 | 
 96 | if (NOT RUY_MINIMAL_BUILD)
 97 |   add_subdirectory("example")
 98 | endif()
 99 | 
100 | if (RUY_ENABLE_INSTALL)
101 |   install(EXPORT ${PROJECT_NAME}Targets
102 |     NAMESPACE ${PROJECT_NAME}::
103 |     DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
104 |   )
105 | 
106 |   include(CMakePackageConfigHelpers)
107 | 
108 |   configure_package_config_file(
109 |     "cmake/${PROJECT_NAME}Config.cmake.in"
110 |     "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
111 |     INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
112 |   )
113 | 
114 |   install(FILES "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake"
115 |     DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}"
116 |   )
117 | endif()
118 | 
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We'd love to accept your patches and contributions to this project. There are
 4 | just a few small guidelines you need to follow.
 5 | 
 6 | ## Contributor License Agreement
 7 | 
 8 | Contributions to this project must be accompanied by a Contributor License
 9 | Agreement. You (or your employer) retain the copyright to your contribution;
10 | this simply gives us permission to use and redistribute your contributions as
11 | part of the project. Head over to  to see
12 | your current agreements on file or to sign a new one.
13 | 
14 | You generally only need to submit a CLA once, so if you've already submitted one
15 | (even if it was for a different project), you probably don't need to do it
16 | again.
17 | 
18 | ## Code reviews
19 | 
20 | All submissions, including submissions by project members, require review. We
21 | use GitHub pull requests for this purpose. Consult
22 | [GitHub Help](https://help.github.com/articles/about-pull-requests/) for more
23 | information on using pull requests.
24 | 
25 | ## Community Guidelines
26 | 
27 | This project follows [Google's Open Source Community
28 | Guidelines](https://opensource.google/conduct/).
29 | 
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # The ruy matrix multiplication library
 2 | 
 3 | This is not an officially supported Google product.
 4 | 
 5 | ruy is a matrix multiplication library. Its focus is to cover the matrix
 6 | multiplication needs of neural network inference engines. Its initial user has
 7 | been TensorFlow Lite, where it is used by default on the ARM CPU architecture.
 8 | 
 9 | ruy supports both floating-point and 8bit-integer-quantized matrices.
10 | 
11 | ## Efficiency
12 | 
13 | ruy is designed to achieve high performance not just on very large sizes, as
14 | is the focus of many established libraries, but on whatever are the actual sizes
15 | and shapes of matrices most critical in current TensorFlow Lite applications.
16 | This often means quite small sizes, e.g. 100x100 or even 50x50, and all sorts of
17 | rectangular shapes. It's not as fast as completely specialized code for each
18 | shape, but it aims to offer a good compromise of speed across all shapes and a
19 | small binary size.
20 | 
21 | ## Documentation
22 | 
23 | Some documentation will eventually be available in the doc/ directory, see
24 | [doc/README.md](doc/README.md).
25 | 
26 | 
--------------------------------------------------------------------------------
/WORKSPACE:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Workspace file for the Ruy project.
16 | 
17 | workspace(name = "com_google_ruy")
18 | 
19 | load("@bazel_tools//tools/build_defs/repo:http.bzl", "http_archive")
20 | load("@bazel_tools//tools/build_defs/repo:utils.bzl", "maybe")
21 | 
22 | maybe(
23 |     local_repository,
24 |     name = "com_google_googletest",
25 |     path = "third_party/googletest",
26 | )
27 | 
28 | maybe(
29 |     new_local_repository,
30 |     name = "cpuinfo",
31 |     path = "third_party/cpuinfo",
32 |     build_file = "@//third_party:cpuinfo.BUILD",
33 | )
34 | 
35 | # skylib utility for additional bazel functionality.
36 | skylib_version = "0.9.0"
37 | http_archive(
38 |     name = "bazel_skylib",
39 |     type = "tar.gz",
40 |     url = "https://github.com/bazelbuild/bazel-skylib/releases/download/{}/bazel_skylib-{}.tar.gz".format (skylib_version, skylib_version),
41 |     sha256 = "1dde365491125a3db70731e25658dfdd3bc5dbdfd11b840b3e987ecf043c7ca0",
42 | )
43 | load("@bazel_skylib//lib:versions.bzl", "versions")
44 | versions.check(minimum_bazel_version = "2.0.0")
45 | 
--------------------------------------------------------------------------------
/cmake/bazel_to_cmake.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2021 Google LLC
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      https://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | this_script_dir="$(dirname "$0")"
17 | 
18 | root_dir="$(git -C "${this_script_dir}" rev-parse --show-toplevel)"
19 | 
20 | build_files="$(find "${root_dir}" -type f -name BUILD)"
21 | 
22 | if ! command -v python3 &> /dev/null; then
23 |   python_command=python
24 | else
25 |   python_command=python3
26 | fi
27 | 
28 | for build_file in ${build_files}; do
29 |     package_dir="$(dirname "${build_file}")"
30 |     if [[ "${package_dir}" == "${root_dir}" ]]; then
31 |       # The root CMakeLists.txt is not generated.
32 |       continue
33 |     fi
34 |     "${python_command}" "${this_script_dir}/bazel_to_cmake.py" "${root_dir}" "${package_dir}" > "${package_dir}/CMakeLists.txt"
35 | done
36 | 
--------------------------------------------------------------------------------
/cmake/run_android_test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Minimal script pushing and running a file on device!
 4 | # Contemporary versions of ADB properly propagate exit codes so nothing more
 5 | # is needed to let CTest report test success/failure.
 6 | 
 7 | # TODO: consider clearing temporary files after testing, although that will
 8 | # get in the way of debugging and will make code more complex... also,
 9 | # Ruy's test files aren't huge and people running these probably have
10 | # bigger clutter issues in their /data/local/tmp anyway. Anyway, if we want
11 | # to do this, we could copy IREE's code.
12 | 
13 | device_tmpdir=/data/local/tmp
14 | 
15 | adb push "$1" "${device_tmpdir}"
16 | adb shell "${device_tmpdir}/$(basename "$1")"
17 | 
--------------------------------------------------------------------------------
/cmake/ruyConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | # ruy CMake configuration file.
 2 | 
 3 | include(CMakeFindDependencyMacro)
 4 | 
 5 | find_dependency(cpuinfo)
 6 | 
 7 | @PACKAGE_INIT@
 8 | 
 9 | include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@Targets.cmake")
10 | 
--------------------------------------------------------------------------------
/cmake/ruy_add_all_subdirs.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Forked from IREE's iree_add_all_subdirs.cmake.
16 | 
17 | # add_all_subidrs
18 | #
19 | # CMake function to add all subdirectories of the current directory that contain
20 | # a CMakeLists.txt file
21 | #
22 | # Takes no arguments.
23 | function(ruy_add_all_subdirs)
24 |   FILE(GLOB _CHILDREN RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/*)
25 |   SET(_DIRLIST "")
26 |   foreach(_CHILD ${_CHILDREN})
27 |     if((NOT(subdir MATCHES third_party)) AND
28 |        (IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/${_CHILD}) AND
29 |        (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${_CHILD}/CMakeLists.txt))
30 |       LIST(APPEND _DIRLIST ${_CHILD})
31 |     endif()
32 |   endforeach()
33 | 
34 |   foreach(subdir ${_DIRLIST})
35 |     add_subdirectory(${subdir})
36 |   endforeach()
37 | endfunction()
38 | 
--------------------------------------------------------------------------------
/cmake/ruy_cc_binary.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Forked from IREE's iree_cc_binary.cmake.
16 | 
17 | include(CMakeParseArguments)
18 | include(cmake/ruy_include_directories.cmake)
19 | 
20 | # ruy_cc_binary()
21 | #
22 | # CMake function to imitate Bazel's cc_binary rule.
23 | function(ruy_cc_binary)
24 |   cmake_parse_arguments(
25 |     _RULE
26 |     "TESTONLY"
27 |     "NAME"
28 |     "SRCS;COPTS;LINKOPTS;DEPS;TAGS"
29 |     ${ARGN}
30 |   )
31 | 
32 |   if(_RULE_TESTONLY AND RUY_MINIMAL_BUILD)
33 |     return()
34 |   endif()
35 | 
36 |   set(_NAME "${_RULE_NAME}")
37 | 
38 |   add_executable(${_NAME} "")
39 |   target_sources(${_NAME}
40 |     PRIVATE
41 |       ${_RULE_SRCS}
42 |   )
43 |   set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "${_RULE_NAME}")
44 |   ruy_include_directories(${_NAME} "${_RULE_DEPS}")
45 |   target_compile_options(${_NAME}
46 |     PRIVATE
47 |       ${_RULE_COPTS}
48 |   )
49 |   target_link_options(${_NAME}
50 |     PRIVATE
51 |       ${_RULE_LINKOPTS}
52 |   )
53 |   target_link_libraries(${_NAME}
54 |     PUBLIC
55 |       ${_RULE_DEPS}
56 |   )
57 | endfunction()
58 | 
--------------------------------------------------------------------------------
/cmake/ruy_cc_library.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Forked from IREE's iree_cc_library.cmake.
16 | 
17 | include(CMakeParseArguments)
18 | include(cmake/ruy_include_directories.cmake)
19 | 
20 | # ruy_cc_library()
21 | #
22 | # CMake function to imitate Bazel's cc_library rule.
23 | function(ruy_cc_library)
24 |   cmake_parse_arguments(
25 |     _RULE
26 |     "PUBLIC;TESTONLY"
27 |     "NAME"
28 |     "HDRS;SRCS;COPTS;DEFINES;LINKOPTS;DEPS"
29 |     ${ARGN}
30 |   )
31 | 
32 |   if(_RULE_TESTONLY AND RUY_MINIMAL_BUILD)
33 |     return()
34 |   endif()
35 | 
36 |   set(_NAME "${_RULE_NAME}")
37 | 
38 |   # Check if this is a header-only library.
39 |   if("${_RULE_SRCS}" STREQUAL "")
40 |     set(_RULE_IS_INTERFACE 1)
41 |   else()
42 |     set(_RULE_IS_INTERFACE 0)
43 |   endif()
44 | 
45 |   file(RELATIVE_PATH _SUBDIR ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_LIST_DIR})
46 | 
47 |   if(_RULE_IS_INTERFACE)
48 |     # Generating a header-only library.
49 |     add_library(${_NAME} INTERFACE)
50 |     set_target_properties(${_NAME} PROPERTIES PUBLIC_HEADER "${_RULE_HDRS}")
51 |     target_include_directories(${_NAME}
52 |       INTERFACE
53 |         "$"
54 |         "$"
55 |     )
56 |     target_link_libraries(${_NAME}
57 |       INTERFACE
58 |         ${_RULE_DEPS}
59 |         ${_RULE_LINKOPTS}
60 |     )
61 |     target_compile_definitions(${_NAME}
62 |       INTERFACE
63 |         ${_RULE_DEFINES}
64 |     )
65 |   else()
66 |     # Generating a static binary library.
67 |     add_library(${_NAME} STATIC ${_RULE_SRCS} ${_RULE_HDRS})
68 |     set_target_properties(${_NAME} PROPERTIES PUBLIC_HEADER "${_RULE_HDRS}")
69 |     ruy_include_directories(${_NAME} "${_RULE_DEPS}")
70 |     target_compile_options(${_NAME}
71 |       PRIVATE
72 |         ${_RULE_COPTS}
73 |     )
74 |     target_link_libraries(${_NAME}
75 |       PUBLIC
76 |         ${_RULE_DEPS}
77 |       PRIVATE
78 |         ${_RULE_LINKOPTS}
79 |     )
80 |     target_compile_definitions(${_NAME}
81 |       PUBLIC
82 |         ${_RULE_DEFINES}
83 |     )
84 |   endif()
85 | 
86 |   add_library(${PROJECT_NAME}::${_NAME} ALIAS ${_NAME})
87 | 
88 |   if(NOT _RULE_TESTONLY)
89 |     install(
90 |       TARGETS ${_NAME}
91 |       EXPORT ruyTargets
92 |       LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
93 |       PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/${_SUBDIR}
94 |     )
95 |   endif()
96 | endfunction()
97 | 
--------------------------------------------------------------------------------
/cmake/ruy_cc_test.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright 2019 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | # Forked from IREE's iree_cc_test.cmake.
16 | 
17 | include(CMakeParseArguments)
18 | include(cmake/ruy_include_directories.cmake)
19 | 
20 | # ruy_cc_test()
21 | # 
22 | # CMake function to imitate Bazel's cc_test rule.
23 | function(ruy_cc_test)
24 |   cmake_parse_arguments(
25 |     _RULE
26 |     ""
27 |     "NAME"
28 |     "SRCS;COPTS;LINKOPTS;DEPS;TAGS"
29 |     ${ARGN}
30 |   )
31 | 
32 |   if(RUY_MINIMAL_BUILD)
33 |     return()
34 |   endif()
35 | 
36 |   set(_NAME "${_RULE_NAME}")
37 | 
38 |   add_executable(${_NAME} "")
39 |   target_sources(${_NAME}
40 |     PRIVATE
41 |       ${_RULE_SRCS}
42 |   )
43 |   set_target_properties(${_NAME} PROPERTIES OUTPUT_NAME "${_RULE_NAME}")
44 |   ruy_include_directories(${_NAME} "${_RULE_DEPS}")
45 |   target_compile_options(${_NAME}
46 |     PRIVATE
47 |       ${_RULE_COPTS}
48 |   )
49 |   target_link_options(${_NAME}
50 |     PRIVATE
51 |       ${_RULE_LINKOPTS}
52 |   )
53 |   target_link_libraries(${_NAME}
54 |     PUBLIC
55 |       ${_RULE_DEPS}
56 |   )
57 |   if(ANDROID)
58 |     add_test(
59 |       NAME
60 |         ${_NAME}
61 |       COMMAND
62 |         "${CMAKE_SOURCE_DIR}/cmake/run_android_test.sh"
63 |         "$"
64 |     )
65 |   else()
66 |     add_test(
67 |         NAME
68 |           ${_NAME}
69 |         COMMAND
70 |           "$"
71 |         )
72 |   endif()
73 |   if (_RULE_TAGS)
74 |     set_property(TEST ${_NAME} PROPERTY LABELS ${_RULE_TAGS})
75 |   endif()
76 | endfunction()
77 | 
--------------------------------------------------------------------------------
/cmake/ruy_include_directories.cmake:
--------------------------------------------------------------------------------
 1 | # Copyright 2019-2021 Google LLC
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #      https://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | function(ruy_include_directories NAME DEPS)
16 |   target_include_directories(${NAME}
17 |     PUBLIC
18 |       "$"
19 |       "$"
20 |   )
21 | endfunction()
22 | 
--------------------------------------------------------------------------------
/doc/README.md:
--------------------------------------------------------------------------------
1 | # Ruy documentation
2 | 
3 | This directory will eventually contain ruy documentation.
4 | 
--------------------------------------------------------------------------------
/doc/depgraph.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Generates a graphviz dependency graph for :ruy, with details trimmed.
  4 | # Suggested rendering: pipe to `neato` (part of graphviz standard distribution)
  5 | #   doc/depgraph.sh | dot -Tsvg > depgraph.svg
  6 | 
  7 | drop=(
  8 |     ':platform'
  9 |     ':check_macros'
 10 |     ':asm_helpers'
 11 |     ':size_util'
 12 |     ':system_aligned_alloc'
 13 |     ':side_pair'
 14 |     ':opt_set'
 15 |     ':blocking_counter'
 16 |     ':wait'
 17 |     ':time'
 18 |     ':path'
 19 |     ':performance_advisory'
 20 |     ':tune'
 21 |     ':matrix'
 22 |     ':mat'
 23 |     ':mul_params'
 24 |     ':context_get_ctx'
 25 |     ':have_built_path_for'
 26 |     ':pack_common'
 27 |     ':kernel_common'
 28 |     ':trace'
 29 |     ':validate'
 30 |     'profiler:instrumentation'
 31 |     '\bclog\b'
 32 |     '\bcpuinfo\b'
 33 |     ':apply_multiplier'
 34 |     '\blabel='
 35 | )
 36 | 
 37 | graph="$(bazel query 'kind("cc_library", deps(//ruy))' --output graph --noimplicit_deps 2>/dev/null)"
 38 | 
 39 | graph="$(echo "${graph}" | sed 's|//ruy/\?||g')"
 40 | 
 41 | for t in "${drop[@]}"; do
 42 |   graph="$(echo "${graph}" | grep -v "${t}")"
 43 | done
 44 | 
 45 | graph="$(echo "${graph}" | sed 's|//:cpuinfo_with_unstripped_include_path||g')"
 46 | graph="$(echo "${graph}" | sed 's|//third_party/cpuinfo:[a-z0-9_]*|@cpuinfo|g')"
 47 | 
 48 | frontend=(
 49 |     ':ruy'
 50 |     ':context'
 51 |     ':frontend'
 52 |     ':prepare_packed_matrices'
 53 |     ':create_trmul_params'
 54 | )
 55 | 
 56 | middleend=(
 57 |     ':ctx'
 58 |     ':trmul_params'
 59 |     ':trmul'
 60 |     ':block_map'
 61 |     ':cpuinfo'
 62 |     ':cpu_cache_params'
 63 |     ':allocator'
 64 |     ':prepacked_cache'
 65 | )
 66 | 
 67 | backend=(
 68 |     ':kernel.*'
 69 |     ':pack.*'
 70 | )
 71 | 
 72 | threadpool=(
 73 |     ':thread_pool'
 74 | )
 75 | 
 76 | frontend_lines=()
 77 | middleend_lines=()
 78 | backend_lines=()
 79 | threadpool_lines=()
 80 | misc_lines=()
 81 | arrow_lines=()
 82 | 
 83 | while IFS= read -r line; do
 84 |   if [[ "${line}" =~ '->' ]]; then
 85 |     arrow_lines+=("${line}")
 86 |   else
 87 |     handled=false
 88 |     if [ $handled = false ]; then
 89 |         for f in "${frontend[@]}"; do
 90 |             if [[ "${line}" =~ ${f} ]]; then
 91 |                 frontend_lines+=("${line}")
 92 |                 handled=true
 93 |                 break
 94 |             fi
 95 |         done
 96 |     fi
 97 |     if [ $handled = false ]; then
 98 |         for f in "${middleend[@]}"; do
 99 |             if [[ "${line}" =~ ${f} ]]; then
100 |                 middleend_lines+=("${line}")
101 |                 handled=true
102 |                 break
103 |             fi
104 |         done
105 |     fi
106 |     if [ $handled = false ]; then
107 |         for f in "${backend[@]}"; do
108 |             if [[ "${line}" =~ ${f} ]]; then
109 |                 backend_lines+=("${line}")
110 |                 handled=true
111 |                 break
112 |             fi
113 |         done
114 |     fi
115 |     if [ $handled = false ]; then
116 |         for f in "${threadpool[@]}"; do
117 |             if [[ "${line}" =~ ${f} ]]; then
118 |                 threadpool_lines+=("${line}")
119 |                 handled=true
120 |                 break
121 |             fi
122 |         done
123 |     fi
124 |     if [ $handled = false ]; then
125 |         if [[ "${line}" =~ ^[[:space:]]+\" ]]; then
126 |             misc_lines+=("${line}")
127 |         fi
128 |     fi
129 |   fi
130 | done <<< "${graph}"
131 | 
132 | echo "digraph ruy {"
133 | echo "  splines = true"
134 | echo "  node [shape=box]"
135 | for f in "${frontend_lines[@]}"; do
136 |   echo "  $f [style=filled, color=\"#B2EBF2\"];"
137 | done
138 | for m in "${middleend_lines[@]}"; do
139 |   echo "  $m [style=filled, color=\"#C8E6C9\"];"
140 | done
141 | for b in "${backend_lines[@]}"; do
142 |   echo "  $b [style=filled, color=\"#FFCDD2\"];"
143 | done
144 | for b in "${threadpool_lines[@]}"; do
145 |   echo "  $b [style=filled, color=\"#FFF9C4\"];"
146 | done
147 | for m in "${misc_lines[@]}"; do
148 |   echo "$m"
149 | done
150 | for a in "${arrow_lines[@]}"; do
151 |   echo "$a"
152 | done
153 | echo "}"
154 | 
--------------------------------------------------------------------------------
/example/BUILD:
--------------------------------------------------------------------------------
 1 | load("//third_party/bazel_rules/rules_cc/cc:cc_binary.bzl", "cc_binary")
 2 | 
 3 | package(
 4 |     default_applicable_licenses = ["//third_party/ruy:license"],
 5 |     licenses = ["notice"],  # Apache 2.0
 6 | )
 7 | 
 8 | # Usage examples.
 9 | cc_binary(
10 |     name = "example",
11 |     srcs = ["example.cc"],
12 |     deps = ["//ruy"],
13 | )
14 | 
15 | cc_binary(
16 |     name = "parametrized_example",
17 |     srcs = ["parametrized_example.cc"],
18 |     deps = ["//ruy"],
19 | )
20 | 
--------------------------------------------------------------------------------
/example/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This file is generated (whence no license header). Do not edit!
 2 | # To regenerate, run:
 3 | #   cmake/bazel_to_cmake.sh
 4 | 
 5 | ruy_cc_binary(
 6 |   NAME
 7 |     ruy_example_example
 8 |   SRCS
 9 |     example.cc
10 |   DEPS
11 |     ruy
12 | )
13 | 
14 | ruy_cc_binary(
15 |   NAME
16 |     ruy_example_parametrized_example
17 |   SRCS
18 |     parametrized_example.cc
19 |   DEPS
20 |     ruy
21 | )
22 | 
23 | ruy_add_all_subdirs()
24 | 
--------------------------------------------------------------------------------
/example/README.md:
--------------------------------------------------------------------------------
 1 | ## Introduction
 2 | 
 3 | These are some examples about how to use RUY.
 4 | 
 5 | ## BUILD
 6 | 
 7 | Build the example with bazel commands:
 8 | ```
 9 | bazel build //ruy/example:example
10 | ```
11 | You can find the generated target under directory:
12 | ```
13 | ./bazel-bin/ruy/example
14 | ```
15 | 
--------------------------------------------------------------------------------
/ruy/allocator.cc:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #include "ruy/allocator.h"
 17 | 
 18 | #include "ruy/opt_set.h"
 19 | #include "ruy/size_util.h"
 20 | #include "ruy/system_aligned_alloc.h"
 21 | 
 22 | namespace ruy {
 23 | 
 24 | Allocator::~Allocator() {
 25 |   FreeAll();
 26 |   detail::SystemAlignedFree(ptr_);
 27 | }
 28 | 
 29 | void* Allocator::AllocateFast(std::ptrdiff_t num_bytes) {
 30 |   if (current_ + num_bytes > size_) {
 31 |     return nullptr;
 32 |   }
 33 |   void* ret = static_cast(ptr_) + current_;
 34 |   current_ += num_bytes;
 35 |   return ret;
 36 | }
 37 | 
 38 | void* Allocator::AllocateSlow(std::ptrdiff_t num_bytes) {
 39 |   void* p = detail::SystemAlignedAlloc(num_bytes);
 40 |   fallback_blocks_total_size_ += num_bytes;
 41 |   fallback_blocks_.push_back(p);
 42 |   return p;
 43 | }
 44 | 
 45 | void* Allocator::AllocateBytes(std::ptrdiff_t num_bytes) {
 46 |   if (num_bytes == 0) {
 47 |     return nullptr;
 48 |   }
 49 |   const std::ptrdiff_t rounded_num_bytes =
 50 |       round_up_pot(num_bytes, detail::kMinimumBlockAlignment);
 51 |   if (void* p = AllocateFast(rounded_num_bytes)) {
 52 |     return p;
 53 |   }
 54 |   return AllocateSlow(rounded_num_bytes);
 55 | }
 56 | 
 57 | void* Allocator::AllocateBytesAvoidingAliasingWith(std::ptrdiff_t num_bytes,
 58 |                                                    const void* to_avoid) {
 59 | #if RUY_OPT(AVOID_ALIASING)
 60 |   if (num_bytes == 0) {
 61 |     return nullptr;
 62 |   }
 63 |   // The minimum L1D cache aliasing periodicity in bytes that we expect to
 64 |   // encounter on any device. This does not seem to be documented, but
 65 |   // empirically we observe the following:
 66 |   //   Cortex-A53:   1024
 67 |   //   Cortex-A55r1: 2048
 68 |   //   Cortex-A76:   not as easily observable.
 69 |   // Over-estimating this makes the AVOID_ALIASING optimization useless on
 70 |   // devices with lower periodicity.
 71 |   // Under-estimating this by 2x should be harmless.
 72 |   // Under-estimating this by a larger factor should gradually degrade
 73 |   // performance due to cache aliasing causing mutual eviction between
 74 |   // the packed matrix data, and the source matrix data being prefetched by the
 75 |   // CPU ahead of the packing code execution.
 76 |   static constexpr std::uint32_t kMinPeriod = 1024;
 77 |   static_assert(is_pot(kMinPeriod), "");
 78 |   void* p = AllocateBytes(num_bytes + kMinPeriod);
 79 |   auto unsigned_low_bits = [](const void* p) {
 80 |     return static_cast(reinterpret_cast(p));
 81 |   };
 82 |   // This relies on unsigned integer overflow wrapping around.
 83 |   std::uint32_t diff_modulus =
 84 |       (unsigned_low_bits(p) - unsigned_low_bits(to_avoid)) % kMinPeriod;
 85 |   // diff_modulus is in [0, kMinPeriod).
 86 |   // We want it as close as possible to the middle of that interval,
 87 |   // kMinPeriod/2. The bad 'aliasing' case, that we are working to avoid,
 88 |   // is when diff_modulus is close to the ends of that interval, 0 or
 89 |   // kMinPeriod. So we want to add an offset of kMinPeriod/2 if it is in the
 90 |   // first or the last quarter of that interval.
 91 |   bool need_offset =
 92 |       diff_modulus < kMinPeriod / 4 || diff_modulus > 3 * kMinPeriod / 4;
 93 |   return static_cast(p) + (need_offset ? (kMinPeriod / 2) : 0);
 94 | #else
 95 |   (void)to_avoid;
 96 |   return AllocateBytes(num_bytes);
 97 | #endif
 98 | }
 99 | 
100 | void Allocator::FreeAll() {
101 |   current_ = 0;
102 |   if (fallback_blocks_.empty()) {
103 |     return;
104 |   }
105 | 
106 |   // Free all memory before reallocating `ptr_`.
107 |   // This minimizes the memory high-water-mark.
108 |   detail::SystemAlignedFree(ptr_);
109 |   for (void* p : fallback_blocks_) {
110 |     detail::SystemAlignedFree(p);
111 |   }
112 | 
113 |   // We reallocate to the exact new size, rather than growing
114 |   // exponentially like std::vector. This means linear instead of logarithmic
115 |   // bound on the number of allocation in some worst-case calling patterns.
116 |   // This is considered worth it because minimizing memory usage is important
117 |   // and actual calling patterns in applications that we care about still
118 |   // reach the no-further-allocations steady state in a small finite number
119 |   // of iterations.
120 |   std::ptrdiff_t new_size = size_ + fallback_blocks_total_size_;
121 |   ptr_ = detail::SystemAlignedAlloc(new_size);
122 |   size_ = new_size;
123 | 
124 |   fallback_blocks_.clear();
125 |   fallback_blocks_total_size_ = 0;
126 | }
127 | 
128 | }  // namespace ruy
129 | 
--------------------------------------------------------------------------------
/ruy/allocator.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #ifndef RUY_RUY_ALLOCATOR_H_
 17 | #define RUY_RUY_ALLOCATOR_H_
 18 | 
 19 | #include 
 20 | #include 
 21 | #include 
 22 | #include 
 23 | 
 24 | namespace ruy {
 25 | 
 26 | // Specialized allocator designed to converge to a steady-state where all
 27 | // allocations are bump-ptr allocations from an already-allocated buffer.
 28 | //
 29 | // To support these constraints, this allocator only supports two
 30 | // operations.
 31 | // - AllocateBytes/Allocate: allocates a pointer to storage of a
 32 | // specified size, which will be aligned to kMinimumBlockAlignment.
 33 | // - FreeAll: frees all previous allocations (but retains the internal
 34 | // buffer to minimize future calls into the system allocator).
 35 | //
 36 | // This class is specialized for supporting just those two operations
 37 | // under this specific steady-state usage pattern. Extending this class
 38 | // with new allocation interfaces that don't fit that pattern is probably not
 39 | // the right choice. Instead, build a new class on top of
 40 | // SystemAlignedAlloc/SystemAlignedFree.
 41 | //
 42 | // All operations happen on aligned blocks for simplicity.
 43 | //
 44 | // Theory of operation:
 45 | //
 46 | // - ptr_, current_, and size_ implement a basic bump-ptr allocator.
 47 | //
 48 | // - in AllocateBytes, the fast path is just a bump-ptr
 49 | // allocation. If our bump-ptr allocator doesn't have enough space for an
 50 | // allocation, then we allocate a block from the system allocator to
 51 | // service the allocation request. We save that block in fallback_blocks_
 52 | // and track the total size of the fallback blocks in
 53 | // fallback_blocks_total_size_.
 54 | //
 55 | // - in FreeAll, the fast path just resets the bump-ptr allocator. If
 56 | // there are any fallback blocks, we free them and reallocate the
 57 | // bump-ptr allocator's buffer so that the next sequence of allocations
 58 | // will hopefully not need any fallback blocks.
 59 | class Allocator final {
 60 |  public:
 61 |   ~Allocator();
 62 | 
 63 |   // Allocate a buffer.
 64 |   void* AllocateBytes(std::ptrdiff_t num_bytes);
 65 |   // Allocate a buffer, trying to avoid having its address close to aliasing
 66 |   // the specified `to_avoid` in the L1D cache.
 67 |   void* AllocateBytesAvoidingAliasingWith(std::ptrdiff_t num_bytes,
 68 |                                           const void* to_avoid);
 69 |   // Allocate an array of `count` elements of type T.
 70 |   template 
 71 |   T* Allocate(std::ptrdiff_t count) {
 72 |     return static_cast(AllocateBytes(count * sizeof(T)));
 73 |   }
 74 |   // Allocate an array of `count` elements of the given `Pointer` type's
 75 |   // element_type.
 76 |   template 
 77 |   void Allocate(std::ptrdiff_t count, Pointer* out) {
 78 |     using T = typename std::pointer_traits::element_type;
 79 |     *out = Allocate(count);
 80 |   }
 81 | 
 82 |   // Free all allocated blocks. Internally consolidate allocated buffers as
 83 |   // explained in the class comment.
 84 |   void FreeAll();
 85 | 
 86 |  private:
 87 |   void operator=(const Allocator&) = delete;
 88 |   void* AllocateFast(std::ptrdiff_t num_bytes);
 89 |   void* AllocateSlow(std::ptrdiff_t num_bytes);
 90 | 
 91 |   void* ptr_ = nullptr;
 92 |   std::ptrdiff_t current_ = 0;
 93 |   std::ptrdiff_t size_ = 0;
 94 |   std::vector fallback_blocks_;
 95 |   std::ptrdiff_t fallback_blocks_total_size_ = 0;
 96 | };
 97 | 
 98 | }  // namespace ruy
 99 | 
100 | #endif  // RUY_RUY_ALLOCATOR_H_
101 | 
--------------------------------------------------------------------------------
/ruy/allocator_test.cc:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #include "ruy/allocator.h"
 17 | 
 18 | #include "ruy/gtest_wrapper.h"
 19 | 
 20 | namespace ruy {
 21 | namespace {
 22 | 
 23 | TEST(AllocatorTest, ReturnsValidMemory) {
 24 |   Allocator allocator;
 25 |   int *p;
 26 |   allocator.Allocate(1, &p);
 27 |   ASSERT_NE(p, nullptr);
 28 | 
 29 |   // If this is bogus memory, ASan will cause this test to fail.
 30 |   *p = 42;
 31 | 
 32 |   allocator.FreeAll();
 33 | }
 34 | 
 35 | TEST(AllocatorTest, NoLeak) {
 36 |   Allocator allocator;
 37 |   // Allocate and free some ridiculously large total amount of memory, so
 38 |   // that a leak will hopefully cause some sort of resource exhaustion.
 39 |   //
 40 |   // Despite the large number of allocations, this test is actually quite
 41 |   // fast, since our fast-path allocation logic is very fast.
 42 |   constexpr int kNumAllocations = 100 * 1024;
 43 |   constexpr int kAllocationSize = 1024 * 1024;
 44 |   for (int i = 0; i < kNumAllocations; i++) {
 45 |     char *p;
 46 |     allocator.Allocate(kAllocationSize, &p);
 47 |     allocator.FreeAll();
 48 |   }
 49 | }
 50 | 
 51 | TEST(AllocatorTest, IncreasingSizes) {
 52 |   Allocator allocator;
 53 |   // Allocate sizes that increase by small amounts across FreeAll calls.
 54 |   for (int i = 1; i < 100 * 1024; i++) {
 55 |     char *p;
 56 |     allocator.Allocate(i, &p);
 57 |     allocator.FreeAll();
 58 |   }
 59 | }
 60 | 
 61 | TEST(AllocatorTest, ManySmallAllocations) {
 62 |   Allocator allocator;
 63 |   // Allocate many small allocations between FreeAll calls.
 64 |   for (int i = 0; i < 10 * 1024; i += 100) {
 65 |     for (int j = 0; j < i; j++) {
 66 |       char *p;
 67 |       allocator.Allocate(1, &p);
 68 |     }
 69 |     allocator.FreeAll();
 70 |   }
 71 | }
 72 | 
 73 | TEST(AllocatorTest, DestructorHandlesMainBumpPtr) {
 74 |   // This is a white-box test.
 75 |   Allocator allocator;
 76 |   allocator.AllocateBytes(1);
 77 |   allocator.FreeAll();
 78 |   // After the call to FreeAll, the allocator will consolidate all of the memory
 79 |   // into the main bump-ptr allocator's block, which we then expect to be freed
 80 |   // in the destructor.
 81 |   //
 82 |   // We have no test assertions -- we primarily expect that this trigger a leak
 83 |   // checker and cause the test to fail.
 84 | }
 85 | 
 86 | TEST(AllocatorTest, DestructorHandlesFallbackBlocks) {
 87 |   // This is a white-box test.
 88 |   Allocator allocator;
 89 |   // Since we just created the allocator, this will allocate a fallback block,
 90 |   // which we then expect to be freed in the destructor.
 91 |   //
 92 |   // We have no test assertions -- we primarily expect that this trigger a leak
 93 |   // checker and cause the test to fail.
 94 |   allocator.AllocateBytes(1);
 95 | }
 96 | 
 97 | TEST(AllocatorTest, AvoidAliasing) {
 98 |   Allocator allocator;
 99 |   // Run twice with a FreeAll in between, just in case some future
100 |   // change of internal logic makes that bug-prone.
101 |   for (int repeat = 0; repeat < 2; repeat++) {
102 |     for (int i = 1; i < 100; i++) {
103 |       const void *to_avoid =
104 |           reinterpret_cast(0x1234567890123ull + 123 * i);
105 |       void *ptr = allocator.AllocateBytesAvoidingAliasingWith(i * 10, to_avoid);
106 |       auto unsigned_low_bits = [](const void *p) {
107 |         return static_cast(reinterpret_cast(p));
108 |       };
109 |       static constexpr int kMinPeriod = 1024;
110 |       std::uint32_t unsigned_diff =
111 |           (unsigned_low_bits(ptr) - unsigned_low_bits(to_avoid)) % kMinPeriod;
112 |       std::uint32_t unsigned_diff_mod = unsigned_diff % kMinPeriod;
113 |       ASSERT_TRUE(unsigned_diff_mod >= (kMinPeriod / 4) &&
114 |                   unsigned_diff_mod <= 3 * (kMinPeriod / 4));
115 |     }
116 |     allocator.FreeAll();
117 |   }
118 | }
119 | 
120 | }  // namespace
121 | }  // namespace ruy
122 | 
123 | int main(int argc, char **argv) {
124 |   ::testing::InitGoogleTest(&argc, argv);
125 |   return RUN_ALL_TESTS();
126 | }
127 | 
--------------------------------------------------------------------------------
/ruy/apply_multiplier.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/apply_multiplier.h"
17 | 
18 | #include 
19 | #include 
20 | #include 
21 | #include 
22 | 
23 | namespace ruy {
24 | namespace detail {
25 | 
26 | // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
27 | // Warning: this code is not meant to be bit-exact-normative.
28 | // Please refer to the class comment of ruy::MulParams, in mul_params.h.
29 | // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
30 | // Simplified multiplier application function
31 | //
32 | // Double rounding and symmetric rounding are removed compared to reference.
33 | // Double rounding seems unnecessary and can complicate implementations.
34 | // Symmetric rounding also adds implementation complexity.
35 | //
36 | // Composed of a single rounding shift right and can lead to more HW
37 | // friendly implementations.
38 | //
39 | // On NEON this can be translated to a SQDMULH + rounding shift right sequence.
40 | // The use of SQDMULH rather than SQRDMULH gives a result that is
41 | // equivalent to a single rounded shift since the truncating shift of SQDMULH
42 | // can be combined with the rounding right shift via the formula (for k>=1):
43 | //  ((x>>31)+(1<<(k-1)))>>k = (x + (1<<(30+k))>>(31+k)
44 | //
45 | // Preconditions:
46 | // - quantized_multiplier >= 0
47 | // - shift is -31 to +7 (negative for right shift)
48 | std::int32_t MultiplyByQuantizedMultiplier(std::int32_t x,
49 |                                            std::int32_t quantized_multiplier,
50 |                                            int shift) {
51 |   RUY_CHECK_GE(shift, -31);
52 | 
53 |   int total_shift = 31 - shift;
54 | 
55 |   std::int64_t x_64(x);
56 |   std::int64_t quantized_multiplier_64(quantized_multiplier);
57 |   std::int64_t round = (int64_t)1 << (total_shift - 1);
58 |   int64_t result = x_64 * quantized_multiplier_64 + round;
59 |   result = result >> total_shift;
60 | 
61 |   RUY_DCHECK_GE(result, std::numeric_limits::lowest());
62 |   RUY_DCHECK_LE(result, std::numeric_limits::max());
63 | 
64 |   return static_cast(result);
65 | }
66 | 
67 | }  // namespace detail
68 | 
69 | }  // namespace ruy
70 | 
--------------------------------------------------------------------------------
/ruy/apply_multiplier.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | // Provides a reference (portable, non-optimized) ApplyMultiplier function.
17 | // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
18 | // Warning: this code is not meant to be bit-exact-normative.
19 | // Please refer to the class comment of ruy::MulParams, in mul_params.h.
20 | // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
21 | 
22 | #ifndef RUY_RUY_APPLY_MULTIPLIER_H_
23 | #define RUY_RUY_APPLY_MULTIPLIER_H_
24 | 
25 | #include 
26 | #include 
27 | 
28 | #include "ruy/check_macros.h"
29 | #include "ruy/mul_params.h"
30 | 
31 | namespace ruy {
32 | 
33 | // Applies the quantized multiplier to the `*accum` accumulator value, if
34 | // applicable, that is, if AccumScalar==int32 and DstScalar!=int32. Otherwise,
35 | // does nothing.
36 | //
37 | // This is slow, portable, 'reference' code. It should only be used in
38 | // ReferenceMul and in Path::kStandardCpp. There isn't a point in optimizing it,
39 | // either. Fast paths have that multiplier work done as part of the kernel,
40 | // typically written in assembly anyway.
41 | template 
42 | void ApplyMultiplier(const MulParams& mul_params,
43 |                      int channel, AccumScalar* accum);
44 | 
45 | namespace detail {
46 | 
47 | // Copied from TF Lite code.
48 | std::int32_t MultiplyByQuantizedMultiplier(std::int32_t x,
49 |                                            std::int32_t quantized_multiplier,
50 |                                            int shift);
51 | 
52 | // Helper to apply a fixed-point multiplier.  Only 'applicable' if AccumScalar
53 | // is int32 (i.e. in all cases except floating-point) and if the destination is
54 | // not int32 (i.e. unless the user wants to get raw accumulators).
55 | template ::value &&
57 |                               !std::is_same::value>
58 | struct ApplyMultiplierImpl {};
59 | 
60 | // Specialization in non-applicable case: do nothing.
61 | template 
62 | struct ApplyMultiplierImpl {
63 |   static void Run(const MulParams&, int, AccumScalar*) {
64 |   }
65 | };
66 | 
67 | template 
68 | struct ApplyMultiplierImpl {
69 |   static void Run(const MulParams& mul_params,
70 |                   int channel, AccumScalar* accum) {
71 |     AccumScalar m = mul_params.multiplier_fixedpoint_perchannel()
72 |                         ? mul_params.multiplier_fixedpoint_perchannel()[channel]
73 |                         : mul_params.multiplier_fixedpoint();
74 |     int e = mul_params.multiplier_exponent_perchannel()
75 |                 ? mul_params.multiplier_exponent_perchannel()[channel]
76 |                 : mul_params.multiplier_exponent();
77 |     *accum = MultiplyByQuantizedMultiplier(*accum, m, e);
78 |   }
79 | };
80 | 
81 | }  // namespace detail
82 | 
83 | template 
84 | void ApplyMultiplier(const MulParams& mul_params,
85 |                      int channel, AccumScalar* accum) {
86 |   detail::ApplyMultiplierImpl::Run(mul_params, channel,
87 |                                                            accum);
88 | }
89 | 
90 | }  // namespace ruy
91 | 
92 | #endif  // RUY_RUY_APPLY_MULTIPLIER_H_
93 | 
--------------------------------------------------------------------------------
/ruy/asm_helpers.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | // Some helpers to write inline asm.
17 | 
18 | #ifndef RUY_RUY_ASM_HELPERS_H_
19 | #define RUY_RUY_ASM_HELPERS_H_
20 | 
21 | #include "ruy/opt_set.h"
22 | 
23 | // Enclose load-prefetch instructions in RUY_PREFETCH_LOAD() so we can
24 | // conditionally enable them based on the RUY_OPT_SET.
25 | #if RUY_OPT(PREFETCH_LOAD)
26 | #define RUY_PREFETCH_LOAD(X) X
27 | #else
28 | #define RUY_PREFETCH_LOAD(X)
29 | #endif
30 | 
31 | // Enclose store-prefetch instructions in RUY_PREFETCH_STORE() so we can
32 | // conditionally enable them based on the RUY_OPT_SET.
33 | #if RUY_OPT(PREFETCH_STORE)
34 | #define RUY_PREFETCH_STORE(X) X
35 | #else
36 | #define RUY_PREFETCH_STORE(X)
37 | #endif
38 | 
39 | // The usual stringification macro.
40 | #define RUY_STR(s) RUY_STR_UNEXPANDED(s)
41 | #define RUY_STR_UNEXPANDED(s) #s
42 | 
43 | #endif  // RUY_RUY_ASM_HELPERS_H_
44 | 
--------------------------------------------------------------------------------
/ruy/blocking_counter.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/blocking_counter.h"
17 | 
18 | #include "ruy/check_macros.h"
19 | #include "ruy/wait.h"
20 | 
21 | namespace ruy {
22 | 
23 | void BlockingCounter::Reset(int initial_count) {
24 |   int old_count_value = count_.load(std::memory_order_relaxed);
25 |   RUY_DCHECK_EQ(old_count_value, 0);
26 |   (void)old_count_value;
27 |   count_.store(initial_count, std::memory_order_release);
28 | }
29 | 
30 | bool BlockingCounter::DecrementCount() {
31 |   int old_count_value = count_.fetch_sub(1, std::memory_order_acq_rel);
32 |   RUY_DCHECK_GT(old_count_value, 0);
33 |   int count_value = old_count_value - 1;
34 |   bool hit_zero = (count_value == 0);
35 |   if (hit_zero) {
36 |     std::lock_guard lock(count_mutex_);
37 |     count_cond_.notify_all();
38 |   }
39 |   return hit_zero;
40 | }
41 | 
42 | void BlockingCounter::Wait(const Duration spin_duration) {
43 |   const auto& condition = [this]() {
44 |     return count_.load(std::memory_order_acquire) == 0;
45 |   };
46 |   ruy::Wait(condition, spin_duration, &count_cond_, &count_mutex_);
47 | }
48 | 
49 | }  // namespace ruy
50 | 
--------------------------------------------------------------------------------
/ruy/blocking_counter.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef RUY_RUY_BLOCKING_COUNTER_H_
17 | #define RUY_RUY_BLOCKING_COUNTER_H_
18 | 
19 | #include 
20 | #include   // NOLINT(build/c++11) // IWYU pragma: keep
21 | #include                // NOLINT(build/c++11) // IWYU pragma: keep
22 | 
23 | #include "ruy/time.h"
24 | 
25 | namespace ruy {
26 | 
27 | // A BlockingCounter lets one thread to wait for N events to occur.
28 | // This is how the master thread waits for all the worker threads
29 | // to have finished working.
30 | // The waiting is done using a naive spinlock waiting for the atomic
31 | // count_ to hit the value 0. This is acceptable because in our usage
32 | // pattern, BlockingCounter is used only to synchronize threads after
33 | // short-lived tasks (performing parts of the same GEMM). It is not used
34 | // for synchronizing longer waits (resuming work on the next GEMM).
35 | class BlockingCounter {
36 |  public:
37 |   BlockingCounter() : count_(0) {}
38 | 
39 |   // Sets/resets the counter; initial_count is the number of
40 |   // decrementing events that the Wait() call will be waiting for.
41 |   void Reset(int initial_count);
42 | 
43 |   // Decrements the counter; if the counter hits zero, signals
44 |   // the threads that were waiting for that, and returns true.
45 |   // Otherwise (if the decremented count is still nonzero),
46 |   // returns false.
47 |   bool DecrementCount();
48 | 
49 |   // Waits for the N other threads (N having been set by Reset())
50 |   // to hit the BlockingCounter.
51 |   //
52 |   // Will first spin-wait for `spin_duration` before reverting to passive wait.
53 |   void Wait(const Duration spin_duration);
54 | 
55 |  private:
56 |   std::atomic count_;
57 | 
58 |   // The condition variable and mutex allowing to passively wait for count_
59 |   // to reach the value zero, in the case of longer waits.
60 |   std::condition_variable count_cond_;
61 |   std::mutex count_mutex_;
62 | };
63 | 
64 | }  // namespace ruy
65 | 
66 | #endif  // RUY_RUY_BLOCKING_COUNTER_H_
67 | 
--------------------------------------------------------------------------------
/ruy/build_defs.bzl:
--------------------------------------------------------------------------------
 1 | """Build definitions for Ruy."""
 2 | 
 3 | # Helper for ruy_copts().
 4 | # Returns warnings flags to use for all ruy code.
 5 | def ruy_copts_warnings():
 6 |     return select({
 7 |         "//tools/cc_target_os:windows": [
 8 |             # We run into trouble on Windows toolchains with warning flags,
 9 |             # as mentioned in the comments below on each flag.
10 |             # We could be more aggressive in enabling supported warnings on each
11 |             # Windows toolchain, but we compromise with keeping BUILD files simple
12 |             # by limiting the number of config_setting's.
13 |         ],
14 |         "//conditions:default": [
15 |             "-Wall",
16 |             # Some clang-based Windows toolchains have more warnings in -Wextra.
17 |             "-Wextra",
18 |             # Warn on preprocessor expansion of an undefined token, e.g. catching
19 |             # typos such as `#ifdef __linus__` instead of `#ifdef __linux__`.
20 |             # Not supported by MSVC.
21 |             "-Wundef",
22 |         ],
23 |     })
24 | 
25 | # Helper for ruy_copts().
26 | # Returns flags to use to enable NEON if applicable, for all ruy code.
27 | def ruy_copts_neon():
28 |     return select({
29 |         # OK to crash old devices that lack full NEON support.
30 |         # No need to pass -mfloat-abi=softfp, that is already on.
31 |         "//ruy:arm32_assuming_neon": [
32 |             "-mfpu=neon",
33 |         ],
34 |         "//conditions:default": [],
35 |     })
36 | 
37 | # Helper for ruy_copts().
38 | # Returns optimization flags to use for all ruy code.
39 | def ruy_copts_optimize():
40 |     return select({
41 |         # On some toolchains, typically mobile, "-c opt" is interpreted by
42 |         # default as "optimize for size, not for speed". For Ruy code,
43 |         # optimizing for speed is the better compromise, so we override that.
44 |         # Careful to keep debug builds debuggable, whence the select based
45 |         # on the compilation mode.
46 |         "//ruy:do_not_want_O3": [],
47 |         "//conditions:default": ["-O3"],
48 |     })
49 | 
50 | # Returns compiler flags to use for all ruy code.
51 | def ruy_copts():
52 |     return ruy_copts_warnings() + ruy_copts_neon() + ruy_copts_optimize()
53 | 
54 | def ruy_copts_avx():
55 |     return select({
56 |         "//ruy:x86_64_and_not_msvc": ["-mavx"],
57 |         "//conditions:default": [],
58 |     })
59 | 
60 | def ruy_copts_avx2_fma():
61 |     return select({
62 |         "//ruy:x86_64_and_not_msvc": ["-mavx2", "-mfma"],
63 |         "//conditions:default": [],
64 |     })
65 | 
66 | def ruy_copts_avx512():
67 |     # In some clang-based toolchains, in the default compilation mode (not -c opt),
68 |     # heavy spillage in the AVX512 kernels results in stack frames > 50k. This issue does not exist
69 |     # in optimized builds (-c opt).
70 |     return select({
71 |         "//ruy:x86_64_and_not_msvc": ["$(STACK_FRAME_UNLIMITED)", "-mavx512f", "-mavx512vl", "-mavx512cd", "-mavx512bw", "-mavx512dq"],
72 |         "//conditions:default": [],
73 |     })
74 | 
--------------------------------------------------------------------------------
/ruy/build_defs.oss.bzl:
--------------------------------------------------------------------------------
 1 | """Build definitions for Ruy that are specific to the open-source build."""
 2 | 
 3 | # Used for targets that #include 
 4 | def ruy_linkopts_thread_standard_library():
 5 |     # In open source builds, GCC is a common occurence. It requires "-pthread"
 6 |     # to use the C++11  standard library header. This breaks the
 7 |     # opensource build on Windows and probably some other platforms, so that
 8 |     # will need to be fixed as needed. Ideally we would like to do this based
 9 |     # on GCC being the compiler, but that does not seem to be easy to achieve
10 |     # with Bazel. Instead we do the following, which is copied from
11 |     # https://github.com/abseil/abseil-cpp/blob/1112609635037a32435de7aa70a9188dcb591458/absl/base/BUILD.bazel#L155
12 |     return select({
13 |         "//tools/cc_target_os:windows": [],
14 |         "//conditions:default": ["-pthread"],
15 |     })
16 | 
--------------------------------------------------------------------------------
/ruy/check_macros.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | // self-contained, minimal, CHECK/DCHECK macros similar to glog.
 17 | 
 18 | #ifndef RUY_RUY_CHECK_MACROS_H_
 19 | #define RUY_RUY_CHECK_MACROS_H_
 20 | 
 21 | #include 
 22 | #include 
 23 | #include 
 24 | #include 
 25 | 
 26 | namespace ruy {
 27 | namespace check_macros {
 28 | 
 29 | constexpr int kValueBufSize = 32;
 30 | 
 31 | template 
 32 | struct ToString {
 33 |   static void Run(const T&, char* buf) { snprintf(buf, kValueBufSize, "(?)"); }
 34 | };
 35 | 
 36 | template <>
 37 | struct ToString {
 38 |   static void Run(float value, char* buf) {
 39 |     snprintf(buf, kValueBufSize, "%.9g", static_cast(value));
 40 |   }
 41 | };
 42 | 
 43 | template <>
 44 | struct ToString {
 45 |   static void Run(double value, char* buf) {
 46 |     snprintf(buf, kValueBufSize, "%.16g", value);
 47 |   }
 48 | };
 49 | 
 50 | template 
 51 | struct ToString::value>::type> {
 52 |   static void Run(const T& value, char* buf) {
 53 |     snprintf(buf, kValueBufSize, "%lld", static_cast(value));
 54 |   }
 55 | };
 56 | 
 57 | template 
 58 | struct ToString {
 59 |   static void Run(T* value, char* buf) {
 60 |     snprintf(buf, kValueBufSize, "%p", value);
 61 |   }
 62 | };
 63 | 
 64 | template 
 65 | struct ToString::value>::type> {
 66 |   static void Run(const T& value, char* buf) {
 67 |     snprintf(buf, kValueBufSize, "(enum value %d)", static_cast(value));
 68 |   }
 69 | };
 70 | 
 71 | inline void CheckImpl(bool condition, const char* file, int line,
 72 |                       const char* macro, const char* condition_str) {
 73 |   if (!condition) {
 74 |     fprintf(stderr, "%s:%d: %s condition not satisfied: %s\n", file, line,
 75 |             macro, condition_str);
 76 |     abort();
 77 |   }
 78 | }
 79 | 
 80 | template  class Comparison, typename LhsType,
 81 |           typename RhsType>
 82 | inline void CheckImpl(const char* file, int line, const char* macro,
 83 |                       const char* lhs, const LhsType& lhs_value,
 84 |                       const char* op_symbol, const char* rhs,
 85 |                       const RhsType& rhs_value) {
 86 |   using CommonType = typename std::common_type::type;
 87 |   if (!Comparison()(lhs_value, rhs_value)) {
 88 |     char lhs_value_buf[kValueBufSize];
 89 |     ToString::Run(lhs_value, lhs_value_buf);
 90 |     char rhs_value_buf[kValueBufSize];
 91 |     ToString::Run(rhs_value, rhs_value_buf);
 92 |     fprintf(
 93 |         stderr,
 94 |         "%s:%d: %s condition not satisfied:   [ %s %s %s ]   with values   [ "
 95 |         "%s %s %s ].\n",
 96 |         file, line, macro, lhs, op_symbol, rhs, lhs_value_buf, op_symbol,
 97 |         rhs_value_buf);
 98 |     abort();
 99 |   }
100 | }
101 | 
102 | #define RUY_CHECK_IMPL(macro, condition)                              \
103 |   ruy::check_macros::CheckImpl(condition, __FILE__, __LINE__, #macro, \
104 |                                #condition)
105 | 
106 | #define RUY_CHECK_OP_IMPL(macro, lhs, op_symbol, op_comparison, rhs) \
107 |   ruy::check_macros::CheckImpl(                       \
108 |       __FILE__, __LINE__, #macro, #lhs, lhs, #op_symbol, #rhs, rhs)
109 | 
110 | #define RUY_CHECK(condition) RUY_CHECK_IMPL(RUY_CHECK, condition)
111 | #define RUY_CHECK_EQ(x, y) \
112 |   RUY_CHECK_OP_IMPL(RUY_CHECK_EQ, x, ==, std::equal_to, y)
113 | #define RUY_CHECK_NE(x, y) \
114 |   RUY_CHECK_OP_IMPL(RUY_CHECK_NE, x, !=, std::not_equal_to, y)
115 | #define RUY_CHECK_GE(x, y) \
116 |   RUY_CHECK_OP_IMPL(RUY_CHECK_GE, x, >=, std::greater_equal, y)
117 | #define RUY_CHECK_GT(x, y) \
118 |   RUY_CHECK_OP_IMPL(RUY_CHECK_GT, x, >, std::greater, y)
119 | #define RUY_CHECK_LE(x, y) \
120 |   RUY_CHECK_OP_IMPL(RUY_CHECK_LE, x, <=, std::less_equal, y)
121 | #define RUY_CHECK_LT(x, y) RUY_CHECK_OP_IMPL(RUY_CHECK_LT, x, <, std::less, y)
122 | 
123 | #ifdef NDEBUG
124 | #define RUY_DCHECK_IS_ENABLED false
125 | #else
126 | #define RUY_DCHECK_IS_ENABLED true
127 | #endif
128 | 
129 | #define RUY_DCHECK(condition) \
130 |   if (RUY_DCHECK_IS_ENABLED) RUY_CHECK(condition)
131 | #define RUY_DCHECK_EQ(x, y) \
132 |   if (RUY_DCHECK_IS_ENABLED) RUY_CHECK_EQ(x, y)
133 | #define RUY_DCHECK_NE(x, y) \
134 |   if (RUY_DCHECK_IS_ENABLED) RUY_CHECK_NE(x, y)
135 | #define RUY_DCHECK_GE(x, y) \
136 |   if (RUY_DCHECK_IS_ENABLED) RUY_CHECK_GE(x, y)
137 | #define RUY_DCHECK_GT(x, y) \
138 |   if (RUY_DCHECK_IS_ENABLED) RUY_CHECK_GT(x, y)
139 | #define RUY_DCHECK_LE(x, y) \
140 |   if (RUY_DCHECK_IS_ENABLED) RUY_CHECK_LE(x, y)
141 | #define RUY_DCHECK_LT(x, y) \
142 |   if (RUY_DCHECK_IS_ENABLED) RUY_CHECK_LT(x, y)
143 | 
144 | }  // end namespace check_macros
145 | }  // end namespace ruy
146 | 
147 | #endif  // RUY_RUY_CHECK_MACROS_H_
148 | 
--------------------------------------------------------------------------------
/ruy/context.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/context.h"
17 | 
18 | #include "ruy/ctx.h"
19 | #include "ruy/ctx_impl.h"
20 | #include "ruy/strategy_controls.h"
21 | #include "ruy/path.h"
22 | #include "ruy/performance_advisory.h"
23 | #include "ruy/prepacked_cache.h"
24 | #include "ruy/thread_pool.h"
25 | #include "ruy/tune.h"
26 | 
27 | namespace ruy {
28 | 
29 | Context::Context() : impl_(new CtxImpl) {}
30 | Context::~Context() { delete impl_; }
31 | 
32 | const Ctx& Context::ctx() const { return static_cast(*impl_); }
33 | Ctx* Context::mutable_ctx() { return static_cast(impl_); }
34 | 
35 | Path Context::last_used_path() const { return ctx().last_used_path(); }
36 | Tuning Context::explicit_tuning() const { return ctx().explicit_tuning(); }
37 | void Context::set_explicit_tuning(Tuning value) {
38 |   mutable_ctx()->set_explicit_tuning(value);
39 | }
40 | const ThreadPool& Context::thread_pool() const { return ctx().thread_pool(); }
41 | ThreadPool* Context::mutable_thread_pool() {
42 |   return mutable_ctx()->mutable_thread_pool();
43 | }
44 | int Context::max_num_threads() const { return ctx().max_num_threads(); }
45 | void Context::set_max_num_threads(int value) {
46 |   mutable_ctx()->set_max_num_threads(value);
47 | }
48 | NumThreadsStrategy Context::num_threads_strategy() const {
49 |   return ctx().num_threads_strategy();
50 | }
51 | void Context::set_num_threads_strategy(NumThreadsStrategy strategy) {
52 |   mutable_ctx()->set_num_threads_strategy(strategy);
53 | }
54 | 
55 | void Context::ClearPrepackedCache() { mutable_ctx()->ClearPrepackedCache(); }
56 | 
57 | bool Context::performance_advisory(PerformanceAdvisory advisory) const {
58 |   return ctx().performance_advisory(advisory);
59 | }
60 | 
61 | void Context::set_runtime_enabled_paths(Path paths) {
62 |   mutable_ctx()->SetRuntimeEnabledPaths(paths);
63 | }
64 | 
65 | Path Context::get_runtime_enabled_paths() {
66 |   // The `& kAllPaths` hides internal test-only paths.
67 |   return mutable_ctx()->GetRuntimeEnabledPaths() & ruy::kAllPaths;
68 | }
69 | 
70 | }  // namespace ruy
71 | 
--------------------------------------------------------------------------------
/ruy/context.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | // Context is the user-facing context class.
 17 | 
 18 | #ifndef RUY_RUY_CONTEXT_H_
 19 | #define RUY_RUY_CONTEXT_H_
 20 | 
 21 | #include 
 22 | 
 23 | namespace ruy {
 24 | 
 25 | class Ctx;
 26 | class CtxImpl;
 27 | class ThreadPool;
 28 | enum class Path : std::uint8_t;
 29 | enum class Tuning;
 30 | enum class PerformanceAdvisory;
 31 | enum class NumThreadsStrategy : std::uint8_t;
 32 | 
 33 | // A Context holds runtime information used by Ruy. It holds runtime resources
 34 | // such as the workers thread pool and the allocator (which holds buffers for
 35 | // temporary data), as well as runtime options controlling which Paths are
 36 | // enabled (typically based on which instruction sets are detected) and how
 37 | // many threads to use.
 38 | class Context final {
 39 |  public:
 40 |   Context();
 41 |   ~Context();
 42 | 
 43 |   // Returns the Path enum value that corresponds to the code path used by
 44 |   // the last ruy::Mul with this Context.
 45 |   Path last_used_path() const;
 46 | 
 47 |   // Control of whether to use kernels tuned for in-order or out-of-order CPU
 48 |   // cores. The default is auto-detection, so these methods should only be used
 49 |   // to override that auto-detection if it's not working as intended or for
 50 |   // testing.
 51 |   Tuning explicit_tuning() const;
 52 |   void set_explicit_tuning(Tuning value);
 53 | 
 54 |   // The thread pool held by this context to dispatch a ruy::Mul to worker
 55 |   // threads.
 56 |   //
 57 |   // By default, threads may spin-wait for a few milliseconds before reverting
 58 |   // to passive wait. This can be controlled by
 59 |   // `mutable_thread_pool()->set_spin_milliseconds(value)`.
 60 |   const ThreadPool& thread_pool() const;
 61 |   ThreadPool* mutable_thread_pool();
 62 | 
 63 |   // Controls the maximum number of threads to be used by ruy::Mul with this
 64 |   // Context. The number of threads in the pool will be that value minus one,
 65 |   // as the remaining portion of the work is done directly on the calling
 66 |   // thread.
 67 |   //
 68 |   // This defaults to 1. Multi-threading in ruy is always opt-in. There is
 69 |   // no auto-detection of hardware concurrency. That is on purpose, ruy focuses
 70 |   // on mobile applications where such concepts are difficult to define
 71 |   // (e.g. ARM big.LITTLE).
 72 |   int max_num_threads() const;
 73 |   void set_max_num_threads(int value);
 74 | 
 75 |   // Controls the logic to determine how many threads to use.
 76 |   NumThreadsStrategy num_threads_strategy() const;
 77 |   void set_num_threads_strategy(NumThreadsStrategy strategy);
 78 | 
 79 |   // Returns true of the last ruy::Mul using this Context flagged the specified
 80 |   // `advisory`. This is reset by each ruy::Mul call.
 81 |   bool performance_advisory(PerformanceAdvisory advisory) const;
 82 | 
 83 |   // When using Matrix::set_cache_policy(), this Context will keep a cache of
 84 |   // pre-packed matrix data. This function clears that cache.
 85 |   void ClearPrepackedCache();
 86 | 
 87 |   // Override auto-detection of supported code paths.
 88 |   //
 89 |   // Passing `paths == Path::kNone` means reverting to the default behavior.
 90 |   // This will trigger auto-detection on the next use.
 91 |   //
 92 |   // Other values will override auto-detection with the explicitly provided set
 93 |   // of paths.
 94 |   //
 95 |   // Paths in kNonArchPaths are always implicitly supported.
 96 |   void set_runtime_enabled_paths(Path paths);
 97 | 
 98 |   // Returns the set of Path's that are available.
 99 |   Path get_runtime_enabled_paths();
100 | 
101 |  private:
102 |   CtxImpl* const impl_;
103 | 
104 |   const Ctx& ctx() const;
105 |   Ctx* mutable_ctx();
106 | 
107 |   friend const Ctx* get_ctx(const Context*);
108 |   friend Ctx* get_ctx(Context*);
109 | 
110 |   // Disallow copy
111 |   Context(const Context&) = delete;
112 | };
113 | 
114 | }  // end namespace ruy
115 | 
116 | #endif  // RUY_RUY_CONTEXT_H_
117 | 
--------------------------------------------------------------------------------
/ruy/context_get_ctx.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/context_get_ctx.h"
17 | 
18 | #include "ruy/ctx_impl.h"
19 | 
20 | namespace ruy {
21 | 
22 | const Ctx* get_ctx(const Context* context) {
23 |   return static_cast(context->impl_);
24 | }
25 | Ctx* get_ctx(Context* context) { return static_cast(context->impl_); }
26 | 
27 | }  // namespace ruy
28 | 
--------------------------------------------------------------------------------
/ruy/context_get_ctx.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | // Gateway to access the Ctx (internal context interface for ruy code) from
17 | // a Context (public-facing class). Befriended by Context.
18 | 
19 | #ifndef THIRD_PARTY_RUY_RUY_CONTEXT_GET_CTX_H_
20 | #define THIRD_PARTY_RUY_RUY_CONTEXT_GET_CTX_H_
21 | 
22 | #include "ruy/context.h"
23 | #include "ruy/ctx.h"
24 | 
25 | namespace ruy {
26 | 
27 | const Ctx* get_ctx(const Context* context);
28 | Ctx* get_ctx(Context*);
29 | 
30 | }  // namespace ruy
31 | 
32 | #endif  // THIRD_PARTY_RUY_RUY_CONTEXT_GET_CTX_H_
33 | 
--------------------------------------------------------------------------------
/ruy/context_test.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/context.h"
17 | 
18 | #include "ruy/gtest_wrapper.h"
19 | #include "ruy/strategy_controls.h"
20 | #include "ruy/path.h"
21 | #include "ruy/prepacked_cache.h"
22 | #include "ruy/tune.h"
23 | 
24 | namespace ruy {
25 | namespace {
26 | 
27 | TEST(ContextTest, ContextClassSanity) {
28 |   Context context;
29 |   EXPECT_EQ(context.last_used_path(), Path::kNone);
30 |   EXPECT_EQ(context.explicit_tuning(), Tuning::kAuto);
31 |   EXPECT_EQ(&context.thread_pool(), context.mutable_thread_pool());
32 |   EXPECT_NE(context.mutable_thread_pool(), nullptr);
33 |   EXPECT_EQ(context.max_num_threads(), 1);
34 |   EXPECT_EQ(context.num_threads_strategy(), NumThreadsStrategy::kDefault);
35 |   context.set_explicit_tuning(Tuning::kGeneric);
36 |   context.set_max_num_threads(2);
37 |   context.set_num_threads_strategy(NumThreadsStrategy::kForceMaxNumThreads);
38 |   EXPECT_EQ(context.explicit_tuning(), Tuning::kGeneric);
39 |   EXPECT_EQ(context.max_num_threads(), 2);
40 |   EXPECT_EQ(context.num_threads_strategy(),
41 |             NumThreadsStrategy::kForceMaxNumThreads);
42 | }
43 | 
44 | }  // namespace
45 | }  // namespace ruy
46 | 
47 | int main(int argc, char** argv) {
48 |   ::testing::InitGoogleTest(&argc, argv);
49 |   return RUN_ALL_TESTS();
50 | }
51 | 
--------------------------------------------------------------------------------
/ruy/cpu_cache_params.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef RUY_RUY_CPU_CACHE_PARAMS_H_
17 | #define RUY_RUY_CPU_CACHE_PARAMS_H_
18 | 
19 | namespace ruy {
20 | 
21 | // Holds some information about a CPU's data caches.
22 | //
23 | // Meaning of 'local': a 'local' cache means a cache that is used by only one
24 | // CPU core, not shared with other cores. It might still be used by multiple
25 | // 'processors' in case of SMT as in Intel HyperThreading. CPUs often have
26 | // multiple levels of local cache, e.g. L1 and L2. We typically return the
27 | // larger one, the assumption being that even the larger one has substantially
28 | // lower latency than any higher (non-local) cache, however as noted below (*)
29 | // the implementation may choose to ignore a cache level.
30 | //
31 | // Meaning of 'last level': this refers to some higher cache level, typically
32 | // shared among multiple CPU cores, so we considered using the terminology
33 | // 'shared' instead of 'last_level'. However that created some confusion of its
34 | // own, as the meaning of 'shared' varies between CPUs, with some CPUs not
35 | // having any level of cache shared among all cores. That is why we stick with
36 | // the 'last_level' terminology, however with the following caveats:
37 | //   1. As noted below (*) the implementation may choose to ignore a cache
38 | // level, which could cause the 'last level' cache according to ruy not to be
39 | // the actual last level.
40 | //   2. On some systems-on-chip there is a 'last level' cache outside of the
41 | // last level cache in the CPU complex. Ruy is not currently doing anything
42 | // specific regarding such caches.
43 | //   3. We haven't figured out how to amend our terminology to be meaningful
44 | // on NUMA architectures. NUMA hasn't been part of ruy's scope so far.
45 | //
46 | // (*) Note on ignoring certain cache levels:
47 | // The implementation may choose to ignore a cache if it's suspected not to
48 | // have compelling performance. This is true about all cache levels, but more
49 | // likely regarding the 'last level' cache. For example, a L4 cache may be
50 | // ignored if we believe that it's not the right latency/size compromise for us,
51 | // so on such a CPU, the L3 cache may be used as the 'last level' cache instead.
52 | //
53 | // (**) Note on CPUs with heterogeneous cores:
54 | // Some CPUs have multiple cores with different local caches. For example, some
55 | // ARM big.LITTLE CPUs have some CPU cores with L1=32k and L2=128k, and some
56 | // other CPU cores with L1=64k and L2=256k or even 512k. On such CPUs, the
57 | // fields in this struct refer to the minimum value over all cores. In other
58 | // words, we use conservative values that do not risk over-estimating local
59 | // cache sizes in case of a migration of our threads to smaller cores.
60 | //
61 | // Example:
62 | // On a Qualcomm S855 SoC, there are 8 CPU cores. Each core has L1 and L2 data
63 | // caches local to it:
64 | // - 4 cores have L1=32k, L2=128k.
65 | // - 3 cores have L1=64k, L2=256k.
66 | // - 1 core has   L1=64k, L2=512k.
67 | // All 8 cores share a L3 cache of size 2M, and there is beyond that a SoC-level
68 | // cache of size 3M.
69 | // On such a system, we should have:
70 | // - local_level_cache_size=128k, the smallest L2 size.
71 | // - last_level_cache_size=2M, the L3 cache size, ignoring the SoC-level cache.
72 | struct CpuCacheParams final {
73 |   // Minimum value (see (**)), over all cores, of the size in bytes of its local
74 |   // cache (see "Meaning of 'local'").
75 |   int local_cache_size = 0;
76 |   // Minimum value (see (**)), over all cores, of the size in bytes of its last
77 |   // level cache (see "Meaning of 'last level'").
78 |   int last_level_cache_size = 0;
79 | };
80 | 
81 | }  // namespace ruy
82 | 
83 | #endif  // RUY_RUY_CPU_CACHE_PARAMS_H_
84 | 
--------------------------------------------------------------------------------
/ruy/cpuinfo.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef RUY_RUY_CPUINFO_H_
17 | #define RUY_RUY_CPUINFO_H_
18 | 
19 | #include "ruy/cpu_cache_params.h"
20 | 
21 | namespace ruy {
22 | 
23 | // Wraps the functionality that ruy needs from the cpuinfo library.
24 | class CpuInfo final {
25 |  public:
26 |   CpuInfo() {}
27 |   ~CpuInfo();
28 | 
29 |   // ARM features
30 |   bool NeonDotprod();
31 | 
32 |   // X86 features
33 |   bool Sse42();
34 |   bool Avx();
35 |   bool Avx2Fma();
36 |   bool Avx512();
37 |   bool AvxVnni();
38 | 
39 |   // Common features
40 |   const CpuCacheParams& CacheParams();
41 |   bool CurrentCpuIsA55ish();
42 |   bool CurrentCpuIsX1();
43 | 
44 |  private:
45 |   enum class InitStatus {
46 |     kNotYetAttempted,
47 |     kInitialized,
48 |     kFailed,
49 |   };
50 | 
51 |   InitStatus init_status_ = InitStatus::kNotYetAttempted;
52 |   CpuCacheParams cache_params_;
53 | 
54 |   bool EnsureInitialized();
55 |   InitStatus Initialize();
56 | 
57 |   CpuInfo(const CpuInfo&) = delete;
58 | };
59 | 
60 | }  // namespace ruy
61 | 
62 | #endif  // RUY_RUY_CPUINFO_H_
63 | 
--------------------------------------------------------------------------------
/ruy/ctx.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | // Ctx is the internal context interface class used by most of ruy's own code.
17 | // It is subclassed by CtxImpl which provides the actual data members.
18 | 
19 | #ifndef RUY_RUY_CTX_H_
20 | #define RUY_RUY_CTX_H_
21 | 
22 | #include 
23 | 
24 | namespace ruy {
25 | 
26 | class CtxImpl;
27 | class ThreadPool;
28 | class Allocator;
29 | class TuningResolver;
30 | class PrepackedCache;
31 | class CpuInfo;
32 | enum class Path : std::uint8_t;
33 | enum class Tuning;
34 | enum class PerformanceAdvisory;
35 | enum class NumThreadsStrategy : std::uint8_t;
36 | 
37 | // Ctx is the internal context class used throughout ruy code. Whereas Context
38 | // is exposed to users, Ctx is internal to ruy. As many of ruy's internal
39 | // headers, included by ruy public headers, need to use Ctx, it is important
40 | // that it does not include definition of all the actual data members. This is
41 | // solved by a variant of the 'pimpl' idiom, where instead of being implemented
42 | // in the usual way with a pointer member, it is implemented in a subclass,
43 | // CtxImpl.
44 | class Ctx /* not final, subclassed by CtxImpl */ {
45 |  public:
46 |   Path last_used_path() const;
47 |   Tuning explicit_tuning() const;
48 |   void set_explicit_tuning(Tuning value);
49 |   const ThreadPool& thread_pool() const;
50 |   ThreadPool* mutable_thread_pool();
51 |   int max_num_threads() const;
52 |   void set_max_num_threads(int value);
53 |   CpuInfo* mutable_cpuinfo();
54 |   void clear_performance_advisories();
55 |   void set_performance_advisory(PerformanceAdvisory advisory);
56 |   bool performance_advisory(PerformanceAdvisory advisory) const;
57 |   void set_num_threads_strategy(NumThreadsStrategy strategy);
58 |   NumThreadsStrategy num_threads_strategy() const;
59 | 
60 |   // Returns the set of Path's that are available. By default, this is based on
61 |   // runtime detection of CPU features, as well as on which code paths were
62 |   // built. Detection results are stored on the context object so that
63 |   // subsequent calls are fast. This is overridden by SetRuntimeEnabledPaths.
64 |   Path GetRuntimeEnabledPaths();
65 | 
66 |   // Override auto-detection of supported code paths.
67 |   //
68 |   // Passing `paths == Path::kNone` means reverting to the default behavior.
69 |   // This will trigger auto-detection on the next use.
70 |   //
71 |   // Other values will override auto-detection with the explicitly provided set
72 |   // of paths.
73 |   //
74 |   // Paths in kNonArchPaths are always implicitly supported.
75 |   void SetRuntimeEnabledPaths(Path paths);
76 | 
77 |   Path SelectPath(Path compiled_paths);
78 |   void EnsureThreadSpecificResources(int thread_count);
79 |   TuningResolver* GetThreadSpecificTuningResolver(int thread_index) const;
80 |   Allocator* GetThreadSpecificAllocator(int thread_index) const;
81 |   Allocator* GetMainAllocator();
82 |   PrepackedCache* GetPrepackedCache();
83 |   Tuning GetMainThreadTuning();
84 |   void ClearPrepackedCache();
85 | 
86 |  private:
87 |   // Downcast helpers.
88 |   const CtxImpl& impl() const;
89 |   CtxImpl* mutable_impl();
90 | };
91 | 
92 | }  // namespace ruy
93 | 
94 | #endif  // RUY_RUY_CTX_H_
95 | 
--------------------------------------------------------------------------------
/ruy/ctx_impl.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | // Internal implementation details for Ctx. Drags in the entire world. Avoid
17 | // #including this, use "ctx.h" instead.
18 | 
19 | #ifndef RUY_RUY_CTX_IMPL_H_
20 | #define RUY_RUY_CTX_IMPL_H_
21 | 
22 | #include 
23 | #include 
24 | #include 
25 | 
26 | #include "ruy/allocator.h"
27 | #include "ruy/cpuinfo.h"
28 | #include "ruy/ctx.h"
29 | #include "ruy/path.h"
30 | #include "ruy/performance_advisory.h"
31 | #include "ruy/prepacked_cache.h"
32 | #include "ruy/strategy_controls.h"
33 | #include "ruy/thread_pool.h"
34 | #include "ruy/tune.h"
35 | 
36 | namespace ruy {
37 | 
38 | // The resources private to each Ruy thread.
39 | struct ThreadSpecificResource final {
40 |   // Each thread may be running on a different microarchitecture. For example,
41 |   // some threads may be on big cores, while others are on little cores. Thus,
42 |   // it's best for the tuning to be per-thread.
43 |   TuningResolver tuning_resolver;
44 |   // Each thread has its own local allocator.
45 |   Allocator allocator;
46 | };
47 | 
48 | // CtxImpl is what actually holds all the data members in a context.
49 | // It is a subclass of Ctx, which provides the interface that is what most
50 | // of ruy's code needs.
51 | //
52 | // A key requirement is that since many ruy files, including public headers,
53 | // need a definition of Ctx, the "ctx.h" header defining it must minimize how
54 | // many other ruy internal headers it includes. That is achieved by putting data
55 | // members in the CtxImpl subclass, and ensuring that only a few .cc files, not
56 | // header files, need a definition of CtxImpl.
57 | class CtxImpl final : public Ctx {
58 |  private:
59 |   friend class Ctx;
60 | 
61 |   // Single Path bit indicating which Path was used last.
62 |   Path last_used_path_ = Path::kNone;
63 |   PerformanceAdvisory performance_advisory_ = PerformanceAdvisory::kNone;
64 |   Tuning explicit_tuning_ = Tuning::kAuto;
65 |   ThreadPool thread_pool_;
66 |   int max_num_threads_ = 1;
67 |   NumThreadsStrategy num_threads_strategy_ = NumThreadsStrategy::kDefault;
68 |   // Allocator for main thread work before invoking the threadpool.
69 |   // Our simple Allocator does not allow reserving/allocating more blocks
70 |   // while it's already in committed state, so the main thread needs both
71 |   // this allocator, and its per-thread allocator.
72 |   std::unique_ptr main_allocator_;
73 |   std::unique_ptr prepacked_cache_;
74 |   // Set of Paths enabled at runtime. By default, that is based on runtime
75 |   // detection, but may be overridden. The initial value kNone
76 |   // means that detection has not yet been performed.
77 |   Path runtime_enabled_paths_ = Path::kNone;
78 |   CpuInfo cpuinfo_;
79 |   // State for each thread in the thread pool. Entry 0 is the main thread.
80 |   std::vector>
81 |       thread_specific_resources_;
82 | };
83 | 
84 | }  // namespace ruy
85 | 
86 | #endif  // RUY_RUY_CTX_IMPL_H_
87 | 
--------------------------------------------------------------------------------
/ruy/ctx_test.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/ctx_impl.h"
17 | #include "ruy/gtest_wrapper.h"
18 | #include "ruy/strategy_controls.h"
19 | #include "ruy/path.h"
20 | #include "ruy/platform.h"
21 | 
22 | namespace ruy {
23 | namespace {
24 | 
25 | TEST(ContextInternalTest, EnabledPathsGeneral) {
26 |   CtxImpl ctx;
27 |   const auto ruy_paths = ctx.GetRuntimeEnabledPaths();
28 |   const auto ruy_paths_repeat = ctx.GetRuntimeEnabledPaths();
29 |   ASSERT_EQ(ruy_paths, ruy_paths_repeat);
30 |   EXPECT_NE(ruy_paths, Path::kNone);
31 |   EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kStandardCpp);
32 | }
33 | 
34 | #if RUY_PLATFORM_X86
35 | TEST(ContextInternalTest, EnabledPathsX86Explicit) {
36 |   CtxImpl ctx;
37 |   ctx.SetRuntimeEnabledPaths(Path::kAvx2Fma);
38 |   const auto ruy_paths = ctx.GetRuntimeEnabledPaths();
39 |   EXPECT_EQ(ruy_paths, Path::kStandardCpp | Path::kAvx2Fma);
40 | }
41 | #endif  // RUY_PLATFORM_X86
42 | 
43 | #if RUY_PLATFORM_ARM
44 | TEST(ContextInternalTest, EnabledPathsX86Explicit) {
45 |   CtxImpl ctx;
46 |   ctx.SetRuntimeEnabledPaths(Path::kNeonDotprod);
47 |   const auto ruy_paths = ctx.GetRuntimeEnabledPaths();
48 |   EXPECT_EQ(ruy_paths, Path::kStandardCpp | Path::kNeonDotprod);
49 | }
50 | 
51 | TEST(ContextInternalTest, EnabledPathsArmDefault) {
52 |   CtxImpl ctx;
53 |   const auto ruy_paths = ctx.GetRuntimeEnabledPaths();
54 |   EXPECT_EQ(ruy_paths & Path::kStandardCpp, Path::kStandardCpp);
55 |   // NEON is always assumed to be supported at the moment.
56 |   EXPECT_EQ(ruy_paths & Path::kNeon, Path::kNeon);
57 | }
58 | #endif  // RUY_PLATFORM_ARM
59 | 
60 | TEST(ContextInternalTest, ThreadSpecificResources) {
61 |   CtxImpl ctx;
62 |   for (int i = 1; i <= 4; i++) {
63 |     ctx.EnsureThreadSpecificResources(i);
64 |     for (int j = 0; j < i; j++) {
65 |       EXPECT_NE(ctx.GetThreadSpecificAllocator(j), nullptr);
66 |       EXPECT_NE(ctx.GetThreadSpecificTuningResolver(j), nullptr);
67 |     }
68 |   }
69 | }
70 | 
71 | TEST(ContextInternalTest, SetNumThreadsStrategy) {
72 |   CtxImpl ctx;
73 |   EXPECT_EQ(ctx.num_threads_strategy(), NumThreadsStrategy::kDefault);
74 |   ctx.set_num_threads_strategy(NumThreadsStrategy::kForceMaxNumThreads);
75 |   EXPECT_EQ(ctx.num_threads_strategy(),
76 |             NumThreadsStrategy::kForceMaxNumThreads);
77 | }
78 | 
79 | }  // namespace
80 | }  // namespace ruy
81 | 
82 | int main(int argc, char** argv) {
83 |   ::testing::InitGoogleTest(&argc, argv);
84 |   return RUN_ALL_TESTS();
85 | }
86 | 
--------------------------------------------------------------------------------
/ruy/denormal.cc:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #include "ruy/denormal.h"
 17 | 
 18 | // NOTE: this is simply a copy of pthreadpool/src/threadpool-utils.h that's not
 19 | // exposed by the pthreadpool library
 20 | // (https://github.com/Maratyszcza/pthreadpool), but with an additional C++
 21 | // helper class to suppress floating-point denormal values.
 22 | 
 23 | /* SSE-specific headers */
 24 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \
 25 |     (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
 26 | #include 
 27 | #endif
 28 | 
 29 | /* MSVC-specific headers */
 30 | #if defined(_MSC_VER)
 31 | #include 
 32 | #endif
 33 | 
 34 | namespace ruy {
 35 | namespace {
 36 | inline struct fpu_state get_fpu_state() {
 37 |   struct fpu_state state = {};
 38 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \
 39 |     (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
 40 |   state.mxcsr = static_cast(_mm_getcsr());
 41 | #elif defined(_MSC_VER) && defined(_M_ARM)
 42 |   state.fpscr =
 43 |       static_cast(_MoveFromCoprocessor(10, 7, 1, 0, 0));
 44 | #elif defined(_MSC_VER) && defined(_M_ARM64)
 45 |   state.fpcr = static_cast(_ReadStatusReg(0x5A20));
 46 | #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \
 47 |     (__ARM_FP != 0)
 48 |   __asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r"(state.fpscr));
 49 | #elif defined(__GNUC__) && defined(__aarch64__)
 50 |   __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r"(state.fpcr));
 51 | #endif
 52 |   return state;
 53 | }
 54 | 
 55 | inline void set_fpu_state(const struct fpu_state state) {
 56 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \
 57 |     (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
 58 |   _mm_setcsr(static_cast(state.mxcsr));
 59 | #elif defined(_MSC_VER) && defined(_M_ARM)
 60 |   _MoveToCoprocessor(static_cast(state.fpscr), 10, 7, 1, 0, 0);
 61 | #elif defined(_MSC_VER) && defined(_M_ARM64)
 62 |   _WriteStatusReg(0x5A20, static_cast<__int64>(state.fpcr));
 63 | #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \
 64 |     (__ARM_FP != 0)
 65 |   __asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r"(state.fpscr));
 66 | #elif defined(__GNUC__) && defined(__aarch64__)
 67 |   __asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r"(state.fpcr));
 68 | #else
 69 |   (void)state;
 70 | #endif
 71 | }
 72 | 
 73 | inline void disable_fpu_denormals() {
 74 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \
 75 |     (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
 76 |   _mm_setcsr(_mm_getcsr() | 0x8040);
 77 | #elif defined(_MSC_VER) && defined(_M_ARM)
 78 |   int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0);
 79 |   fpscr |= 0x1000000;
 80 |   _MoveToCoprocessor(fpscr, 10, 7, 1, 0, 0);
 81 | #elif defined(_MSC_VER) && defined(_M_ARM64)
 82 |   __int64 fpcr = _ReadStatusReg(0x5A20);
 83 |   fpcr |= 0x1080000;
 84 |   _WriteStatusReg(0x5A20, fpcr);
 85 | #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \
 86 |     (__ARM_FP != 0)
 87 |   std::uint32_t fpscr;
 88 | #if defined(__thumb__) && !defined(__thumb2__)
 89 |   __asm__ __volatile__(
 90 |       "VMRS %[fpscr], fpscr\n"
 91 |       "ORRS %[fpscr], %[bitmask]\n"
 92 |       "VMSR fpscr, %[fpscr]\n"
 93 |       : [fpscr] "=l"(fpscr)
 94 |       : [bitmask] "l"(0x1000000)
 95 |       : "cc");
 96 | #else
 97 |   __asm__ __volatile__(
 98 |       "VMRS %[fpscr], fpscr\n"
 99 |       "ORR %[fpscr], #0x1000000\n"
100 |       "VMSR fpscr, %[fpscr]\n"
101 |       : [fpscr] "=r"(fpscr));
102 | #endif
103 | #elif defined(__GNUC__) && defined(__aarch64__)
104 |   std::uint64_t fpcr;
105 |   __asm__ __volatile__(
106 |       "MRS %[fpcr], fpcr\n"
107 |       "ORR %w[fpcr], %w[fpcr], 0x1000000\n"
108 |       "ORR %w[fpcr], %w[fpcr], 0x80000\n"
109 |       "MSR fpcr, %[fpcr]\n"
110 |       : [fpcr] "=r"(fpcr));
111 | #endif
112 | }
113 | }  // namespace
114 | 
115 | ScopedSuppressDenormals::ScopedSuppressDenormals() {
116 |   restore_ = get_fpu_state();
117 |   disable_fpu_denormals();
118 | }
119 | 
120 | ScopedSuppressDenormals::~ScopedSuppressDenormals() { set_fpu_state(restore_); }
121 | }  // namespace ruy
122 | 
--------------------------------------------------------------------------------
/ruy/denormal.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2021 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | #ifndef RUY_RUY_DENORMAL_H_
16 | #define RUY_RUY_DENORMAL_H_
17 | 
18 | #include 
19 | 
20 | namespace ruy {
21 | // NOTE: the following 'fpu_state' struct is copied from
22 | // pthreadpool/src/threadpool-utils.h that's not exposed by the pthreadpool
23 | // library (https://github.com/Maratyszcza/pthreadpool).
24 | struct fpu_state {
25 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \
26 |     (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
27 |   std::uint32_t mxcsr;
28 | #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \
29 |         (__ARM_FP != 0) ||                                          \
30 |     defined(_MSC_VER) && defined(_M_ARM)
31 |   std::uint32_t fpscr;
32 | #elif defined(__GNUC__) && defined(__aarch64__) || \
33 |     defined(_MSC_VER) && defined(_M_ARM64)
34 |   std::uint64_t fpcr;
35 | #endif
36 | };
37 | 
38 | // While this class is active, denormal floating point numbers are suppressed.
39 | // The destructor restores the original flags.
40 | class ScopedSuppressDenormals {
41 |  public:
42 |   ScopedSuppressDenormals();
43 |   ~ScopedSuppressDenormals();
44 | 
45 |  private:
46 |   fpu_state restore_;
47 | 
48 |   ScopedSuppressDenormals(const ScopedSuppressDenormals&) = delete;
49 |   void operator=(const ScopedSuppressDenormals&) = delete;
50 | };
51 | }  // namespace ruy
52 | 
53 | #endif  // RUY_RUY_DENORMAL_H_
54 | 
--------------------------------------------------------------------------------
/ruy/frontend.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/frontend.h"
17 | 
18 | #include "ruy/allocator.h"
19 | #include "ruy/prepare_packed_matrices.h"
20 | #include "ruy/trmul.h"
21 | #include "ruy/trmul_params.h"
22 | 
23 | namespace ruy {
24 | 
25 | void MulFrontEndFromTrMulParams(Ctx* ctx, TrMulParams* params) {
26 |   RUY_TRACE_SCOPE;
27 |   // Handle Matrix::cache_policy, possibly retrieving existing packed matrices
28 |   // or packing and caching now.
29 |   PreparePackedMatrices(ctx, params);
30 | 
31 |   // We're done with the front-end work, now enter the middle-end.
32 |   TrMul(ctx, params);
33 | 
34 |   ctx->GetMainAllocator()->FreeAll();
35 | }
36 | 
37 | }  // namespace ruy
38 | 
--------------------------------------------------------------------------------
/ruy/frontend.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | // Implementation of MulFrontEnd, the front-end part of ruy.
 17 | // This is what the ruy::Mul entry point calls, and this ends in a call to
 18 | // TrMul, at which point we enter the middle-end.
 19 | // The front-end work includes parameter validation (Validate), detemplatization
 20 | // and resolution of the specific code path to take (CreateTrMulParams), and
 21 | // any additional logic best done upfront before entering the middle-end
 22 | // (e.g. HandlePrepackedCaching).
 23 | // The call to CreateTrMulParams is an important watershed in this code's
 24 | // structure: code before it needs to be templatized like the ruy::Mul entry
 25 | // point, code after it is un-templatized.
 26 | 
 27 | #ifndef RUY_RUY_FRONTEND_H_
 28 | #define RUY_RUY_FRONTEND_H_
 29 | 
 30 | #include "ruy/create_trmul_params.h"
 31 | #include "ruy/ctx.h"
 32 | #include "ruy/profiler/instrumentation.h"
 33 | #include "ruy/trace.h"
 34 | #include "ruy/trmul_params.h"
 35 | #include "ruy/validate.h"
 36 | 
 37 | namespace ruy {
 38 | 
 39 | // The first half of front-end work, up to the point where we have TrMulParams.
 40 | // In other words, this is the part of the front-end work that needs to be
 41 | // templatized like the entry point, and that performs the initial work that
 42 | // requires this templatization, and the de-templatization. The output of this
 43 | // function is the TrMulParams, which contain enough information to allow the
 44 | // un-templatized code to take over from there.
 45 | template 
 47 | void MulFrontEndUpToCreateTrMulParams(
 48 |     const Mat& lhs, const Mat& rhs,
 49 |     const Mat& dst,
 50 |     const MulParams& mul_params, Ctx* ctx,
 51 |     TrMulParams* params) {
 52 |   RUY_TRACE_SCOPE;
 53 |   static_assert(CompiledPaths != Path::kNone, "Must compile at least one Path");
 54 |   static_assert(
 55 |       (CompiledPaths & ~kAllPathsIncludingInternalVariants) == Path::kNone,
 56 |       "CompiledPaths must be a subset of "
 57 |       "ruy::kAllPathsIncludingInternalVariants");
 58 | 
 59 |   // Perform validation of parameters early so that failures are easier to map
 60 |   // to user errors. In particular, perform this validation before the
 61 |   // transposition.
 62 |   Validate(lhs, rhs, dst);
 63 | 
 64 |   // De-templatize this Mul call by creating a TrMulParams structure.
 65 |   // This is also where the specific kernel and pack code paths corresponding to
 66 |   // `the_path` are selected, among all the code paths in `CompiledPaths`, and
 67 |   // recorded as function pointers in the TrMulParams.
 68 |   // The Transpose(lhs) here is where we switch from 'Mul' to 'TrMul'.
 69 |   CreateTrMulParams(Transpose(lhs), rhs, dst, mul_params, ctx,
 70 |                                    params);
 71 | }
 72 | 
 73 | // The second part of the front-end work, starting from where we have freshly
 74 | // created TrMulParams, performing any remaining front-end work and entering the
 75 | // middle-end.
 76 | void MulFrontEndFromTrMulParams(Ctx* ctx, TrMulParams* params);
 77 | 
 78 | // Top-level function orchestrating the two halves of front-end work:
 79 | // before and after we have detemplatized the call by creating TrMulParams.
 80 | template 
 82 | void MulFrontEnd(const Mat& lhs, const Mat& rhs,
 83 |                  const MulParams& mul_params, Ctx* ctx,
 84 |                  Mat* dst) {
 85 |   RUY_TRACE_SCOPE;
 86 |   profiler::ScopeLabel mul_label("Mul");
 87 |   profiler::ScopeLabel shape_specific_label("matmul shape: %dx%dx%d",
 88 |                                             lhs.layout.rows, lhs.layout.cols,
 89 |                                             rhs.layout.cols);
 90 |   ctx->clear_performance_advisories();
 91 |   TrMulParams params;
 92 |   MulFrontEndUpToCreateTrMulParams(lhs, rhs, *dst, mul_params,
 93 |                                                   ctx, ¶ms);
 94 |   MulFrontEndFromTrMulParams(ctx, ¶ms);
 95 | }
 96 | 
 97 | }  // namespace ruy
 98 | 
 99 | #endif  // RUY_RUY_FRONTEND_H_
100 | 
--------------------------------------------------------------------------------
/ruy/gtest_wrapper.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | // Wrapper around GTest that works around warnings and inconsistencies.
17 | 
18 | #ifndef THIRD_PARTY_RUY_RUY_GTEST_WRAPPER_H_
19 | #define THIRD_PARTY_RUY_RUY_GTEST_WRAPPER_H_
20 | 
21 | #pragma GCC diagnostic push
22 | #pragma GCC diagnostic ignored "-Wunused-parameter"
23 | #include "gtest/gtest.h"  // IWYU pragma: export
24 | #pragma GCC diagnostic pop
25 | 
26 | // When building for WAsm, ASSERT_DEATH is not defined.
27 | #ifdef ASSERT_DEATH
28 | #define RUY_ASSERT_DEATH(CONDITION, MESSAGE) ASSERT_DEATH(CONDITION, MESSAGE)
29 | #else
30 | #define RUY_ASSERT_DEATH(CONDITION, MESSAGE)
31 | #endif
32 | 
33 | #endif  // THIRD_PARTY_RUY_RUY_GTEST_WRAPPER_H_
34 | 
--------------------------------------------------------------------------------
/ruy/have_built_path_for.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef RUY_RUY_HAVE_BUILT_PATH_FOR_H_
17 | #define RUY_RUY_HAVE_BUILT_PATH_FOR_H_
18 | 
19 | #include "ruy/platform.h"
20 | 
21 | namespace ruy {
22 | 
23 | #if RUY_PLATFORM_X86
24 | bool HaveBuiltPathForAvx();
25 | bool HaveBuiltPathForAvx2Fma();
26 | bool HaveBuiltPathForAvx512();
27 | #endif  // RUY_PLATFORM_X86
28 | 
29 | }  // namespace ruy
30 | 
31 | #endif  // RUY_RUY_HAVE_BUILT_PATH_FOR_H_
32 | 
--------------------------------------------------------------------------------
/ruy/have_built_path_for_avx.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/have_built_path_for.h"
17 | #include "ruy/opt_set.h"
18 | 
19 | namespace ruy {
20 | 
21 | #if RUY_PLATFORM_X86
22 | // IMPORTANT:
23 | // These patterns must match those in the pack and kernel cc files.
24 | #if !(RUY_PLATFORM_AVX && RUY_OPT(ASM))
25 | 
26 | bool HaveBuiltPathForAvx() { return false; }
27 | 
28 | #else  // RUY_PLATFORM_AVX && RUY_OPT(ASM)
29 | 
30 | bool HaveBuiltPathForAvx() { return true; }
31 | 
32 | #endif  // RUY_PLATFORM_AVX && RUY_OPT(ASM)
33 | #endif  // RUY_PLATFORM_X86
34 | 
35 | }  // namespace ruy
36 | 
--------------------------------------------------------------------------------
/ruy/have_built_path_for_avx2_fma.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/have_built_path_for.h"
17 | #include "ruy/opt_set.h"
18 | 
19 | namespace ruy {
20 | 
21 | #if RUY_PLATFORM_X86
22 | // IMPORTANT:
23 | // These patterns must match those in the pack and kernel cc files.
24 | #if !(RUY_PLATFORM_AVX2_FMA && RUY_OPT(ASM))
25 | 
26 | bool HaveBuiltPathForAvx2Fma() { return false; }
27 | 
28 | #else  // RUY_PLATFORM_AVX2_FMA && RUY_OPT(ASM)
29 | 
30 | bool HaveBuiltPathForAvx2Fma() { return true; }
31 | 
32 | #endif  // RUY_PLATFORM_AVX2_FMA && RUY_OPT(ASM)
33 | #endif  // RUY_PLATFORM_X86
34 | 
35 | }  // namespace ruy
36 | 
--------------------------------------------------------------------------------
/ruy/have_built_path_for_avx512.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/have_built_path_for.h"
17 | #include "ruy/opt_set.h"
18 | 
19 | namespace ruy {
20 | 
21 | #if RUY_PLATFORM_X86
22 | // IMPORTANT:
23 | // These patterns must match those in the pack and kernel cc files.
24 | #if !(RUY_PLATFORM_AVX512 && RUY_OPT(ASM))
25 | 
26 | bool HaveBuiltPathForAvx512() { return false; }
27 | 
28 | #else  // RUY_PLATFORM_AVX512 && RUY_OPT(ASM)
29 | 
30 | bool HaveBuiltPathForAvx512() { return true; }
31 | 
32 | #endif  // RUY_PLATFORM_AVX512 && RUY_OPT(ASM)
33 | #endif  // RUY_PLATFORM_X86
34 | 
35 | }  // namespace ruy
36 | 
--------------------------------------------------------------------------------
/ruy/matrix_test.cc:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #include "ruy/matrix.h"
 17 | 
 18 | #include "ruy/gtest_wrapper.h"
 19 | 
 20 | namespace ruy {
 21 | namespace {
 22 | 
 23 | TEST(MatrixTest, LayoutClassSanity) {
 24 |   Layout layout;
 25 |   EXPECT_EQ(layout.rows(), 0);
 26 |   EXPECT_EQ(layout.cols(), 0);
 27 |   EXPECT_EQ(layout.stride(), 0);
 28 |   EXPECT_EQ(layout.order(), Order::kColMajor);
 29 |   layout.set_rows(123);
 30 |   layout.set_cols(456);
 31 |   layout.set_stride(789);
 32 |   layout.set_order(Order::kRowMajor);
 33 |   EXPECT_EQ(layout.rows(), 123);
 34 |   EXPECT_EQ(layout.cols(), 456);
 35 |   EXPECT_EQ(layout.stride(), 789);
 36 |   EXPECT_EQ(layout.order(), Order::kRowMajor);
 37 | }
 38 | 
 39 | TEST(MatrixTest, MakeSimpleLayout) {
 40 |   Layout layout;
 41 |   MakeSimpleLayout(123, 456, Order::kColMajor, &layout);
 42 |   EXPECT_EQ(layout.rows(), 123);
 43 |   EXPECT_EQ(layout.cols(), 456);
 44 |   EXPECT_EQ(layout.stride(), 123);
 45 |   EXPECT_EQ(layout.order(), Order::kColMajor);
 46 |   MakeSimpleLayout(321, 654, Order::kRowMajor, &layout);
 47 |   EXPECT_EQ(layout.rows(), 321);
 48 |   EXPECT_EQ(layout.cols(), 654);
 49 |   EXPECT_EQ(layout.stride(), 654);
 50 |   EXPECT_EQ(layout.order(), Order::kRowMajor);
 51 | }
 52 | 
 53 | TEST(MatrixTest, ConstCheckingPtrSanity) {
 54 |   using PtrType = detail::ConstCheckingPtr;
 55 |   PtrType ptr;
 56 |   int some_nonconst;
 57 |   const int some_const = 0;
 58 |   EXPECT_EQ(ptr.get(), nullptr);
 59 |   ptr.set(&some_nonconst);
 60 |   EXPECT_EQ(static_cast(ptr).get(), &some_nonconst);
 61 |   EXPECT_EQ(ptr.get(), &some_nonconst);
 62 |   ptr.set(&some_const);
 63 |   EXPECT_EQ(static_cast(ptr).get(), &some_const);
 64 | #ifndef NDEBUG
 65 |   RUY_ASSERT_DEATH(ptr.get(), "");
 66 | #endif
 67 | }
 68 | 
 69 | TEST(MatrixTest, MatrixClassSanity) {
 70 |   Matrix matrix;
 71 |   EXPECT_EQ(matrix.data(), nullptr);
 72 |   EXPECT_EQ(matrix.zero_point(), 0);
 73 |   EXPECT_EQ(matrix.cache_policy(), CachePolicy::kNeverCache);
 74 |   EXPECT_EQ(matrix.layout().rows(), 0);
 75 |   EXPECT_EQ(matrix.layout().cols(), 0);
 76 |   EXPECT_EQ(matrix.layout().stride(), 0);
 77 |   EXPECT_EQ(matrix.layout().order(), Order::kColMajor);
 78 |   const int some_const = 0;
 79 |   matrix.set_data(&some_const);
 80 |   matrix.set_zero_point(123);
 81 |   matrix.set_cache_policy(CachePolicy::kAlwaysCache);
 82 |   MakeSimpleLayout(12, 34, Order::kRowMajor, matrix.mutable_layout());
 83 |   EXPECT_EQ(static_cast&>(matrix).data(), &some_const);
 84 | #ifndef NDEBUG
 85 |   RUY_ASSERT_DEATH(matrix.data(), "");
 86 | #endif
 87 |   EXPECT_EQ(matrix.zero_point(), 123);
 88 |   EXPECT_EQ(matrix.cache_policy(), CachePolicy::kAlwaysCache);
 89 |   EXPECT_EQ(matrix.layout().rows(), 12);
 90 |   EXPECT_EQ(matrix.layout().cols(), 34);
 91 |   EXPECT_EQ(matrix.layout().stride(), 34);
 92 |   EXPECT_EQ(matrix.layout().order(), Order::kRowMajor);
 93 | }
 94 | 
 95 | }  // namespace
 96 | }  // namespace ruy
 97 | 
 98 | int main(int argc, char** argv) {
 99 |   ::testing::InitGoogleTest(&argc, argv);
100 |   return RUN_ALL_TESTS();
101 | }
102 | 
--------------------------------------------------------------------------------
/ruy/mul_params_test.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/mul_params.h"
17 | 
18 | #include 
19 | #include 
20 | 
21 | #include "ruy/gtest_wrapper.h"
22 | 
23 | namespace ruy {
24 | namespace {
25 | 
26 | TEST(MulParamsTest, SpecClassSanity) {
27 |   using MulParamsType = MulParams;
28 |   static_assert(std::is_same::value,
29 |                 "");
30 |   static_assert(std::is_same::value, "");
31 | 
32 |   MulParamsType mul_params;
33 |   EXPECT_EQ(mul_params.bias(), nullptr);
34 |   EXPECT_EQ(mul_params.multiplier_fixedpoint(), 0);
35 |   EXPECT_EQ(mul_params.multiplier_exponent(), 0);
36 |   EXPECT_EQ(mul_params.multiplier_fixedpoint_perchannel(), nullptr);
37 |   EXPECT_EQ(mul_params.multiplier_exponent_perchannel(), nullptr);
38 |   EXPECT_EQ(mul_params.clamp_min(), -128);
39 |   EXPECT_EQ(mul_params.clamp_max(), 127);
40 |   EXPECT_EQ(mul_params.channel_dimension(), ChannelDimension::kRow);
41 |   EXPECT_EQ(mul_params.perchannel_buffers_capacity_rounding(), 1);
42 |   std::int32_t bias_data[1];
43 |   mul_params.set_bias(bias_data);
44 |   mul_params.set_multiplier_fixedpoint(123);
45 |   mul_params.set_multiplier_exponent(4);
46 |   mul_params.set_channel_dimension(ChannelDimension::kCol);
47 |   mul_params.set_perchannel_buffers_capacity_rounding(8);
48 |   EXPECT_EQ(mul_params.bias(), bias_data);
49 |   EXPECT_EQ(mul_params.multiplier_fixedpoint(), 123);
50 |   EXPECT_EQ(mul_params.multiplier_exponent(), 4);
51 |   EXPECT_EQ(mul_params.channel_dimension(), ChannelDimension::kCol);
52 |   EXPECT_EQ(mul_params.perchannel_buffers_capacity_rounding(), 8);
53 |   mul_params.set_multiplier_fixedpoint(0);
54 |   mul_params.set_multiplier_exponent(0);
55 |   std::int32_t multiplier_fixedpoint_perchannel_data[1];
56 |   int multiplier_exponent_perchannel_data[1];
57 |   mul_params.set_multiplier_fixedpoint_perchannel(
58 |       multiplier_fixedpoint_perchannel_data);
59 |   mul_params.set_multiplier_exponent_perchannel(
60 |       multiplier_exponent_perchannel_data);
61 |   mul_params.set_clamp_min(-10);
62 |   mul_params.set_clamp_max(10);
63 |   EXPECT_EQ(mul_params.multiplier_fixedpoint(), 0);
64 |   EXPECT_EQ(mul_params.multiplier_exponent(), 0);
65 |   EXPECT_EQ(mul_params.multiplier_fixedpoint_perchannel(),
66 |             multiplier_fixedpoint_perchannel_data);
67 |   EXPECT_EQ(mul_params.multiplier_exponent_perchannel(),
68 |             multiplier_exponent_perchannel_data);
69 |   EXPECT_EQ(mul_params.clamp_min(), -10);
70 |   EXPECT_EQ(mul_params.clamp_max(), 10);
71 | }
72 | 
73 | }  // namespace
74 | }  // namespace ruy
75 | 
76 | int main(int argc, char** argv) {
77 |   ::testing::InitGoogleTest(&argc, argv);
78 |   return RUN_ALL_TESTS();
79 | }
80 | 
--------------------------------------------------------------------------------
/ruy/opt_set.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef RUY_RUY_OPT_SET_H_
17 | #define RUY_RUY_OPT_SET_H_
18 | 
19 | // RUY_OPT_SET is a compile-time API that Ruy provides for enabling/disabling
20 | // certain optimizations. It should be used by defining that macro on the
21 | // compiler command line.
22 | //
23 | // Each bit in RUY_OPT_SET controls a particular optimization done in Ruy.
24 | #define RUY_OPT_BIT_INTRINSICS 0x1
25 | #define RUY_OPT_BIT_ASM 0x2
26 | #define RUY_OPT_BIT_TUNING 0x4
27 | #define RUY_OPT_BIT_FAT_KERNEL 0x8
28 | // 0x10 used to be RUY_OPT_BIT_NATIVE_ROUNDING
29 | #define RUY_OPT_BIT_AVOID_ALIASING 0x20
30 | #define RUY_OPT_BIT_MAX_STREAMING 0x40
31 | #define RUY_OPT_BIT_PACK_AHEAD 0x80
32 | #define RUY_OPT_BIT_PREFETCH_LOAD 0x100
33 | #define RUY_OPT_BIT_PREFETCH_STORE 0x200
34 | #define RUY_OPT_BIT_FRACTAL_Z 0x400
35 | #define RUY_OPT_BIT_FRACTAL_U 0x800
36 | #define RUY_OPT_BIT_FRACTAL_HILBERT 0x1000
37 | 
38 | #if !defined(RUY_OPT_SET)
39 | #ifdef RUY_OPTIMIZE_FOR_MATMUL_BENCHMARK
40 | // Load prefetching is detrimental in matrix multiplication benchmarks.
41 | // Store prefetching is not.
42 | #define RUY_OPT_SET (~RUY_OPT_BIT_PREFETCH_LOAD)
43 | #else
44 | // Default to all optimizations.
45 | #define RUY_OPT_SET (~0)
46 | #endif
47 | #endif
48 | 
49 | #define RUY_OPT(X) ((RUY_OPT_SET & RUY_OPT_BIT_##X) != 0)
50 | 
51 | #endif  // RUY_RUY_OPT_SET_H_
52 | 
--------------------------------------------------------------------------------
/ruy/perchannel_buffers_reallocation_test.cc:
--------------------------------------------------------------------------------
  1 | #include "ruy/context.h"
  2 | #include "ruy/gtest_wrapper.h"
  3 | #include "ruy/kernel.h"
  4 | #include "ruy/matrix.h"
  5 | #include "ruy/path.h"
  6 | #include "ruy/performance_advisory.h"
  7 | #include "ruy/ruy.h"
  8 | 
  9 | namespace ruy {
 10 | namespace {
 11 | 
 12 | constexpr Path kPath = Path::kInternalStandardCppVariant3;
 13 | constexpr int kBufferSize = 64;
 14 | 
 15 | template ::value &&
 18 |               !std::is_same::value>
 19 | struct PopulatePerChannelBuffersImpl {
 20 |   static void Run(MulParams* mul_params) {
 21 |     static const AccumScalar bias_buf[kBufferSize] = {0};
 22 |     static const AccumScalar multiplier_fixedpoint_buf[kBufferSize] = {0};
 23 |     static const int multiplier_exponent_buf[kBufferSize] = {0};
 24 |     mul_params->set_bias(bias_buf);
 25 |     mul_params->set_multiplier_fixedpoint_perchannel(multiplier_fixedpoint_buf);
 26 |     mul_params->set_multiplier_exponent_perchannel(multiplier_exponent_buf);
 27 |   }
 28 | };
 29 | 
 30 | template 
 31 | struct PopulatePerChannelBuffersImpl {
 32 |   static void Run(MulParams* mul_params) {
 33 |     static const AccumScalar bias_buf[kBufferSize] = {0};
 34 |     mul_params->set_bias(bias_buf);
 35 |   }
 36 | };
 37 | 
 38 | template 
 39 | void PopulatePerChannelBuffers(MulParams* mul_params) {
 40 |   PopulatePerChannelBuffersImpl::Run(mul_params);
 41 | }
 42 | 
 43 | template 
 45 | void TestPerChannelBuffersReallocation() {
 46 |   using KernelType = Kernel;
 47 | 
 48 |   MulParams mul_params;
 49 |   PopulatePerChannelBuffers(&mul_params);
 50 | 
 51 |   const int kMatrixSize = 3;
 52 |   ruy::Matrix lhs;
 53 |   ruy::MakeSimpleLayout(kMatrixSize, kMatrixSize, ruy::Order::kRowMajor,
 54 |                         lhs.mutable_layout());
 55 |   const LhsScalar lhs_data[kMatrixSize * kMatrixSize] = {0};
 56 |   lhs.set_data(lhs_data);
 57 |   ruy::Matrix rhs;
 58 |   ruy::MakeSimpleLayout(kMatrixSize, kMatrixSize, ruy::Order::kColMajor,
 59 |                         rhs.mutable_layout());
 60 |   const RhsScalar rhs_data[kMatrixSize * kMatrixSize] = {0};
 61 |   rhs.set_data(rhs_data);
 62 |   DstScalar dst_data[kMatrixSize * kMatrixSize] = {0};
 63 |   ruy::Matrix dst;
 64 |   ruy::MakeSimpleLayout(kMatrixSize, kMatrixSize, ruy::Order::kColMajor,
 65 |                         dst.mutable_layout());
 66 |   dst.set_data(dst_data);
 67 | 
 68 |   ruy::Context context;
 69 | 
 70 |   auto test_advisory = [&](bool expect_advisory,
 71 |                            ChannelDimension channel_dimension,
 72 |                            int capacity_rounding) {
 73 |     mul_params.set_channel_dimension(channel_dimension);
 74 |     mul_params.set_perchannel_buffers_capacity_rounding(capacity_rounding);
 75 |     ruy::Mul(lhs, rhs, mul_params, &context, &dst);
 76 |     EXPECT_EQ(context.performance_advisory(
 77 |                   PerformanceAdvisory::kReallocatedPerChannelBuffer),
 78 |               expect_advisory);
 79 |   };
 80 | 
 81 |   static_assert(KernelType::LhsLayout::kCols == 16, "");
 82 |   test_advisory(true, ChannelDimension::kRow, 1);
 83 |   test_advisory(true, ChannelDimension::kRow, 2);
 84 |   test_advisory(true, ChannelDimension::kRow, 4);
 85 |   test_advisory(true, ChannelDimension::kRow, 8);
 86 |   test_advisory(false, ChannelDimension::kRow, 16);
 87 |   test_advisory(false, ChannelDimension::kRow, 32);
 88 |   test_advisory(false, ChannelDimension::kRow, 64);
 89 | 
 90 |   static_assert(KernelType::RhsLayout::kCols == 8, "");
 91 |   test_advisory(true, ChannelDimension::kCol, 1);
 92 |   test_advisory(true, ChannelDimension::kCol, 2);
 93 |   test_advisory(true, ChannelDimension::kCol, 4);
 94 |   test_advisory(false, ChannelDimension::kCol, 8);
 95 |   test_advisory(false, ChannelDimension::kCol, 16);
 96 |   test_advisory(false, ChannelDimension::kCol, 32);
 97 |   test_advisory(false, ChannelDimension::kCol, 64);
 98 | }
 99 | 
100 | TEST(PerChannelBuffersReallocationTest, Float) {
101 |   TestPerChannelBuffersReallocation();
102 | }
103 | 
104 | TEST(PerChannelBuffersReallocationTest, Quantized) {
105 |   TestPerChannelBuffersReallocation();
107 | }
108 | 
109 | TEST(PerChannelBuffersReallocationTest, RawInt32) {
110 |   TestPerChannelBuffersReallocation();
112 | }
113 | 
114 | }  // namespace
115 | }  // namespace ruy
116 | 
117 | int main(int argc, char** argv) {
118 |   ::testing::InitGoogleTest(&argc, argv);
119 |   return RUN_ALL_TESTS();
120 | }
121 | 
--------------------------------------------------------------------------------
/ruy/performance_advisory.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef RUY_RUY_PERFORMANCE_ADVISORY_H_
17 | #define RUY_RUY_PERFORMANCE_ADVISORY_H_
18 | 
19 | namespace ruy {
20 | 
21 | enum class PerformanceAdvisory {
22 |   kNone = 0,
23 |   kReallocatedPerChannelBuffer = 0x1
24 | };
25 | 
26 | inline constexpr PerformanceAdvisory operator|(PerformanceAdvisory p,
27 |                                                PerformanceAdvisory q) {
28 |   return static_cast(static_cast(p) |
29 |                                           static_cast(q));
30 | }
31 | 
32 | inline constexpr PerformanceAdvisory operator&(PerformanceAdvisory p,
33 |                                                PerformanceAdvisory q) {
34 |   return static_cast(static_cast(p) &
35 |                                           static_cast(q));
36 | }
37 | 
38 | }  // namespace ruy
39 | 
40 | #endif  // RUY_RUY_PERFORMANCE_ADVISORY_H_
41 | 
--------------------------------------------------------------------------------
/ruy/pmu.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2019 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef RUY_RUY_PMU_H_
17 | #define RUY_RUY_PMU_H_
18 | 
19 | namespace ruy {
20 | 
21 | class PmuEventsPrivate;
22 | 
23 | class PmuEvents {
24 |  public:
25 |   PmuEvents();
26 |   ~PmuEvents();
27 |   void StartRecording();
28 |   void StopRecording();
29 |   float L1RefillCount() const;
30 |   float L2RefillCount() const;
31 |   float L3RefillCount() const;
32 |   float BranchMispredictionCount() const;
33 |   float FrontendStallCount() const;
34 |   float BackendStallCount() const;
35 |   float L1TLBRefillCount() const;
36 |   float L2TLBRefillCount() const;
37 |   float L1WritebackCount() const;
38 |   float L2WritebackCount() const;
39 | 
40 |  private:
41 |   PmuEventsPrivate* priv = nullptr;
42 | };
43 | 
44 | }  // namespace ruy
45 | 
46 | #endif  // RUY_RUY_PMU_H_
47 | 
--------------------------------------------------------------------------------
/ruy/prepacked_cache.cc:
--------------------------------------------------------------------------------
  1 | /* Copyright 2019 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #include "ruy/prepacked_cache.h"
 17 | 
 18 | #include "ruy/mat.h"
 19 | #include "ruy/profiler/instrumentation.h"
 20 | #include "ruy/system_aligned_alloc.h"
 21 | 
 22 | namespace ruy {
 23 | 
 24 | namespace {
 25 | 
 26 | // Allocates the `data` and `sums` buffers, and sets the corresponding
 27 | // pointer fields, in a PEMat whose other fields, particularly `layout`
 28 | // and the runtime data types, are already populated.
 29 | std::ptrdiff_t AllocateBuffers(PEMat* packed_matrix) {
 30 |   const std::ptrdiff_t data_bytes = DataBytes(*packed_matrix);
 31 |   packed_matrix->data = detail::SystemAlignedAlloc(data_bytes);
 32 |   std::ptrdiff_t sums_bytes = 0;
 33 |   if (!packed_matrix->sums_type.is_floating_point) {
 34 |     // Integer quantized matrices also need the `sums` buffer.
 35 |     sums_bytes = SumsBytes(*packed_matrix);
 36 |     packed_matrix->sums = detail::SystemAlignedAlloc(sums_bytes);
 37 |   }
 38 |   return data_bytes + sums_bytes;
 39 | }
 40 | 
 41 | // Frees the `data` and `sums` buffers held by a PEMat.
 42 | void FreeBuffers(const PEMat& packed_matrix) {
 43 |   detail::SystemAlignedFree(packed_matrix.data);
 44 |   detail::SystemAlignedFree(packed_matrix.sums);
 45 | }
 46 | 
 47 | }  // end anonymous namespace
 48 | 
 49 | std::size_t PrepackedCache::KeyHash::operator()(
 50 |     const PrepackedCache::Key& key) const {
 51 |   std::size_t src_data_hash = reinterpret_cast(key.src_data);
 52 |   // Naive hash of the layout. Based on some heuristic reasoning, not any
 53 |   // benchmarking.
 54 |   // A choice of hash function here is just an optimization matter
 55 |   // anyway, since a hash collision only results in some Key::operator== calls
 56 |   // to disambiguate, and even just returning src_data_hash, ignoring the layout
 57 |   // altogether, would probably be good enough, as the case of multiple entries
 58 |   // with the same data pointer will be uncommon.
 59 |   // Here we multiply-add the layout fields using some small constant prime
 60 |   // numbers as multipliers. The conventional approach of xor-ing bit-rotations
 61 |   // would result in some hash collisions because these values are typically
 62 |   // small positive integers, so bit-rotations are essentially bit-shifts,
 63 |   // and powers of two are common.
 64 |   std::size_t packed_layout_hash =
 65 |       static_cast(key.packed_layout.order) +
 66 |       static_cast(key.packed_layout.kernel.order) * 2 +
 67 |       key.packed_layout.stride * 3 + key.packed_layout.kernel.rows * 5 +
 68 |       key.packed_layout.kernel.cols * 7 + key.packed_layout.rows * 11 +
 69 |       key.packed_layout.cols * 13;
 70 |   return src_data_hash ^ packed_layout_hash;
 71 | }
 72 | 
 73 | PrepackedCache::~PrepackedCache() {
 74 |   for (const auto& pair : cache_) {
 75 |     FreeBuffers(pair.second.packed_matrix);
 76 |   }
 77 | }
 78 | 
 79 | PrepackedCache::Action PrepackedCache::Get(const void* src_data,
 80 |                                            PEMat* packed_matrix) {
 81 |   // Construct a Key and look up the cache.
 82 |   Key key;
 83 |   key.src_data = src_data;
 84 |   key.packed_layout = packed_matrix->layout;
 85 |   key.zero_point = packed_matrix->zero_point;
 86 |   const auto& itr = cache_.find(key);
 87 | 
 88 |   if (itr != cache_.end()) {
 89 |     // Found existing entry. Update its timestamp and return it.
 90 |     itr->second.timestamp = timestamp_++;
 91 |     *packed_matrix = itr->second.packed_matrix;
 92 |     return Action::kGotExistingEntry;
 93 |   }
 94 | 
 95 |   // No existing entry found. Allocate new buffers now and insert in the cache.
 96 |   const std::ptrdiff_t new_bytes = AllocateBuffers(packed_matrix);
 97 |   EjectUntilRoomFor(new_bytes);
 98 |   Entry entry{*packed_matrix, timestamp_++};
 99 |   cache_.emplace(key, entry);
100 |   buffers_bytes_ += new_bytes;
101 |   return Action::kInsertedNewEntry;
102 | }
103 | 
104 | void PrepackedCache::EjectUntilRoomFor(std::ptrdiff_t new_bytes) {
105 |   profiler::ScopeLabel label("PrepackedCacheEjection");
106 |   // While we are above the threshold of ejection, eject the LRU entry.
107 |   while (!cache_.empty() && buffers_bytes_ + new_bytes > max_buffers_bytes_) {
108 |     EjectOne();
109 |   }
110 | }
111 | 
112 | void PrepackedCache::EjectOne() {
113 |   auto oldest = cache_.begin();
114 |   Timestamp oldest_timestamp = oldest->second.timestamp;
115 |   {
116 |     for (auto itr = cache_.begin(); itr != cache_.end(); ++itr) {
117 |       if (itr->second.timestamp < oldest_timestamp) {
118 |         oldest = itr;
119 |         oldest_timestamp = itr->second.timestamp;
120 |       }
121 |     }
122 |   }
123 |   const PEMat& packed_matrix = oldest->second.packed_matrix;
124 |   buffers_bytes_ -= DataBytes(packed_matrix) + SumsBytes(packed_matrix);
125 |   FreeBuffers(packed_matrix);
126 |   cache_.erase(oldest);
127 | }
128 | 
129 | }  // namespace ruy
130 | 
--------------------------------------------------------------------------------
/ruy/prepare_packed_matrices.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include "ruy/prepare_packed_matrices.h"
17 | 
18 | #include "ruy/allocator.h"
19 | #include "ruy/ctx.h"
20 | #include "ruy/matrix.h"
21 | #include "ruy/prepacked_cache.h"
22 | #include "ruy/side_pair.h"
23 | #include "ruy/trace.h"
24 | #include "ruy/trmul_params.h"
25 | 
26 | namespace ruy {
27 | namespace {
28 | 
29 | // Returns true if the operand on the given side should use caching of the
30 | // packed form. This may either be explicitly dictated by its cache_policy
31 | // (if it is kNeverCache, the default, or kAlwaysCache), or it may depend
32 | // on a heuristic decision based on the other operand's width. For example,
33 | // in a matrix*vector product, for the LHS matrix operand, the other side is
34 | // the RHS vector, with a width of 1, causing the packing of the LHS to be
35 | // a large fraction of the overall work, so a heuristic would typically
36 | // decide in favor of caching, if permitted at all by the cache_policy.
37 | bool ShouldCache(const TrMulParams& params, Side side) {
38 |   const CachePolicy cache_policy = params.src[side].cache_policy;
39 |   // The width that matters is that of the other side, it is what determines
40 |   // the amortization of the packing work done on the present side.
41 |   const Side other_side = OtherSide(side);
42 |   const int other_width = params.src[other_side].layout.cols;
43 |   const int other_kernel_width =
44 |       params.packed_matrix[other_side].layout.kernel.cols;
45 |   switch (cache_policy) {
46 |     case CachePolicy::kNeverCache:
47 |       return false;
48 |     case CachePolicy::kAlwaysCache:
49 |       return true;
50 |     case CachePolicy::kCacheIfLargeSpeedup:
51 |       // The condition (other_width <= other_kernel_width) means that the kernel
52 |       // will traverse each value of the present side only once, meaning that
53 |       // the overhead of the packing work will be maximal, hence maximally
54 |       // worth caching.
55 |       return (other_width <= other_kernel_width);
56 |     case CachePolicy::kCacheIfSignificantSpeedup:
57 |       // Variant of the heuristic used in the kCacheIfLargeSpeedup case. The
58 |       // kernel will run on each value of the present side only a few times,
59 |       // so packing overhead will be significant.
60 |       return (other_width <= 4 * other_kernel_width);
61 |     default:
62 |       RUY_DCHECK(false);
63 |       return false;
64 |   }
65 | }
66 | 
67 | }  // namespace
68 | 
69 | void PreparePackedMatrices(Ctx* ctx, TrMulParams* params) {
70 |   RUY_TRACE_SCOPE;
71 |   for (Side side : {Side::kLhs, Side::kRhs}) {
72 |     PEMat& packed_matrix = params->packed_matrix[side];
73 |     if (ShouldCache(*params, side)) {
74 |       // Use a cached packed matrix (possibly packing and caching now).
75 |       auto* cache = ctx->GetPrepackedCache();
76 |       auto action = cache->Get(params->src[side].data, &packed_matrix);
77 |       RUY_TRACE_INFO(PREPARE_PACKED_MATRICES_SHOULD_CACHE);
78 |       if (action == PrepackedCache::Action::kInsertedNewEntry) {
79 |         params->RunPack(side, ctx->GetMainThreadTuning(), 0,
80 |                         packed_matrix.layout.cols);
81 |       }
82 |       params->is_prepacked[side] = true;
83 |     } else {
84 |       RUY_TRACE_INFO(PREPARE_PACKED_MATRICES_NO_CACHE);
85 |       // Do not use a cached packed matrix. Only need to allocate buffers now.
86 |       Allocator* allocator = ctx->GetMainAllocator();
87 |       packed_matrix.data = allocator->AllocateBytesAvoidingAliasingWith(
88 |           DataBytes(packed_matrix), params->src[side].data);
89 |       packed_matrix.sums = allocator->AllocateBytes(SumsBytes(packed_matrix));
90 |     }
91 |   }
92 | }
93 | 
94 | }  // namespace ruy
95 | 
--------------------------------------------------------------------------------
/ruy/prepare_packed_matrices.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef RUY_RUY_PREPARE_PACKED_MATRICES_H_
17 | #define RUY_RUY_PREPARE_PACKED_MATRICES_H_
18 | 
19 | #include "ruy/ctx.h"
20 | #include "ruy/trmul_params.h"
21 | 
22 | namespace ruy {
23 | 
24 | // Ensures that the packed matrices are ready for TrMul's work. In the generic
25 | // case, this is merely allocating their buffers.
26 | //
27 | // In the non-default case where
28 | // a matrix has a cache_policy allowing caching, this is where we implement
29 | // this caching feature: determining whether to cache each matrix, performing
30 | // the cache lookup, and possibly performing the packing and cache update if
31 | // not already cached.
32 | //
33 | // Assumes that the packed matrices have previously been created, with their
34 | // fields already set except for the buffer allocations, as part of
35 | // CreateTrMulParams. The reason for separating this preparation from the
36 | // creation is that the creation needs to be templatized and this preparation
37 | // does not.
38 | void PreparePackedMatrices(Ctx* ctx, TrMulParams* params);
39 | 
40 | }  // namespace ruy
41 | 
42 | #endif  // RUY_RUY_PREPARE_PACKED_MATRICES_H_
43 | 
--------------------------------------------------------------------------------
/ruy/profiler/BUILD:
--------------------------------------------------------------------------------
 1 | # A minimalistic profiler sampling pseudo-stacks
 2 | 
 3 | load("//ruy:build_defs.oss.bzl", "ruy_linkopts_thread_standard_library")
 4 | load("//third_party/bazel_rules/rules_cc/cc:cc_library.bzl", "cc_library")
 5 | load("//third_party/bazel_rules/rules_cc/cc:cc_test.bzl", "cc_test")
 6 | 
 7 | package(
 8 |     default_applicable_licenses = ["//third_party/ruy:license"],
 9 |     licenses = ["notice"],  # Apache 2.0
10 | )
11 | 
12 | config_setting(
13 |     name = "ruy_profiler",
14 |     define_values = {"ruy_profiler": "true"},
15 | )
16 | 
17 | # Used to build TFLite Micro RUY dependency for embedded targets outside of the
18 | # RUY source tree.
19 | filegroup(
20 |     name = "ruy_instrumentation_header",
21 |     srcs = ["instrumentation.h"],
22 |     visibility = ["//visibility:public"],
23 | )
24 | 
25 | cc_library(
26 |     name = "instrumentation",
27 |     srcs = ["instrumentation.cc"],
28 |     hdrs = ["instrumentation.h"],
29 |     defines = select({
30 |         ":ruy_profiler": ["RUY_PROFILER"],
31 |         "//conditions:default": [],
32 |     }),
33 |     linkopts = ruy_linkopts_thread_standard_library(),
34 |     visibility = ["//visibility:public"],
35 | )
36 | 
37 | cc_library(
38 |     name = "profiler",
39 |     srcs = [
40 |         "profiler.cc",
41 |         "treeview.cc",
42 |     ],
43 |     hdrs = [
44 |         "profiler.h",
45 |         "treeview.h",
46 |     ],
47 |     linkopts = ruy_linkopts_thread_standard_library(),
48 |     visibility = ["//visibility:public"],
49 |     deps = [":instrumentation"],
50 | )
51 | 
52 | cc_library(
53 |     name = "test_instrumented_library",
54 |     testonly = True,
55 |     srcs = ["test_instrumented_library.cc"],
56 |     hdrs = ["test_instrumented_library.h"],
57 |     deps = [":instrumentation"],
58 | )
59 | 
60 | cc_test(
61 |     name = "test",
62 |     srcs = ["test.cc"],
63 |     linkopts = ruy_linkopts_thread_standard_library(),
64 |     deps = [
65 |         ":profiler",
66 |         ":test_instrumented_library",
67 |         "//ruy:gtest_wrapper",
68 |     ],
69 | )
70 | 
--------------------------------------------------------------------------------
/ruy/profiler/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # This file is generated (whence no license header). Do not edit!
 2 | # To regenerate, run:
 3 | #   cmake/bazel_to_cmake.sh
 4 | 
 5 | if(${RUY_PROFILER})
 6 |   set(ruy_profiler_0_RUY_PROFILER "RUY_PROFILER")
 7 | else()
 8 |   set(ruy_profiler_0_RUY_PROFILER "")
 9 | endif()
10 | 
11 | if(CMAKE_SYSTEM_NAME STREQUAL Windows)
12 |   set(ruy_profiler_1_pthread "")
13 | else()
14 |   set(ruy_profiler_1_pthread "-pthread")
15 | endif()
16 | 
17 | ruy_cc_library(
18 |   NAME
19 |     ruy_profiler_instrumentation
20 |   SRCS
21 |     instrumentation.cc
22 |   HDRS
23 |     instrumentation.h
24 |   DEFINES
25 |     ${ruy_profiler_0_RUY_PROFILER}
26 |   LINKOPTS
27 |     ${ruy_profiler_1_pthread}
28 |   PUBLIC
29 | )
30 | 
31 | ruy_cc_library(
32 |   NAME
33 |     ruy_profiler_profiler
34 |   SRCS
35 |     profiler.cc
36 |     treeview.cc
37 |   HDRS
38 |     profiler.h
39 |     treeview.h
40 |   LINKOPTS
41 |     ${ruy_profiler_1_pthread}
42 |   PUBLIC
43 |   DEPS
44 |     ruy_profiler_instrumentation
45 | )
46 | 
47 | ruy_cc_library(
48 |   NAME
49 |     ruy_profiler_test_instrumented_library
50 |   TESTONLY
51 |   SRCS
52 |     test_instrumented_library.cc
53 |   HDRS
54 |     test_instrumented_library.h
55 |   DEPS
56 |     ruy_profiler_instrumentation
57 | )
58 | 
59 | ruy_cc_test(
60 |   NAME
61 |     ruy_profiler_test
62 |   SRCS
63 |     test.cc
64 |   LINKOPTS
65 |     ${ruy_profiler_1_pthread}
66 |   DEPS
67 |     ruy_profiler_profiler
68 |     ruy_profiler_test_instrumented_library
69 |     ruy_gtest_wrapper
70 | )
71 | 
72 | ruy_add_all_subdirs()
73 | 
--------------------------------------------------------------------------------
/ruy/profiler/instrumentation.cc:
--------------------------------------------------------------------------------
  1 | /* Copyright 2020 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #include "ruy/profiler/instrumentation.h"
 17 | 
 18 | #ifdef RUY_PROFILER
 19 | 
 20 | #include 
 21 | 
 22 | namespace ruy {
 23 | namespace profiler {
 24 | 
 25 | void Label::operator=(const Label& other) {
 26 |   format_ = other.format_;
 27 |   args_count_ = other.args_count_;
 28 |   for (int i = 0; i < args_count_; i++) {
 29 |     args_[i] = other.args_[i];
 30 |   }
 31 | }
 32 | 
 33 | bool Label::operator==(const Label& other) const {
 34 |   if (std::string(format_) != std::string(other.format_)) {
 35 |     return false;
 36 |   }
 37 |   if (args_count_ != other.args_count_) {
 38 |     return false;
 39 |   }
 40 |   for (int i = 0; i < args_count_; i++) {
 41 |     if (args_[i] != other.args_[i]) {
 42 |       return false;
 43 |     }
 44 |   }
 45 |   return true;
 46 | }
 47 | 
 48 | std::string Label::Formatted() const {
 49 |   static constexpr int kBufSize = 256;
 50 |   char buf[kBufSize];
 51 |   if (args_count_ == 0) {
 52 |     return format_;
 53 |   }
 54 |   if (args_count_ == 1) {
 55 |     snprintf(buf, kBufSize, format_, args_[0]);
 56 |   } else if (args_count_ == 2) {
 57 |     snprintf(buf, kBufSize, format_, args_[0], args_[1]);
 58 |   } else if (args_count_ == 3) {
 59 |     snprintf(buf, kBufSize, format_, args_[0], args_[1], args_[2]);
 60 |   } else if (args_count_ == 4) {
 61 |     snprintf(buf, kBufSize, format_, args_[0], args_[1], args_[2], args_[3]);
 62 |   } else {
 63 |     abort();
 64 |   }
 65 |   return buf;
 66 | }
 67 | 
 68 | namespace detail {
 69 | 
 70 | std::mutex* GlobalsMutex() {
 71 |   static std::mutex mutex;
 72 |   return &mutex;
 73 | }
 74 | 
 75 | bool& GlobalIsProfilerRunning() {
 76 |   static bool b;
 77 |   return b;
 78 | }
 79 | 
 80 | std::vector* GlobalAllThreadStacks() {
 81 |   static std::vector all_stacks;
 82 |   return &all_stacks;
 83 | }
 84 | 
 85 | ThreadStack* ThreadLocalThreadStack() {
 86 |   thread_local static ThreadStack thread_stack;
 87 |   return &thread_stack;
 88 | }
 89 | 
 90 | ThreadStack::ThreadStack() {
 91 |   std::lock_guard lock(*GlobalsMutex());
 92 |   static std::uint32_t global_next_thread_stack_id = 0;
 93 |   stack_.id = global_next_thread_stack_id++;
 94 |   GlobalAllThreadStacks()->push_back(this);
 95 | }
 96 | 
 97 | ThreadStack::~ThreadStack() {
 98 |   std::lock_guard lock(*GlobalsMutex());
 99 |   std::vector* all_stacks = GlobalAllThreadStacks();
100 |   for (auto it = all_stacks->begin(); it != all_stacks->end(); ++it) {
101 |     if (*it == this) {
102 |       all_stacks->erase(it);
103 |       return;
104 |     }
105 |   }
106 | }
107 | int GetBufferSize(const Stack& stack) {
108 |   return sizeof(stack.id) + sizeof(stack.size) +
109 |          stack.size * sizeof(stack.labels[0]);
110 | }
111 | 
112 | void CopyToBuffer(const Stack& stack, char* dst) {
113 |   memcpy(dst, &stack.id, sizeof(stack.id));
114 |   dst += sizeof(stack.id);
115 |   memcpy(dst, &stack.size, sizeof(stack.size));
116 |   dst += sizeof(stack.size);
117 |   memcpy(dst, stack.labels, stack.size * sizeof(stack.labels[0]));
118 | }
119 | 
120 | void ReadFromBuffer(const char* src, Stack* stack) {
121 |   memcpy(&stack->id, src, sizeof(stack->id));
122 |   src += sizeof(stack->id);
123 |   memcpy(&stack->size, src, sizeof(stack->size));
124 |   src += sizeof(stack->size);
125 |   memcpy(stack->labels, src, stack->size * sizeof(stack->labels[0]));
126 | }
127 | 
128 | }  // namespace detail
129 | }  // namespace profiler
130 | }  // namespace ruy
131 | 
132 | #endif
133 | 
--------------------------------------------------------------------------------
/ruy/profiler/profiler.cc:
--------------------------------------------------------------------------------
  1 | /* Copyright 2020 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #include "ruy/profiler/profiler.h"
 17 | 
 18 | #ifdef RUY_PROFILER
 19 | #include 
 20 | #include   // NOLINT
 21 | #include 
 22 | #include 
 23 | #include   // NOLINT
 24 | #include 
 25 | #endif
 26 | 
 27 | #include "ruy/profiler/instrumentation.h"
 28 | #include "ruy/profiler/treeview.h"
 29 | 
 30 | namespace ruy {
 31 | namespace profiler {
 32 | 
 33 | #ifdef RUY_PROFILER
 34 | 
 35 | ScopeProfile::ScopeProfile() { Start(); }
 36 | ScopeProfile::ScopeProfile(bool enable) {
 37 |   if (enable) {
 38 |     Start();
 39 |   }
 40 | }
 41 | ScopeProfile::~ScopeProfile() {
 42 |   if (!thread_) {
 43 |     return;
 44 |   }
 45 |   finishing_.store(true);
 46 |   thread_->join();
 47 |   Finish();
 48 | }
 49 | 
 50 | void ScopeProfile::Start() {
 51 |   {
 52 |     std::lock_guard lock(*detail::GlobalsMutex());
 53 |     if (detail::GlobalIsProfilerRunning()) {
 54 |       fprintf(stderr, "FATAL: profiler already running!\n");
 55 |       abort();
 56 |     }
 57 |     detail::GlobalIsProfilerRunning() = true;
 58 |   }
 59 |   finishing_ = false;
 60 |   thread_.reset(new std::thread(&ScopeProfile::ThreadFunc, this));
 61 | }
 62 | 
 63 | void ScopeProfile::ThreadFunc() {
 64 |   while (!finishing_.load()) {
 65 |     std::this_thread::sleep_for(std::chrono::milliseconds(1));
 66 |     std::lock_guard lock(*detail::GlobalsMutex());
 67 |     auto* thread_stacks = detail::GlobalAllThreadStacks();
 68 |     for (detail::ThreadStack* thread_stack : *thread_stacks) {
 69 |       Sample(*thread_stack);
 70 |     }
 71 |   }
 72 | }
 73 | 
 74 | void ScopeProfile::Sample(const detail::ThreadStack& thread_stack) {
 75 |   std::lock_guard lock(thread_stack.Mutex());
 76 |   // Drop empty stacks.
 77 |   // This ensures that profiles aren't polluted by uninteresting threads.
 78 |   if (thread_stack.stack().size == 0) {
 79 |     return;
 80 |   }
 81 |   int sample_size = detail::GetBufferSize(thread_stack.stack());
 82 |   int old_buf_size = samples_buf_.size();
 83 |   samples_buf_.resize(old_buf_size + sample_size);
 84 |   detail::CopyToBuffer(thread_stack.stack(),
 85 |                        samples_buf_.data() + old_buf_size);
 86 | }
 87 | 
 88 | void ScopeProfile::Finish() {
 89 |   {
 90 |     std::lock_guard lock(*detail::GlobalsMutex());
 91 |     if (!detail::GlobalIsProfilerRunning()) {
 92 |       fprintf(stderr, "FATAL: profiler is not running!\n");
 93 |       abort();
 94 |     }
 95 |     detail::GlobalIsProfilerRunning() = false;
 96 |   }
 97 |   if (user_treeview_) {
 98 |     user_treeview_->Populate(samples_buf_);
 99 |   } else {
100 |     TreeView treeview;
101 |     treeview.Populate(samples_buf_);
102 |     Print(treeview);
103 |   }
104 | }
105 | 
106 | #endif  // RUY_PROFILER
107 | 
108 | }  // namespace profiler
109 | }  // namespace ruy
110 | 
--------------------------------------------------------------------------------
/ruy/profiler/profiler.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2020 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #ifndef RUY_RUY_PROFILER_PROFILER_H_
 17 | #define RUY_RUY_PROFILER_PROFILER_H_
 18 | 
 19 | #include 
 20 | 
 21 | #ifdef RUY_PROFILER
 22 | #include 
 23 | #include 
 24 | #include 
 25 | #include 
 26 | #endif
 27 | 
 28 | #include "ruy/profiler/instrumentation.h"
 29 | #include "ruy/profiler/treeview.h"
 30 | 
 31 | namespace ruy {
 32 | namespace profiler {
 33 | 
 34 | #ifdef RUY_PROFILER
 35 | 
 36 | // RAII user-facing way to create a profiler and let it profile a code scope,
 37 | // and print out an ASCII/MarkDown treeview upon leaving the scope.
 38 | class ScopeProfile {
 39 |  public:
 40 |   // Default constructor, unconditionally profiling.
 41 |   ScopeProfile();
 42 | 
 43 |   // Constructor allowing to choose at runtime whether to profile.
 44 |   explicit ScopeProfile(bool enable);
 45 | 
 46 |   // Destructor. It's where the profile is reported.
 47 |   ~ScopeProfile();
 48 | 
 49 |   // See treeview_ member.
 50 |   void SetUserTreeView(TreeView* treeview) { user_treeview_ = treeview; }
 51 | 
 52 |  private:
 53 |   void Start();
 54 | 
 55 |   // Thread entry point function for the profiler thread. This thread is
 56 |   // created on construction.
 57 |   void ThreadFunc();
 58 | 
 59 |   // Record a stack as a sample.
 60 |   void Sample(const detail::ThreadStack& stack);
 61 | 
 62 |   // Finalize the profile. Called on destruction.
 63 |   // If user_treeview_ is non-null, it will receive the treeview.
 64 |   // Otherwise the treeview will just be printed.
 65 |   void Finish();
 66 | 
 67 |   // Buffer where samples are recorded during profiling.
 68 |   std::vector samples_buf_;
 69 | 
 70 |   // Used to synchronize thread termination.
 71 |   std::atomic finishing_;
 72 | 
 73 |   // Underlying profiler thread, which will perform the sampling.
 74 |   // This profiler approach relies on a thread rather than on signals.
 75 |   std::unique_ptr thread_;
 76 | 
 77 |   // TreeView to populate upon destruction. If left null (the default),
 78 |   // a temporary treeview will be used and dumped on stdout. The user
 79 |   // may override that by passing their own TreeView object for other
 80 |   // output options or to directly inspect the TreeView.
 81 |   TreeView* user_treeview_ = nullptr;
 82 | };
 83 | 
 84 | #else  // no RUY_PROFILER
 85 | 
 86 | struct ScopeProfile {
 87 |   ScopeProfile() {
 88 | #ifdef GEMMLOWP_PROFILING
 89 |     fprintf(
 90 |         stderr,
 91 |         "\n\n\n**********\n\nWARNING:\n\nLooks like you defined "
 92 |         "GEMMLOWP_PROFILING, but this code has been ported to the new ruy "
 93 |         "profiler replacing the old gemmlowp profiler. You should now be "
 94 |         "defining RUY_PROFILER and not GEMMLOWP_PROFILING. When building using "
 95 |         "Bazel, just pass --define=ruy_profiler=true.\n\n**********\n\n\n");
 96 | #endif
 97 |   }
 98 |   explicit ScopeProfile(bool) {}
 99 | };
100 | 
101 | #endif
102 | 
103 | }  // namespace profiler
104 | }  // namespace ruy
105 | 
106 | #endif  // RUY_RUY_PROFILER_PROFILER_H_
107 | 
--------------------------------------------------------------------------------
/ruy/profiler/test_instrumented_library.cc:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #include 
17 | 
18 | #include "ruy/profiler/instrumentation.h"
19 | 
20 | namespace {
21 | 
22 | void MergeSortRecurse(int level, int size, int* data, int* workspace) {
23 |   ruy::profiler::ScopeLabel function_label(
24 |       "MergeSortRecurse (level=%d, size=%d)", level, size);
25 |   if (size <= 1) {
26 |     return;
27 |   }
28 |   int half_size = size / 2;
29 |   MergeSortRecurse(level + 1, half_size, data, workspace);
30 |   MergeSortRecurse(level + 1, size - half_size, data + half_size,
31 |                    workspace + half_size);
32 | 
33 |   ruy::profiler::ScopeLabel merging_sorted_halves_label(
34 |       "Merging sorted halves");
35 |   int dst_index = 0;
36 |   int left_index = 0;
37 |   int right_index = half_size;
38 |   while (dst_index < size) {
39 |     int val;
40 |     if (left_index < half_size &&
41 |         ((right_index >= size) || data[left_index] < data[right_index])) {
42 |       val = data[left_index++];
43 |     } else {
44 |       val = data[right_index++];
45 |     }
46 |     workspace[dst_index++] = val;
47 |   }
48 |   for (int i = 0; i < size; i++) {
49 |     data[i] = workspace[i];
50 |   }
51 | }
52 | 
53 | }  // namespace
54 | 
55 | void MergeSort(int size, int* data) {
56 |   ruy::profiler::ScopeLabel function_label("MergeSort (size=%d)", size);
57 |   std::vector workspace(size);
58 |   MergeSortRecurse(0, size, data, workspace.data());
59 | }
60 | 
--------------------------------------------------------------------------------
/ruy/profiler/test_instrumented_library.h:
--------------------------------------------------------------------------------
 1 | /* Copyright 2020 Google LLC. All Rights Reserved.
 2 | 
 3 | Licensed under the Apache License, Version 2.0 (the "License");
 4 | you may not use this file except in compliance with the License.
 5 | You may obtain a copy of the License at
 6 | 
 7 |     http://www.apache.org/licenses/LICENSE-2.0
 8 | 
 9 | Unless required by applicable law or agreed to in writing, software
10 | distributed under the License is distributed on an "AS IS" BASIS,
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | See the License for the specific language governing permissions and
13 | limitations under the License.
14 | ==============================================================================*/
15 | 
16 | #ifndef RUY_RUY_PROFILER_TEST_INSTRUMENTED_LIBRARY_H_
17 | #define RUY_RUY_PROFILER_TEST_INSTRUMENTED_LIBRARY_H_
18 | 
19 | #include "ruy/profiler/instrumentation.h"
20 | 
21 | void MergeSort(int size, int* data);
22 | 
23 | #endif  // RUY_RUY_PROFILER_TEST_INSTRUMENTED_LIBRARY_H_
24 | 
--------------------------------------------------------------------------------
/ruy/profiler/treeview.h:
--------------------------------------------------------------------------------
  1 | /* Copyright 2020 Google LLC. All Rights Reserved.
  2 | 
  3 | Licensed under the Apache License, Version 2.0 (the "License");
  4 | you may not use this file except in compliance with the License.
  5 | You may obtain a copy of the License at
  6 | 
  7 |     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | Unless required by applicable law or agreed to in writing, software
 10 | distributed under the License is distributed on an "AS IS" BASIS,
 11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | See the License for the specific language governing permissions and
 13 | limitations under the License.
 14 | ==============================================================================*/
 15 | 
 16 | #ifndef RUY_RUY_PROFILER_TREEVIEW_H_
 17 | #define RUY_RUY_PROFILER_TREEVIEW_H_
 18 | 
 19 | #ifdef RUY_PROFILER
 20 | 
 21 | #include 
 22 | #include