├── .gitignore ├── .travis.yml ├── AUTHORS ├── BUILD ├── CONTRIBUTING ├── CONTRIBUTORS ├── LICENSE ├── Makefile.travis ├── README.md ├── WORKSPACE ├── contrib └── CMakeLists.txt ├── doc ├── design.md ├── kernel.md ├── less-than-8-bit.md ├── low-precision.md ├── output.md ├── packing.md ├── public.md ├── quantization.md └── quantization_example.cc ├── eight_bit_int_gemm ├── eight_bit_int_gemm.cc └── eight_bit_int_gemm.h ├── fixedpoint ├── fixedpoint.h ├── fixedpoint_avx.h ├── fixedpoint_msa.h ├── fixedpoint_neon.h ├── fixedpoint_sse.h └── fixedpoint_wasmsimd.h ├── flags.bzl ├── internal ├── allocator.h ├── block_params.h ├── common.h ├── compute.h ├── detect_platform.h ├── dispatch_gemm_shape.h ├── kernel.h ├── kernel_avx.h ├── kernel_default.h ├── kernel_msa.h ├── kernel_neon.h ├── kernel_reference.h ├── kernel_sse.h ├── multi_thread_gemm.h ├── output.h ├── output_avx.h ├── output_msa.h ├── output_neon.h ├── output_sse.h ├── pack.h ├── pack_avx.h ├── pack_msa.h ├── pack_neon.h ├── pack_sse.h ├── platform.h ├── simd_wrappers.h ├── simd_wrappers_common_neon_sse.h ├── simd_wrappers_msa.h ├── simd_wrappers_neon.h ├── simd_wrappers_sse.h ├── single_thread_gemm.h └── unpack.h ├── jni ├── Android.mk └── Application.mk ├── meta ├── README ├── base.h ├── generators │ ├── cc_emitter.py │ ├── common.py │ ├── metagemm_generate_headers.sh │ ├── neon_emitter.py │ ├── neon_emitter_64.py │ ├── quantized_mul_kernels_arm_32.py │ ├── quantized_mul_kernels_arm_64.py │ ├── quantized_mul_kernels_common.py │ ├── streams_arm_32.py │ ├── streams_arm_64.py │ ├── streams_common.py │ ├── transform_kernels_arm_32.py │ ├── transform_kernels_arm_64.py │ └── transform_kernels_common.py ├── legacy_multi_thread_common.h ├── legacy_multi_thread_gemm.h ├── legacy_multi_thread_gemv.h ├── legacy_operations_common.h ├── legacy_single_thread_gemm.h ├── multi_thread_common.h ├── multi_thread_gemm.h ├── multi_thread_transform.h ├── quantized_mul_kernels.h ├── quantized_mul_kernels_arm_32.h ├── quantized_mul_kernels_arm_64.h ├── single_thread_gemm.h ├── single_thread_transform.h ├── streams.h ├── streams_arm_32.h ├── streams_arm_64.h ├── test_gemm_correctness.cc ├── test_streams_correctness.cc ├── test_transform_benchmark.cc ├── test_transform_correctness.cc ├── transform_kernels.h ├── transform_kernels_arm_32.h └── transform_kernels_arm_64.h ├── profiling ├── instrumentation.h ├── profiler.h └── pthread_everywhere.h ├── public ├── bit_depth.h ├── gemmlowp.h ├── map.h └── output_stages.h ├── scripts ├── ci-before.sh ├── ci-test.sh └── test-android.sh ├── standalone ├── cache_counters.cc ├── encode.py └── neon-gemm-kernel-benchmark.cc ├── test ├── benchmark.cc ├── benchmark_all_sizes.cc ├── benchmark_meta_gemm.cc ├── correctness_meta_gemm.cc ├── ios │ ├── gemmlowp_test.xcodeproj │ │ └── project.pbxproj │ └── gemmlowp_test │ │ ├── AppDelegate.h │ │ ├── AppDelegate.mm │ │ ├── Base.lproj │ │ ├── LaunchScreen.xib │ │ └── Main.storyboard │ │ ├── Images.xcassets │ │ └── AppIcon.appiconset │ │ │ └── Contents.json │ │ ├── Info.plist │ │ ├── ViewController.h │ │ ├── ViewController.m │ │ └── main.m ├── test.cc ├── test.h ├── test_allocator.cc ├── test_blocking_counter.cc ├── test_data.cc ├── test_data.h ├── test_fixedpoint.cc └── test_math_helpers.cc └── todo ├── armv8-64bit-kernel-for-less-than-8-bit.txt ├── error-diffusion-experiments.txt ├── fast-gemv.txt ├── less-than-8-bit-without-requantization.txt ├── multi-threading-experiments.txt ├── neon-depth-major-sources-packing.txt ├── remove-default-template-param-values.txt └── x86-kernels.txt /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.ii 3 | *.s 4 | **/.DS_Store 5 | ? 6 | ?? 7 | *binary* 8 | /.idea/ 9 | CMakeLists.txt 10 | /bazel-* 11 | cmake_build/ 12 | cmake_install/ -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | 2 | sudo: false 3 | 4 | jobs: 5 | include: 6 | - stage: build 7 | name: Android NDK 8 | language: android 9 | compiler: clang 10 | os: 11 | - linux 12 | env: 13 | - NDK_VERSION=r14b TEST=arm 14 | - TEST=x86 15 | android: 16 | components: 17 | - build-tools-22.0.1 18 | - android-22 19 | - ndk-bundle 20 | - sys-img-armeabi-v7a-android-22 21 | before_script: 22 | - ./scripts/ci-before.sh 23 | script: 24 | - ./scripts/ci-test.sh 25 | 26 | - name: Linux CMake(clang) 27 | os: linux 28 | dist: bionic 29 | language: cpp 30 | compiler: clang 31 | script: 32 | - cmake -S contrib -B cmake_build -DCMAKE_INSTALL_PREFIX=cmake_install 33 | - cmake --build cmake_build 34 | - cmake --build cmake_build --target install 35 | - ctest --test-dir cmake_build --output-on-failure --output-junit TEST-${TRAVIS_COMMIT}.xml 36 | -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the official list of gemmlowp authors for copyright purposes. 2 | # This file is distinct from the CONTRIBUTORS.txt file. 3 | # See the latter for an explanation. 4 | 5 | # Names should be added to this file as: 6 | # Name or Organization 7 | # The email address is not required for organizations. 8 | 9 | Google Inc. 10 | Intel Corporation 11 | ARM Ltd. 12 | Silk Labs Inc. 13 | MIPS Tech LLC 14 | Wave Computing Inc. 15 | -------------------------------------------------------------------------------- /BUILD: -------------------------------------------------------------------------------- 1 | # 2 | # Description: 3 | # gemmlowp is a small self-contained low-precision GEMM library. 4 | # https://github.com/google/gemmlowp 5 | 6 | licenses(["notice"]) # Apache 2.0 7 | 8 | exports_files(["LICENSE"]) 9 | 10 | config_setting( 11 | name = "windows", 12 | values = { 13 | "cpu": "x64_windows", 14 | }, 15 | ) 16 | 17 | config_setting( 18 | name = "android", 19 | values = { 20 | "crosstool_top": "//external:android/crosstool", 21 | }, 22 | ) 23 | 24 | load(":flags.bzl", "LIB_COPTS", "LIB_LINKOPTS", "BIN_LINKOPTS") 25 | 26 | filegroup( 27 | name = "gemmlowp_private_headers", 28 | srcs = glob([ 29 | "fixedpoint/*.h", 30 | "internal/*.h", 31 | ]), 32 | visibility = ["//visibility:private"], 33 | ) 34 | 35 | filegroup( 36 | name = "gemmlowp_public_headers", 37 | srcs = glob([ 38 | "meta/*.h", 39 | "public/*.h", 40 | "profiling/*.h", 41 | ]), 42 | visibility = ["//visibility:public"], 43 | ) 44 | 45 | filegroup( 46 | name = "gemmlowp_headers", 47 | srcs = [ 48 | ":gemmlowp_private_headers", 49 | ":gemmlowp_public_headers", 50 | ], 51 | visibility = ["//visibility:private"], 52 | ) 53 | 54 | filegroup( 55 | name = "eight_bit_int_gemm_headers", 56 | srcs = glob(["eight_bit_int_gemm/*.h"]), 57 | visibility = ["//visibility:private"], 58 | ) 59 | 60 | filegroup( 61 | name = "eight_bit_int_gemm_public_headers", 62 | srcs = [ 63 | ":eight_bit_int_gemm_headers", 64 | ":gemmlowp_public_headers", 65 | ], 66 | visibility = ["//visibility:public"], 67 | ) 68 | 69 | filegroup( 70 | name = "eight_bit_int_gemm_sources_with_no_headers", 71 | srcs = glob(["eight_bit_int_gemm/*.cc"]), 72 | visibility = ["//visibility:private"], 73 | ) 74 | 75 | filegroup( 76 | name = "eight_bit_int_gemm_sources", 77 | srcs = [ 78 | ":eight_bit_int_gemm_headers", 79 | ":eight_bit_int_gemm_sources_with_no_headers", 80 | ":gemmlowp_headers", 81 | ], 82 | visibility = ["//visibility:public"], 83 | ) 84 | 85 | filegroup( 86 | name = "gemmlowp_test_headers", 87 | srcs = [":gemmlowp_headers"] + glob(["test/*.h"]), 88 | visibility = ["//visibility:private"], 89 | ) 90 | 91 | filegroup( 92 | name = "fixedpoint_private_headers", 93 | srcs = glob([ 94 | "fixedpoint/*.h", 95 | ]) + [ 96 | "internal/common.h", 97 | "internal/detect_platform.h", 98 | ], 99 | visibility = ["//visibility:private"], 100 | ) 101 | 102 | cc_library( 103 | name = "fixedpoint", 104 | srcs = [ 105 | ":fixedpoint_private_headers", 106 | ], 107 | hdrs = [ 108 | "fixedpoint/fixedpoint.h", 109 | ], 110 | # Blaze warning: 111 | # "setting 'linkstatic=1' is recommended if there are no object files." 112 | linkstatic = 1, 113 | visibility = ["//visibility:public"], 114 | ) 115 | 116 | cc_library( 117 | name = "gemmlowp", 118 | hdrs = [":gemmlowp_headers"], 119 | linkopts = LIB_LINKOPTS, 120 | # Blaze warning: 121 | # "setting 'linkstatic=1' is recommended if there are no object files." 122 | linkstatic = 1, 123 | visibility = ["//visibility:public"], 124 | deps = [":fixedpoint"], 125 | ) 126 | 127 | cc_library( 128 | name = "eight_bit_int_gemm", 129 | srcs = [":eight_bit_int_gemm_sources_with_no_headers"], 130 | hdrs = [ 131 | ":eight_bit_int_gemm_headers", 132 | ":gemmlowp_private_headers", 133 | ":gemmlowp_public_headers", 134 | ], 135 | copts = LIB_COPTS, 136 | linkopts = LIB_LINKOPTS, 137 | visibility = ["//visibility:public"], 138 | deps = [":gemmlowp"], 139 | ) 140 | 141 | cc_library( 142 | name = "profiler", 143 | hdrs = [ 144 | "profiling/instrumentation.h", 145 | "profiling/profiler.h", 146 | "profiling/pthread_everywhere.h", 147 | ], 148 | visibility = ["//visibility:public"], 149 | ) 150 | 151 | # The main gemmlowp unit test 152 | cc_test( 153 | name = "test", 154 | size = "medium", 155 | srcs = [ 156 | "test/test.cc", 157 | "test/test_data.cc", 158 | ":gemmlowp_test_headers", 159 | ], 160 | copts = ["-O3"], 161 | deps = [":eight_bit_int_gemm"], 162 | ) 163 | 164 | # Math helpers test 165 | cc_test( 166 | name = "test_math_helpers", 167 | size = "small", 168 | srcs = [ 169 | "test/test_math_helpers.cc", 170 | ":gemmlowp_test_headers", 171 | ], 172 | ) 173 | 174 | # BlockingCounter test 175 | cc_test( 176 | name = "test_blocking_counter", 177 | size = "medium", 178 | srcs = [ 179 | "test/test_blocking_counter.cc", 180 | ":gemmlowp_test_headers", 181 | ], 182 | linkopts = BIN_LINKOPTS, 183 | ) 184 | 185 | # Allocator test 186 | cc_test( 187 | name = "test_allocator", 188 | size = "small", 189 | srcs = [ 190 | "test/test_allocator.cc", 191 | ":gemmlowp_test_headers", 192 | ], 193 | ) 194 | 195 | # FixedPoint test 196 | cc_test( 197 | name = "test_fixedpoint", 198 | size = "small", 199 | srcs = [ 200 | "test/test_fixedpoint.cc", 201 | ":gemmlowp_test_headers", 202 | ], 203 | ) 204 | 205 | # Benchmark 206 | cc_binary( 207 | name = "benchmark", 208 | srcs = [ 209 | "test/benchmark.cc", 210 | ":gemmlowp_test_headers", 211 | ], 212 | copts = [ 213 | "-O3", 214 | "-DNDEBUG", 215 | ], 216 | linkopts = BIN_LINKOPTS, 217 | ) 218 | 219 | # Benchmark 220 | cc_binary( 221 | name = "benchmark_profile", 222 | srcs = [ 223 | "test/benchmark.cc", 224 | ":gemmlowp_test_headers", 225 | ], 226 | copts = [ 227 | "-O3", 228 | "-DNDEBUG", 229 | "-DGEMMLOWP_TEST_PROFILE", 230 | ], 231 | linkopts = BIN_LINKOPTS, 232 | ) 233 | -------------------------------------------------------------------------------- /CONTRIBUTING: -------------------------------------------------------------------------------- 1 | Want to contribute? Great! First, read this page (including the small print at the end). 2 | 3 | 4 | Before you contribute 5 | ===================== 6 | 7 | Before we can use your code, you must sign the Google Individual Contributor 8 | License Agreement (CLA), 9 | 10 | https://developers.google.com/open-source/cla/individual?csw=1 11 | 12 | which you can do online. The CLA is necessary mainly because you own the 13 | copyright to your changes, even after your contribution becomes part of our 14 | codebase, so we need your permission to use and distribute your code. We also 15 | need to be sure of various other things—for instance that you'll tell us if you 16 | know that your code infringes on other people's patents. You don't have to sign 17 | the CLA until after you've submitted your code for review and a member has 18 | approved it, but you must do it before we can put your code into our codebase. 19 | Before you start working on a larger contribution, you should get in touch with 20 | us first through the issue tracker with your idea so that we can help out and 21 | possibly guide you. Coordinating up front makes it much easier to avoid 22 | frustration later on. 23 | 24 | 25 | Getting in touch with the gemmlowp community 26 | ============================================ 27 | 28 | The central point of communication around gemmlowp is the mailing list, 29 | https://groups.google.com/forum/#!forum/gemmlowp 30 | 31 | 32 | TODO items and projects 33 | ======================= 34 | 35 | We try to keep a current list of TODO items in the todo/ directory. 36 | Please feel free to pick one to work on, and to ask current maintainers for 37 | guidance. The gemmlowp mailing list is a good place for that. 38 | 39 | 40 | Code reviews 41 | ============ 42 | 43 | All submissions, including submissions by project members, require review. 44 | For this purpose, we use Github pull requests against this repository: 45 | 46 | https://github.com/google/gemmlowp 47 | 48 | 49 | The small print 50 | =============== 51 | 52 | Contributions made by corporations are covered by a different agreement than 53 | the one above, the Software Grant and Corporate Contributor License Agreement. 54 | -------------------------------------------------------------------------------- /CONTRIBUTORS: -------------------------------------------------------------------------------- 1 | # People who have agreed to one of the CLAs and can contribute patches. 2 | # The AUTHORS.txt file lists the copyright holders; this file 3 | # lists people. For example, Google employees are listed here 4 | # but not in AUTHORS.txt, because Google holds the copyright. 5 | # 6 | # https://developers.google.com/open-source/cla/individual 7 | # https://developers.google.com/open-source/cla/corporate 8 | # 9 | # Names should be added to this file as: 10 | # Name 11 | 12 | Google: 13 | Benoit Jacob 14 | Pete Warden 15 | Miao Wang 16 | David Andersen 17 | Maciek Chociej 18 | Justine Tunney 19 | Mark J. Matthews 20 | Marie White 21 | Suharsh Sivakumar 22 | 23 | Intel: 24 | Sagi Marcovich 25 | Murat Efe Guney 26 | Sarah Knepper 27 | Mourad Gouicem 28 | Richard Winterton 29 | 30 | ARM: 31 | David Mansell 32 | 33 | Silk Labs: 34 | Andreas Gal 35 | 36 | MIPS Tech LLC: 37 | Alexey Frunze 38 | 39 | Wave Computing Inc.: 40 | Alexey Frunze 41 | -------------------------------------------------------------------------------- /Makefile.travis: -------------------------------------------------------------------------------- 1 | UNITTESTS_COMMON=test.cc test_allocator.cc test_blocking_counter.cc test_fixedpoint.cc test_math_helpers.cc 2 | UNITTESTS_X86=$(UNITTESTS_COMMON) 3 | 4 | UNITTESTS_X86_BIN=$(addprefix ./test/, $(addsuffix .x86, $(basename $(UNITTESTS_X86)))) 5 | UNITTESTS_BIN=$(UNITTESTS_X86_BIN) 6 | 7 | VPATH=./test ./public 8 | 9 | space := 10 | space += 11 | join-with = $(subst $(space),$1,$(strip $2)) 12 | 13 | .PHONY: compile clean unittest 14 | 15 | CC_X86=clang++ 16 | CFLAGS_X86=-march=native -O3 -lpthread 17 | 18 | compile: $(UNITTESTS_BIN) 19 | 20 | clean: 21 | rm -f $(UNITTESTS_BIN) 22 | 23 | unittest: $(UNITTESTS_BIN) 24 | $(call join-with, && ,$(addprefix ./, $^)) 25 | 26 | %.x86: %.cc ./eight_bit_int_gemm/eight_bit_int_gemm.cc ./test/test_data.cc 27 | $(CC_X86) $(CFLAGS_X86) -std=c++11 -g -O3 -o $@ $^ 28 | -------------------------------------------------------------------------------- /WORKSPACE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/google/gemmlowp/16e8662c34917be0065110bfcd9cc27d30f52fdf/WORKSPACE -------------------------------------------------------------------------------- /contrib/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Gemmlowp CMake file written for Debian. 2 | # Copyright © 2016 Zhou Mo 3 | # Licence Apache-2.0 4 | 5 | cmake_minimum_required(VERSION 3.7) 6 | 7 | # Project 8 | project(gemmlowp C CXX) 9 | 10 | include(CTest) # option(BUILD_TESTING). ON by default. 11 | include(GNUInstallDirs) 12 | 13 | # Set C++11 as default standard 14 | set(CMAKE_CXX_STANDARD 11) 15 | 16 | set(THREADS_PREFER_PTHREAD_FLAG ON) 17 | find_package(Threads REQUIRED) 18 | 19 | get_filename_component(gemmlowp_src ${gemmlowp_SOURCE_DIR} PATH) 20 | 21 | if(WIN32) 22 | # one can enable simd from the cmake command line, ie -DCMAKE_CXX_FLAGS="/arch:AVX2 23 | add_definitions(-DNOMINMAX -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI) 24 | add_definitions(/bigobj /nologo /EHsc /GF /MP /Gm- /wd4800 /wd4805 /wd4244) 25 | if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") 26 | # if we compile for windows with clang, allow inline asm 27 | add_definitions(-DGEMMLOWP_ALLOW_INLINE_ASM) 28 | endif() 29 | else() 30 | set(EXTERNAL_LIBRARIES Threads::Threads) 31 | endif() 32 | 33 | # Glob header files 34 | file(GLOB gemmlowp_private_headers "${gemmlowp_src}/fixedpoint/*.h" "${gemmlowp_src}/internal/*.h") 35 | file(GLOB gemmlowp_public_headers "${gemmlowp_src}/meta/*.h" "${gemmlowp_src}/public/*.h" "${gemmlowp_src}/profiling/*.h") 36 | list(APPEND gemmlowp_headers ${gemmlowp_private_headers} ${gemmlowp_public_headers}) 37 | 38 | file(GLOB eight_bit_int_gemm_headers "${gemmlowp_src}/eight_bit_int_gemm/*.h") 39 | list(APPEND eight_bit_int_gemm_public_headers ${eight_bit_int_gemm_headers} ${gemmlowp_public_headers}) 40 | file(GLOB eight_bit_int_gemm_sources_with_no_headers "${gemmlowp_src}/eight_bit_int_gemm/*.cc") 41 | 42 | list(APPEND eight_bit_int_gemm_sources 43 | ${eight_bit_int_gemm_headers} 44 | ${eight_bit_int_gemm_sources_with_no_headers} 45 | ${gemmlowp_headers}) 46 | 47 | file(GLOB gemmlowp_test_headers "${gemmlowp_src}/test/*.h") 48 | list(APPEND gemmlowp_test_headers ${gemmlowp_headers}) 49 | 50 | file(GLOB fixedpoint_private_headers "${gemmlowp_src}/fixedpoint/*.h") 51 | list(APPEND fixedpoint_private_headers "${gemmlowp_src}/internal/common.h") 52 | 53 | add_library(eight_bit_int_gemm ${eight_bit_int_gemm_sources_with_no_headers}) 54 | set_target_properties(eight_bit_int_gemm PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON) 55 | target_link_libraries(eight_bit_int_gemm ${EXTERNAL_LIBRARIES}) 56 | 57 | # INTERFACE target to help header include 58 | add_library(gemmlowp INTERFACE) 59 | target_include_directories(gemmlowp INTERFACE 60 | $ 61 | $) 62 | target_link_libraries(gemmlowp INTERFACE eight_bit_int_gemm) 63 | 64 | install(FILES ${eight_bit_int_gemm_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/eight_bit_int_gemm) 65 | file(GLOB meta_headers "${gemmlowp_src}/meta/*.h") 66 | install(FILES ${meta_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/meta) 67 | file(GLOB public_headers "${gemmlowp_src}/public/*.h") 68 | install(FILES ${public_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/public) 69 | file(GLOB profile_headers "${gemmlowp_src}/profiling/*.h") 70 | install(FILES ${profile_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/profiling) 71 | file(GLOB internal_headers "${gemmlowp_src}/internal/*.h") 72 | install(FILES ${internal_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/internal) 73 | file(GLOB fixedpoint_headers "${gemmlowp_src}/fixedpoint/*.h") 74 | install(FILES ${fixedpoint_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/fixedpoint) 75 | 76 | install(TARGETS gemmlowp eight_bit_int_gemm 77 | EXPORT gemmlowp-config # support find_package(gemmlowp CONFIG) 78 | RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} 79 | LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} 80 | ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) 81 | 82 | install(EXPORT gemmlowp-config # export gemmlowp::gemmlowp 83 | NAMESPACE gemmlowp:: # gemmlowp::eight_bit_int_gemm 84 | DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/gemmlowp) 85 | 86 | if(BUILD_TESTING) 87 | # Benchmarks 88 | add_executable(benchmark 89 | "${gemmlowp_src}/test/benchmark.cc" ${gemmlowp_test_headers}) 90 | target_link_libraries(benchmark ${EXTERNAL_LIBRARIES}) 91 | 92 | add_executable(benchmark_all_sizes 93 | "${gemmlowp_src}/test/benchmark_all_sizes.cc" ${gemmlowp_test_headers}) 94 | target_compile_options(benchmark_all_sizes PRIVATE -DBENCHMARK_8bit -DBENCHMARK_QUICK) 95 | target_link_libraries(benchmark_all_sizes ${EXTERNAL_LIBRARIES}) 96 | 97 | # Gemmlowp test 98 | add_executable(test_gemmlowp 99 | "${gemmlowp_src}/test/test.cc" "${gemmlowp_src}/test/test_data.cc" ${gemmlowp_test_headers}) 100 | target_link_libraries(test_gemmlowp eight_bit_int_gemm) 101 | 102 | # Math helpers test 103 | add_executable(test_math_helpers 104 | "${gemmlowp_src}/test/test_math_helpers.cc" ${gemmlowp_test_headers}) 105 | 106 | # BlockingCounter test 107 | add_executable(test_blocking_counter 108 | "${gemmlowp_src}/test/test_blocking_counter.cc" ${gemmlowp_test_headers}) 109 | target_link_libraries(test_blocking_counter ${EXTERNAL_LIBRARIES}) 110 | 111 | # Allocator test 112 | add_executable(test_allocator 113 | "${gemmlowp_src}/test/test_allocator.cc" ${gemmlowp_test_headers}) 114 | 115 | # FixedPoint test 116 | add_executable(test_fixedpoint 117 | "${gemmlowp_src}/test/test_fixedpoint.cc" ${gemmlowp_test_headers}) 118 | 119 | # Add tests 120 | enable_testing() 121 | foreach(testname "test_math_helpers" "test_blocking_counter" "test_allocator" "test_fixedpoint" "test_gemmlowp") 122 | add_test(NAME ${testname} COMMAND "${testname}") 123 | endforeach(testname) 124 | endif() 125 | -------------------------------------------------------------------------------- /doc/design.md: -------------------------------------------------------------------------------- 1 | # Overview of gemmlowp design 2 | 3 | ## Primer on GEMM, kernels, and cache friendliness 4 | 5 | gemmlowp, like most GEMMs, implements the straightforward matrix multiplication 6 | algorithm, which takes n^3 multiply-accumulate instructions for n*n sized 7 | matrices. Because the arithmetic complexity grows quicker than the memory 8 | complexity (n^3 vs. n^2), memory accesses are redundant (each matrix entry is 9 | accessed n times). A large part of a GEMM's performance and design goes toward 10 | minimizing the inefficiency resulting from these redundant memory accesses. 11 | 12 | Ultimately, once values are loaded into CPU registers, they cost nothing to 13 | access, so as long as we can work within registers, this problem doesn't exist. 14 | Thus, in order to be efficient, a GEMM's inner loops must wisely use the 15 | available registers to do as much arithmetic work as possible before loading 16 | more data from memory into registers. This means that a GEMM implementation 17 | needs to have architecture-specific inner loops tailored for architecture 18 | details such as the number of registers, and typically written in assembly. This 19 | 'inner loops' architecture-specific component is referred to as the GEMM kernel. 20 | (More details about kernels are in [kernel.md](kernel.md)). 21 | 22 | However, only small blocks can fit at a given time in registers, so at larger 23 | scales one needs to repeatedly load blocks of matrices from memory, and these 24 | accesses are redundant for the reason outlined above. The way that one minimizes 25 | the resulting inefficiency is by organizing for cache locality, so that most of 26 | these accesses hit the L1 cache, and most of the remaining ones hit the L2 27 | cache, etc. 28 | 29 | This is achieved by subdividing the matrices into blocks sized to fit in L2 30 | cache, and subdividing these blocks into sub-blocks sizes to fit in L1 cache, 31 | and performing the matrix multiplication one such block at a time. 32 | 33 | In practice, it tends to pay off to "pack" input blocks for optimally efficient 34 | traversal by the kernel, since they will be traversed multiple times. "packing" 35 | means at least reordering the data layout for 1) simple access patterns that fit 36 | the CPU's cache behavior (in particular, the cache line size), and 2) simple 37 | loading into SIMD vector registers by the kernel. 38 | 39 | So a typical GEMM, in pseudo-code, tends to look like this: 40 | 41 | ``` 42 | allocate(some_lhs_L2_block); 43 | allocate(some_rhs_L2_block); 44 | for (some_lhs_L2_block) { 45 | pack(some_lhs_L2_block); 46 | for (some_rhs_L2_block) { 47 | pack(some_rhs_L2_block); 48 | for (some_lhs_sub_block in some_lhs_L2_block) { 49 | for (some_rhs_sub_block in some_rhs_L2_block) { 50 | kernel(some_lhs_sub_block, some_rhs_sub_block); 51 | } 52 | } 53 | } 54 | } 55 | ``` 56 | 57 | ## Impact of low-precision computation on gemmlowp design 58 | 59 | Refer to [low-precision.md](low-precision.md) for specifics of the 60 | low-precision-computation paradigm and how it's implemented in gemmlowp. 61 | 62 | Inputs and outputs are matrices of uint8 values, but internally we are 63 | accumulating int32 values, only converting them back to uint8 at the end. This 64 | means that we need so store a block of int32 accumulators at a time. We compute 65 | a block of the result in int32 accumulators and then we "unpack" it into the 66 | destination matrix at once. In this way, we minimize the amount of memory used 67 | to store int32 values at a given time. 68 | 69 | Because of that, besides the "pack" and "kernel" stages outlined above, a third 70 | stage is needed in gemmlowp, which we call "unpack". Thus we arrive at the 71 | 3-stage computation scheme that gemmlowp uses: 72 | 73 | 1. Pack lhs/rhs blocks from the input matrices. 74 | 2. Compute the product of the packed blocks, using the kernel. 75 | 3. Unpack the result block into the output matrix. 76 | 77 | The pseudo-code overview of gemmlowp now looks like: 78 | 79 | ``` 80 | allocate(some_lhs_L2_block); 81 | allocate(some_rhs_L2_block); 82 | // new: temp storage for int32 accums 83 | allocate(some_int32_accumulators_block); 84 | for (some_lhs_L2_block) { 85 | pack(some_lhs_L2_block); 86 | for (some_rhs_L2_block) { 87 | pack(some_rhs_L2_block); 88 | for (some_lhs_sub_block in some_lhs_L2_block) { 89 | for (some_rhs_sub_block in some_rhs_L2_block) { 90 | // new: pass int32 accums to kernel 91 | kernel(&some_int32_accumulators_block, 92 | some_lhs_sub_block, 93 | some_rhs_sub_block); 94 | } 95 | } 96 | // new: unpack int32 accums into destination matrix 97 | unpack(some_int32_accumulators_block); 98 | } 99 | } 100 | ``` 101 | 102 | ## Exploring gemmlowp code 103 | 104 | The design outlined above can be readily matched to gemmlowp source code, in 105 | particular in this file, which gives a simple GEMM implementation fitting in one 106 | rather small function: 107 | 108 | ``` 109 | internal/single_thread_gemm.h 110 | ``` 111 | 112 | The reader can compare the above pseudo-code to the actual code in this file: 113 | 114 | ``` 115 | for (int r = 0; r < rows; r += block_params.l2_rows) { 116 | int rs = std::min(block_params.l2_rows, rows - r); 117 | 118 | PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth)); 119 | 120 | for (int c = 0; c < cols; c += block_params.l2_cols) { 121 | int cs = std::min(block_params.l2_cols, cols - c); 122 | 123 | if (!pack_rhs_once) { 124 | PackRhs(&packed_rhs, rhs.block(0, c, depth, cs)); 125 | } 126 | 127 | Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs); 128 | 129 | auto result_block = result->block(r, c, rs, cs); 130 | UnpackResult(&result_block, packed_result, packed_lhs, packed_rhs, depth, 131 | result_offset, result_mult_int, result_shift); 132 | } 133 | } 134 | ``` 135 | 136 | The files in `internal/` fall into a few categories: 137 | 138 | There are two top-level GEMM implementations, 139 | 140 | * [internal/single_thread_gemm.h](../internal/single_thread_gemm.h) 141 | * [internal/multi_thread_gemm.h](../internal/multi_thread_gemm.h) 142 | 143 | They both call into pack/compute/unpack stages (see [kernel.md](kernel.md) and 144 | [packing.md](packing.md)) implemented in the following files: 145 | 146 | * [internal/pack.h](../internal/pack.h) 147 | * [internal/compute.h](../internal/compute.h) 148 | * [internal/unpack.h](../internal/unpack.h) 149 | * This in turn calls into [internal/output.h](../internal/output.h) for 150 | the output pipeline (see [output.md](output.md)) 151 | 152 | The pack.h and unpack.h files contain generic templated code that can be 153 | overridden by optimized code in template specializations; for example, see the 154 | NEON optimized code here: 155 | 156 | * [internal/pack_neon.h](../internal/pack_neon.h) 157 | * [internal/unpack_neon.h](../internal/unpack_neon.h) 158 | * This in turn calls into 159 | [internal/output_neon.h](../internal/output_neon.h) 160 | 161 | The compute stage contains generic code in compute.h that only calls into 162 | optimized code through the Kernel::Run() entry point. Each kernel is basically 163 | just as struct offering a Run() implementation; see the NEON kernels in: 164 | 165 | * [internal/kernel_neon.h](../internal/kernel_neon.h) 166 | -------------------------------------------------------------------------------- /doc/output.md: -------------------------------------------------------------------------------- 1 | # Output pipelines in gemmlowp 2 | 3 | In gemmlowp, the "output pipeline" is the process that takes a final `int32` 4 | accumulator value (the output of the compute/kernel stage), and processes it to 5 | obtain the final value (typically a `uint8` value) and write it to the 6 | destination matrix. 7 | 8 | Gemmlowp has some genericity in what arithmetic transformations take place in 9 | the output pipeline, so as to allow different users to implement different 10 | quantization paradigms. See [low-precision.md](low-precision.md) and 11 | [quantization.md](quantization.md). 12 | 13 | Besides implementing a quantization paradigm, the other thing that output 14 | pipelines is good for, is implementing fused operations where a matrix 15 | multiplication feeds into other operations applied to its result, without 16 | additional array traversals. For instance, when implementing neural network 17 | inference, one might have a Convolutional layer with a bias-addition and an 18 | activation. One then wants to feed the result of the matrix multiplication 19 | implementing the Convolutional operator itself, directly into the bias-addition 20 | and activation function. gemmlowp's output pipelines allow implementing that: 21 | the bias-addition and activation function are just additional stages in the 22 | output pipeline. 23 | 24 | ## Usage 25 | 26 | The gemmlowp entry point allowing to use an arbitrary output pipeline is 27 | `GemmWithOutputPipeline` in [public/gemmlowp.h](../public/gemmlowp.h). 28 | 29 | The output pipeline is specified as a `std::tuple` of "output stages", each of 30 | which defining an elementary arithmetic transformation. 31 | 32 | All available output stages are defined in 33 | [public/output_stages.h](../public/output_stages.h). 34 | 35 | ## Example usage 36 | 37 | The best part to see examples of using various output pipelines is in the unit 38 | test, 39 | 40 | ``` 41 | test/test.cc 42 | ``` 43 | 44 | specifically in this function: 45 | 46 | ``` 47 | TestOutputStages 48 | ``` 49 | 50 | Separately, a self-contained example showing how to use gemmlowp to compute a 51 | quantized matrix multiplication with a sounds quantization paradigm, is here: 52 | 53 | [doc/quantization_example.cc](quantization_example.cc) 54 | -------------------------------------------------------------------------------- /eight_bit_int_gemm/eight_bit_int_gemm.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 Google Inc. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // eight_bit_int_gemm.h: exposes the standard EightBitIntGemm interface. 16 | 17 | #ifndef GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_ 18 | #define GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_ 19 | 20 | #ifndef GEMMLOWP_USE_STLPORT 21 | #include 22 | #else 23 | #include 24 | namespace std { 25 | using ::uint8_t; 26 | using ::int32_t; 27 | } 28 | #endif 29 | 30 | namespace gemmlowp { 31 | 32 | namespace eight_bit_int_gemm { 33 | 34 | // Concurrency / reentrancy notice 35 | // =============================== 36 | // 37 | // This eight_bit_int_gemm has global singleton persistent state. 38 | // A global lock ensures serialization of calls, so this library 39 | // is fully reentrant but only one calling thread gets to actually run 40 | // at a time, while other calling threads would wait. So it is safe 41 | // albeit potentially slow to call the functions exposed here on 42 | // multiple threads concurrently. 43 | // 44 | // Users who prefer a state-less, singleton-less interface, 45 | // should use the main gemmlowp interface (public/gemmlowp.h) instead. 46 | 47 | // The BitDepthSetting enum lists supported a/b bit-depth combinations. 48 | enum class BitDepthSetting { 49 | A8B8, // 8-bit a, 8-bit b 50 | A5B7 // 5-bit a, 7-bit b 51 | }; 52 | 53 | // The main entry point to compute a Gemm. This is the standard 54 | // EightBitIntGemm interface. 55 | void EightBitIntGemm(bool transpose_a, bool transpose_b, bool transpose_c, 56 | int m, int n, int k, const std::uint8_t *a, 57 | std::int32_t a_offset, int lda, const std::uint8_t *b, 58 | std::int32_t b_offset, int ldb, std::uint8_t *c, 59 | std::int32_t c_offset, std::int32_t c_mult_int, 60 | std::int32_t c_shift, int ldc, BitDepthSetting bit_depth); 61 | 62 | void EightBitIntGemm(bool transpose_a, bool transpose_b, bool transpose_c, 63 | int m, int n, int k, const std::uint8_t *a, 64 | std::int32_t a_offset, int lda, const std::uint8_t *b, 65 | std::int32_t b_offset, int ldb, float *c, float c_offset, 66 | int ldc, BitDepthSetting bit_depth); 67 | 68 | // Frees any persistent resources 69 | // (threads, thread pools, allocators, buffers, ...) 70 | // that gemmlowp might hold. This is called automatically 71 | // on thread exit, but one may also call it earlier, at any time. 72 | void FreePersistentResources(); 73 | 74 | // Allows specifying the number of hardware threads, as a hint as to 75 | // how many worker threads to use for sufficiently large Gemm's. 76 | // We will never use more threads than that, but may use fewer, 77 | // for instance on Gemm's that are too small to benefit from all 78 | // available threads. The value 0 lets the implementation query 79 | // the system to determine the number of hardware threads. 80 | // Default value: 0. 81 | void SetMaxNumThreads(int n); 82 | 83 | } // namespace eight_bit_int_gemm 84 | 85 | } // namespace gemmlowp 86 | 87 | #endif // GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_ 88 | -------------------------------------------------------------------------------- /flags.bzl: -------------------------------------------------------------------------------- 1 | # Android builds do not need to link in a separate pthread library. 2 | LIB_COPTS = [] 3 | 4 | LIB_LINKOPTS = select({ 5 | ":android": [], 6 | ":windows": [], 7 | "//conditions:default": ["-lpthread"], 8 | }) 9 | 10 | BIN_LINKOPTS = LIB_LINKOPTS 11 | 12 | -------------------------------------------------------------------------------- /internal/allocator.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // allocator.h: a buffer allocator that allows avoiding most of the 16 | // malloc/free overhead, by: 17 | // 1. Requiring all N allocations to be reserved in advance, and 18 | // then commited at once, turning N allocations into 1. 19 | // 2. Being persistent, the allocated storage is reused across commits, 20 | // and only reallocated as needed when the commit size gets larger. 21 | // 22 | // This is driven by Android-specific needs: 23 | // 1. On Android, the default (Bionic) allocator tends to aggressively 24 | // unmap pages, which means that malloc/free can be surprisingly expensive. 25 | // 2. On Android, stack allocations with alloca() can't be as large as on 26 | // desktop platforms. 27 | // 28 | // General usage: 29 | // 1. Reserve blocks by calling Reserve(), which returns a Handle. 30 | // 2. Call Commit() once. 31 | // 3. Now it is possible to get pointers to allocated buffers by calling 32 | // GetPointer(). 33 | // 4. Call Decommit() once. 34 | // 5. The allocator is now reverted to its original state, except that 35 | // it retained its allocated storage, so the next Commit() will be faster. 36 | // The allocated storage is only freed when the Allocator object is 37 | // destroyed. 38 | 39 | #ifndef GEMMLOWP_INTERNAL_ALLOCATOR_H_ 40 | #define GEMMLOWP_INTERNAL_ALLOCATOR_H_ 41 | 42 | #include "common.h" 43 | 44 | namespace gemmlowp { 45 | 46 | enum class TypeId : std::uint8_t { Uint8, Int8, Uint16, Int16, Uint32, Int32 }; 47 | 48 | template 49 | struct GetTypeIdImpl {}; 50 | 51 | template 52 | inline TypeId GetTypeId() { 53 | return GetTypeIdImpl::Value; 54 | } 55 | 56 | template 57 | struct GetTypeIdImpl : GetTypeIdImpl {}; 58 | 59 | #define GEMMLOWP_REGISTER_TYPEID(type_, id) \ 60 | template <> \ 61 | struct GetTypeIdImpl { \ 62 | static const TypeId Value = TypeId::id; \ 63 | }; 64 | 65 | GEMMLOWP_REGISTER_TYPEID(std::uint8_t, Uint8) 66 | GEMMLOWP_REGISTER_TYPEID(std::int8_t, Int8) 67 | GEMMLOWP_REGISTER_TYPEID(std::uint16_t, Uint16) 68 | GEMMLOWP_REGISTER_TYPEID(std::int16_t, Int16) 69 | GEMMLOWP_REGISTER_TYPEID(std::uint32_t, Uint32) 70 | GEMMLOWP_REGISTER_TYPEID(std::int32_t, Int32) 71 | 72 | class Allocator { 73 | public: 74 | Allocator() 75 | : committed_(false), 76 | storage_size_(0), 77 | storage_(nullptr), 78 | reserved_blocks_(0), 79 | reserved_bytes_(0), 80 | generation_(0) {} 81 | 82 | ~Allocator() { 83 | assert(!committed_); 84 | assert(!reserved_blocks_); 85 | DeallocateStorage(); 86 | } 87 | 88 | // Alignment of allocated blocks. 89 | static constexpr std::size_t kAlignment = kDefaultCacheLineSize; 90 | 91 | // This is all we need so far, and since the usage pattern is fixed, 92 | // there is no point in allowing more until we need to. 93 | static constexpr std::size_t kMaxBlocks = 5; 94 | 95 | void Commit() { 96 | assert(!committed_); 97 | 98 | if (reserved_bytes_ > storage_size_) { 99 | DeallocateStorage(); 100 | storage_size_ = RoundUpToPowerOfTwo(reserved_bytes_); 101 | storage_ = aligned_alloc(kAlignment, storage_size_); 102 | } 103 | 104 | ReleaseBuildAssertion(!storage_size_ || storage_, "allocation failure"); 105 | committed_ = true; 106 | } 107 | 108 | void Decommit() { 109 | assert(committed_); 110 | committed_ = false; 111 | generation_++; 112 | 113 | reserved_blocks_ = 0; 114 | reserved_bytes_ = 0; 115 | } 116 | 117 | // See generation_ 118 | typedef std::size_t generation_t; 119 | 120 | // A handle on a reserved block. The user obtains 121 | // one by calling Reserve() and, after committing, 122 | // passes it to GetPointer(). 123 | class Handle { 124 | std::uint8_t index_; 125 | generation_t generation_; 126 | TypeId type_; 127 | 128 | friend class Allocator; 129 | }; 130 | 131 | // Reserves a block sized for n elements of type T, and 132 | // returns a handle to it. Must be called before committing. 133 | template 134 | Handle Reserve(std::size_t n) { 135 | assert(!committed_ && "can't reserve blocks while committed"); 136 | assert(reserved_blocks_ < kMaxBlocks && 137 | "didn't expect to allocate this many blocks"); 138 | const std::size_t bytes = RoundUp(n * sizeof(T)); 139 | const std::size_t offset = reserved_bytes_; 140 | const std::size_t index = reserved_blocks_; 141 | 142 | reserved_blocks_offsets_[index] = offset; 143 | Handle h; 144 | h.index_ = index; 145 | h.generation_ = generation_; 146 | h.type_ = GetTypeId(); 147 | 148 | reserved_blocks_++; 149 | reserved_bytes_ += bytes; 150 | 151 | return h; 152 | } 153 | 154 | // Returns the pointer to the allocated buffer for the given handle. 155 | // Must be called after committing. 156 | template 157 | T* GetPointer(const Handle& h) const { 158 | assert(committed_ && "can't get block pointers unless committed"); 159 | assert(h.index_ < reserved_blocks_ && 160 | "bad handle, points to inexistant block"); 161 | assert(h.generation_ == generation_ && 162 | "handle from earlier generation, have decommitted since"); 163 | assert(h.type_ == GetTypeId() && "type mismatch"); 164 | std::size_t offset = reserved_blocks_offsets_[h.index_]; 165 | std::uintptr_t addr = reinterpret_cast(storage_) + offset; 166 | return reinterpret_cast(addr); 167 | } 168 | 169 | private: 170 | void DeallocateStorage() { 171 | assert(!committed_); 172 | aligned_free(storage_); 173 | storage_size_ = 0; 174 | } 175 | 176 | // Set to true by Commit() and to false by Decommit(). Initially false. 177 | bool committed_; 178 | 179 | // The actually allocated storage size and buffer pointer. 180 | std::size_t storage_size_; 181 | mutable void* storage_; 182 | 183 | // The number of blocks that have been reserved by Reserve(). 184 | std::size_t reserved_blocks_; 185 | // The number of bytes that have been reserved by Reserve(). 186 | std::size_t reserved_bytes_; 187 | // The offsets of reserved blocks into the storage buffer. 188 | std::size_t reserved_blocks_offsets_[kMaxBlocks]; 189 | 190 | // The 'generation' is incremented on Decommit() and allows catching 191 | // bad GetPointer() calls still referring to a previous commit. 192 | generation_t generation_; 193 | }; 194 | 195 | } // namespace gemmlowp 196 | 197 | #endif // GEMMLOWP_INTERNAL_ALLOCATOR_H_ 198 | -------------------------------------------------------------------------------- /internal/compute.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // compute.h: the central stage of the Gemm computation, operates 16 | // on already-packed LHS and RHS blocks and calls the Gemm kernel 17 | // to compute a block of the product. 18 | 19 | #ifndef GEMMLOWP_INTERNAL_COMPUTE_H_ 20 | #define GEMMLOWP_INTERNAL_COMPUTE_H_ 21 | 22 | #include "block_params.h" 23 | #include "kernel.h" 24 | #include "pack.h" 25 | 26 | namespace gemmlowp { 27 | 28 | template 29 | class ComputeImpl { 30 | typedef typename PackedLhs::KernelSideFormat KernelLhsFormat; 31 | typedef typename PackedRhs::KernelSideFormat KernelRhsFormat; 32 | typedef KernelFormat Format; 33 | 34 | const KernelBase& kernel_; 35 | const BlockParams& block_params_; 36 | 37 | PackedResult* const packed_result_; 38 | const PackedLhs& packed_lhs_; 39 | const PackedRhs& packed_rhs_; 40 | 41 | public: 42 | ComputeImpl(const KernelBase& _kernel, const BlockParams& _block_params, 43 | PackedResult* _packed_result, const PackedLhs& _packed_lhs, 44 | const PackedRhs& _packed_rhs) 45 | : kernel_(_kernel), 46 | block_params_(_block_params), 47 | packed_result_(_packed_result), 48 | packed_lhs_(_packed_lhs), 49 | packed_rhs_(_packed_rhs) {} 50 | 51 | void Compute(int depth) { 52 | depth = RoundUp(depth); 53 | assert(depth <= block_params_.l2_depth); 54 | for (int d = 0; d < depth; d += block_params_.l1_depth) { 55 | int ds = std::min(block_params_.l1_depth, depth - d); 56 | 57 | for (int r = 0; r < block_params_.l2_rows; r += block_params_.l1_rows) { 58 | int rs = std::min(block_params_.l1_rows, block_params_.l2_rows - r); 59 | 60 | ComputeL1(r, rs, 0, block_params_.l2_cols, d, ds); 61 | } 62 | } 63 | } 64 | 65 | private: 66 | static void MarkPackedResultBlockAsInitialized( 67 | const MatrixMap& packed_result_block) { 68 | #ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED 69 | for (int col = 0; col < packed_result_block.cols(); col++) { 70 | MarkMemoryAsInitialized( 71 | packed_result_block.data() + col * packed_result_block.cols_stride(), 72 | packed_result_block.rows()); 73 | } 74 | #else 75 | (void)packed_result_block; 76 | #endif 77 | } 78 | 79 | void ComputeRun(int start_row, int start_col, int start_depth, 80 | int depth) GEMMLOWP_NOINLINE { 81 | packed_lhs_.seek_run(start_row, start_depth); 82 | packed_rhs_.seek_run(start_col, start_depth); 83 | auto packed_result_block = packed_result_->Map().block( 84 | start_row, start_col, Format::kRows, Format::kCols); 85 | kernel_.Run(packed_result_block.data(), packed_result_block.rows_stride(), 86 | packed_result_block.cols_stride(), packed_lhs_.current_data(), 87 | packed_rhs_.current_data(), start_depth, depth); 88 | MarkPackedResultBlockAsInitialized(packed_result_block); 89 | } 90 | 91 | void ComputeL1(int start_row, int rows, int start_col, int cols, 92 | int start_depth, int depth) { 93 | assert(rows % Format::kRows == 0); 94 | assert(cols % Format::kCols == 0); 95 | assert(depth % Format::kDepth == 0); 96 | 97 | for (int c = 0; c < cols; c += Format::kCols) { 98 | for (int r = 0; r < rows; r += Format::kRows) { 99 | ComputeRun(start_row + r, start_col + c, start_depth, depth); 100 | } 101 | } 102 | } 103 | }; 104 | 105 | template 106 | void Compute(const KernelBase& kernel, const BlockParams& block_params, 107 | PackedResult* packed_result, const PackedLhs& packed_lhs, 108 | const PackedRhs& packed_rhs, int depth) { 109 | ScopedProfilingLabel label("compute"); 110 | ComputeImpl impl( 111 | kernel, block_params, packed_result, packed_lhs, packed_rhs); 112 | 113 | impl.Compute(depth); 114 | } 115 | 116 | } // namespace gemmlowp 117 | 118 | #endif // GEMMLOWP_INTERNAL_COMPUTE_H_ 119 | -------------------------------------------------------------------------------- /internal/detect_platform.h: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // detect_platform.h: Sets up macros that control architecture-specific 16 | // features of gemmlowp's implementation. 17 | 18 | #ifndef GEMMLOWP_INTERNAL_DETECT_PLATFORM_H_ 19 | #define GEMMLOWP_INTERNAL_DETECT_PLATFORM_H_ 20 | 21 | // Our inline assembly path assume GCC/Clang syntax. 22 | // Native Client doesn't seem to support inline assembly(?). 23 | #if (defined(__GNUC__) || defined(__clang__)) && !defined(__native_client__) 24 | #define GEMMLOWP_ALLOW_INLINE_ASM 25 | #endif 26 | 27 | // Define macro statement that avoids inlining for GCC. 28 | // For non-GCC, define as empty macro. 29 | #if defined(__GNUC__) 30 | #define GEMMLOWP_NOINLINE __attribute__((noinline)) 31 | #else 32 | #define GEMMLOWP_NOINLINE 33 | #endif 34 | 35 | // Detect ARM, 32-bit or 64-bit 36 | #ifdef __arm__ 37 | #define GEMMLOWP_ARM_32 38 | #endif 39 | 40 | #ifdef __aarch64__ 41 | #define GEMMLOWP_ARM_64 42 | #endif 43 | 44 | #if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64) 45 | #define GEMMLOWP_ARM 46 | #endif 47 | 48 | // Detect MIPS, 32-bit or 64-bit 49 | #if defined(__mips) && !defined(__LP64__) 50 | #define GEMMLOWP_MIPS_32 51 | #endif 52 | 53 | #if defined(__mips) && defined(__LP64__) 54 | #define GEMMLOWP_MIPS_64 55 | #endif 56 | 57 | #if defined(GEMMLOWP_MIPS_32) || defined(GEMMLOWP_MIPS_64) 58 | #define GEMMLOWP_MIPS 59 | #endif 60 | 61 | // Detect x86, 32-bit or 64-bit 62 | #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386) 63 | #define GEMMLOWP_X86_32 64 | #endif 65 | 66 | #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64) 67 | #define GEMMLOWP_X86_64 68 | #endif 69 | 70 | #if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64) 71 | #define GEMMLOWP_X86 72 | #endif 73 | 74 | // Detect WebAssembly SIMD. 75 | #if defined(__wasm_simd128__) 76 | #define GEMMLOWP_WASMSIMD 77 | #endif 78 | 79 | // Some of our optimized paths use inline assembly and for 80 | // now we don't bother enabling some other optimized paths using intrinddics 81 | // where we can't use inline assembly paths. 82 | #ifdef GEMMLOWP_ALLOW_INLINE_ASM 83 | 84 | // Detect NEON. It's important to check for both tokens. 85 | #if (defined __ARM_NEON) || (defined __ARM_NEON__) 86 | #define GEMMLOWP_NEON 87 | #endif 88 | 89 | // Convenience NEON tokens for 32-bit or 64-bit 90 | #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32) 91 | #define GEMMLOWP_NEON_32 92 | #endif 93 | 94 | #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64) 95 | #define GEMMLOWP_NEON_64 96 | #endif 97 | 98 | // Detect MIPS MSA. 99 | // Limit MSA optimizations to little-endian CPUs for now. 100 | // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs? 101 | #if defined(GEMMLOWP_MIPS) && (__mips_isa_rev >= 5) && defined(__mips_msa) && \ 102 | defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) 103 | #define GEMMLOWP_MSA 104 | #endif 105 | 106 | // Convenience MIPS MSA tokens for 32-bit or 64-bit. 107 | #if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_32) 108 | #define GEMMLOWP_MSA_32 109 | #endif 110 | 111 | #if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_64) 112 | #define GEMMLOWP_MSA_64 113 | #endif 114 | 115 | // compiler define for AVX2 -D GEMMLOWP_ENABLE_AVX2 116 | // Detect AVX2 117 | #if defined(__AVX2__) && defined(GEMMLOWP_ENABLE_AVX2) 118 | #define GEMMLOWP_AVX2 119 | // Detect SSE4. 120 | // MSVC does not have __SSE4_1__ macro, but will enable SSE4 121 | // when AVX is turned on. 122 | #elif defined(__SSE4_1__) || (defined(_MSC_VER) && defined(__AVX__)) 123 | #define GEMMLOWP_SSE4 124 | // Detect SSE3. 125 | #elif defined(__SSE3__) 126 | #define GEMMLOWP_SSE3 127 | #endif 128 | 129 | // Convenience SSE4 tokens for 32-bit or 64-bit 130 | #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32) && \ 131 | !defined(GEMMLOWP_DISABLE_SSE4) 132 | #define GEMMLOWP_SSE4_32 133 | #endif 134 | 135 | #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_32) 136 | #define GEMMLOWP_SSE3_32 137 | #endif 138 | 139 | #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64) && \ 140 | !defined(GEMMLOWP_DISABLE_SSE4) 141 | #define GEMMLOWP_SSE4_64 142 | #endif 143 | 144 | #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_64) 145 | #define GEMMLOWP_SSE3_64 146 | #endif 147 | 148 | #if defined(GEMMLOWP_AVX2) && defined(GEMMLOWP_X86_64) 149 | #define GEMMLOWP_AVX2_64 150 | #endif 151 | 152 | #if defined(__has_feature) 153 | #if __has_feature(memory_sanitizer) 154 | #include 155 | #define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __msan_unpoison 156 | #elif __has_feature(address_sanitizer) 157 | #include 158 | #define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __asan_unpoison_memory_region 159 | #endif 160 | #endif 161 | 162 | #endif // GEMMLOWP_ALLOW_INLINE_ASM 163 | 164 | // Detect Android. Don't conflate with ARM - we care about tuning 165 | // for non-ARM Android devices too. This can be used in conjunction 166 | // with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs. 167 | #if defined(__ANDROID__) || defined(ANDROID) 168 | #define GEMMLOWP_ANDROID 169 | #endif 170 | 171 | #endif // GEMMLOWP_INTERNAL_DETECT_PLATFORM_H_ 172 | -------------------------------------------------------------------------------- /internal/kernel_default.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // kernel_default.h: Chooses default GEMM and GEMV kernels for the 16 | // host platform. 17 | 18 | #ifndef GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_ 19 | #define GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_ 20 | 21 | #include "../public/bit_depth.h" 22 | #include "common.h" 23 | #include "kernel.h" 24 | #include "kernel_reference.h" 25 | 26 | namespace gemmlowp { 27 | 28 | template 29 | struct DefaultKernelImpl {}; 30 | 31 | // Partial specialization implementing the logic that if we want to use 32 | // a kernel for MaxProductIsLessThan4096 but do not have such a kernel, then we 33 | // fall back to a generic kernel not taking advantage of 34 | // MaxProductIsLessThan4096. 35 | template 36 | struct DefaultKernelImpl 37 | : DefaultKernelImpl {}; 38 | 39 | // Partial specialization implementing the logic that if we want to use 40 | // a kernel for LhsNonZero but do not have such a kernel, then we fall 41 | // back to a generic kernel not taking advantage of LhsNonZero. 42 | template 43 | struct DefaultKernelImpl 44 | : DefaultKernelImpl {}; 45 | 46 | template 47 | struct DefaultKernel 48 | : DefaultKernelImpl<(BitDepthParams::LhsRange::kMaxValue * 49 | BitDepthParams::RhsRange::kMaxValue < 50 | 4096), 51 | (BitDepthParams::LhsRange::kMinValue >= 0), 52 | (BitDepthParams::LhsRange::kMinValue > 0 || 53 | (BitDepthParams::LhsRange::kMaxValue <= 127 && 54 | BitDepthParams::LhsRange::kMinValue > -128))> {}; 55 | 56 | } // end namespace gemmlowp 57 | 58 | #define GEMMLOWP_SET_DEFAULT_KERNEL(MaxProductIsLessThan4096, IsUnsigned, \ 59 | LhsAlwaysNonZero, Kernel) \ 60 | namespace gemmlowp { \ 61 | template <> \ 62 | struct DefaultKernelImpl : Kernel {}; \ 64 | } 65 | 66 | // User-provided int8 inputs is only supported in the NEON path currently. 67 | #if defined GEMMLOWP_NEON_32 68 | #include "kernel_neon.h" 69 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, NEON_32_Kernel12x4Depth2) 70 | GEMMLOWP_SET_DEFAULT_KERNEL(true, true, false, 71 | NEON_32_Kernel12x4Depth2Assuming12BitProducts) 72 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true, 73 | NEON_32bit_GEMM_Int8Operands_LhsNonzero) 74 | GEMMLOWP_SET_DEFAULT_KERNEL(false, false, true, 75 | NEON_32bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs) 76 | #elif defined GEMMLOWP_NEON_64 77 | #include "kernel_neon.h" 78 | #if defined GEMMLOWP_DOTPROD_KERNEL 79 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, 80 | NEON_64_Kernel12x8Depth4_dotprod) 81 | #else 82 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, NEON_64_Kernel12x8Depth2) 83 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true, 84 | NEON_64bit_GEMM_Int8Operands_LhsNonzero) 85 | #endif 86 | GEMMLOWP_SET_DEFAULT_KERNEL(false, false, true, 87 | NEON_64bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs) 88 | #elif defined(GEMMLOWP_MSA) 89 | #include "kernel_msa.h" 90 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, MSA_Kernel12x8Depth2) 91 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true, MSA_GEMM_Int8Operands_LhsNonzero) 92 | #elif defined GEMMLOWP_SSE4_32 93 | #include "kernel_sse.h" 94 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, SSE4_32_Kernel4x4Depth2) 95 | #elif defined GEMMLOWP_SSE4_64 96 | #include "kernel_sse.h" 97 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, SSE4_64_Kernel12x4Depth2) 98 | #elif defined GEMMLOWP_AVX2_64 99 | #include "kernel_avx.h" 100 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, AVX2_64_Kernel24x8Depth2) 101 | #else 102 | #include "kernel_reference.h" 103 | namespace gemmlowp { 104 | typedef ReferenceKernel, 1>, 106 | KernelSideFormat, 1> > > 107 | DefaultReferenceKernel; 108 | } 109 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, DefaultReferenceKernel) 110 | #endif 111 | 112 | #endif // GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_ 113 | -------------------------------------------------------------------------------- /internal/kernel_reference.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // kernel_reference.h: a reference kernel for CPU architectures where we don't 16 | // have optimized kernels yet. Also useful for testing, as it's templatized 17 | // to have any arbitrary format, allowing tests to cover all sorts of corner 18 | // cases. 19 | 20 | #ifndef GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_ 21 | #define GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_ 22 | 23 | #include "kernel.h" 24 | 25 | #include 26 | #include 27 | 28 | namespace gemmlowp { 29 | 30 | // This kernel is templatized in an arbitrary Format template parameter, 31 | // allowing it to have any arbitrary format. 32 | template 33 | struct ReferenceKernel : KernelBase { 34 | typedef tFormat Format; 35 | 36 | const char* Name() const override { 37 | static char buf[256]; 38 | snprintf(buf, sizeof(buf), 39 | "reference(Lhs: %d cells %dx%d %s, Rhs: %d cells %dx%d %s)", 40 | Format::Lhs::kCells, Format::Lhs::Cell::kWidth, 41 | Format::Lhs::Cell::kDepth, 42 | CellOrderName(Format::Lhs::Cell::kOrder), Format::Rhs::kCells, 43 | Format::Rhs::Cell::kDepth, Format::Rhs::Cell::kWidth, 44 | CellOrderName(Format::Rhs::Cell::kOrder)); 45 | return buf; 46 | } 47 | 48 | void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, 49 | std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, 50 | const std::uint8_t* rhs_ptr, std::size_t start_depth, 51 | std::size_t run_depth) const override { 52 | std::int32_t accumulator[Format::kRows * Format::kCols]; 53 | memset(accumulator, 0, sizeof(accumulator)); 54 | 55 | const int run_depth_cells = static_cast(run_depth / Format::kDepth); 56 | 57 | // The outer loop is over the depth dimension. 58 | for (int dc = 0; dc < run_depth_cells; dc++) { 59 | // The next two loops are over cells of the Lhs (stacked vertically), 60 | // and over cells of the Rhs (stacked horizontally). 61 | for (int rc = 0; rc < Format::Lhs::kCells; rc++) { 62 | const std::uint8_t* lhs_cell_ptr = 63 | lhs_ptr + (dc * Format::Lhs::kCells + rc) * 64 | Format::Lhs::Cell::kWidth * Format::kDepth; 65 | for (int cc = 0; cc < Format::Rhs::kCells; cc++) { 66 | const std::uint8_t* rhs_cell_ptr = 67 | rhs_ptr + (dc * Format::Rhs::kCells + cc) * 68 | Format::Rhs::Cell::kWidth * Format::kDepth; 69 | 70 | // Now we are inside one cell of the Lhs and inside one cell 71 | // of the Rhs, so the remaining inner loops are just 72 | // traditional three loops of matrix multiplication. 73 | for (int di = 0; di < Format::kDepth; di++) { 74 | for (int ri = 0; ri < Format::Lhs::Cell::kWidth; ri++) { 75 | for (int ci = 0; ci < Format::Rhs::Cell::kWidth; ci++) { 76 | const std::uint8_t* lhs_coeff_ptr = 77 | lhs_cell_ptr + 78 | OffsetIntoCell(ri, di); 79 | const std::uint8_t* rhs_coeff_ptr = 80 | rhs_cell_ptr + 81 | OffsetIntoCell(ci, di); 82 | std::int32_t* accumulator_coeff_ptr = 83 | accumulator + (ri + rc * Format::Lhs::Cell::kWidth) + 84 | (ci + cc * Format::Rhs::Cell::kWidth) * Format::kRows; 85 | *accumulator_coeff_ptr += 86 | std::int32_t(*lhs_coeff_ptr) * std::int32_t(*rhs_coeff_ptr); 87 | } 88 | } 89 | } 90 | } 91 | } 92 | } 93 | 94 | if (start_depth == 0) { 95 | // start_depth == 0 means we haven't accumulated anything yet, so we need 96 | // to overwrite the accumulator, as it hasn't been initialized to zero. 97 | for (int r = 0; r < Format::kRows; r++) { 98 | for (int c = 0; c < Format::kCols; c++) { 99 | dst_ptr[r * dst_row_stride + c * dst_col_stride] = 100 | accumulator[r + c * Format::kRows]; 101 | } 102 | } 103 | } else { 104 | // We have already accumulated stuff, so we need to continue accumulating 105 | // instead of just overwriting. 106 | for (int r = 0; r < Format::kRows; r++) { 107 | for (int c = 0; c < Format::kCols; c++) { 108 | dst_ptr[r * dst_row_stride + c * dst_col_stride] += 109 | accumulator[r + c * Format::kRows]; 110 | } 111 | } 112 | } 113 | } 114 | }; 115 | 116 | } // namespace gemmlowp 117 | 118 | #endif // GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_ 119 | -------------------------------------------------------------------------------- /internal/output_avx.h: -------------------------------------------------------------------------------- 1 | // 2 | // Licensed under the Apache License, Version 2.0 (the "License"); 3 | // you may not use this file except in compliance with the License. 4 | // You may obtain a copy of the License at 5 | // 6 | // http://www.apache.org/licenses/LICENSE-2.0 7 | // 8 | // Unless required by applicable law or agreed to in writing, software 9 | // distributed under the License is distributed on an "AS IS" BASIS, 10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | // See the License for the specific language governing permissions and 12 | // limitations under the License. 13 | 14 | // output_avx.h: optimized AVX 2 specializations of the templates in output.h. 15 | 16 | #ifndef GEMMLOWP_INTERNAL_OUTPUT_AVX_H_ 17 | #define GEMMLOWP_INTERNAL_OUTPUT_AVX_H_ 18 | 19 | #endif // GEMMLOWP_INTERNAL_OUTPUT_AVX_H_ 20 | -------------------------------------------------------------------------------- /internal/pack_sse.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // pack_SSE.h: optimized SSE specializations of the templates in pack.h. 16 | 17 | #ifndef GEMMLOWP_INTERNAL_PACK_SSE_H_ 18 | #define GEMMLOWP_INTERNAL_PACK_SSE_H_ 19 | 20 | #include 21 | #include "pack.h" 22 | 23 | namespace gemmlowp { 24 | 25 | // TODO: Add DepthMajorUint8SideMap 26 | 27 | typedef SideMap 28 | WidthMajorUint8SideMap; 29 | 30 | template 31 | using WidthMajorSideFormatNCells4x2 = 32 | KernelSideFormat, Cells>; 33 | 34 | template 35 | class PackingRegisterBlock< 36 | WidthMajorUint8SideMap, 37 | PackedSideBlock > > 38 | : public PackingRegisterBlockBase< 39 | WidthMajorUint8SideMap, 40 | PackedSideBlock > > { 41 | public: 42 | typedef WidthMajorSideFormatNCells4x2 KernelSideFormat; 43 | typedef typename KernelSideFormat::Cell CellFormat; 44 | static constexpr int kCells = KernelSideFormat::kCells; 45 | static constexpr int kCellWidth = CellFormat::kWidth; 46 | static constexpr int kKernelWidth = CellFormat::kWidth * kCells; 47 | static constexpr int kCellDepth = CellFormat::kDepth; 48 | static constexpr int kCellSize = CellFormat::kSize; 49 | 50 | void Pack(PackedSideBlock* dst, int start_width) { 51 | std::uint8_t* dst_ptr = dst->current_data(); 52 | const int width_stride = this->complete_src_.width_stride(); 53 | int depth_step = 8; 54 | 55 | __m128i one = _mm_set1_epi16(1); 56 | for (int cell_start_depth = 0; cell_start_depth < kRegisterSize; 57 | cell_start_depth += depth_step) { 58 | for (int cell_start_width = 0; cell_start_width < kKernelWidth; 59 | cell_start_width += kCellWidth) { 60 | std::int32_t* cell_sums_of_each_slice_ptr = 61 | dst->sums_of_each_slice() + start_width + cell_start_width; 62 | const std::uint8_t* src_data = 63 | this->complete_src_.data(cell_start_width, cell_start_depth); 64 | 65 | __m128i xmm1 = 66 | _mm_loadl_epi64(reinterpret_cast(&src_data[0])); 67 | __m128i xmm2 = _mm_loadl_epi64( 68 | reinterpret_cast(&src_data[1 * width_stride])); 69 | __m128i xmm3 = _mm_loadl_epi64( 70 | reinterpret_cast(&src_data[2 * width_stride])); 71 | __m128i xmm4 = _mm_loadl_epi64( 72 | reinterpret_cast(&src_data[3 * width_stride])); 73 | 74 | __m128i xmm5 = _mm_unpacklo_epi16(xmm1, xmm2); 75 | __m128i xmm8 = _mm_shuffle_epi32(xmm5, 0x31); 76 | 77 | __m128i xmm6 = _mm_unpacklo_epi16(xmm3, xmm4); 78 | __m128i xmm7 = _mm_shuffle_epi32(xmm6, 0x80); 79 | 80 | __m128i xmm9 = _mm_blend_epi16(xmm5, xmm7, 0xcc); 81 | __m128i xmm10 = _mm_blend_epi16(xmm8, xmm6, 0xcc); 82 | 83 | _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst_ptr[0]), xmm9); 84 | _mm_storel_epi64( 85 | reinterpret_cast<__m128i*>(&dst_ptr[kCellSize * kCells]), xmm10); 86 | 87 | __m128i xmm11 = _mm_shuffle_epi32(xmm9, 0xee); 88 | __m128i xmm12 = _mm_shuffle_epi32(xmm10, 0xee); 89 | 90 | _mm_storel_epi64( 91 | reinterpret_cast<__m128i*>(&dst_ptr[2 * kCellSize * kCells]), 92 | xmm11); 93 | _mm_storel_epi64( 94 | reinterpret_cast<__m128i*>(&dst_ptr[3 * kCellSize * kCells]), 95 | xmm12); 96 | 97 | xmm1 = _mm_cvtepu8_epi16(xmm9); 98 | xmm2 = _mm_madd_epi16(xmm1, one); 99 | __m128i sums_of_each_slice_xmm = _mm_loadu_si128( 100 | reinterpret_cast(&cell_sums_of_each_slice_ptr[0])); 101 | sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2); 102 | 103 | xmm1 = _mm_cvtepu8_epi16(xmm10); 104 | xmm2 = _mm_madd_epi16(xmm1, one); 105 | sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2); 106 | 107 | xmm1 = _mm_cvtepu8_epi16(xmm11); 108 | xmm2 = _mm_madd_epi16(xmm1, one); 109 | sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2); 110 | 111 | xmm1 = _mm_cvtepu8_epi16(xmm12); 112 | xmm2 = _mm_madd_epi16(xmm1, one); 113 | sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2); 114 | 115 | _mm_storeu_si128( 116 | reinterpret_cast<__m128i*>(&cell_sums_of_each_slice_ptr[0]), 117 | sums_of_each_slice_xmm); 118 | dst_ptr += kCellSize; 119 | } 120 | dst_ptr += 3 * kCellSize * kCells; 121 | } 122 | dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth); 123 | } 124 | }; 125 | 126 | } // namespace gemmlowp 127 | 128 | #endif // GEMMLOWP_INTERNAL_PACK_SSE_H_ 129 | -------------------------------------------------------------------------------- /internal/platform.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // internal/platform.h: a place to put platform specific code 16 | 17 | #ifndef GEMMLOWP_INTERNAL_PLATFORM_H_ 18 | #define GEMMLOWP_INTERNAL_PLATFORM_H_ 19 | 20 | #ifdef _WIN32 21 | #include 22 | #include 23 | #else 24 | #include 25 | #include 26 | #include 27 | #endif 28 | 29 | #ifdef __APPLE__ 30 | #include 31 | #endif 32 | 33 | #if defined ANDROID || defined __ANDROID__ 34 | #include 35 | #include 36 | // The 18 here should be 16, but has to be 18 for now due 37 | // to a Google-internal issue. 38 | #if __ANDROID_API__ < 18 39 | #define GEMMLOWP_USE_MEMALIGN 40 | #endif 41 | // posix_memalign is missing on some 4.1 x86 devices 42 | #if __ANDROID_API__ == 18 43 | #ifdef GEMMLOWP_X86_32 44 | #define GEMMLOWP_USE_MEMALIGN 45 | #endif 46 | #endif 47 | #endif 48 | 49 | // Needed by chrome native builds 50 | #ifndef _SC_NPROCESSORS_CONF 51 | #define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_ONLN 52 | #endif 53 | 54 | namespace gemmlowp { 55 | 56 | #ifdef _WIN32 57 | inline void *aligned_alloc(size_t alignment, size_t size) { 58 | return _aligned_malloc(size, alignment); 59 | } 60 | 61 | inline void aligned_free(void *memptr) { _aligned_free(memptr); } 62 | 63 | inline int GetHardwareConcurrency(int max_threads) { 64 | if (max_threads == 0) { 65 | SYSTEM_INFO sysinfo; 66 | GetSystemInfo(&sysinfo); 67 | return sysinfo.dwNumberOfProcessors; 68 | } 69 | return max_threads; 70 | } 71 | 72 | inline double real_time_in_seconds() { 73 | __int64 wintime; 74 | GetSystemTimeAsFileTime((FILETIME *)&wintime); 75 | wintime -= 116444736000000000LL; // 1jan1601 to 1jan1970 76 | return wintime / 10000000LL + wintime % 10000000LL * 100 * 1e-9; 77 | } 78 | 79 | #else 80 | inline void *aligned_alloc(size_t alignment, size_t size) { 81 | #ifdef GEMMLOWP_USE_MEMALIGN 82 | return memalign(alignment, size); 83 | #else 84 | void *memptr; 85 | if (posix_memalign(&memptr, alignment, size)) { 86 | memptr = nullptr; 87 | } 88 | return memptr; 89 | #endif 90 | } 91 | 92 | inline int GetHardwareConcurrency(int max_threads) { 93 | if (max_threads == 0) { 94 | static const int hardware_threads_count = 95 | static_cast(sysconf(_SC_NPROCESSORS_CONF)); 96 | return hardware_threads_count; 97 | } 98 | return max_threads; 99 | } 100 | 101 | inline void aligned_free(void *memptr) { free(memptr); } 102 | 103 | inline double real_time_in_seconds() { 104 | #ifdef __APPLE__ 105 | timeval t; 106 | gettimeofday(&t, nullptr); 107 | return t.tv_sec + 1e-6 * t.tv_usec; 108 | #else 109 | timespec t; 110 | clock_gettime(CLOCK_REALTIME, &t); 111 | return t.tv_sec + 1e-9 * t.tv_nsec; 112 | #endif 113 | } 114 | 115 | #endif 116 | } // namespace gemmlowp 117 | #endif // GEMMLOWP_INTERNAL_PLATFORM_H_ 118 | -------------------------------------------------------------------------------- /internal/simd_wrappers_msa.h: -------------------------------------------------------------------------------- 1 | // Copyright 2018 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // simd_wrappers_msa.h: MSA specialization of simd_wrappers.h 16 | 17 | #ifndef GEMMLOWP_INTERNAL_SIMD_WRAPPERS_MSA_H_ 18 | #define GEMMLOWP_INTERNAL_SIMD_WRAPPERS_MSA_H_ 19 | 20 | #include 21 | 22 | namespace gemmlowp { 23 | 24 | using Int32x4 = v4i32; 25 | using Int16x8 = v8i16; 26 | using Uint8x16 = v16i8; 27 | 28 | template 29 | struct RegisterType { 30 | using Type = 31 | typename std::conditional= 4, Int32x4, std::int32_t>::type; 32 | }; 33 | 34 | template 35 | struct RegisterType { 36 | using Type = typename std::conditional= 8, Int16x8, std::int16_t>::type; 37 | }; 38 | 39 | template 40 | struct RegisterType { 41 | using Type = typename std::conditional< 42 | ScalarCount >= 16, Uint8x16, 43 | typename std::conditional= 4, std::uint32_t, 44 | std::uint8_t>::type>::type; 45 | }; 46 | 47 | inline Int32x4 LoadInt32x4(const std::int32_t* src) { 48 | return __builtin_msa_ld_w(const_cast(src), 0); 49 | } 50 | 51 | inline Int32x4 LoadInt32x4(const Int32x4* src) { 52 | return __builtin_msa_ld_w(const_cast(src), 0); 53 | } 54 | 55 | inline void StoreInt32x4(std::int32_t* dst, Int32x4 value) { 56 | __builtin_msa_st_w(value, dst, 0); 57 | } 58 | 59 | inline void StoreInt32x4(Int32x4* dst, Int32x4 value) { 60 | __builtin_msa_st_w(value, dst, 0); 61 | } 62 | 63 | inline Int16x8 LoadInt16x8(const std::int16_t* src) { 64 | return __builtin_msa_ld_h(const_cast(src), 0); 65 | } 66 | 67 | inline Int16x8 LoadInt16x8(const Int16x8* src) { 68 | return __builtin_msa_ld_h(const_cast(src), 0); 69 | } 70 | 71 | inline void StoreInt16x8(std::int16_t* dst, Int16x8 value) { __builtin_msa_st_h(value, dst, 0); } 72 | 73 | inline void StoreInt16x8(Int16x8* dst, Int16x8 value) { __builtin_msa_st_h(value, dst, 0); } 74 | 75 | inline Uint8x16 LoadUint8x16(const std::uint8_t* src) { 76 | return __builtin_msa_ld_b(const_cast(src), 0); 77 | } 78 | 79 | inline Uint8x16 LoadUint8x16(const Uint8x16* src) { 80 | return __builtin_msa_ld_b(const_cast(src), 0); 81 | } 82 | 83 | inline void StoreUint8x16(std::uint8_t* dst, Uint8x16 value) { 84 | __builtin_msa_st_b(value, dst, 0); 85 | } 86 | 87 | inline void StoreUint8x16(Uint8x16* dst, Uint8x16 value) { 88 | __builtin_msa_st_b(value, dst, 0); 89 | } 90 | 91 | template 92 | std::int32_t GetLane(Int32x4 value) { 93 | return __builtin_msa_copy_s_w(value, Lane); 94 | } 95 | 96 | template 97 | Int32x4 DupLane(Int32x4 value) { 98 | static_assert(Lane >= 0 && Lane <= 3, ""); 99 | return __builtin_msa_splati_w(value, Lane); 100 | } 101 | 102 | inline Int32x4 Mul(Int32x4 a, std::int32_t b) { 103 | return __builtin_msa_mulv_w(a, __builtin_msa_fill_w(b)); 104 | } 105 | 106 | inline Int32x4 Min(Int32x4 a, Int32x4 b) { return __builtin_msa_min_s_w(a, b); } 107 | 108 | inline Int32x4 Max(Int32x4 a, Int32x4 b) { return __builtin_msa_max_s_w(a, b); } 109 | 110 | inline Int32x4 SaturatingRoundingDoublingHighMul(Int32x4 a, std::int32_t b) { 111 | return __builtin_msa_mulr_q_w(a, __builtin_msa_fill_w(b)); 112 | } 113 | 114 | template 115 | Int32x4 MulByRhsLane(Int32x4 a, Int32x4 b) { 116 | static_assert(Lane >= 0 && Lane <= 3, ""); 117 | return __builtin_msa_mulv_w(a, __builtin_msa_splati_w(b, Lane)); 118 | } 119 | 120 | static inline v4i32 workaround_msa_maddv_w(v4i32 a, v4i32 b, v4i32 c) { 121 | // Workaround for incorrect encoding of maddv.df in gcc (a exchanged with c). 122 | #if 0 123 | return __builtin_msa_maddv_w(a, b, c); 124 | #else 125 | asm volatile("maddv.w %w[a], %w[b], %w[c]\n" 126 | // Outputs 127 | : [a] "+f"(a) 128 | // Inputs 129 | : [b] "f"(b), [c] "f"(c)); 130 | return a; 131 | #endif 132 | } 133 | 134 | inline void MulAdd(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) { 135 | Int32x4 tmp = LoadInt32x4(acc); 136 | tmp = workaround_msa_maddv_w(tmp, lhs, rhs); 137 | StoreInt32x4(acc, tmp); 138 | } 139 | 140 | inline void MulAdd(Int32x4 lhs, std::int32_t rhs, Int32x4* acc) { 141 | Int32x4 tmp = LoadInt32x4(acc); 142 | tmp = workaround_msa_maddv_w(tmp, lhs, __builtin_msa_fill_w(rhs)); 143 | StoreInt32x4(acc, tmp); 144 | } 145 | 146 | template 147 | inline void MulAddByRhsLane(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) { 148 | static_assert(Lane >= 0 && Lane <= 3, ""); 149 | Int32x4 tmp = LoadInt32x4(acc); 150 | tmp = workaround_msa_maddv_w(tmp, lhs, __builtin_msa_splati_w(rhs, Lane)); 151 | StoreInt32x4(acc, tmp); 152 | } 153 | 154 | template <> 155 | struct LoadContiguousImpl> { 156 | static RegBlockUint8<8, 8> Run(const std::uint8_t* src) { 157 | RegBlockUint8<8, 8> result; 158 | for (int i = 0; i < 4; i++) { 159 | result.buf.reg[i] = LoadUint8x16(src + 16 * i); 160 | } 161 | return result; 162 | } 163 | }; 164 | 165 | template <> 166 | struct LoadContiguousImpl> { 167 | static RegBlockInt32<8, 8> Run(const std::int32_t* src) { 168 | RegBlockInt32<8, 8> result; 169 | for (int i = 0; i < 16; i++) { 170 | result.buf.reg[i] = LoadInt32x4(src + 4 * i); 171 | } 172 | return result; 173 | } 174 | }; 175 | 176 | template <> 177 | struct LoadContiguousImpl> { 178 | static RegBlockInt16<8, 8> Run(const std::int16_t* src) { 179 | RegBlockInt16<8, 8> result; 180 | for (int i = 0; i < 8; i++) { 181 | result.buf.reg[i] = LoadInt16x8(src + 8 * i); 182 | } 183 | return result; 184 | } 185 | }; 186 | 187 | } // end namespace gemmlowp 188 | 189 | #include "simd_wrappers_common_neon_sse.h" 190 | 191 | #endif // GEMMLOWP_INTERNAL_SIMD_WRAPPERS_MSA_H_ 192 | -------------------------------------------------------------------------------- /internal/simd_wrappers_sse.h: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // simd_wrappers_neon.h: SSE SIMD wrappers 16 | 17 | #ifndef GEMMLOWP_INTERNAL_SIMD_WRAPPERS_SSE_H_ 18 | #define GEMMLOWP_INTERNAL_SIMD_WRAPPERS_SSE_H_ 19 | 20 | #include 21 | 22 | namespace gemmlowp { 23 | 24 | using Int32x4 = __m128i; 25 | using Int16x8 = __m128i; 26 | using Uint8x16 = __m128i; 27 | 28 | template 29 | struct RegisterType { 30 | using Type = 31 | typename std::conditional= 4, Int32x4, std::int32_t>::type; 32 | }; 33 | 34 | template 35 | struct RegisterType { 36 | using Type = 37 | typename std::conditional= 8, Int16x8, std::int16_t>::type; 38 | }; 39 | 40 | template 41 | struct RegisterType { 42 | using Type = typename std::conditional< 43 | ScalarCount >= 16, Uint8x16, 44 | typename std::conditional= 4, std::uint32_t, 45 | std::uint8_t>::type>::type; 46 | }; 47 | 48 | inline Int32x4 LoadInt32x4(const std::int32_t* src) { 49 | return _mm_loadu_si128(reinterpret_cast(src)); 50 | } 51 | 52 | inline Int32x4 LoadInt16x8(const std::int16_t* src) { 53 | return _mm_loadu_si128(reinterpret_cast(src)); 54 | } 55 | 56 | inline void StoreInt32x4(std::int32_t* dst, Int32x4 value) { 57 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), value); 58 | } 59 | 60 | inline void StoreInt16x8(std::int16_t* dst, Int16x8 value) { 61 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), value); 62 | } 63 | 64 | inline Uint8x16 LoadUint8x16(const std::uint8_t* src) { 65 | return _mm_loadu_si128(reinterpret_cast(src)); 66 | } 67 | 68 | inline void StoreUint8x16(std::uint8_t* dst, Uint8x16 value) { 69 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), value); 70 | } 71 | 72 | template 73 | std::int32_t GetLane(Int32x4 value) { 74 | return _mm_extract_epi32(value, Lane); 75 | } 76 | 77 | template 78 | Int32x4 DupLane(Int32x4 value) { 79 | return _mm_shuffle_epi32(value, _MM_SHUFFLE(Lane, Lane, Lane, Lane)); 80 | } 81 | 82 | inline Int32x4 Mul(Int32x4 a, std::int32_t b) { 83 | return Mul(a, Dup(b)); 84 | } 85 | 86 | inline Int32x4 Min(Int32x4 a, Int32x4 b) { return _mm_min_epi32(a, b); } 87 | 88 | inline Int32x4 Max(Int32x4 a, Int32x4 b) { return _mm_max_epi32(a, b); } 89 | 90 | inline Int32x4 SaturatingRoundingDoublingHighMul(Int32x4 a, std::int32_t b) { 91 | return SaturatingRoundingDoublingHighMul(a, Dup(b)); 92 | } 93 | 94 | template 95 | Int32x4 MulByRhsLane(Int32x4 a, Int32x4 b) { 96 | return Mul(a, DupLane(b)); 97 | } 98 | 99 | inline void MulAdd(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) { 100 | *acc = Add(*acc, Mul(lhs, rhs)); 101 | } 102 | 103 | inline void MulAdd(Int32x4 lhs, std::int32_t rhs, Int32x4* acc) { 104 | *acc = Add(*acc, Mul(lhs, rhs)); 105 | } 106 | 107 | template 108 | inline void MulAddByRhsLane(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) { 109 | *acc = Add(*acc, MulByRhsLane(lhs, rhs)); 110 | } 111 | 112 | template <> 113 | struct LoadContiguousImpl> { 114 | static RegBlockUint8<8, 8> Run(const std::uint8_t* src) { 115 | RegBlockUint8<8, 8> result; 116 | for (int i = 0; i < 4; i++) { 117 | result.buf.reg[i] = LoadUint8x16(src + 16 * i); 118 | } 119 | return result; 120 | } 121 | }; 122 | 123 | template <> 124 | struct LoadContiguousImpl> { 125 | static RegBlockInt32<8, 8> Run(const std::int32_t* src) { 126 | RegBlockInt32<8, 8> result; 127 | for (int i = 0; i < 16; i++) { 128 | result.buf.reg[i] = LoadInt32x4(src + 4 * i); 129 | } 130 | return result; 131 | } 132 | }; 133 | 134 | template <> 135 | struct LoadContiguousImpl> { 136 | static RegBlockInt16<8, 8> Run(const std::int16_t* src) { 137 | RegBlockInt16<8, 8> result; 138 | for (int i = 0; i < 8; i++) { 139 | result.buf.reg[i] = LoadInt16x8(src + 8 * i); 140 | } 141 | return result; 142 | } 143 | }; 144 | 145 | } // end namespace gemmlowp 146 | 147 | #include "simd_wrappers_common_neon_sse.h" 148 | 149 | #endif // GEMMLOWP_INTERNAL_SIMD_WRAPPERS_SSE_H_ 150 | -------------------------------------------------------------------------------- /internal/single_thread_gemm.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // single_thread_gemm.h: Single-threaded GEMM implementation. 16 | // This is a good place to start reading code, as it shows the overall 17 | // structure of a GEMM and is much simpler than multi_thread_gemm.h. 18 | 19 | #ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ 20 | #define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ 21 | 22 | #include 23 | 24 | #include "../public/map.h" 25 | #include "allocator.h" 26 | #include "compute.h" 27 | #include "kernel.h" 28 | #include "pack.h" 29 | #include "unpack.h" 30 | 31 | #ifdef GEMMLOWP_PROFILING_SIZES 32 | #ifndef GEMMLOWP_PROFILING 33 | #error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING 34 | #endif 35 | #include 36 | #include 37 | #endif 38 | 39 | namespace gemmlowp { 40 | 41 | class SingleThreadGemmContext { 42 | public: 43 | Allocator* allocator() { return &allocator_; } 44 | 45 | void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; } 46 | void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; } 47 | void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; } 48 | 49 | int l1_bytes_to_use() const { return l1_bytes_to_use_; } 50 | int l2_bytes_to_use() const { return l2_bytes_to_use_; } 51 | float l2_rhs_factor() const { return l2_rhs_factor_; } 52 | 53 | protected: 54 | Allocator allocator_; 55 | 56 | // The cache configurationt to use. 57 | int l1_bytes_to_use_ = kDefaultL1CacheSize; 58 | int l2_bytes_to_use_ = kDefaultL2CacheSize; 59 | float l2_rhs_factor_ = kDefaultL2RhsFactor; 60 | }; 61 | 62 | template 66 | void SingleThreadGemm(SingleThreadGemmContext* context, 67 | const KernelBase& kernel, 68 | const MatrixMap& lhs, 69 | const MatrixMap& rhs, 70 | MatrixMap* result, 71 | const LhsOffset& lhs_offset, const RhsOffset& rhs_offset, 72 | const OutputPipelineType& output_pipeline) { 73 | ScopedProfilingLabel label("gemmlowp::SingleThreadGemm"); 74 | 75 | assert(lhs.cols() == rhs.rows()); 76 | 77 | int rows = result->rows(); 78 | int cols = result->cols(); 79 | int depth = lhs.cols(); 80 | 81 | // zero sizes should have been caught earlier and early-returned. 82 | assert(rows > 0); 83 | assert(cols > 0); 84 | assert(depth > 0); 85 | 86 | // The case of rows= cols); 88 | 89 | Allocator* allocator = context->allocator(); 90 | 91 | BlockParams block_params; 92 | block_params.Init( 93 | rows, cols, depth, 1, context->l1_bytes_to_use(), 94 | context->l2_bytes_to_use(), context->l2_rhs_factor()); 95 | 96 | #ifdef GEMMLOWP_PROFILING_SIZES 97 | // Using a static map of label strings. Not reentrant at all! 98 | static std::unordered_map labels_map; 99 | std::uint64_t sizes_hash = static_cast(rows) ^ 100 | (static_cast(depth) << 16) ^ 101 | (static_cast(cols) << 32); 102 | if (!labels_map.count(sizes_hash)) { 103 | char label[256]; 104 | snprintf(label, sizeof(label), 105 | "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, " 106 | "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)", 107 | rows, depth, cols, block_params.l2_rows, block_params.l2_depth, 108 | block_params.l2_cols, block_params.l1_rows, block_params.l1_depth, 109 | block_params.l1_cols); 110 | labels_map[sizes_hash] = label; 111 | } 112 | ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str()); 113 | #endif 114 | 115 | PackedSideBlock packed_lhs(Side::Lhs, allocator, 116 | block_params); 117 | PackedSideBlock packed_rhs(Side::Rhs, allocator, 118 | block_params); 119 | 120 | PackedResult packed_result(allocator, block_params); 121 | 122 | allocator->Commit(); 123 | 124 | const bool pack_rhs_once = block_params.l2_cols >= cols; 125 | 126 | if (pack_rhs_once) { 127 | PackRhs(&packed_rhs, rhs); 128 | } 129 | 130 | for (int r = 0; r < rows; r += block_params.l2_rows) { 131 | int rs = std::min(block_params.l2_rows, rows - r); 132 | 133 | PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth)); 134 | 135 | for (int c = 0; c < cols; c += block_params.l2_cols) { 136 | int cs = std::min(block_params.l2_cols, cols - c); 137 | 138 | if (!pack_rhs_once) { 139 | PackRhs(&packed_rhs, rhs.block(0, c, depth, cs)); 140 | } 141 | 142 | Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs, 143 | depth); 144 | 145 | UnpackResult( 146 | result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth, 147 | packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(), 148 | lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline); 149 | } 150 | } 151 | 152 | allocator->Decommit(); 153 | } 154 | 155 | } // namespace gemmlowp 156 | 157 | #endif // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_ 158 | -------------------------------------------------------------------------------- /jni/Android.mk: -------------------------------------------------------------------------------- 1 | LOCAL_PATH := $(call my-dir) 2 | 3 | include $(CLEAR_VARS) 4 | 5 | LOCAL_ARM_NEON := true 6 | LOCAL_MODULE := correctness_meta_gemm 7 | LOCAL_SRC_FILES := ../test/correctness_meta_gemm.cc 8 | 9 | include $(BUILD_EXECUTABLE) 10 | 11 | include $(CLEAR_VARS) 12 | 13 | LOCAL_ARM_NEON := true 14 | LOCAL_MODULE := benchmark_meta_gemm 15 | LOCAL_CFLAGS := -DNDEBUG -DGEMMLOWP_USE_META_FASTPATH 16 | LOCAL_SRC_FILES := ../test/benchmark_meta_gemm.cc ../eight_bit_int_gemm/eight_bit_int_gemm.cc 17 | 18 | include $(BUILD_EXECUTABLE) 19 | 20 | include $(CLEAR_VARS) 21 | 22 | LOCAL_ARM_NEON := true 23 | LOCAL_MODULE := benchmark 24 | LOCAL_SRC_FILES := ../test/benchmark.cc 25 | 26 | include $(BUILD_EXECUTABLE) 27 | -------------------------------------------------------------------------------- /jni/Application.mk: -------------------------------------------------------------------------------- 1 | NDK_TOOLCHAIN_VERSION := clang 2 | APP_STL := gnustl_static 3 | APP_ABI := armeabi-v7a 4 | APP_CPPFLAGS := -std=c++11 -Wall -Wextra -pedantic -Wno-unused-variable -Wno-unused-parameter 5 | APP_LDFLAGS := -L$(SYSROOT)/usr/lib -lstdc++ -latomic 6 | APP_PIE := true 7 | -------------------------------------------------------------------------------- /meta/README: -------------------------------------------------------------------------------- 1 | METAPROGRAMMED GEMM 2 | =================== 3 | 4 | The two main goals of this library are: 5 | - providing a new matrix multiplication kernel. 6 | - providing the optimized codepaths for as many possible user scenarios without 7 | enforcing additional input data constraints (padding, sizes, strides, layout) 8 | 9 | To enable this code add -DGEMMLOWP_USE_META_FASTPATH to your build setup. 10 | 11 | The new kernel 12 | -------------- 13 | 14 | The multiplication kernel - the most inner loop of the matrix multiplication, 15 | which is responsible for the row/column products was rewritten. The new code 16 | produces a 3x3 result patch and processes the row/column arrays in 8 element 17 | packs (the kernel 'shape' is 3x3x8 compared to the previous 12x4x2). By using 18 | specialized 8bit multiplication, aggregating to vector aggregators and then 19 | reduction with parallel horizontal addition we devised code that achieved 20 | higher arithmetical density (arithmetical operation per assembly instruction). 21 | The arithmetical performance of the new kernel exceeds 18 GOps/s on a vanilla 22 | Nexus 5 phone (which is practically peak for this device). 23 | 24 | In order to feed the kernel with input data and minimize the number of 25 | instructions other than the arithmetical operations a different packing 26 | scheme was used. Three rows (columns) are interweaved every 8 elements so that 27 | they can be read from continuous memory in one op inside the kernel. Additional 28 | memory preload hint operations are inserted into the kernel to hide memory 29 | latency behind arithmetical operations. 30 | 31 | Generated code 32 | -------------- 33 | 34 | The basic kernel used in this approach is of shape 3x3x8. Obviously this 35 | kernel can be easily applied to multipications where matrix sizes are: 36 | M x K, K x N where M and N are multiplies of 3 and K is a multiply of 8. 37 | 38 | We rejected two obvious solutions of: padding the matrix sizes to appropriate 39 | sizes, or using the reference implementation for the leftovers. Neither did 40 | we consider enforcing extra constraints on the caller. 41 | 42 | In order to allow all matrix sizes the kernels processing all combinations of 43 | 1, 2 or 3 rows and 1, 2 or 3 columns are required. Similarily to allow all 44 | possible depths the leftover values (up to 7 elements) needed to be handled. 45 | 46 | Instead of writing those kernels by hand we decided to generate them with 47 | some python scripts. 9 Versions of the multiplication kernel were prepared. 48 | Additionally packing and unpacking code for different row/column counts and 49 | depth leftovers was generated. Finally different code was generated for 50 | aligned memory reads/writes and unaligned memory reads/writes. 51 | 52 | Using those multiplication and packing/unpacking primitives 144 gemm function 53 | versions were prepared. On top of them one high level gemm function that would 54 | switch to one of those preoptimized versions. 55 | 56 | This approach allowed moving all unnecessary branching and conditional execution 57 | outside of the inner loops. It also allowed removing of all short loops required 58 | for leftover handling. Finally aligned memory reads/writes are used everywhere 59 | where the provided input data allows. 60 | 61 | Results 62 | ------- 63 | 64 | The library shows up to 35% faster gemm execution in some cases (e.g. ImageNet 65 | benchmark). 66 | 67 | Files 68 | ----- 69 | 70 | single_thread_gemm.h 71 | -- generated ARM/NEON 8bit x 8bit gemm implementation. Contains all the 72 | optimized, unrolled and curried pack/unpack, and multiply procedures and 73 | a single gemm function that switches between the optimized versions based 74 | on the runtime parameters. 75 | 76 | multi_thread_gemm.h 77 | -- a simple parallelization scheme for the gemm function. 78 | 79 | generators/gemm_NxMxK_neon.py 80 | -- script that generates the single_thread_gemm.h header library. 81 | Usage: python gemm_NxMxK_neon > single_thread_gemm.h 82 | -------------------------------------------------------------------------------- /meta/base.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GEMMLOWP_META_BASE_H_ 16 | #define GEMMLOWP_META_BASE_H_ 17 | 18 | #include 19 | #include 20 | 21 | #include "../internal/common.h" 22 | 23 | namespace gemmlowp { 24 | namespace meta { 25 | 26 | template 27 | inline int AlignTo(int value) { 28 | return ((value + align - 1) / align) * align; 29 | } 30 | 31 | inline int AlignTo(int align, int value) { 32 | return ((value + align - 1) / align) * align; 33 | } 34 | 35 | template 36 | struct FusedKernelParams { 37 | public: 38 | typedef Kernel_ Kernel; 39 | typedef OutputStream_ OutputStream; 40 | 41 | Kernel kernel; 42 | OutputStream output_stream; 43 | }; 44 | 45 | template 47 | struct GemmParams { 48 | public: 49 | typedef InType_ InType; 50 | typedef OutType_ OutType; 51 | typedef LeftStream_ LeftStream; 52 | typedef RightStream_ RightStream; 53 | typedef Kernel_ Kernel; 54 | typedef OutputStream_ OutputStream; 55 | 56 | typedef FusedKernelParams FusedKernel; 57 | 58 | // Common parameters. 59 | 60 | int m; 61 | int n; 62 | int k; 63 | 64 | const InType* lhs; 65 | const InType* rhs; 66 | OutType* result; 67 | std::uint8_t* scratch; 68 | 69 | // Specialized parameters. 70 | 71 | LeftStream left_stream; 72 | RightStream right_stream; 73 | FusedKernel fused_kernel; 74 | }; 75 | 76 | template 78 | class Stream { 79 | public: 80 | static void Pack(const InType* in, const StreamParams& params, InType* out); 81 | 82 | static int UnpackedAdvance(const StreamParams& params); 83 | 84 | static int PackedAdvance(const StreamParams& params); 85 | 86 | static int UnpackedStride(const StreamParams& params); 87 | 88 | static int PackedStride(const StreamParams& params); 89 | }; 90 | 91 | template 92 | class StreamUtil { 93 | public: 94 | static const InType* Offset(const StreamType& params, const InType* source, 95 | int offset_stride, int offset_advance); 96 | 97 | static int Scratch(const StreamType& params, int lanes); 98 | }; 99 | 100 | template 102 | class MulKernel { 103 | public: 104 | static void Multiply(const InType* lhs, const InType* rhs, 105 | const FusedKernelParams& params, 106 | OutType* result); 107 | }; 108 | 109 | template 110 | struct Transform1DParams { 111 | typedef InType_ InType; 112 | typedef OutType_ OutType; 113 | typedef Kernel_ Kernel; 114 | 115 | const InType* input; 116 | OutType* output; 117 | std::uint8_t* scratch; 118 | 119 | Kernel kernel; 120 | }; 121 | 122 | template 124 | class Transform1DKernel { 125 | public: 126 | static void Transform(const InType* input, const Kernel& params, 127 | OutType* output); 128 | }; 129 | 130 | template 131 | class Transform1DUtil { 132 | public: 133 | static int EstimateComputeCost(const Transform& params); 134 | 135 | static const InType* OffsetInput(const Transform& params, const InType* input, 136 | int offset); 137 | 138 | static OutType* OffsetOutput(const Transform& params, OutType* output, 139 | int offset); 140 | }; 141 | 142 | } // namespace meta 143 | } // namespace gemmlowp 144 | 145 | #endif // GEMMLOWP_META_BASE_H_ 146 | -------------------------------------------------------------------------------- /meta/generators/common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """.""" 15 | import collections 16 | 17 | _HEADER_COPYRIGHT = ( 18 | '''// Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 19 | // 20 | // Licensed under the Apache License, Version 2.0 (the "License"); 21 | // you may not use this file except in compliance with the License. 22 | // You may obtain a copy of the License at 23 | // 24 | // http://www.apache.org/licenses/LICENSE-2.0 25 | // 26 | // Unless required by applicable law or agreed to in writing, software 27 | // distributed under the License is distributed on an "AS IS" BASIS, 28 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 29 | // See the License for the specific language governing permissions and 30 | // limitations under the License. 31 | ''') 32 | 33 | 34 | def GenerateHeader(cc, header_name, preprocessor_directive): 35 | cc.EmitCodeNoSemicolon(_HEADER_COPYRIGHT) 36 | cc.EmitHeaderBegin(header_name) 37 | 38 | cc.EmitPreprocessor1('ifdef', preprocessor_directive) 39 | cc.EmitNewline() 40 | 41 | cc.EmitInclude('') 42 | cc.EmitInclude('') 43 | cc.EmitNewline() 44 | 45 | 46 | def GenerateFooter(cc, message): 47 | cc.EmitPreprocessor('else') 48 | cc.EmitPreprocessor1('warning', '"%s"' % message) 49 | cc.EmitPreprocessor('endif') 50 | cc.EmitNewline() 51 | cc.EmitHeaderEnd() 52 | 53 | 54 | def GenerateDebugLog(cc, message): 55 | cc.EmitPreprocessor1('ifdef', 'DEBUG') 56 | cc.EmitPreprocessor1('ifdef', 'DEBUG_METAGEMM_VERBOSE') 57 | cc.EmitCode('std::cout << __FILE__ << \"(\" << __LINE__ << \") %s\" ' 58 | '<< std::endl << std::flush' % message) 59 | cc.EmitPreprocessor('endif') 60 | cc.EmitPreprocessor('endif') 61 | 62 | 63 | def _TemplateName(base, params): 64 | return '%s<%s>' % (base, ', '.join(map(str, params))) 65 | 66 | 67 | class StreamGenerator(object): 68 | """.""" 69 | 70 | def __init__(self, emitter, name): 71 | self.name = name 72 | self.emitter = emitter 73 | 74 | def SpecializeStream(self, in_type, lanes_count, pack_size, leftovers): 75 | if isinstance(getattr(self, 'EmitPack', None), collections.Callable): 76 | template_params = [in_type, lanes_count, pack_size, leftovers, self.name] 77 | self.emitter.EmitMemberFunctionBegin( 78 | 'Stream', [], template_params, 'Pack', 79 | [['const %s*' % in_type, 'in'], ['const %s&' % self.name, 'params'], 80 | ['%s*' % in_type, 'out']], 'inline void') 81 | GenerateDebugLog(self.emitter, 82 | '%s::Pack()' % _TemplateName(self.name, template_params)) 83 | self.EmitPack(in_type, lanes_count, pack_size, leftovers) 84 | self.emitter.EmitFunctionEnd() 85 | 86 | 87 | class MulKernelGenerator(object): 88 | """.""" 89 | 90 | def __init__(self, emitter, kernel_name, output_stream_name): 91 | self.kernel_name = kernel_name 92 | self.output_stream_name = output_stream_name 93 | self.emitter = emitter 94 | 95 | def SpecializeMulKernel(self, in_type, out_type, kernel_m, kernel_n, 96 | pack_size): 97 | """Generates the kernel wrapped in a MulKernel template specialization.""" 98 | template_params = [ 99 | in_type, out_type, self.kernel_name, self.output_stream_name, kernel_m, 100 | kernel_n, pack_size 101 | ] 102 | self.emitter.EmitMemberFunctionBegin( 103 | 'MulKernel', [], template_params, 'Multiply', 104 | [['const %s*' % in_type, 'lhs'], ['const %s*' % in_type, 'rhs'], [ 105 | 'const FusedKernelParams<%s, %s>&' % (self.kernel_name, 106 | self.output_stream_name), 107 | 'params' 108 | ], ['%s*' % out_type, 'result']], 'inline void') 109 | GenerateDebugLog(self.emitter, '%s::Multiply()' % 110 | _TemplateName(self.kernel_name + self.output_stream_name, 111 | template_params)) 112 | self.EmitMultiply(in_type, out_type, kernel_m, kernel_n, pack_size) 113 | self.emitter.EmitFunctionEnd() 114 | 115 | 116 | class Transform1DKernelGenerator(object): 117 | """.""" 118 | 119 | def __init__(self, emitter, kernel_name): 120 | self.kernel_name = kernel_name 121 | self.emitter = emitter 122 | 123 | def SpecializeTransform1DKernel(self, in_type, out_type, kernel_size, 124 | leftovers): 125 | """Generates the kernel wrapped in a Transform1DKernel specialization.""" 126 | template_params = [ 127 | in_type, out_type, self.kernel_name, kernel_size, leftovers 128 | ] 129 | self.emitter.EmitMemberFunctionBegin( 130 | 'Transform1DKernel', [], template_params, 'Transform', 131 | [['const %s*' % in_type, 'input'], 132 | ['const %s&' % self.kernel_name, 'params'], 133 | ['%s*' % out_type, 'output']], 'inline void') 134 | GenerateDebugLog(self.emitter, '%s::Transform()' % 135 | _TemplateName(self.kernel_name, template_params)) 136 | self.EmitTransform(in_type, out_type, kernel_size, leftovers) 137 | self.emitter.EmitFunctionEnd() 138 | -------------------------------------------------------------------------------- /meta/generators/metagemm_generate_headers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python streams_arm_32.py > ../streams_arm_32.h 3 | python streams_arm_64.py > ../streams_arm_64.h 4 | python quantized_mul_kernels_arm_32.py > ../quantized_mul_kernels_arm_32.h 5 | python quantized_mul_kernels_arm_64.py > ../quantized_mul_kernels_arm_64.h 6 | python transform_kernels_arm_32.py > ../transform_kernels_arm_32.h 7 | python transform_kernels_arm_64.py > ../transform_kernels_arm_64.h 8 | 9 | -------------------------------------------------------------------------------- /meta/generators/quantized_mul_kernels_arm_32.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Generates the arm32 headers used by the gemm/gemv lib.""" 15 | 16 | import cc_emitter 17 | import common 18 | import neon_emitter 19 | import quantized_mul_kernels_common 20 | 21 | 22 | def Main(): 23 | """.""" 24 | cc = cc_emitter.CCEmitter() 25 | common.GenerateHeader(cc, 'gemmlowp_meta_quantized_mul_kernels_arm_32', 26 | 'GEMMLOWP_NEON_32') 27 | 28 | cc.EmitNamespaceBegin('gemmlowp') 29 | cc.EmitNamespaceBegin('meta') 30 | cc.EmitNewline() 31 | 32 | shapes = [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), 33 | (2, 1), (2, 2), (2, 3), (2, 4), (3, 1), (3, 2), (3, 3)] 34 | 35 | quantized_mul_kernels_common.GenerateKernels(cc, 36 | neon_emitter.NeonEmitter(), 37 | shapes) 38 | 39 | cc.EmitNamespaceEnd() 40 | cc.EmitNamespaceEnd() 41 | cc.EmitNewline() 42 | 43 | common.GenerateFooter(cc, 'Meta gemm for arm32 requires: GEMMLOWP_NEON_32!') 44 | 45 | 46 | if __name__ == '__main__': 47 | Main() 48 | -------------------------------------------------------------------------------- /meta/generators/quantized_mul_kernels_arm_64.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Generates the arm32 headers used by the gemm/gemv lib.""" 15 | 16 | import cc_emitter 17 | import common 18 | import neon_emitter_64 19 | import quantized_mul_kernels_common 20 | 21 | 22 | def Main(): 23 | """.""" 24 | cc = cc_emitter.CCEmitter() 25 | common.GenerateHeader(cc, 'gemmlowp_meta_quantized_mul_kernels_arm_64', 26 | 'GEMMLOWP_NEON_64') 27 | 28 | cc.EmitNamespaceBegin('gemmlowp') 29 | cc.EmitNamespaceBegin('meta') 30 | cc.EmitNewline() 31 | 32 | shapes = [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), 33 | (2, 1), (2, 2), (2, 3), (2, 4), (3, 1), (3, 2), (3, 3)] 34 | 35 | quantized_mul_kernels_common.GenerateKernels(cc, 36 | neon_emitter_64.NeonEmitter64(), 37 | shapes) 38 | 39 | cc.EmitNamespaceEnd() 40 | cc.EmitNamespaceEnd() 41 | cc.EmitNewline() 42 | 43 | common.GenerateFooter(cc, 'Meta gemm for arm64 requires: GEMMLOWP_NEON_64!') 44 | 45 | 46 | if __name__ == '__main__': 47 | Main() 48 | -------------------------------------------------------------------------------- /meta/generators/streams_arm_32.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Generates the arm32 headers used by the gemm/gemv lib.""" 15 | 16 | import cc_emitter 17 | import common 18 | import neon_emitter 19 | import streams_common 20 | 21 | 22 | def Main(): 23 | """.""" 24 | cc = cc_emitter.CCEmitter() 25 | common.GenerateHeader(cc, 'gemmlowp_meta_streams_arm_32', 'GEMMLOWP_NEON_32') 26 | 27 | cc.EmitNamespaceBegin('gemmlowp') 28 | cc.EmitNamespaceBegin('meta') 29 | cc.EmitNewline() 30 | 31 | streams_common.GenerateUInt8x8Streams(cc, neon_emitter.NeonEmitter(), 8) 32 | 33 | cc.EmitNamespaceEnd() 34 | cc.EmitNamespaceEnd() 35 | cc.EmitNewline() 36 | 37 | common.GenerateFooter(cc, 'Meta gemm for arm32 requires: GEMMLOWP_NEON_32!') 38 | 39 | 40 | if __name__ == '__main__': 41 | Main() 42 | -------------------------------------------------------------------------------- /meta/generators/streams_arm_64.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Generates the arm32 headers used by the gemm/gemv lib.""" 15 | 16 | import cc_emitter 17 | import common 18 | import neon_emitter_64 19 | import streams_common 20 | 21 | 22 | def Main(): 23 | """.""" 24 | cc = cc_emitter.CCEmitter() 25 | common.GenerateHeader(cc, 'gemmlowp_meta_streams_arm_64', 'GEMMLOWP_NEON_64') 26 | 27 | cc.EmitNamespaceBegin('gemmlowp') 28 | cc.EmitNamespaceBegin('meta') 29 | cc.EmitNewline() 30 | 31 | streams_common.GenerateUInt8x8Streams(cc, neon_emitter_64.NeonEmitter64(), 8) 32 | 33 | cc.EmitNamespaceEnd() 34 | cc.EmitNamespaceEnd() 35 | cc.EmitNewline() 36 | 37 | common.GenerateFooter(cc, 'Meta gemm for arm64 requires: GEMMLOWP_NEON_64!') 38 | 39 | 40 | if __name__ == '__main__': 41 | Main() 42 | -------------------------------------------------------------------------------- /meta/generators/transform_kernels_arm_32.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Generates the arm32 headers used by the gemm/gemv lib.""" 15 | 16 | import cc_emitter 17 | import common 18 | import neon_emitter 19 | import transform_kernels_common 20 | 21 | 22 | def Main(): 23 | """.""" 24 | cc = cc_emitter.CCEmitter() 25 | common.GenerateHeader(cc, 'gemmlowp_meta_transform_kernels_arm_32', 26 | 'GEMMLOWP_NEON_32') 27 | 28 | cc.EmitNamespaceBegin('gemmlowp') 29 | cc.EmitNamespaceBegin('meta') 30 | cc.EmitNewline() 31 | 32 | transform_kernels_common.GenerateKernels(cc, 33 | neon_emitter.NeonEmitter(), 34 | [(16, x) for x in range(16)]) 35 | 36 | cc.EmitNamespaceEnd() 37 | cc.EmitNamespaceEnd() 38 | cc.EmitNewline() 39 | 40 | common.GenerateFooter(cc, 'Meta gemm for arm32 requires: GEMMLOWP_NEON_32!') 41 | 42 | 43 | if __name__ == '__main__': 44 | Main() 45 | -------------------------------------------------------------------------------- /meta/generators/transform_kernels_arm_64.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Generates the arm32 headers used by the gemm/gemv lib.""" 15 | 16 | import cc_emitter 17 | import common 18 | import neon_emitter_64 19 | import transform_kernels_common 20 | 21 | 22 | def Main(): 23 | """.""" 24 | cc = cc_emitter.CCEmitter() 25 | common.GenerateHeader(cc, 'gemmlowp_meta_transform_kernels_arm_64', 26 | 'GEMMLOWP_NEON_64') 27 | 28 | cc.EmitNamespaceBegin('gemmlowp') 29 | cc.EmitNamespaceBegin('meta') 30 | cc.EmitNewline() 31 | 32 | transform_kernels_common.GenerateKernels(cc, 33 | neon_emitter_64.NeonEmitter64(), 34 | [(16, x) for x in range(16)]) 35 | 36 | cc.EmitNamespaceEnd() 37 | cc.EmitNamespaceEnd() 38 | cc.EmitNewline() 39 | 40 | common.GenerateFooter(cc, 'Meta gemm for arm64 requires: GEMMLOWP_NEON_64!') 41 | 42 | 43 | if __name__ == '__main__': 44 | Main() 45 | -------------------------------------------------------------------------------- /meta/legacy_multi_thread_common.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // multi_thread_common.h: Multithreading code shared by different meta gemm 16 | // versions. 17 | 18 | #ifndef GEMMLOWP_META_MULTI_THREAD_COMMON_H_ 19 | #define GEMMLOWP_META_MULTI_THREAD_COMMON_H_ 20 | 21 | #include "../internal/multi_thread_gemm.h" 22 | 23 | namespace gemmlowp { 24 | namespace meta { 25 | namespace internal { 26 | 27 | const std::int32_t kMinTaskSize = 16000; 28 | const std::int32_t kMinTaskDimension = 4; 29 | 30 | struct TaskRect { 31 | std::int32_t m_offset; 32 | std::int32_t m; 33 | std::int32_t n_offset; 34 | std::int32_t n; 35 | 36 | TaskRect(std::int32_t m_offset, std::int32_t m, std::int32_t n_offset, 37 | std::int32_t n) 38 | : m_offset(m_offset), m(m), n_offset(n_offset), n(n) {} 39 | }; 40 | 41 | template 42 | struct MetaTask : gemmlowp::Task { 43 | std::uint8_t* scratch; 44 | const IN_TYPE* lhs; 45 | const IN_TYPE* rhs; 46 | TaskRect task_rect; 47 | std::int32_t k; 48 | OUT_TYPE* result; 49 | std::int32_t result_stride; 50 | const F& operation; 51 | 52 | MetaTask(std::uint8_t* scratch, const IN_TYPE* lhs, const IN_TYPE* rhs, 53 | const TaskRect& task_rect, std::int32_t k, OUT_TYPE* result, 54 | std::int32_t result_stride, const F& operation) 55 | : scratch(scratch), 56 | lhs(lhs), 57 | rhs(rhs), 58 | task_rect(task_rect), 59 | k(k), 60 | result(result), 61 | result_stride(result_stride), 62 | operation(operation) {} 63 | 64 | void Run() override { 65 | const IN_TYPE* task_lhs = lhs + task_rect.m_offset * k; 66 | const IN_TYPE* task_rhs = rhs + task_rect.n_offset * k; 67 | OUT_TYPE* task_result = 68 | result + task_rect.m_offset * result_stride + task_rect.n_offset; 69 | operation.ExecuteMatrixMatrix(scratch, task_lhs, task_rhs, task_rect.m, 70 | task_rect.n, k, task_result, result_stride); 71 | } 72 | }; 73 | 74 | std::int32_t ResolveMaxThreads(std::int32_t max_threads) { 75 | if (max_threads == 0) { 76 | static const int hardware_threads_count = 77 | static_cast(sysconf(_SC_NPROCESSORS_CONF)); 78 | return hardware_threads_count; 79 | } 80 | return max_threads; 81 | } 82 | 83 | void PrepareTasks(std::int32_t max_tasks, std::int32_t m, std::int32_t n, 84 | std::int32_t k, std::vector* tasks) { 85 | const std::int32_t max_tasks_by_size = (m * n * k) / kMinTaskSize; 86 | const std::int32_t max_tasks_m = m / kMinTaskDimension; 87 | const std::int32_t max_tasks_n = n / kMinTaskDimension; 88 | const std::int32_t max_tasks_dimension = std::max(max_tasks_m, max_tasks_n); 89 | 90 | std::int32_t real_tasks = std::max( 91 | 1, std::min(max_tasks, std::min(max_tasks_by_size, max_tasks_dimension))); 92 | 93 | if (real_tasks == 1) { 94 | tasks->push_back(TaskRect(0, m, 0, n)); 95 | return; 96 | } 97 | 98 | if (max_tasks_m > max_tasks_n) { 99 | const std::int32_t m_chunk = m / real_tasks; 100 | for (int i = 0; i < real_tasks - 1; ++i) { 101 | tasks->push_back(TaskRect(i * m_chunk, m_chunk, 0, n)); 102 | } 103 | const std::int32_t last_m_offset = (real_tasks - 1) * m_chunk; 104 | tasks->push_back(TaskRect(last_m_offset, m - last_m_offset, 0, n)); 105 | } else { 106 | const std::int32_t n_chunk = n / real_tasks; 107 | for (int i = 0; i < real_tasks - 1; ++i) { 108 | tasks->push_back(TaskRect(0, m, i * n_chunk, n_chunk)); 109 | } 110 | const std::int32_t last_n_offset = (real_tasks - 1) * n_chunk; 111 | tasks->push_back(TaskRect(0, m, last_n_offset, n - last_n_offset)); 112 | } 113 | } 114 | 115 | template 116 | void MultiThreadedMatrixMatrix(gemmlowp::WorkersPool* pool, 117 | std::int32_t max_threads, std::uint8_t* scratch, 118 | const IN_TYPE* lhs, const IN_TYPE* rhs, 119 | std::int32_t m, std::int32_t n, std::int32_t k, 120 | OUT_TYPE* result, std::int32_t result_stride, 121 | const F& operation) { 122 | max_threads = internal::ResolveMaxThreads(max_threads); 123 | 124 | std::vector task_rects; 125 | internal::PrepareTasks(max_threads, m, n, k, &task_rects); 126 | 127 | if (task_rects.size() == 1) { 128 | operation.ExecuteMatrixMatrix(scratch, lhs, rhs, m, n, k, result, 129 | result_stride); 130 | return; 131 | } 132 | 133 | std::uint8_t* task_scratch = scratch; 134 | std::int32_t scratch_per_thread = operation.ScratchPerThread(m, n, k); 135 | std::vector tasks; 136 | std::for_each( 137 | task_rects.begin(), task_rects.end(), 138 | [&tasks, &task_scratch, lhs, rhs, k, result, result_stride, operation, 139 | scratch_per_thread](internal::TaskRect& rect) { 140 | tasks.push_back(new internal::MetaTask( 141 | task_scratch, lhs, rhs, rect, k, result, result_stride, operation)); 142 | task_scratch += scratch_per_thread; 143 | }); 144 | pool->Execute(tasks); 145 | } 146 | 147 | } // namespace internal 148 | } // namespace meta 149 | } // namespace gemmlowp 150 | 151 | #endif // GEMMLOWP_META_MULTI_THREAD_COMMON_H_ 152 | -------------------------------------------------------------------------------- /meta/legacy_operations_common.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GEMMLOWP_META_OPERATIONS_COMMON_H_ 16 | #define GEMMLOWP_META_OPERATIONS_COMMON_H_ 17 | 18 | class Quantized8BitOperation { 19 | public: 20 | Quantized8BitOperation(std::int32_t lhs_offset, std::int32_t rhs_offset, 21 | std::int32_t sum_offset, std::int32_t multiplier, 22 | std::int32_t shift) 23 | : lhs_offset(lhs_offset), 24 | rhs_offset(rhs_offset), 25 | sum_offset(sum_offset), 26 | multiplier(multiplier), 27 | shift(shift) {} 28 | 29 | protected: 30 | std::int32_t lhs_offset; 31 | std::int32_t rhs_offset; 32 | std::int32_t sum_offset; 33 | std::int32_t multiplier; 34 | std::int32_t shift; 35 | }; 36 | 37 | class FloatOperation { 38 | public: 39 | FloatOperation(std::int32_t lhs_offset, std::int32_t rhs_offset, 40 | float result_offset) 41 | : lhs_offset(lhs_offset), 42 | rhs_offset(rhs_offset), 43 | result_offset(result_offset) {} 44 | 45 | protected: 46 | std::int32_t lhs_offset; 47 | std::int32_t rhs_offset; 48 | float result_offset; 49 | }; 50 | 51 | class Int32Operation { 52 | public: 53 | Int32Operation(std::int32_t lhs_offset, std::int32_t rhs_offset) 54 | : lhs_offset(lhs_offset), rhs_offset(rhs_offset) {} 55 | 56 | protected: 57 | std::int32_t lhs_offset; 58 | std::int32_t rhs_offset; 59 | }; 60 | 61 | #endif // GEMMLOWP_META_OPERATIONS_COMMON_H_ 62 | -------------------------------------------------------------------------------- /meta/multi_thread_common.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GEMMLOWP_META_MULTI_THREAD_COMMON_H_ 16 | #define GEMMLOWP_META_MULTI_THREAD_COMMON_H_ 17 | 18 | #include "../internal/multi_thread_gemm.h" 19 | 20 | namespace gemmlowp { 21 | namespace meta { 22 | 23 | inline int ResolveMaxThreads(int max_threads) { 24 | if (max_threads == 0) { 25 | #ifdef _WIN32 26 | SYSTEM_INFO sysinfo; 27 | GetSystemInfo(&sysinfo); 28 | return sysinfo.dwNumberOfProcessors; 29 | #else 30 | static const int hardware_threads_count = 31 | static_cast(sysconf(_SC_NPROCESSORS_CONF)); 32 | return hardware_threads_count; 33 | #endif 34 | } 35 | return max_threads; 36 | } 37 | 38 | template 39 | class SimpleContext { 40 | public: 41 | SimpleContext(int max_num_threads, WorkersPool* pool) 42 | : max_num_threads_(max_num_threads), pool_(pool) {} 43 | 44 | WorkersPool* workers_pool() { return pool_; } 45 | 46 | int max_num_threads() { return max_num_threads_; } 47 | 48 | private: 49 | int max_num_threads_; 50 | WorkersPool* pool_; 51 | }; 52 | 53 | } // namespace meta 54 | } // namespace gemmlowp 55 | 56 | #endif // GEMMLOWP_META_MULTI_THREAD_COMMON_H_ 57 | -------------------------------------------------------------------------------- /meta/multi_thread_gemm.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GEMMLOWP_META_MULTI_THREAD_GEMM_H_ 16 | #define GEMMLOWP_META_MULTI_THREAD_GEMM_H_ 17 | 18 | #include "multi_thread_common.h" 19 | #include "single_thread_gemm.h" 20 | 21 | namespace gemmlowp { 22 | namespace meta { 23 | namespace internal { 24 | 25 | const std::int32_t kMinGemmTaskSize = 16000; 26 | const std::int32_t kMinGemmTaskDimension = 4; 27 | 28 | template 29 | std::uint8_t* PrepareGemmTask(const Params& params, int kernel_m, int kernel_n, 30 | int kernel_k, std::uint8_t* scratch, int m_start, 31 | int m, int n_start, int n, 32 | std::vector* tasks) { 33 | tasks->push_back(params); 34 | Params& task = tasks->back(); 35 | task.scratch = scratch; 36 | 37 | task.m = m; 38 | task.lhs = 39 | StreamUtil::Offset( 40 | params.left_stream, params.lhs, m_start, 0); 41 | 42 | task.n = n; 43 | task.rhs = 44 | StreamUtil::Offset( 45 | params.right_stream, params.rhs, n_start, 0); 46 | 47 | task.result = 48 | StreamUtil:: 49 | Offset(params.fused_kernel.output_stream, params.result, m_start, 50 | n_start); 51 | 52 | return scratch + Executor::template EstimateScratchSize( 53 | task, kernel_m, kernel_n, kernel_k); 54 | } 55 | 56 | template 57 | bool PrepareGemmTasks(MultiThreadingContext* context, const Params& params, 58 | int kernel_m, int kernel_n, int kernel_k, 59 | std::vector* task_params) { 60 | const int max_threads = ResolveMaxThreads(context->max_num_threads()); 61 | const int max_tasks_by_size = 62 | (params.m * params.n * params.k) / kMinGemmTaskSize; 63 | const int max_tasks_m = params.m / kMinGemmTaskDimension; 64 | const int max_tasks_n = params.n / kMinGemmTaskDimension; 65 | const int max_tasks_dimension = std::max(max_tasks_m, max_tasks_n); 66 | 67 | const int real_tasks = std::max( 68 | 1, 69 | std::min(max_threads, std::min(max_tasks_by_size, max_tasks_dimension))); 70 | 71 | if (real_tasks == 1) { 72 | return false; 73 | } 74 | 75 | std::uint8_t* scratch = params.scratch; 76 | 77 | if (max_tasks_m > max_tasks_n) { 78 | const int m_chunk = params.m / real_tasks; 79 | for (int i = 0; i < real_tasks - 1; ++i) { 80 | scratch = PrepareGemmTask( 81 | params, kernel_m, kernel_n, kernel_k, scratch, i * m_chunk, m_chunk, 82 | 0, params.n, task_params); 83 | } 84 | const int sum_m = (real_tasks - 1) * m_chunk; 85 | PrepareGemmTask(params, kernel_m, kernel_n, kernel_k, 86 | scratch, sum_m, params.m - sum_m, 0, 87 | params.n, task_params); 88 | } else { 89 | const int n_chunk = params.n / real_tasks; 90 | for (int i = 0; i < real_tasks - 1; ++i) { 91 | scratch = PrepareGemmTask( 92 | params, kernel_m, kernel_n, kernel_k, scratch, 0, params.m, 93 | i * n_chunk, n_chunk, task_params); 94 | } 95 | int sum_n = (real_tasks - 1) * n_chunk; 96 | PrepareGemmTask(params, kernel_m, kernel_n, kernel_k, 97 | scratch, 0, params.m, sum_n, 98 | params.n - sum_n, task_params); 99 | } 100 | 101 | return true; 102 | } 103 | 104 | template 106 | struct GemmTaskRunner : gemmlowp::Task { 107 | GemmTaskRunner(const Params& params) : params(params) {} 108 | 109 | void Run() override { 110 | Gemm(params); 111 | } 112 | 113 | Params params; 114 | }; 115 | 116 | } // namespace internal 117 | 118 | template 120 | inline void MultiThreadGemm(MultiThreadingContext* context, 121 | const Params& params) { 122 | typedef internal::GemmTaskRunner 124 | TaskRunnerType; 125 | 126 | std::vector task_params; 127 | if (!internal::PrepareGemmTasks( 128 | context, params, kernel_m, kernel_n, kernel_k, &task_params)) { 129 | Gemm(params); 130 | return; 131 | } 132 | 133 | auto workers_pool = context->workers_pool(); 134 | std::vector tasks; 135 | for (auto& task_param : task_params) { 136 | tasks.push_back(new TaskRunnerType(task_param)); 137 | }; 138 | workers_pool->Execute(tasks); 139 | } 140 | 141 | } // namespace meta 142 | } // namespace gemmlowp 143 | 144 | #endif // GEMMLOWP_META_MULTI_THREAD_GEMM_H_ 145 | -------------------------------------------------------------------------------- /meta/multi_thread_transform.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_ 16 | #define GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_ 17 | 18 | #include "multi_thread_common.h" 19 | #include "single_thread_transform.h" 20 | 21 | namespace gemmlowp { 22 | namespace meta { 23 | namespace internal { 24 | 25 | const int kTransformTaskOverhead = 128000; 26 | const int kMinTransformTaskSize = 32000; 27 | 28 | template 29 | inline bool PrepareTransform1DTasks(MultiThreadingContext* context, 30 | const Params& params, int kernel_size, 31 | std::vector* task_params) { 32 | typedef Transform1DUtil 34 | Util; 35 | 36 | const int max_threads = ResolveMaxThreads(context->max_num_threads()); 37 | const int task_size = Util::EstimateComputeCost(params.kernel); 38 | const int max_tasks_by_size = 39 | (task_size - kTransformTaskOverhead) / kMinTransformTaskSize; 40 | 41 | const int real_tasks = std::max(1, std::min(max_threads, max_tasks_by_size)); 42 | 43 | if (real_tasks == 1) { 44 | return false; 45 | } 46 | 47 | const int chunk = params.kernel.count / real_tasks; 48 | for (int i = 0; i < real_tasks - 1; ++i) { 49 | task_params->push_back(params); 50 | Params& task = task_params->back(); 51 | task.kernel.count = chunk; 52 | task.input = Util::OffsetInput(params.kernel, params.input, i * chunk); 53 | task.output = Util::OffsetOutput(params.kernel, params.output, i * chunk); 54 | } 55 | task_params->push_back(params); 56 | Params& task = task_params->back(); 57 | const int sum_chunk = (real_tasks - 1) * chunk; 58 | task.kernel.count = params.kernel.count - sum_chunk; 59 | task.input = Util::OffsetInput(params.kernel, params.input, sum_chunk); 60 | task.output = Util::OffsetOutput(params.kernel, params.output, sum_chunk); 61 | return true; 62 | } 63 | 64 | template 65 | struct Transform1DTaskRunner : gemmlowp::Task { 66 | Transform1DTaskRunner(const Params& params) : params(params) {} 67 | 68 | void Run() override { Transform1D(params); } 69 | 70 | Params params; 71 | }; 72 | 73 | } // namespace internal 74 | 75 | template 76 | inline void MultiThreadTransform1D(MultiThreadingContext* context, 77 | const Params& params) { 78 | typedef internal::Transform1DTaskRunner TaskRunnerType; 79 | 80 | std::vector task_params; 81 | if (!internal::PrepareTransform1DTasks( 82 | context, params, kernel_size, &task_params)) { 83 | Transform1D(params); 84 | return; 85 | } 86 | 87 | auto workers_pool = context->workers_pool(); 88 | std::vector tasks; 89 | for (auto& task_param : task_params) { 90 | tasks.push_back(new TaskRunnerType(task_param)); 91 | } 92 | workers_pool->Execute(tasks); 93 | } 94 | 95 | } // namespace meta 96 | } // namespace gemmlowp 97 | 98 | #endif // GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_ 99 | -------------------------------------------------------------------------------- /meta/quantized_mul_kernels.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GEMMLOWP_META_QUANTIZED_MUL_KERNELS_H_ 16 | #define GEMMLOWP_META_QUANTIZED_MUL_KERNELS_H_ 17 | 18 | #include 19 | #include 20 | 21 | #include "base.h" 22 | #include "streams.h" 23 | 24 | namespace gemmlowp { 25 | namespace meta { 26 | 27 | struct QuantizedStaticPreprocessed { 28 | public: 29 | int multiplicative_offset; 30 | int rounding_offset; 31 | int shift; 32 | int count; 33 | }; 34 | 35 | template 36 | class MulKernel { 38 | public: 39 | typedef FusedKernelParams FusedKernel; 40 | 41 | static void Multiply(const InType* lhs, const InType*, 42 | const FusedKernel& params, OutType* result) { 43 | #ifdef DEBUG 44 | #ifdef DEBUG_METAGEMM_VERBOSE 45 | std::cout << "MulQSPR(" << typeid(InType).name() << ", " 46 | << typeid(OutType).name() << ")::Multiply() -- " << m << "x" << n 47 | << "x" << k << std::endl; 48 | #endif 49 | #else 50 | if (m != 0 && n != 0) { 51 | std::cerr << "FATAL: QuantizedStaticPreprocessed_RowMajor::Multiply not " 52 | << "implemented." << std::endl; 53 | std::exit(1); 54 | } 55 | #endif 56 | } 57 | 58 | #ifdef DEBUG 59 | #ifdef DEBUG_METAGEMM_VERBOSE 60 | static void Debug(const FusedKernel& params) { 61 | std::cout << "MulQSPR(" << typeid(InType).name() << ", " 62 | << typeid(OutType).name() << ") -- " << m << "x" << n << "x" << k 63 | << std::endl; 64 | std::cout << " params:" << std::endl; 65 | std::cout << " kernel.multiplicative_offset: " 66 | << params.kernel.multiplicative_offset << std::endl; 67 | std::cout << " kernel.rounding_offset: " << params.kernel.rounding_offset 68 | << std::endl; 69 | std::cout << " kernel.shift: " << params.kernel.shift << std::endl; 70 | std::cout << " kernel.count: " << params.kernel.count << std::endl; 71 | std::cout << " output_stream.stride: " << params.output_stream.stride 72 | << std::endl; 73 | } 74 | #endif 75 | #endif 76 | }; 77 | 78 | struct QuantizedStaticPreprocessedAsInt32 { 79 | public: 80 | int count; 81 | }; 82 | 83 | template 84 | class MulKernel { 86 | public: 87 | typedef FusedKernelParams 88 | FusedKernel; 89 | 90 | static void Multiply(const InType* lhs, const InType*, 91 | const FusedKernel& params, OutType* result) { 92 | #ifdef DEBUG 93 | #ifdef DEBUG_METAGEMM_VERBOSE 94 | std::cout << "MulQSPI32R(" << typeid(InType).name() << ", " 95 | << typeid(OutType).name() << ")::Multiply() -- " << m << "x" << n 96 | << "x" << k << std::endl; 97 | #endif 98 | #else 99 | if (m != 0 && n != 0) { 100 | std::cerr << "FATAL: QuantizedStaticPreprocessedAsInt32_RowMajor::" 101 | << "Multiply not implemented." << std::endl; 102 | std::exit(1); 103 | } 104 | #endif 105 | } 106 | 107 | #ifdef DEBUG 108 | #ifdef DEBUG_METAGEMM_VERBOSE 109 | static void Debug(const FusedKernel& params) { 110 | std::cout << "MulQSPI32R(" << typeid(InType).name() << ", " 111 | << typeid(OutType).name() << ") -- " << m << "x" << n << "x" << k 112 | << std::endl; 113 | std::cout << " params:" << std::endl; 114 | std::cout << " kernel.count: " << params.kernel.count << std::endl; 115 | std::cout << " output_stream.stride: " << params.output_stream.stride 116 | << std::endl; 117 | } 118 | #endif 119 | #endif 120 | }; 121 | 122 | struct QuantizedStaticPreprocessedAsFloat { 123 | public: 124 | int count; 125 | float scale; 126 | }; 127 | 128 | template 129 | class MulKernel { 131 | public: 132 | typedef FusedKernelParams 133 | FusedKernel; 134 | 135 | static void Multiply(const InType* lhs, const InType*, 136 | const FusedKernel& params, OutType* result) { 137 | #ifdef DEBUG 138 | #ifdef DEBUG_METAGEMM_VERBOSE 139 | std::cout << "MulQSPFR(" << typeid(InType).name() << ", " 140 | << typeid(OutType).name() << ")::Multiply() -- " << m << "x" << n 141 | << "x" << k << std::endl; 142 | #endif 143 | #else 144 | if (m != 0 && n != 0) { 145 | std::cerr << "FATAL: QuantizedStaticPreprocessedAsFloat_RowMajor::" 146 | << "Multiply not implemented." << std::endl; 147 | std::exit(1); 148 | } 149 | #endif 150 | } 151 | 152 | #ifdef DEBUG 153 | #ifdef DEBUG_METAGEMM_VERBOSE 154 | static void Debug(const FusedKernel& params) { 155 | std::cout << "MulQSPFR(" << typeid(InType).name() << ", " 156 | << typeid(OutType).name() << ") -- " << m << "x" << n << "x" << k 157 | << std::endl; 158 | std::cout << " params:" << std::endl; 159 | std::cout << " kernel.count: " << params.kernel.count << std::endl; 160 | std::cout << " kernel.scale: " << params.kernel.scale << std::endl; 161 | std::cout << " output_stream.stride: " << params.output_stream.stride 162 | << std::endl; 163 | } 164 | #endif 165 | #endif 166 | }; 167 | 168 | } // namespace meta 169 | } // namespace gemmlowp 170 | 171 | #ifdef GEMMLOWP_NEON_32 172 | #include "quantized_mul_kernels_arm_32.h" 173 | #elif defined(GEMMLOWP_NEON_64) 174 | #include "quantized_mul_kernels_arm_64.h" 175 | #endif 176 | 177 | #endif // GEMMLOWP_META_QUANTIZED_MUL_KERNELS_H_ 178 | -------------------------------------------------------------------------------- /meta/single_thread_transform.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GEMMLOWP_META_SINGLE_THREAD_TRANSFORM_H_ 16 | #define GEMMLOWP_META_SINGLE_THREAD_TRANSFORM_H_ 17 | 18 | #include 19 | #include "base.h" 20 | 21 | namespace gemmlowp { 22 | namespace meta { 23 | 24 | template 25 | void Transform1D(const Params& params); 26 | 27 | namespace internal { 28 | 29 | class Transform1DExecutor { 30 | public: 31 | template 32 | static void ExecuteDispatch1D(const P& params) { 33 | Transform1DKernel::Transform(params.input, params.kernel, 36 | params.output); 37 | } 38 | }; 39 | 40 | template 41 | struct Dispatch1D { 42 | static void Execute(const P& params, int leftovers) { 43 | #ifdef DEBUG 44 | #ifdef DEBUG_METAGEMM_VERBOSE 45 | std::cout << "Dispatch(1): " << kernel_size << ":" << variable_leftovers 46 | << std::endl 47 | << std::flush; 48 | #endif 49 | #endif 50 | if (leftovers == variable_leftovers) { 51 | E::template ExecuteDispatch1D(params); 52 | } else { 53 | Dispatch1D::Execute(params, 54 | leftovers); 55 | } 56 | } 57 | }; 58 | 59 | template 60 | struct Dispatch1D { 61 | static void Execute(const P& params, int leftovers) { 62 | #ifdef DEBUG 63 | #ifdef DEBUG_METAGEMM_VERBOSE 64 | std::cout << "Dispatch(1): " << kernel_size << ": 0" << std::endl 65 | << std::flush; 66 | #endif 67 | #endif 68 | if (leftovers == 0) { 69 | E::template ExecuteDispatch1D(params); 70 | } else { 71 | std::cerr << "FATAL: dispatch1D failed: ran out of cases." << std::endl 72 | << std::flush; 73 | std::exit(1); 74 | } 75 | } 76 | }; 77 | 78 | } // namespace internal 79 | 80 | template 81 | inline void Transform1D(const Params& params) { 82 | internal::Dispatch1D::Execute(params, params.kernel.count % 84 | kernel_size); 85 | } 86 | 87 | } // namespace meta 88 | } // namespace gemmlowp 89 | 90 | #endif // GEMMLOWP_META_SINGLE_THREAD_TRANSFORM_H_ 91 | -------------------------------------------------------------------------------- /meta/test_streams_correctness.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #ifdef __APPLE__ 17 | #include 18 | #endif 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include "streams.h" 30 | 31 | #define MUL_OFFSET (3) 32 | #define ADD_OFFSET (100) 33 | 34 | using namespace gemmlowp::meta; 35 | 36 | void prepare_row_major_data(int rows, int elements, int stride, std::uint8_t* data) { 37 | for (int i = 0; i < rows * stride; ++i) { 38 | data[i] = 255; 39 | } 40 | for (int i = 0; i < rows; ++i) { 41 | for (int j = 0; j < elements; ++j) { 42 | data[i * stride + j] = j % 256; 43 | } 44 | } 45 | } 46 | 47 | void prepare_column_major_data(int columns, int elements, int stride, 48 | std::uint8_t* data) { 49 | for (int i = 0; i < elements * stride; ++i) { 50 | data[i] = 255; 51 | } 52 | for (int i = 0; i < elements; ++i) { 53 | for (int j = 0; j < columns; ++j) { 54 | data[i * stride + j] = i % 256; 55 | } 56 | } 57 | } 58 | 59 | void print_out(std::uint8_t* result, int rows, int elements) { 60 | int size = rows * ((elements + 7) / 8) * 8; 61 | for (int i = 0; i < size; ++i) { 62 | std::cout << static_cast(result[i]) << " "; 63 | } 64 | std::cout << std::endl << std::flush; 65 | } 66 | 67 | bool check(std::uint8_t* result, int rows, int elements) { 68 | int chunks = elements / 8; 69 | int leftover = elements % 8; 70 | for (int i = 0; i < chunks; ++i) { 71 | int chunk_index = i * rows * 8; 72 | int chunk_start_value = i * 8; 73 | for (int j = 0; j < rows; ++j) { 74 | for (int k = 0; k < 8; ++k) { 75 | if (result[chunk_index + j * 8 + k] != chunk_start_value + k) { 76 | return false; 77 | } 78 | } 79 | } 80 | } 81 | 82 | int leftover_index = chunks * rows * 8; 83 | int leftover_start_value = chunks * 8; 84 | for (int i = 0; i < rows; ++i) { 85 | for (int j = 0; j < leftover; ++j) { 86 | if (result[leftover_index + i * 8 + j] != leftover_start_value + j) { 87 | return false; 88 | } 89 | } 90 | } 91 | 92 | int expected_sum = 93 | ((elements * (elements - 1)) / 2) * MUL_OFFSET + ADD_OFFSET; 94 | int sums_offset = rows * ((elements + 7) / 8) * 8; 95 | std::int32_t* sums = reinterpret_cast(result + sums_offset); 96 | for (int i = 0; i < rows; ++i) { 97 | if (sums[i] != expected_sum) { 98 | return false; 99 | } 100 | } 101 | 102 | return true; 103 | } 104 | 105 | template 106 | void test_2(std::uint8_t* in, std::uint8_t* out) { 107 | for (int elements = 8; elements < 64; elements += 8) { 108 | int all_elements = elements + leftover; 109 | for (int stride = all_elements; stride < all_elements + 4; ++stride) { 110 | RowMajorWithSum params; 111 | params.count = all_elements; 112 | params.stride = stride; 113 | params.multiplicative_sum_offset = MUL_OFFSET; 114 | params.additive_sum_offset = ADD_OFFSET; 115 | 116 | prepare_row_major_data(lanes, all_elements, stride, in); 117 | Stream::Pack(in, params, 118 | out); 119 | if (check(out, lanes, all_elements)) { 120 | // std::cout << "Row: " << lanes << "x8x" << leftover << " : " 121 | // << all_elements << "@" << stride << " -- OK" << 122 | // std::endl; 123 | } else { 124 | std::cout << "Row: " << lanes << "x8x" << leftover << " : " 125 | << all_elements << "@" << stride << " -- ERROR" << std::endl; 126 | std::cout << "Exiting." << std::endl; 127 | std::exit(1); 128 | } 129 | } 130 | 131 | for (int stride = lanes; stride < lanes + 4; ++stride) { 132 | ColumnMajorWithSum params; 133 | params.count = all_elements; 134 | params.stride = stride; 135 | params.multiplicative_sum_offset = MUL_OFFSET; 136 | params.additive_sum_offset = ADD_OFFSET; 137 | 138 | prepare_column_major_data(lanes, all_elements, stride, in); 139 | Stream::Pack(in, params, 140 | out); 141 | if (check(out, lanes, all_elements)) { 142 | // std::cout << "Column: " << lanes << "x8x" << leftover << " : " 143 | // << all_elements << "@" << stride << " -- OK" << 144 | // std::endl; 145 | } else { 146 | std::cout << "Column: " << lanes << "x8x" << leftover << " : " 147 | << all_elements << "@" << stride << " -- ERROR" << std::endl; 148 | std::cout << "Exiting." << std::endl; 149 | std::exit(1); 150 | } 151 | } 152 | } 153 | } 154 | 155 | template 156 | void test(std::uint8_t* in, std::uint8_t* out) { 157 | test_2(in, out); 158 | test_2(in, out); 159 | test_2(in, out); 160 | test_2(in, out); 161 | test_2(in, out); 162 | test_2(in, out); 163 | test_2(in, out); 164 | test_2(in, out); 165 | } 166 | 167 | int main() { 168 | std::unique_ptr in(new std::uint8_t[128 * 1024]); 169 | std::unique_ptr out(new std::uint8_t[128 * 1024]); 170 | 171 | test<1>(in.get(), out.get()); 172 | test<2>(in.get(), out.get()); 173 | test<3>(in.get(), out.get()); 174 | test<4>(in.get(), out.get()); 175 | test<5>(in.get(), out.get()); 176 | test<6>(in.get(), out.get()); 177 | test<7>(in.get(), out.get()); 178 | test<8>(in.get(), out.get()); 179 | 180 | std::cout << "Ok." << std::endl; 181 | return 0; 182 | } 183 | -------------------------------------------------------------------------------- /meta/test_transform_benchmark.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include 16 | #ifdef __APPLE__ 17 | #include 18 | #endif 19 | 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | #include 27 | #include 28 | 29 | #include "multi_thread_transform.h" 30 | #include "transform_kernels.h" 31 | 32 | using namespace gemmlowp::meta; 33 | 34 | double time() { 35 | #ifdef __APPLE__ 36 | timeval t; 37 | gettimeofday(&t, nullptr); 38 | return t.tv_sec + 1e-6 * t.tv_usec; 39 | #else 40 | timespec t; 41 | clock_gettime(CLOCK_REALTIME, &t); 42 | return t.tv_sec + 1e-9 * t.tv_nsec; 43 | #endif 44 | } 45 | 46 | #define kernel_size (16) 47 | 48 | template 49 | void run_benchmark(const std::string& name, int repetitions, int elements, 50 | Context* context, const Params& params) { 51 | std::cout << "Benchmark: " << name << std::endl; 52 | std::cout << "Warmup single." << std::endl; 53 | 54 | for (int i = 0; i < 10; ++i) { 55 | Transform1D(params); 56 | } 57 | 58 | std::cout << "Benchmark single." << std::endl; 59 | 60 | double start = time(); 61 | 62 | for (int i = 0; i < repetitions; ++i) { 63 | Transform1D(params); 64 | } 65 | 66 | double wall_time = time() - start; 67 | double ops = static_cast(elements) * repetitions; 68 | std::cout << "Avg: " << (wall_time / repetitions) << std::endl; 69 | std::cout << "Perf: " << static_cast(ops / wall_time) << "/s." 70 | << std::endl; 71 | 72 | std::cout << "Warmup single." << std::endl; 73 | 74 | for (int i = 0; i < 10; ++i) { 75 | MultiThreadTransform1D(context, params); 76 | } 77 | 78 | std::cout << "Benchmark multi." << std::endl; 79 | 80 | start = time(); 81 | 82 | for (int i = 0; i < repetitions; ++i) { 83 | MultiThreadTransform1D(context, params); 84 | } 85 | 86 | wall_time = time() - start; 87 | ops = static_cast(elements) * repetitions; 88 | std::cout << "Avg: " << (wall_time / repetitions) << std::endl; 89 | std::cout << "Perf: " << static_cast(ops / wall_time) << "/s." 90 | << std::endl; 91 | } 92 | 93 | int main() { 94 | const int repetitions = 500; 95 | const int elements = 4 * 1024 * 1024; 96 | 97 | std::unique_ptr int32_array(new std::int32_t[elements]); 98 | std::unique_ptr uint8_array(new std::uint8_t[elements]); 99 | std::unique_ptr float_array(new float[elements]); 100 | 101 | typedef SimpleContext Context; 102 | Context context(4, new gemmlowp::WorkersPool()); 103 | 104 | typedef Transform1DParams RequantizeParams; 105 | RequantizeParams requantize_params; 106 | requantize_params.input = int32_array.get(); 107 | requantize_params.output = uint8_array.get(); 108 | requantize_params.kernel.count = elements; 109 | requantize_params.kernel.input_range_min = -100.0f; 110 | requantize_params.kernel.input_range_scale = 111 | 200.0f / ((static_cast(1) << 32) - 1); 112 | requantize_params.kernel.input_range_offset = 113 | static_cast(std::numeric_limits::lowest()); 114 | requantize_params.kernel.output_range_min = -200.0f; 115 | requantize_params.kernel.one_over_output_range_scale = 116 | static_cast((static_cast(1) << 8) - 1) / 500.0f; 117 | requantize_params.kernel.output_range_offset = 118 | static_cast(std::numeric_limits::lowest()); 119 | 120 | run_benchmark("Requantize", repetitions, elements, &context, 121 | requantize_params); 122 | 123 | typedef Transform1DParams DequantizeParams; 124 | DequantizeParams dequantize_params; 125 | dequantize_params.input = uint8_array.get(); 126 | dequantize_params.output = float_array.get(); 127 | dequantize_params.kernel.count = elements; 128 | dequantize_params.kernel.range_min = -100.0f; 129 | dequantize_params.kernel.range_scale = 130 | static_cast((static_cast(1) << 8) - 1) / 200.0f; 131 | dequantize_params.kernel.range_offset = 132 | static_cast(std::numeric_limits::lowest()); 133 | 134 | run_benchmark("Dequantize", repetitions, elements, &context, 135 | dequantize_params); 136 | 137 | typedef Transform1DParams QuantizeParams; 138 | QuantizeParams quantize_params; 139 | quantize_params.input = float_array.get(); 140 | quantize_params.output = uint8_array.get(); 141 | quantize_params.kernel.count = elements; 142 | quantize_params.kernel.range_min = -100.0f; 143 | quantize_params.kernel.range_scale = 144 | 200.0f / ((static_cast(1) << 8) - 1); 145 | quantize_params.kernel.range_offset = 146 | static_cast(std::numeric_limits::lowest()); 147 | 148 | run_benchmark("Quantize", repetitions, elements, &context, quantize_params); 149 | 150 | return 0; 151 | } 152 | -------------------------------------------------------------------------------- /profiling/pthread_everywhere.h: -------------------------------------------------------------------------------- 1 | // Copyright 2017 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // pthread_everywhere.h: Either includes or implements a 16 | // subset of pthread functionality on top of C++11 for portability. 17 | 18 | #ifndef GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_ 19 | #define GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_ 20 | 21 | #ifndef _WIN32 22 | #define GEMMLOWP_USE_PTHREAD 23 | #endif 24 | 25 | #if defined GEMMLOWP_USE_PTHREAD 26 | #include 27 | #else 28 | // Implement a small subset of pthread on top of C++11 threads. 29 | // The function signatures differ from true pthread functions in two ways: 30 | // - True pthread functions return int error codes, ours return void. 31 | // Rationale: the c++11 equivalent functions return void 32 | // and use exceptions to report errors; we don't want to deal with 33 | // exceptions in this code, so we couldn't meaningfully return errors 34 | // in the polyfill. Also, the gemmlowp code using these pthread functions 35 | // never checks their return values anyway. 36 | // - True pthread *_create/*_init functions take pointers to 'attribute' 37 | // structs; ours take nullptr_t. That is because gemmlowp always passes 38 | // nullptr at the moment, so any support we would code for non-null 39 | // attribs would be unused. 40 | #include 41 | #include 42 | #include 43 | #include 44 | namespace gemmlowp { 45 | using pthread_t = std::thread *; 46 | using pthread_mutex_t = std::mutex *; 47 | using pthread_cond_t = std::condition_variable *; 48 | inline void pthread_create(pthread_t *thread, std::nullptr_t, 49 | void *(*start_routine)(void *), void *arg) { 50 | *thread = new std::thread(start_routine, arg); 51 | } 52 | inline void pthread_join(pthread_t thread, std::nullptr_t) { thread->join(); } 53 | inline void pthread_mutex_init(pthread_mutex_t *mutex, std::nullptr_t) { 54 | *mutex = new std::mutex; 55 | } 56 | inline void pthread_mutex_lock(pthread_mutex_t *mutex) { (*mutex)->lock(); } 57 | inline void pthread_mutex_unlock(pthread_mutex_t *mutex) { (*mutex)->unlock(); } 58 | inline void pthread_mutex_destroy(pthread_mutex_t *mutex) { delete *mutex; } 59 | inline void pthread_cond_init(pthread_cond_t *cond, std::nullptr_t) { 60 | *cond = new std::condition_variable; 61 | } 62 | inline void pthread_cond_signal(pthread_cond_t *cond) { (*cond)->notify_one(); } 63 | inline void pthread_cond_broadcast(pthread_cond_t *cond) { 64 | (*cond)->notify_all(); 65 | } 66 | inline void pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) { 67 | std::unique_lock lock(**mutex, std::adopt_lock); 68 | (*cond)->wait(lock); 69 | // detach lock from mutex so when we leave this conext 70 | // the lock is not released 71 | lock.release(); 72 | } 73 | inline void pthread_cond_destroy(pthread_cond_t *cond) { delete *cond; } 74 | } // end namespace gemmlowp 75 | #endif 76 | 77 | #endif // GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_ 78 | -------------------------------------------------------------------------------- /public/bit_depth.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // bit_depth.h: defines the settins controlling LHS/RHS bit depth 16 | 17 | #ifndef GEMMLOWP_PUBLIC_BIT_DEPTH_H_ 18 | #define GEMMLOWP_PUBLIC_BIT_DEPTH_H_ 19 | 20 | namespace gemmlowp { 21 | 22 | // The range of allowed values for an operand. 23 | template 24 | struct OperandRange { 25 | static constexpr int kMinValue = tMinValue; 26 | static constexpr int kMaxValue = tMaxValue; 27 | static_assert(kMinValue < kMaxValue, ""); 28 | }; 29 | 30 | using Uint8Range = OperandRange<0, 255>; 31 | using Uint8RangeExcludingZero = OperandRange<1, 255>; 32 | 33 | using Int8Range = OperandRange<-128, 127>; 34 | using Int8RangeExcludingLow = OperandRange<-127, 127>; 35 | 36 | template 37 | struct BitDepthParams { 38 | using LhsRange = tLhsRange; 39 | using RhsRange = tRhsRange; 40 | }; 41 | 42 | // Default: LHS and RHS are 8bit. 43 | using DefaultL8R8BitDepthParams = BitDepthParams; 44 | 45 | // Variant: LHS may not take the value 0. This allows using 46 | // faster kernels using signed arithmetic, see 47 | // NEON_64bit_GEMM_Int8Operands_Int32Accumulators_AccumTwoWithin16Bits 48 | using L8R8WithLhsNonzeroBitDepthParams = 49 | BitDepthParams; 50 | 51 | // Signed Variant: This allows using faster kernels using signed arithmetic, see 52 | // NEON_64bit_GEMM_Int8Operands_Int32Accumulators_AccumTwoWithin16Bits 53 | using SignedL8R8WithLhsNonzeroBitDepthParams = 54 | BitDepthParams; 55 | 56 | // Deprecated: when gemmlowp used to allow requantizing 8bit 57 | // inputs to less-than-8-bit depths, the public setting allowing 58 | // that was DefaultL7R5BitDepthParams. That requantization 59 | // feature has been removed, but as the whole point of that 60 | // requantization was to make less-than-8-bit an internal 61 | // optimization without any impact on the API (other than lowering 62 | // accuracy), we can temporarily support users who were using it 63 | // by mapping it to the default 8bit behavior. 64 | using DefaultL7R5BitDepthParams = DefaultL8R8BitDepthParams; 65 | 66 | } // namespace gemmlowp 67 | 68 | #endif // GEMMLOWP_PUBLIC_BIT_DEPTH_H_ 69 | -------------------------------------------------------------------------------- /public/gemmlowp.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // gemmlowp.h: the main public interface header of gemmlowp. 16 | 17 | #ifndef GEMMLOWP_PUBLIC_GEMMLOWP_H_ 18 | #define GEMMLOWP_PUBLIC_GEMMLOWP_H_ 19 | #include "../internal/dispatch_gemm_shape.h" 20 | #include "bit_depth.h" 21 | #include "map.h" 22 | #include "output_stages.h" 23 | 24 | namespace gemmlowp { 25 | 26 | class GemmContext : public MultiThreadGemmContext {}; 27 | 28 | // Computes a general matrix product ("GEMM"). 29 | // This is a version that supports per channel quantization. 30 | template 34 | void GemmWithOutputPipelinePC(GemmContextType* context, 35 | const MatrixMap& lhs, 36 | const MatrixMap& rhs, 37 | MatrixMap* result, 38 | const LhsOffset& lhs_offset, 39 | const RhsOffset& rhs_offset, 40 | const OutputPipelineType& output_pipeline) { 41 | DispatchGemmShape( 42 | context, lhs, rhs, result, lhs_offset, rhs_offset, output_pipeline); 43 | } 44 | 45 | // Computes a general matrix product ("GEMM"). 46 | // This is the legacy version that does not support per channel quantization. 47 | // The meaning of the offsets, result_mult_int and result_shift 48 | // parameters is the same as in the standard EightBitIntGemm interface 49 | // (which is also implemented in the eight_bit_int_gemm directory). 50 | template 53 | void GemmWithOutputPipeline(GemmContextType* context, 54 | const MatrixMap& lhs, 55 | const MatrixMap& rhs, 56 | MatrixMap* result, 57 | int lhs_offset, int rhs_offset, 58 | const OutputPipelineType& output_pipeline) { 59 | typedef VectorDup OffsetColDup; 60 | typedef VectorDup OffsetRowDup; 61 | const OffsetColDup lhs_offset_vector(lhs_offset, lhs.rows()); 62 | const OffsetRowDup rhs_offset_vector(rhs_offset, rhs.cols()); 63 | DispatchGemmShape( 64 | context, lhs, rhs, result, lhs_offset_vector, rhs_offset_vector, 65 | output_pipeline); 66 | } 67 | 68 | // Computes a general matrix product ("GEMM"). 69 | // The meaning of the offsets, result_mult_int and result_shift 70 | // parameters is the same as in the standard EightBitIntGemm interface 71 | // (which is also implemented in the eight_bit_int_gemm directory). 72 | template 74 | void Gemm(GemmContextType* context, 75 | const MatrixMap& lhs, 76 | const MatrixMap& rhs, 77 | MatrixMap* result, int lhs_offset, 78 | int rhs_offset, int result_offset, int result_mult_int, 79 | int result_shift) { 80 | GemmWithOutputPipeline( 81 | context, lhs, rhs, result, lhs_offset, rhs_offset, 82 | MakeStandardOutputPipeline(result_offset, result_mult_int, result_shift)); 83 | } 84 | 85 | } // namespace gemmlowp 86 | 87 | #endif // GEMMLOWP_PUBLIC_GEMMLOWP_H_ 88 | -------------------------------------------------------------------------------- /public/map.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // map.h: a minimalist view-existing-buffer-as-a-matrix class, 16 | // which is how gemmlowp interfaces with external matrix data. 17 | 18 | #ifndef GEMMLOWP_PUBLIC_MAP_H_ 19 | #define GEMMLOWP_PUBLIC_MAP_H_ 20 | 21 | #include "../internal/common.h" 22 | 23 | namespace gemmlowp { 24 | 25 | // The two storage orders allowed to map buffers as matrices: ColMajor 26 | // means column-major, RowMajor means row-major. 27 | enum class MapOrder { ColMajor, RowMajor }; 28 | 29 | // A MatrixMap is a view of an existing buffer as a matrix. It does not own 30 | // the buffer. 31 | template 32 | class MatrixMap { 33 | public: 34 | typedef tScalar Scalar; 35 | static constexpr MapOrder kOrder = tOrder; 36 | 37 | protected: 38 | Scalar* data_; // not owned. 39 | int rows_, cols_, stride_; 40 | 41 | public: 42 | MatrixMap() : data_(nullptr), rows_(0), cols_(0), stride_(0) {} 43 | MatrixMap(Scalar* data, int rows, int cols) 44 | : data_(data), 45 | rows_(rows), 46 | cols_(cols), 47 | stride_(kOrder == MapOrder::ColMajor ? rows : cols) {} 48 | MatrixMap(Scalar* data, int rows, int cols, int stride) 49 | : data_(data), rows_(rows), cols_(cols), stride_(stride) {} 50 | MatrixMap(const MatrixMap& other) 51 | : data_(other.data_), 52 | rows_(other.rows_), 53 | cols_(other.cols_), 54 | stride_(other.stride_) {} 55 | 56 | int rows() const { return rows_; } 57 | int cols() const { return cols_; } 58 | int stride() const { return stride_; } 59 | int rows_stride() const { return kOrder == MapOrder::ColMajor ? 1 : stride_; } 60 | int cols_stride() const { return kOrder == MapOrder::RowMajor ? 1 : stride_; } 61 | Scalar* data() const { return data_; } 62 | Scalar* data(int row, int col) const { 63 | return data_ + row * rows_stride() + col * cols_stride(); 64 | } 65 | Scalar& operator()(int row, int col) const { return *data(row, col); } 66 | 67 | MatrixMap block(int start_row, int start_col, int block_rows, 68 | int block_cols) const { 69 | assert(start_row >= 0); 70 | assert(start_row + block_rows <= rows_); 71 | assert(start_col >= 0); 72 | assert(start_col + block_cols <= cols_); 73 | 74 | return MatrixMap(data(start_row, start_col), block_rows, block_cols, 75 | stride_); 76 | } 77 | }; 78 | 79 | enum class VectorShape { Col, Row }; 80 | 81 | // A VectorMap is a view of an existing buffer as a vector. It does not own 82 | // the buffer. 83 | template 84 | class VectorMap { 85 | public: 86 | typedef tScalar Scalar; 87 | static constexpr VectorShape kShape = tShape; 88 | 89 | protected: 90 | Scalar* data_; // not owned. 91 | int size_; 92 | 93 | public: 94 | VectorMap() : data_(nullptr), size_(0) {} 95 | VectorMap(Scalar* data, int size) : data_(data), size_(size) {} 96 | VectorMap(const VectorMap& other) = default; 97 | VectorMap& operator=(const VectorMap& other) = default; 98 | 99 | int size() const { return size_; } 100 | Scalar* data() const { return data_; } 101 | Scalar* data(int index) const { return data_ + index; } 102 | Scalar& operator()(int index) const { return *data(index); } 103 | 104 | VectorMap block(int start, int len) const { 105 | assert(start >= 0); 106 | assert(start + len <= size_); 107 | 108 | return VectorMap(data(start), len); 109 | } 110 | }; 111 | 112 | // A VectorDup is a (duplicated value) vector where all components are the same. 113 | template 114 | class VectorDup { 115 | public: 116 | typedef tScalar Scalar; 117 | static constexpr VectorShape kShape = tShape; 118 | 119 | protected: 120 | Scalar data_; 121 | int size_; 122 | 123 | public: 124 | VectorDup() : data_(0), size_(0) {} 125 | VectorDup(Scalar data, int size) : data_(data), size_(size) {} 126 | VectorDup(const VectorDup& other) : data_(other.data_), size_(other.size_) {} 127 | 128 | int size() const { return size_; } 129 | Scalar& operator()(int) const { return data_; } 130 | 131 | VectorDup block(int start, int len) const { 132 | assert(start >= 0); 133 | assert(start + len <= size_); 134 | 135 | (void)start; 136 | return VectorDup(data_, len); 137 | } 138 | }; 139 | 140 | } // namespace gemmlowp 141 | 142 | #endif // GEMMLOWP_PUBLIC_MAP_H_ 143 | -------------------------------------------------------------------------------- /scripts/ci-before.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ $TEST == "arm" ]; then 3 | curl -L https://dl.google.com/android/repository/android-ndk-${NDK_VERSION}-linux-x86_64.zip -O 4 | unzip android-ndk-${NDK_VERSION}-linux-x86_64.zip 2> /dev/null > /dev/null 5 | echo no | android create avd --force -n test -t android-22 --abi armeabi-v7a 6 | emulator -avd test -no-audio -no-window & 7 | fi 8 | -------------------------------------------------------------------------------- /scripts/ci-test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ $TEST == "arm" ]; then 3 | ./android-ndk-${NDK_VERSION}/ndk-build 4 | android-wait-for-emulator 5 | # adb shell input keyevent 82 & 6 | adb push ./libs/* /data/local/tmp 7 | adb shell /data/local/tmp/benchmark 8 | adb shell /data/local/tmp/correctness_meta_gemm 9 | # too slow 10 | # adb shell /data/local/tmp/benchmark_meta_gemm 11 | fi 12 | if [ $TEST == "x86" ]; then 13 | make -f Makefile.travis unittest 14 | fi 15 | -------------------------------------------------------------------------------- /scripts/test-android.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | if [ -z "$CXX" ] 17 | then 18 | echo "please set the CXX environment variable to point to your native Android toolchain C++ compiler" 19 | exit 1 20 | fi 21 | 22 | default_cflags="-O3" 23 | 24 | if [ "$#" -eq 0 ] 25 | then 26 | echo "Usage: $0 files... [cflags...]" 27 | echo "All command-line parameters are passed along to the C++ compiler, so they can \ 28 | be either source files, or compiler flags." 29 | echo "Default cflags: $default_cflags" 30 | echo "Relies on the CXX environment variable to point to an Android C++ toolchain compiler." 31 | exit 1 32 | fi 33 | 34 | EXE=gemmlowp-android-binary 35 | 36 | if [[ $CXX =~ .*aarch64.* ]] 37 | then 38 | NEON_FLAGS= 39 | else 40 | NEON_FLAGS="-mfpu=neon -mfloat-abi=softfp" 41 | fi 42 | 43 | $CXX \ 44 | --std=c++11 \ 45 | -Wall -Wextra -pedantic \ 46 | -fPIE -pie $NEON_FLAGS \ 47 | -lstdc++ -latomic \ 48 | -I . -I .. \ 49 | -o $EXE \ 50 | -Wno-unused-variable -Wno-unused-parameter \ 51 | $default_cflags \ 52 | $* 53 | 54 | if [ $? != 0 ]; then 55 | echo "build failed" 56 | exit 1 57 | fi 58 | 59 | adb root 60 | 61 | if [ $? != 0 ]; then 62 | echo "$0: adb root failed" 63 | exit 1 64 | fi 65 | 66 | adb shell mkdir -p /data/local/tmp 67 | 68 | if [ $? != 0 ]; then 69 | echo "$0: adb shell failed to mkdir /data/local/tmp" 70 | exit 1 71 | fi 72 | 73 | adb push $EXE /data/local/tmp 74 | 75 | if [ $? != 0 ]; then 76 | echo "$0: adb push failed to write to /data/local/tmp" 77 | exit 1 78 | fi 79 | 80 | echo adb shell "/data/local/tmp/$EXE $TESTARGS" 81 | 82 | adb shell "/data/local/tmp/$EXE $TESTARGS" | tee "log-$EXE" 83 | 84 | if [ $? != 0 ]; then 85 | echo "$0: adb shell failed to run binary on device" 86 | exit 1 87 | fi 88 | -------------------------------------------------------------------------------- /standalone/encode.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 The gemmlowp Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Encodes ARM asm code for certain instructions into the corresponding machine code encoding, as a .word directive in the asm code, preserving the original code in a comment. 16 | 17 | Reads from stdin, writes to stdout. 18 | 19 | Example diff: 20 | - "udot v16.4s, v4.16b, v0.16b\n" 21 | + ".word 0x6e809490 // udot v16.4s, v4.16b, v0.16b\n" 22 | 23 | The intended use case is to make asm code easier to compile on toolchains that 24 | do not support certain new instructions. 25 | """ 26 | 27 | import sys 28 | import re 29 | import argparse 30 | 31 | 32 | def encode_udot_sdot_vector(line): 33 | m = re.search( 34 | r'\b([us])dot[ ]+v([0-9]+)[ ]*\.[ ]*4s[ ]*\,[ ]*v([0-9]+)[ ]*\.[ ]*16b[ ]*\,[ ]*v([0-9]+)[ ]*\.[ ]*16b', 35 | line) 36 | if not m: 37 | return 0, line 38 | 39 | match = m.group(0) 40 | unsigned = 1 if m.group(1) == 'u' else 0 41 | accum = int(m.group(2)) 42 | lhs = int(m.group(3)) 43 | rhs = int(m.group(4)) 44 | assert accum >= 0 and accum <= 31 45 | assert lhs >= 0 and lhs <= 31 46 | assert rhs >= 0 and rhs <= 31 47 | mcode = 0x4e809400 | (accum << 0) | (lhs << 5) | (rhs << 16) | ( 48 | unsigned << 29) 49 | return mcode, match 50 | 51 | 52 | def encode_udot_sdot_element(line): 53 | m = re.search( 54 | r'\b([us])dot[ ]+v([0-9]+)[ ]*\.[ ]*4s[ ]*\,[ ]*v([0-9]+)[ ]*\.[ ]*16b[ ]*\,[ ]*v([0-9]+)[ ]*\.[ ]*4b[ ]*\[([0-9])\]', 55 | line) 56 | if not m: 57 | return 0, line 58 | 59 | match = m.group(0) 60 | unsigned = 1 if m.group(1) == 'u' else 0 61 | accum = int(m.group(2)) 62 | lhs = int(m.group(3)) 63 | rhs = int(m.group(4)) 64 | lanegroup = int(m.group(5)) 65 | assert accum >= 0 and accum <= 31 66 | assert lhs >= 0 and lhs <= 31 67 | assert rhs >= 0 and rhs <= 31 68 | assert lanegroup >= 0 and lanegroup <= 3 69 | l = 1 if lanegroup & 1 else 0 70 | h = 1 if lanegroup & 2 else 0 71 | mcode = 0x4f80e000 | (accum << 0) | (lhs << 5) | (rhs << 16) | (l << 21) | ( 72 | h << 11) | ( 73 | unsigned << 29) 74 | return mcode, match 75 | 76 | 77 | def encode(line): 78 | for encode_func in [encode_udot_sdot_vector, encode_udot_sdot_element]: 79 | mcode, match = encode_func(line) 80 | if mcode: 81 | return mcode, match 82 | return 0, line 83 | 84 | 85 | def read_existing_encoding(line): 86 | m = re.search(r'\.word\ (0x[0-9a-f]+)', line) 87 | if m: 88 | return int(m.group(1), 16) 89 | return 0 90 | 91 | 92 | parser = argparse.ArgumentParser(description='Encode some A64 instructions.') 93 | parser.add_argument( 94 | '-f', 95 | '--fix', 96 | help='fix existing wrong encodings in-place and continue', 97 | action='store_true') 98 | args = parser.parse_args() 99 | 100 | lineno = 0 101 | found_existing_encodings = False 102 | found_error = False 103 | found_fixes = False 104 | for line in sys.stdin: 105 | lineno = lineno + 1 106 | mcode, match = encode(line) 107 | if mcode: 108 | existing_encoding = read_existing_encoding(line) 109 | if existing_encoding: 110 | found_existing_encodings = True 111 | if mcode != existing_encoding: 112 | if args.fix: 113 | line = line.replace('.word 0x%x // %s' % (existing_encoding, match), 114 | '.word 0x%x // %s' % (mcode, match)) 115 | found_fixes = True 116 | else: 117 | sys.stderr.write( 118 | "Error at line %d: existing encoding 0x%x differs from encoding 0x%x for instruction '%s':\n\n%s\n\n" 119 | % (lineno, existing_encoding, mcode, match, line)) 120 | found_error = True 121 | else: 122 | line = line.replace(match, '.word 0x%x // %s' % (mcode, match)) 123 | sys.stdout.write(line) 124 | if found_error: 125 | sys.exit(1) 126 | if found_existing_encodings: 127 | if found_fixes: 128 | sys.stderr.write( 129 | 'Note: some instructions that this program is able to encode, were already encoded and their existing encodings didn\'t match the specified asm instructions. Since --fix was passed, these were fixed in-place.\n' 130 | ) 131 | else: 132 | sys.stderr.write( 133 | 'Note: some instructions that this program is able to encode, were already encoded. These encodings have been checked.\n' 134 | ) 135 | -------------------------------------------------------------------------------- /test/ios/gemmlowp_test/AppDelegate.h: -------------------------------------------------------------------------------- 1 | // 2 | // AppDelegate.h 3 | // gemmlowp_test 4 | // 5 | // Created by petewarden on 9/28/15. 6 | // Copyright (c) 2015 petewarden. All rights reserved. 7 | // 8 | 9 | #import 10 | 11 | @interface AppDelegate : UIResponder 12 | 13 | @property(strong, nonatomic) UIWindow *window; 14 | 15 | @end 16 | -------------------------------------------------------------------------------- /test/ios/gemmlowp_test/AppDelegate.mm: -------------------------------------------------------------------------------- 1 | // 2 | // AppDelegate.m 3 | // gemmlowp_test 4 | // 5 | // Created by petewarden on 9/28/15. 6 | // Copyright (c) 2015 petewarden. All rights reserved. 7 | // 8 | 9 | #import "AppDelegate.h" 10 | 11 | namespace gemmlowp { 12 | extern void benchmark_all(); 13 | extern void test(); 14 | } 15 | 16 | @interface AppDelegate () 17 | 18 | @end 19 | 20 | @implementation AppDelegate 21 | 22 | 23 | - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions { 24 | // Override point for customization after application launch. 25 | 26 | gemmlowp::benchmark_all(); 27 | gemmlowp::test(); 28 | 29 | return YES; 30 | } 31 | 32 | - (void)applicationWillResignActive:(UIApplication *)application { 33 | // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state. 34 | // Use this method to pause ongoing tasks, disable timers, and throttle down OpenGL ES frame rates. Games should use this method to pause the game. 35 | } 36 | 37 | - (void)applicationDidEnterBackground:(UIApplication *)application { 38 | // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later. 39 | // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits. 40 | } 41 | 42 | - (void)applicationWillEnterForeground:(UIApplication *)application { 43 | // Called as part of the transition from the background to the inactive state; here you can undo many of the changes made on entering the background. 44 | } 45 | 46 | - (void)applicationDidBecomeActive:(UIApplication *)application { 47 | // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface. 48 | } 49 | 50 | - (void)applicationWillTerminate:(UIApplication *)application { 51 | // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:. 52 | } 53 | 54 | @end 55 | -------------------------------------------------------------------------------- /test/ios/gemmlowp_test/Base.lproj/LaunchScreen.xib: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 20 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | -------------------------------------------------------------------------------- /test/ios/gemmlowp_test/Base.lproj/Main.storyboard: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /test/ios/gemmlowp_test/Images.xcassets/AppIcon.appiconset/Contents.json: -------------------------------------------------------------------------------- 1 | { 2 | "images" : [ 3 | { 4 | "idiom" : "iphone", 5 | "size" : "29x29", 6 | "scale" : "2x" 7 | }, 8 | { 9 | "idiom" : "iphone", 10 | "size" : "29x29", 11 | "scale" : "3x" 12 | }, 13 | { 14 | "idiom" : "iphone", 15 | "size" : "40x40", 16 | "scale" : "2x" 17 | }, 18 | { 19 | "idiom" : "iphone", 20 | "size" : "40x40", 21 | "scale" : "3x" 22 | }, 23 | { 24 | "idiom" : "iphone", 25 | "size" : "60x60", 26 | "scale" : "2x" 27 | }, 28 | { 29 | "idiom" : "iphone", 30 | "size" : "60x60", 31 | "scale" : "3x" 32 | }, 33 | { 34 | "idiom" : "ipad", 35 | "size" : "29x29", 36 | "scale" : "1x" 37 | }, 38 | { 39 | "idiom" : "ipad", 40 | "size" : "29x29", 41 | "scale" : "2x" 42 | }, 43 | { 44 | "idiom" : "ipad", 45 | "size" : "40x40", 46 | "scale" : "1x" 47 | }, 48 | { 49 | "idiom" : "ipad", 50 | "size" : "40x40", 51 | "scale" : "2x" 52 | }, 53 | { 54 | "idiom" : "ipad", 55 | "size" : "76x76", 56 | "scale" : "1x" 57 | }, 58 | { 59 | "idiom" : "ipad", 60 | "size" : "76x76", 61 | "scale" : "2x" 62 | } 63 | ], 64 | "info" : { 65 | "version" : 1, 66 | "author" : "xcode" 67 | } 68 | } -------------------------------------------------------------------------------- /test/ios/gemmlowp_test/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | CFBundleDevelopmentRegion 6 | en 7 | CFBundleExecutable 8 | $(EXECUTABLE_NAME) 9 | CFBundleIdentifier 10 | com.google.$(PRODUCT_NAME:rfc1034identifier) 11 | CFBundleInfoDictionaryVersion 12 | 6.0 13 | CFBundleName 14 | $(PRODUCT_NAME) 15 | CFBundlePackageType 16 | APPL 17 | CFBundleShortVersionString 18 | 1.0 19 | CFBundleSignature 20 | ???? 21 | CFBundleVersion 22 | 1 23 | LSRequiresIPhoneOS 24 | 25 | UILaunchStoryboardName 26 | LaunchScreen 27 | UIMainStoryboardFile 28 | Main 29 | UIRequiredDeviceCapabilities 30 | 31 | armv7 32 | 33 | UISupportedInterfaceOrientations 34 | 35 | UIInterfaceOrientationPortrait 36 | UIInterfaceOrientationLandscapeLeft 37 | UIInterfaceOrientationLandscapeRight 38 | 39 | UISupportedInterfaceOrientations~ipad 40 | 41 | UIInterfaceOrientationPortrait 42 | UIInterfaceOrientationPortraitUpsideDown 43 | UIInterfaceOrientationLandscapeLeft 44 | UIInterfaceOrientationLandscapeRight 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /test/ios/gemmlowp_test/ViewController.h: -------------------------------------------------------------------------------- 1 | // 2 | // ViewController.h 3 | // gemmlowp_test 4 | // 5 | // Created by petewarden on 9/28/15. 6 | // Copyright (c) 2015 petewarden. All rights reserved. 7 | // 8 | 9 | #import 10 | 11 | @interface ViewController : UIViewController 12 | 13 | @end 14 | -------------------------------------------------------------------------------- /test/ios/gemmlowp_test/ViewController.m: -------------------------------------------------------------------------------- 1 | // 2 | // ViewController.m 3 | // gemmlowp_test 4 | // 5 | // Created by petewarden on 9/28/15. 6 | // Copyright (c) 2015 petewarden. All rights reserved. 7 | // 8 | 9 | #import "ViewController.h" 10 | 11 | @interface ViewController () 12 | 13 | @end 14 | 15 | @implementation ViewController 16 | 17 | - (void)viewDidLoad { 18 | [super viewDidLoad]; 19 | // Do any additional setup after loading the view, typically from a nib. 20 | } 21 | 22 | - (void)didReceiveMemoryWarning { 23 | [super didReceiveMemoryWarning]; 24 | // Dispose of any resources that can be recreated. 25 | } 26 | 27 | @end 28 | -------------------------------------------------------------------------------- /test/ios/gemmlowp_test/main.m: -------------------------------------------------------------------------------- 1 | // 2 | // main.m 3 | // gemmlowp_test 4 | // 5 | // Created by petewarden on 9/28/15. 6 | // Copyright (c) 2015 petewarden. All rights reserved. 7 | // 8 | 9 | #import 10 | #import "AppDelegate.h" 11 | 12 | int main(int argc, char * argv[]) { 13 | @autoreleasepool { 14 | return UIApplicationMain(argc, argv, nil, NSStringFromClass([AppDelegate class])); 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /test/test.h: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | // test.h: shared testing helpers. 16 | 17 | #ifndef GEMMLOWP_TEST_TEST_H_ 18 | #define GEMMLOWP_TEST_TEST_H_ 19 | 20 | #ifdef GEMMLOWP_TEST_PROFILE 21 | #define GEMMLOWP_PROFILING 22 | #include "../profiling/profiler.h" 23 | #endif 24 | 25 | #include 26 | #include 27 | #include 28 | #include 29 | 30 | #include "../public/gemmlowp.h" 31 | 32 | namespace gemmlowp { 33 | 34 | #define GEMMLOWP_STRINGIFY2(x) #x 35 | #define GEMMLOWP_STRINGIFY(x) GEMMLOWP_STRINGIFY2(x) 36 | 37 | #define Check(b) \ 38 | do { \ 39 | ReleaseBuildAssertion( \ 40 | b, "test failed at " __FILE__ ":" GEMMLOWP_STRINGIFY(__LINE__)); \ 41 | } while (false) 42 | 43 | // gemmlowp itself doesn't have a Matrix class, only a MatrixMap class, 44 | // since it only maps existing data. In tests though, we need to 45 | // create our own matrices. 46 | template 47 | class Matrix : public MatrixMap { 48 | public: 49 | typedef MatrixMap Map; 50 | typedef MatrixMap ConstMap; 51 | typedef typename Map::Scalar Scalar; 52 | static constexpr MapOrder Order = tOrder; 53 | using Map::kOrder; 54 | using Map::rows_; 55 | using Map::cols_; 56 | using Map::stride_; 57 | using Map::data_; 58 | 59 | public: 60 | Matrix() : Map(nullptr, 0, 0, 0) {} 61 | 62 | Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); } 63 | 64 | Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; } 65 | 66 | Matrix& operator=(const Matrix& other) { 67 | Resize(other.rows_, other.cols_); 68 | std::memcpy(data_, other.data_, size() * sizeof(Scalar)); 69 | return *this; 70 | } 71 | 72 | friend bool operator==(const Matrix& a, const Matrix& b) { 73 | return a.rows_ == b.rows_ && a.cols_ == b.cols_ && 74 | !std::memcmp(a.data_, b.data_, a.size()); 75 | } 76 | 77 | void Resize(int rows, int cols) { 78 | rows_ = rows; 79 | cols_ = cols; 80 | stride_ = kOrder == MapOrder::ColMajor ? rows : cols; 81 | storage.resize(size()); 82 | data_ = storage.data(); 83 | } 84 | 85 | int size() const { return rows_ * cols_; } 86 | 87 | Map& map() { return *static_cast(this); } 88 | 89 | ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); } 90 | 91 | protected: 92 | std::vector storage; 93 | }; 94 | 95 | inline std::mt19937& RandomEngine() { 96 | static std::mt19937 engine; 97 | return engine; 98 | } 99 | 100 | inline int Random() { 101 | std::uniform_int_distribution dist(0, std::numeric_limits::max()); 102 | return dist(RandomEngine()); 103 | } 104 | 105 | #ifdef _MSC_VER 106 | // msvc does not support 8bit types in uniform_int_distribution<>. 107 | // Take 32 bit uniform_int_distribution<> and only use the lower 8 bits. 108 | template 109 | void MakeRandom(MatrixType* m) { 110 | ScopedProfilingLabel("MakeRandom(matrix)"); 111 | for (int c = 0; c < m->cols(); c++) { 112 | for (int r = 0; r < m->rows(); r++) { 113 | (*m)(r, c) = Random() % OperandRange::kMaxValue; 114 | } 115 | } 116 | } 117 | #else 118 | template 119 | void MakeRandom(MatrixType* m) { 120 | ScopedProfilingLabel("MakeRandom(matrix)"); 121 | typedef typename MatrixType::Scalar Scalar; 122 | std::uniform_int_distribution dist(OperandRange::kMinValue, 123 | OperandRange::kMaxValue); 124 | for (int c = 0; c < m->cols(); c++) { 125 | for (int r = 0; r < m->rows(); r++) { 126 | (*m)(r, c) = dist(RandomEngine()); 127 | } 128 | } 129 | } 130 | #endif 131 | 132 | template 133 | void MakeConstant(MatrixType* m, typename MatrixType::Scalar val) { 134 | ScopedProfilingLabel("MakeConstant(matrix)"); 135 | for (int c = 0; c < m->cols(); c++) { 136 | for (int r = 0; r < m->rows(); r++) { 137 | (*m)(r, c) = val; 138 | } 139 | } 140 | } 141 | 142 | template 143 | void MakeZero(MatrixType* m) { 144 | ScopedProfilingLabel("MakeZero(matrix)"); 145 | MakeConstant(m, 0); 146 | } 147 | 148 | } // namespace gemmlowp 149 | 150 | #endif // GEMMLOWP_TEST_TEST_H_ 151 | -------------------------------------------------------------------------------- /test/test_allocator.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "test.h" 16 | #include "../internal/allocator.h" 17 | 18 | namespace gemmlowp { 19 | 20 | void test_allocator(Allocator* a, int max_array_size) { 21 | const std::size_t int32_array_size = Random() % max_array_size; 22 | auto handle_to_int32_array = a->Reserve(int32_array_size); 23 | const std::size_t int8_array_size = Random() % max_array_size; 24 | auto handle_to_int8_array = a->Reserve(int8_array_size); 25 | a->Commit(); 26 | std::int32_t* int32_array = 27 | a->GetPointer(handle_to_int32_array); 28 | std::int8_t* int8_array = a->GetPointer(handle_to_int8_array); 29 | Check(int32_array == a->GetPointer(handle_to_int32_array)); 30 | Check(int8_array == a->GetPointer(handle_to_int8_array)); 31 | Check( 32 | !(reinterpret_cast(int32_array) % Allocator::kAlignment)); 33 | Check( 34 | !(reinterpret_cast(int8_array) % Allocator::kAlignment)); 35 | Check(reinterpret_cast(int8_array) >= 36 | reinterpret_cast(int32_array + int32_array_size)); 37 | memset(int32_array, 0, sizeof(*int32_array) * int32_array_size); 38 | memset(int8_array, 0, sizeof(*int8_array) * int8_array_size); 39 | a->Decommit(); 40 | } 41 | 42 | void test_allocator() { 43 | Allocator allocator; 44 | 45 | // Test allocating increasingly large sizes on the same allocator, 46 | // starting with size 0. 47 | for (int i = 1; i < 1000; i += 10) { 48 | test_allocator(&allocator, i); 49 | } 50 | } 51 | 52 | } // namespace gemmlowp 53 | 54 | int main() { gemmlowp::test_allocator(); } 55 | -------------------------------------------------------------------------------- /test/test_blocking_counter.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include // NOLINT 16 | #include 17 | #include 18 | #include 19 | 20 | #include "../internal/multi_thread_gemm.h" 21 | #include "../profiling/pthread_everywhere.h" 22 | #include "test.h" 23 | 24 | namespace gemmlowp { 25 | 26 | class Thread { 27 | public: 28 | Thread(BlockingCounter* blocking_counter, int number_of_times_to_decrement) 29 | : blocking_counter_(blocking_counter), 30 | number_of_times_to_decrement_(number_of_times_to_decrement), 31 | made_the_last_decrement_(false), 32 | finished_(false) { 33 | #if defined GEMMLOWP_USE_PTHREAD 34 | // Limit the stack size so as not to deplete memory when creating 35 | // many threads. 36 | pthread_attr_t attr; 37 | int err = pthread_attr_init(&attr); 38 | if (!err) { 39 | size_t stack_size; 40 | err = pthread_attr_getstacksize(&attr, &stack_size); 41 | if (!err && stack_size > max_stack_size_) { 42 | err = pthread_attr_setstacksize(&attr, max_stack_size_); 43 | } 44 | if (!err) { 45 | err = pthread_create(&thread_, &attr, ThreadFunc, this); 46 | } 47 | } 48 | if (err) { 49 | std::cerr << "Failed to create a thread.\n"; 50 | std::abort(); 51 | } 52 | #else 53 | pthread_create(&thread_, nullptr, ThreadFunc, this); 54 | #endif 55 | } 56 | 57 | ~Thread() { Join(); } 58 | 59 | bool Join() { 60 | while (!finished_.load()) { 61 | } 62 | return made_the_last_decrement_; 63 | } 64 | 65 | private: 66 | Thread(const Thread& other) = delete; 67 | 68 | void ThreadFunc() { 69 | for (int i = 0; i < number_of_times_to_decrement_; i++) { 70 | Check(!made_the_last_decrement_); 71 | made_the_last_decrement_ = blocking_counter_->DecrementCount(); 72 | } 73 | finished_.store(true); 74 | } 75 | 76 | static void* ThreadFunc(void* ptr) { 77 | static_cast(ptr)->ThreadFunc(); 78 | return nullptr; 79 | } 80 | 81 | static constexpr size_t max_stack_size_ = 256 * 1024; 82 | BlockingCounter* const blocking_counter_; 83 | const int number_of_times_to_decrement_; 84 | pthread_t thread_; 85 | bool made_the_last_decrement_; 86 | // finished_ is used to manually implement Join() by busy-waiting. 87 | // I wanted to use pthread_join / std::thread::join, but the behavior 88 | // observed on Android was that pthread_join aborts when the thread has 89 | // already joined before calling pthread_join, making that hard to use. 90 | // It appeared simplest to just implement this simple spinlock, and that 91 | // is good enough as this is just a test. 92 | std::atomic finished_; 93 | }; 94 | 95 | void test_blocking_counter(BlockingCounter* blocking_counter, int num_threads, 96 | int num_decrements_per_thread, 97 | int num_decrements_to_wait_for) { 98 | std::vector threads; 99 | blocking_counter->Reset(num_decrements_to_wait_for); 100 | for (int i = 0; i < num_threads; i++) { 101 | threads.push_back(new Thread(blocking_counter, num_decrements_per_thread)); 102 | } 103 | blocking_counter->Wait(); 104 | 105 | int num_threads_that_made_the_last_decrement = 0; 106 | for (int i = 0; i < num_threads; i++) { 107 | if (threads[i]->Join()) { 108 | num_threads_that_made_the_last_decrement++; 109 | } 110 | delete threads[i]; 111 | } 112 | Check(num_threads_that_made_the_last_decrement == 1); 113 | } 114 | 115 | void test_blocking_counter() { 116 | BlockingCounter* blocking_counter = new BlockingCounter; 117 | 118 | // repeating the entire test sequence ensures that we test 119 | // non-monotonic changes. 120 | for (int repeat = 1; repeat <= 2; repeat++) { 121 | for (int num_threads = 1; num_threads <= 5; num_threads++) { 122 | for (int num_decrements_per_thread = 1; 123 | num_decrements_per_thread <= 4 * 1024; 124 | num_decrements_per_thread *= 16) { 125 | test_blocking_counter(blocking_counter, num_threads, 126 | num_decrements_per_thread, 127 | num_threads * num_decrements_per_thread); 128 | } 129 | } 130 | } 131 | delete blocking_counter; 132 | } 133 | 134 | } // end namespace gemmlowp 135 | 136 | int main() { gemmlowp::test_blocking_counter(); } 137 | -------------------------------------------------------------------------------- /test/test_data.h: -------------------------------------------------------------------------------- 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #ifndef GEMMLOWP_TEST_TEST_DATA_H_ 16 | #define GEMMLOWP_TEST_TEST_DATA_H_ 17 | 18 | namespace test_data { 19 | 20 | extern const bool is_a_transposed; 21 | extern const bool is_b_transposed; 22 | extern const bool is_c_transposed; 23 | extern const int m; 24 | extern const int n; 25 | extern const int k; 26 | extern const int a_offset; 27 | extern const int b_offset; 28 | extern const int c_shift; 29 | extern const int c_mult_int; 30 | extern const int c_shift; 31 | extern const int c_offset; 32 | 33 | extern const int a_count; 34 | extern const int b_count; 35 | extern const int c_count; 36 | 37 | extern unsigned char a_data[]; 38 | extern unsigned char b_data[]; 39 | extern unsigned char expected_c_data[]; 40 | 41 | } // namespace test_data 42 | 43 | #endif // GEMMLOWP_TEST_TEST_DATA_H 44 | -------------------------------------------------------------------------------- /test/test_math_helpers.cc: -------------------------------------------------------------------------------- 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. 2 | // 3 | // Licensed under the Apache License, Version 2.0 (the "License"); 4 | // you may not use this file except in compliance with the License. 5 | // You may obtain a copy of the License at 6 | // 7 | // http://www.apache.org/licenses/LICENSE-2.0 8 | // 9 | // Unless required by applicable law or agreed to in writing, software 10 | // distributed under the License is distributed on an "AS IS" BASIS, 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | // See the License for the specific language governing permissions and 13 | // limitations under the License. 14 | 15 | #include "test.h" 16 | 17 | #include 18 | 19 | #include "../internal/common.h" 20 | 21 | namespace gemmlowp { 22 | 23 | // Our math helpers don't intend to be reliable all the way to the 24 | // limit of representable range, wrt overflow. 25 | // We don't care for 2G sized matrices. 26 | // This test stops at half of the representable range. 27 | template 28 | Integer ValueRangeCutoff() { 29 | return std::numeric_limits::max() / 2; 30 | } 31 | 32 | int RandomNonnegativeFarAwayFromOverflow() { return Random() % (1 << 24); } 33 | 34 | template 35 | void test_round_up_down(int x) { 36 | Check(x >= RoundDown(x)); 37 | Check(x < RoundDown(x) + Modulus); 38 | Check(RoundDown(x) % Modulus == 0); 39 | 40 | Check(x <= RoundUp(x)); 41 | Check(x > RoundUp(x) - Modulus); 42 | Check(RoundUp(x) % Modulus == 0); 43 | } 44 | 45 | template 46 | void test_round_up_down() { 47 | for (int i = 0; i < 100; i++) { 48 | test_round_up_down(i); 49 | const int N = ValueRangeCutoff(); 50 | test_round_up_down(Random() % N); 51 | } 52 | } 53 | 54 | template 55 | void test_ceil_quotient(Integer x, Integer y) { 56 | Check(CeilQuotient(x, y) * y >= x); 57 | Check(CeilQuotient(x, y) * y < x + y); 58 | } 59 | 60 | template 61 | void test_ceil_quotient() { 62 | const Integer N = ValueRangeCutoff(); 63 | const Integer K = std::min(N, Integer(100)); 64 | for (Integer x = 0; x < K; x++) { 65 | for (Integer y = 1; y < K; y++) { 66 | test_ceil_quotient(x, y); 67 | test_ceil_quotient(x, Integer(1 + (Random() % (N - 1)))); 68 | test_ceil_quotient(Integer(Random() % N), y); 69 | test_ceil_quotient(Integer(Random() % N), 70 | Integer(1 + (Random() % (N - 1)))); 71 | } 72 | } 73 | } 74 | 75 | template 76 | void test_round_up_to_next_power_of_two(Integer x) { 77 | Check(RoundUpToPowerOfTwo(RoundUpToPowerOfTwo(x) == RoundUpToPowerOfTwo(x))); 78 | Check(RoundUpToPowerOfTwo(x) >= x); 79 | Check(x == 0 || RoundUpToPowerOfTwo(x) < 2 * x); 80 | Check((RoundUpToPowerOfTwo(x) & (RoundUpToPowerOfTwo(x) - 1)) == 0); 81 | } 82 | 83 | template 84 | void test_round_up_to_next_power_of_two() { 85 | const Integer N = ValueRangeCutoff(); 86 | const Integer K = std::min(N, Integer(100)); 87 | for (Integer x = 0; x < K; x++) { 88 | test_round_up_to_next_power_of_two(x); 89 | test_round_up_to_next_power_of_two(Random() % N); 90 | } 91 | } 92 | 93 | void test_math_helpers() { 94 | test_round_up_down<1>(); 95 | test_round_up_down<2>(); 96 | test_round_up_down<3>(); 97 | test_round_up_down<4>(); 98 | test_round_up_down<5>(); 99 | test_round_up_down<6>(); 100 | test_round_up_down<7>(); 101 | test_round_up_down<8>(); 102 | test_round_up_down<9>(); 103 | test_round_up_down<10>(); 104 | test_round_up_down<11>(); 105 | test_round_up_down<12>(); 106 | test_round_up_down<13>(); 107 | test_round_up_down<14>(); 108 | test_round_up_down<15>(); 109 | test_round_up_down<16>(); 110 | 111 | test_round_up_down<50>(); 112 | test_round_up_down<51>(); 113 | 114 | test_round_up_down<500>(); 115 | test_round_up_down<501>(); 116 | 117 | test_ceil_quotient(); 118 | test_ceil_quotient(); 119 | test_ceil_quotient(); 120 | test_ceil_quotient(); 121 | test_ceil_quotient(); 122 | test_ceil_quotient(); 123 | 124 | test_round_up_to_next_power_of_two(); 125 | test_round_up_to_next_power_of_two(); 126 | test_round_up_to_next_power_of_two(); 127 | test_round_up_to_next_power_of_two(); 128 | test_round_up_to_next_power_of_two(); 129 | test_round_up_to_next_power_of_two(); 130 | } 131 | 132 | } // end namespace gemmlowp 133 | 134 | int main() { gemmlowp::test_math_helpers(); } 135 | -------------------------------------------------------------------------------- /todo/armv8-64bit-kernel-for-less-than-8-bit.txt: -------------------------------------------------------------------------------- 1 | TODO: Port the ARMv7 (32bit) less-than-8-bit GEMM kernel to (ARMv8 64bit) 2 | 3 | Platforms: ARM NEON 4 | 5 | Coding time: M 6 | Experimentation time: M 7 | Skill required: M 8 | 9 | Prerequisite reading: 10 | doc/kernels.txt 11 | doc/packing.txt 12 | 13 | Model to follow/adapt: 14 | internal/kernel_neon.h 15 | 16 | In internal/kernel_neon.h, for ARMv7 (32bit), we have a kernel 17 | specifically designed to take advantage of smaller operands ranges 18 | to use 16-bit local accumulators to achieve higher arithmetic throughput: 19 | 20 | NEON_32_Kernel12x4Depth2Assuming12BitProducts 21 | 22 | This is the kernel used with BitDepthSetting::L7R5 and is what allows 23 | this bit depth setting to outperform L8R8. 24 | 25 | This TODO item is about porting it to ARMv8 (64bit) assembly. It can 26 | be approached in two parts: 27 | 28 | 1. Make a trivial port of the existing ARMv7 assembly code in 29 | NEON_32_Kernel12x4Depth2Assuming12BitProducts to ARMv8 assembly. 30 | 31 | 2. Consider ways to make use of the larger register space availble on ARMv8: 32 | there are 32 128-bit vector registers, instead of 16 on ARMv7. 33 | A simple way, and quite possibly the best, would be to take the same 34 | approach already implemented in NEON_64_Kernel12x8Depth2: 35 | When porting a 12x4 kernel from ARMv7 to ARMv8, the extra register 36 | space can be put to good use by doubling the RHS kernel width, from 37 | 4 to 8, thus changing the 12x4 kernel size to 12x8. Since cells 38 | are of width 4, this means switching from 1 RHS cell to 2 RHS cells. 39 | Since everything else remains unchanged, this should be a rather 40 | simple change to implement. Compare NEON_64_Kernel12x8Depth2 41 | to NEON_32_Kernel12x4Depth2. 42 | -------------------------------------------------------------------------------- /todo/error-diffusion-experiments.txt: -------------------------------------------------------------------------------- 1 | TODO: Error diffusion experiments 2 | 3 | Platforms: all 4 | 5 | Coding time: M 6 | Experimentation time: XL 7 | Skill required: XL 8 | 9 | Prerequisite reading: 10 | doc/less-than-8-bit.txt 11 | 12 | 13 | Overview 14 | ======== 15 | 16 | In internal/pack.h, the Requantize function takes care of requantizing 17 | input 8 bit values to less than 8 bit. This is currently done either by 18 | rounding-to-nearest, or by probabilistic rounding. 19 | 20 | People have suggested trying error diffusion instead. 21 | https://en.wikipedia.org/wiki/Error_diffusion 22 | This technique originally from graphics might be adaptable to GEMM; however, 23 | that is far from trivial. 24 | 25 | Still, it may be worth experimenting with it, as the reward of higher accuracy 26 | could be very worthwhile especially if it allows to explore even smaller 27 | bit-depths. 28 | 29 | 30 | Why getting error diffusion to work is nontrivial 31 | ================================================= 32 | 33 | In graphics, there is only one array to 34 | apply error diffusion to, and the criteria are mostly aesthetic. Here in GEMM, 35 | there are two arrays involved, allowing for unwanted interaction between the 36 | error diffusion terms added on either side separately; and we have stringent 37 | accuracy criteria. 38 | 39 | Here is a toy example showing how naive approaches to error diffusion may 40 | suffer from unwanted interactions between the LHS and RHS separate error 41 | diffusion terms: 42 | 43 | Say that we're working on 1-dimensional data (as opposed to 2-D matrices) to 44 | simplify the discussion. 45 | 46 | Say that our input values are real numbers in [0, 1] and that we're quantizing 47 | them to either 0 or 1. 48 | 49 | Say that the left-hand-side is filled with the constant value 0.9 and that our 50 | error-diffusion filter results in the following sequence of quantized values: 51 | 1 (repeated 9 times), 0, ... (repeat). 52 | 53 | Say that the left-hand-side is filled with the constant value 0.1 and that our 54 | error-diffusion filter results in the following sequence of quantized values: 55 | 0 (repeated 9 times), 1, ... (repeat). 56 | 57 | So if we compute the dot product (which is what we really do in a GEMM) of 58 | these quantized vectors, we're computing 59 | 1*0 + ... (repeated 9 times) + 0*1 + ... (repeat) 60 | 61 | So we get exactly 0! This shows how a naive approach to error diffusion may 62 | suffer from bias issues similar to round-to-nearest. 63 | 64 | 65 | Some avenues to explore to make error diffusion work 66 | ==================================================== 67 | 68 | 1. Maybe some fixed error diffusion kernels just happen to avoid that issue? 69 | 70 | 2. Maybe it's just a matter of doing error diffusion for a different vector 71 | error metric, e.g. l^2 instead of l^1? 72 | 73 | 3. Maybe some randomization (adding some random term to the error term being 74 | diffused) would be acceptable? It seems like it would allow to avoid the 75 | interference problem discussed above. 76 | 77 | 78 | Performance considerations 79 | ========================== 80 | 81 | Error diffusion is going to be relatively expensive compared to the current 82 | requantization methods. It may be acceptable for large enough GEMM depth, 83 | since it only needs to be applied once for the n^2 input matrix entries, thus 84 | becoming negligible compared to the n^3 arithmetic cost of GEMM for large 85 | enough n. 86 | 87 | Alternatively, we may consider doing requantization of some matrices once and 88 | for all, but that would likely be the case only for one of LHS or RHS, 89 | otherwise one might as well precompute the whole GEMM. 90 | -------------------------------------------------------------------------------- /todo/less-than-8-bit-without-requantization.txt: -------------------------------------------------------------------------------- 1 | TODO: Discard the old requantization stuff, keep less-than-8-bit kernels, expose 2 | them as a different contract whereby the user specifies that operands use less 3 | than 8 bits. 4 | 5 | Read: doc/less-than-8-bit.md 6 | 7 | This is about going from "the present" to "the future" as described there. 8 | 9 | Discard all requantization stuff. 10 | 11 | Probably no need to worry about compatibility, this was little used. 12 | 13 | Instead, add a new option, in the form of new "bit depth params", whereby the user can specify that operands use less than 8 bits (even though they are represented as std::uint8_t). For example, specifying 6 bits would mean that the contract is that the user guarantees that LHS matrix entries are in the [0, 63] interval. 14 | 15 | Then make use of that to select kernels that take advantage of the lower bit depth. The existing less-than-8bit kernels would work as-is, only now no requantization would be needed anymore in the packing stage, and not rescaling in the unpacking stage. 16 | -------------------------------------------------------------------------------- /todo/multi-threading-experiments.txt: -------------------------------------------------------------------------------- 1 | TODO: Multi-threading experiments for better performance on smaller GEMM sizes 2 | 3 | Platforms: all, but special focus should be put on mobile OSes (Android...) 4 | where thread scheduling seems to be unfavorable to throughput. 5 | 6 | Coding time: Unknown 7 | Experimentation time: XL 8 | Skill required: XL 9 | 10 | Relevant file: 11 | internal/multi_thread_gemm.h 12 | 13 | 14 | The problem, and what we have done about it so far 15 | ================================================== 16 | 17 | It's easy to get a multi-threaded GEMM implementation to perform well 18 | for large enough GEMM sizes, because then the parallel workloads are large 19 | enough compared to the synchronization overhead. In gemmlowp however, 20 | we are specifically interested in "medium" GEMM sizes, of the order of 100, 21 | which are small enough to make synchronization overhead dominant in many 22 | situations. 23 | 24 | We have already implemented some changes that were very effective at getting 25 | good multi-threading benefits for smaller GEMM sizes: 26 | https://github.com/google/gemmlowp/commit/210ac891d6d2d0749f7856103c928d9be70ded94 27 | Let us paste the commit message: 28 | 1. Use only N-1 worker threads while the master plays the role 29 | of the Nth worker, where N is the number of cores. 30 | This 1:1 mapping of threads to cores gives much better perf 31 | esp. for not-very-large GEMMs and esp. on Android. 32 | 2. Implement waiting by actually busy-waiting for a little while 33 | before eventually falling back to passive waiting. That 34 | ensures that we wake up quickly from short naps, which helps 35 | with not-very-large GEMMs esp. on Android. 36 | 37 | These changes revolved around the idea that when the GEMM size is too small to 38 | be efficiently supported by the OS's theading primitives, we can instead 39 | present the OS with a very simple workload: exactly as many threads as there 40 | are CPU cores, and these threads being always busy, never waiting. This makes 41 | it easy for the OS to decide to bring all CPU cores online and give each of our 42 | threads its own CPU core, and occupy it nearly 100% of the time, thus avoiding 43 | to have to wait to get scheduled. 44 | 45 | The cost of waiting (or in particular, of locking) is not just the time it 46 | takes; especially on mobile platforms, it is also the side effects of getting 47 | our threads de-scheduled by the OS, of getting CPUs spun down, etc. With that 48 | in mind, anything that can help us avoid waiting/locking in a OS-visible way, 49 | is worth experimenting with. 50 | 51 | 52 | Other things that would be worth experimenting with 53 | =================================================== 54 | 55 | 56 | Busy-waiting in mutex-locking too 57 | --------------------------------- 58 | 59 | While we have replaced most of the pthread_cond_wait waiting by busy-waiting 60 | in WaitForVariableChange, on the other hand we are still calling 61 | pthread_mutex_lock in a couple of places outside of WaitForVariableChange. 62 | It might be interesting to avoid that too, by having a mutex-locking 63 | implementation that first spends some time busy-waiting before actually 64 | resorting to calling pthread_mutex_lock. 65 | 66 | 67 | Minimizing locking 68 | ------------------ 69 | 70 | The inherent synchronization points of the GEMM, which we essentially can't 71 | avoid, are already implemented using WaitForVariableChange, so they are 72 | already using busy-waiting over short periods of time, which is the 73 | best that we can do. On the other hand, we are also using mutex locking 74 | in a couple of places: around updates to the State of worker threads, and 75 | around updates to the counter value in BlockingCounter. The locking done 76 | there is unnecessary: it could be replaced by atomic operations, and in 77 | fact, because our thread structure is so simple and rigid, even atomic 78 | operations might not be needed at all, as long as we ensure basic 79 | memory ordering. A precise understanding of the CPU's memory model 80 | is needed here, and the outcome could depend on the CPU architecture. 81 | 82 | 83 | Restructuring the GEMM to remove synchronization points 84 | ------------------------------------------------------- 85 | 86 | Compared to the above ideas, this one is a much bigger departure from 87 | what we are currently doing. 88 | 89 | The current structure of our multi-threaded GEMM is: 90 | for_each(slice_of_RHS) { 91 | pack(slice_of_RHS); 92 | for_each(slice_of_LHS) { 93 | do_gemm_on_some_thread(slice_of_LHS, packed_slice_of_RHS) 94 | } 95 | wait_for_all_threads(); // synchronization point 96 | } 97 | 98 | Thus we have a synchronization point at the end of each slice of RHS. 99 | The motivation for this design is to have all threads work on a single 100 | large slice of RHS, occupying top-level (shared among cores) CPU cache. 101 | 102 | Thus the current approach is optimized for cache-friendliness at the 103 | expense of parallelization. Maybe we should consider amending it 104 | to strike a better balance of cache-friendliness vs. parallelization. 105 | 106 | For instance, we could have a "pipeline" where at a given time we have 107 | *two* slices of RHS packed into top-level CPU cache. We would normally 108 | schedule thread tasks to work with the first of these two RHS slices; 109 | whenever a thread task is done, we would immediately give the thread 110 | a new task, and if we are already done with the first RHS slice, we 111 | could then immediately start a task against the second RHS slice. 112 | 113 | There could still be some necessary waiting, if one thread is lagging 114 | behind another by more than one full RHS slice; but that should be a 115 | lot better than the current situation, where we wait at the end of 116 | each slice. 117 | -------------------------------------------------------------------------------- /todo/neon-depth-major-sources-packing.txt: -------------------------------------------------------------------------------- 1 | TODO: Implement depth-major-sources packing paths for NEON 2 | 3 | Platforms: ARM NEON 4 | 5 | Coding time: M 6 | Experimentation time: M 7 | Skill required: M 8 | 9 | Prerequisite reading: 10 | doc/kernels.txt 11 | doc/packing.txt 12 | 13 | Model to follow/adapt: 14 | internal/pack_neon.h 15 | 16 | At the moment we have NEON optimized packing paths for WidthMajor sources. 17 | We also need paths for DepthMajor sources. 18 | 19 | This is harder because for DepthMajor sources, the size of each slice that 20 | we have to load is the kernel's width, which is typically 12 (for the LHS) 21 | or 4 (for the RHS). That's not very friendly to NEON vector-load instructions 22 | which would allow us to load 8 or 16 entries, but not 4 or 12. 23 | 24 | So you will have to load 4 entries at a time only. For that, the 25 | vld1q_lane_u32 seems to be as good as you'll get. The other possible 26 | approach would be to load (with plain scalar C++) four uint32's into a 27 | temporary local buffer, and use vld1q_u8 on that. Some experimentation 28 | will be useful here. For that, you can generate assembly with -save-temps 29 | and make assembly easier to inspect by inserting inline assembly comments 30 | such as 31 | asm volatile("#hello"); 32 | -------------------------------------------------------------------------------- /todo/remove-default-template-param-values.txt: -------------------------------------------------------------------------------- 1 | TODO: Remove default template parameter values 2 | 3 | Platforms: all 4 | 5 | Coding time: S 6 | Experimentation time: S 7 | Skill required: S 8 | 9 | We should generally not have default values for template parameters: this makes 10 | code harder to read, and is discouraged by the Google C++ style guide for good 11 | reason. 12 | 13 | Specifically, I'm concerned about CellFormat having the CellOrder parameter 14 | defaulting to WidthMajor. This specific case has been causing confusion. 15 | 16 | There might be other instances to fix. Part of this TODO item is to audit that. 17 | 18 | One exception in which default template parameters are OK, is for locally 19 | contained metaprogramming helpers, that are a local implementation detail. 20 | However, I don't know if we have any such case in gemmlowp. We're generally 21 | conservative with template metaprogramming around here. 22 | -------------------------------------------------------------------------------- /todo/x86-kernels.txt: -------------------------------------------------------------------------------- 1 | TODO: Implement a full set of kernels for x86 2 | 3 | Platforms: x86, different variants: 32/64bit, SSE*/AVX* etc. 4 | 5 | Coding time: XL 6 | Experimentation time: XL 7 | Skill required: XL 8 | 9 | Prerequisite reading: 10 | doc/kernels.txt 11 | 12 | Model to follow/adapt: 13 | internal/kernel_neon.h 14 | 15 | We need a full set of kernels for x86 architectures. 16 | By "a full set" we mean: covering all the variants of x86 instruction sets, 17 | and covering our different cases, by decreasing order of importance: 18 | 1. GEMM, BitDepthSetting::L8R8 19 | 2. GEMM, BitDepthSetting::L7R5 20 | 3. GEMV, BitDepthSetting::L8R8 (that one may be deprecated when we eventually 21 | implement GEMV more efficiently as a fully specialized operation) 22 | 23 | This generally has to be done separately for 32bit vs 64bit because an 24 | efficient GEMM kernel generally needs to use all the register space that it 25 | can get, and: 26 | - That register space is generally different on 32bit vs 64bit; 27 | - C++ compilers have a hard time doing good register allocation for 28 | intrinsics-using code that's very tight on vector registers, so in 29 | practice we generally prefer to implement kernels in (inline) assembly. 30 | 31 | At the moment we have a couple of kernels targeting SSE4 for the 32 | (GEMM, BitDepthSetting::L8R8) case, contributed by Intel. 33 | We need to cover the other x86 instruction set variants and to cover the 34 | other cases, at least the L7R5 case. 35 | 36 | Labelling this TODO item as XL because unless 37 | one knows the CPU inside out (only CPU vendors do), it generally takes a lot 38 | of trial-and-error to arrive to an optimally performing solution, and in any 39 | case, given the number of x86 different instruction set flavors, the end 40 | result will be a large body of assembly code. 41 | --------------------------------------------------------------------------------