├── .gitignore
├── .travis.yml
├── AUTHORS
├── BUILD
├── CONTRIBUTING
├── CONTRIBUTORS
├── LICENSE
├── Makefile.travis
├── README.md
├── WORKSPACE
├── contrib
    └── CMakeLists.txt
├── doc
    ├── design.md
    ├── kernel.md
    ├── less-than-8-bit.md
    ├── low-precision.md
    ├── output.md
    ├── packing.md
    ├── public.md
    ├── quantization.md
    └── quantization_example.cc
├── eight_bit_int_gemm
    ├── eight_bit_int_gemm.cc
    └── eight_bit_int_gemm.h
├── fixedpoint
    ├── fixedpoint.h
    ├── fixedpoint_avx.h
    ├── fixedpoint_msa.h
    ├── fixedpoint_neon.h
    ├── fixedpoint_sse.h
    └── fixedpoint_wasmsimd.h
├── flags.bzl
├── internal
    ├── allocator.h
    ├── block_params.h
    ├── common.h
    ├── compute.h
    ├── detect_platform.h
    ├── dispatch_gemm_shape.h
    ├── kernel.h
    ├── kernel_avx.h
    ├── kernel_default.h
    ├── kernel_msa.h
    ├── kernel_neon.h
    ├── kernel_reference.h
    ├── kernel_sse.h
    ├── multi_thread_gemm.h
    ├── output.h
    ├── output_avx.h
    ├── output_msa.h
    ├── output_neon.h
    ├── output_sse.h
    ├── pack.h
    ├── pack_avx.h
    ├── pack_msa.h
    ├── pack_neon.h
    ├── pack_sse.h
    ├── platform.h
    ├── simd_wrappers.h
    ├── simd_wrappers_common_neon_sse.h
    ├── simd_wrappers_msa.h
    ├── simd_wrappers_neon.h
    ├── simd_wrappers_sse.h
    ├── single_thread_gemm.h
    └── unpack.h
├── jni
    ├── Android.mk
    └── Application.mk
├── meta
    ├── README
    ├── base.h
    ├── generators
    │   ├── cc_emitter.py
    │   ├── common.py
    │   ├── metagemm_generate_headers.sh
    │   ├── neon_emitter.py
    │   ├── neon_emitter_64.py
    │   ├── quantized_mul_kernels_arm_32.py
    │   ├── quantized_mul_kernels_arm_64.py
    │   ├── quantized_mul_kernels_common.py
    │   ├── streams_arm_32.py
    │   ├── streams_arm_64.py
    │   ├── streams_common.py
    │   ├── transform_kernels_arm_32.py
    │   ├── transform_kernels_arm_64.py
    │   └── transform_kernels_common.py
    ├── legacy_multi_thread_common.h
    ├── legacy_multi_thread_gemm.h
    ├── legacy_multi_thread_gemv.h
    ├── legacy_operations_common.h
    ├── legacy_single_thread_gemm.h
    ├── multi_thread_common.h
    ├── multi_thread_gemm.h
    ├── multi_thread_transform.h
    ├── quantized_mul_kernels.h
    ├── quantized_mul_kernels_arm_32.h
    ├── quantized_mul_kernels_arm_64.h
    ├── single_thread_gemm.h
    ├── single_thread_transform.h
    ├── streams.h
    ├── streams_arm_32.h
    ├── streams_arm_64.h
    ├── test_gemm_correctness.cc
    ├── test_streams_correctness.cc
    ├── test_transform_benchmark.cc
    ├── test_transform_correctness.cc
    ├── transform_kernels.h
    ├── transform_kernels_arm_32.h
    └── transform_kernels_arm_64.h
├── profiling
    ├── instrumentation.h
    ├── profiler.h
    └── pthread_everywhere.h
├── public
    ├── bit_depth.h
    ├── gemmlowp.h
    ├── map.h
    └── output_stages.h
├── scripts
    ├── ci-before.sh
    ├── ci-test.sh
    └── test-android.sh
├── standalone
    ├── cache_counters.cc
    ├── encode.py
    └── neon-gemm-kernel-benchmark.cc
├── test
    ├── benchmark.cc
    ├── benchmark_all_sizes.cc
    ├── benchmark_meta_gemm.cc
    ├── correctness_meta_gemm.cc
    ├── ios
    │   ├── gemmlowp_test.xcodeproj
    │   │   └── project.pbxproj
    │   └── gemmlowp_test
    │   │   ├── AppDelegate.h
    │   │   ├── AppDelegate.mm
    │   │   ├── Base.lproj
    │   │       ├── LaunchScreen.xib
    │   │       └── Main.storyboard
    │   │   ├── Images.xcassets
    │   │       └── AppIcon.appiconset
    │   │       │   └── Contents.json
    │   │   ├── Info.plist
    │   │   ├── ViewController.h
    │   │   ├── ViewController.m
    │   │   └── main.m
    ├── test.cc
    ├── test.h
    ├── test_allocator.cc
    ├── test_blocking_counter.cc
    ├── test_data.cc
    ├── test_data.h
    ├── test_fixedpoint.cc
    └── test_math_helpers.cc
└── todo
    ├── armv8-64bit-kernel-for-less-than-8-bit.txt
    ├── error-diffusion-experiments.txt
    ├── fast-gemv.txt
    ├── less-than-8-bit-without-requantization.txt
    ├── multi-threading-experiments.txt
    ├── neon-depth-major-sources-packing.txt
    ├── remove-default-template-param-values.txt
    └── x86-kernels.txt


/.gitignore:
--------------------------------------------------------------------------------
 1 | *.o
 2 | *.ii
 3 | *.s
 4 | **/.DS_Store
 5 | ?
 6 | ??
 7 | *binary*
 8 | /.idea/
 9 | CMakeLists.txt
10 | /bazel-*
11 | cmake_build/
12 | cmake_install/


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | sudo: false
 3 | 
 4 | jobs:
 5 |   include:
 6 |     - stage: build
 7 |       name: Android NDK
 8 |       language: android
 9 |       compiler: clang
10 |       os:
11 |         - linux
12 |       env:
13 |         - NDK_VERSION=r14b TEST=arm
14 |         - TEST=x86
15 |       android:
16 |         components:
17 |           - build-tools-22.0.1
18 |           - android-22
19 |           - ndk-bundle
20 |           - sys-img-armeabi-v7a-android-22
21 |       before_script:
22 |         - ./scripts/ci-before.sh
23 |       script:
24 |         - ./scripts/ci-test.sh
25 | 
26 |     - name: Linux CMake(clang)
27 |       os: linux
28 |       dist: bionic
29 |       language: cpp
30 |       compiler: clang
31 |       script:
32 |         - cmake -S contrib -B cmake_build -DCMAKE_INSTALL_PREFIX=cmake_install
33 |         - cmake --build cmake_build
34 |         - cmake --build cmake_build --target install
35 |         - ctest --test-dir cmake_build --output-on-failure --output-junit TEST-${TRAVIS_COMMIT}.xml
36 | 


--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
 1 | # This is the official list of gemmlowp authors for copyright purposes.
 2 | # This file is distinct from the CONTRIBUTORS.txt file.
 3 | # See the latter for an explanation.
 4 | 
 5 | # Names should be added to this file as:
 6 | # Name or Organization <email address>
 7 | # The email address is not required for organizations.
 8 | 
 9 | Google Inc.
10 | Intel Corporation
11 | ARM Ltd.
12 | Silk Labs Inc.
13 | MIPS Tech LLC
14 | Wave Computing Inc.
15 | 


--------------------------------------------------------------------------------
/BUILD:
--------------------------------------------------------------------------------
  1 | #
  2 | # Description:
  3 | #   gemmlowp is a small self-contained low-precision GEMM library.
  4 | #   https://github.com/google/gemmlowp
  5 | 
  6 | licenses(["notice"])  # Apache 2.0
  7 | 
  8 | exports_files(["LICENSE"])
  9 | 
 10 | config_setting(
 11 |     name = "windows",
 12 |     values = {
 13 |         "cpu": "x64_windows",
 14 |     },
 15 | )
 16 | 
 17 | config_setting(
 18 |     name = "android",
 19 |     values = {
 20 |         "crosstool_top": "//external:android/crosstool",
 21 |     },
 22 | )
 23 | 
 24 | load(":flags.bzl", "LIB_COPTS", "LIB_LINKOPTS", "BIN_LINKOPTS")
 25 | 
 26 | filegroup(
 27 |     name = "gemmlowp_private_headers",
 28 |     srcs = glob([
 29 |         "fixedpoint/*.h",
 30 |         "internal/*.h",
 31 |     ]),
 32 |     visibility = ["//visibility:private"],
 33 | )
 34 | 
 35 | filegroup(
 36 |     name = "gemmlowp_public_headers",
 37 |     srcs = glob([
 38 |         "meta/*.h",
 39 |         "public/*.h",
 40 |         "profiling/*.h",
 41 |     ]),
 42 |     visibility = ["//visibility:public"],
 43 | )
 44 | 
 45 | filegroup(
 46 |     name = "gemmlowp_headers",
 47 |     srcs = [
 48 |         ":gemmlowp_private_headers",
 49 |         ":gemmlowp_public_headers",
 50 |     ],
 51 |     visibility = ["//visibility:private"],
 52 | )
 53 | 
 54 | filegroup(
 55 |     name = "eight_bit_int_gemm_headers",
 56 |     srcs = glob(["eight_bit_int_gemm/*.h"]),
 57 |     visibility = ["//visibility:private"],
 58 | )
 59 | 
 60 | filegroup(
 61 |     name = "eight_bit_int_gemm_public_headers",
 62 |     srcs = [
 63 |         ":eight_bit_int_gemm_headers",
 64 |         ":gemmlowp_public_headers",
 65 |     ],
 66 |     visibility = ["//visibility:public"],
 67 | )
 68 | 
 69 | filegroup(
 70 |     name = "eight_bit_int_gemm_sources_with_no_headers",
 71 |     srcs = glob(["eight_bit_int_gemm/*.cc"]),
 72 |     visibility = ["//visibility:private"],
 73 | )
 74 | 
 75 | filegroup(
 76 |     name = "eight_bit_int_gemm_sources",
 77 |     srcs = [
 78 |         ":eight_bit_int_gemm_headers",
 79 |         ":eight_bit_int_gemm_sources_with_no_headers",
 80 |         ":gemmlowp_headers",
 81 |     ],
 82 |     visibility = ["//visibility:public"],
 83 | )
 84 | 
 85 | filegroup(
 86 |     name = "gemmlowp_test_headers",
 87 |     srcs = [":gemmlowp_headers"] + glob(["test/*.h"]),
 88 |     visibility = ["//visibility:private"],
 89 | )
 90 | 
 91 | filegroup(
 92 |     name = "fixedpoint_private_headers",
 93 |     srcs = glob([
 94 |         "fixedpoint/*.h",
 95 |     ]) + [
 96 |         "internal/common.h",
 97 |         "internal/detect_platform.h",
 98 |     ],
 99 |     visibility = ["//visibility:private"],
100 | )
101 | 
102 | cc_library(
103 |     name = "fixedpoint",
104 |     srcs = [
105 |         ":fixedpoint_private_headers",
106 |     ],
107 |     hdrs = [
108 |         "fixedpoint/fixedpoint.h",
109 |     ],
110 |     # Blaze warning:
111 |     # "setting 'linkstatic=1' is recommended if there are no object files."
112 |     linkstatic = 1,
113 |     visibility = ["//visibility:public"],
114 | )
115 | 
116 | cc_library(
117 |     name = "gemmlowp",
118 |     hdrs = [":gemmlowp_headers"],
119 |     linkopts = LIB_LINKOPTS,
120 |     # Blaze warning:
121 |     # "setting 'linkstatic=1' is recommended if there are no object files."
122 |     linkstatic = 1,
123 |     visibility = ["//visibility:public"],
124 |     deps = [":fixedpoint"],
125 | )
126 | 
127 | cc_library(
128 |     name = "eight_bit_int_gemm",
129 |     srcs = [":eight_bit_int_gemm_sources_with_no_headers"],
130 |     hdrs = [
131 |         ":eight_bit_int_gemm_headers",
132 |         ":gemmlowp_private_headers",
133 |         ":gemmlowp_public_headers",
134 |     ],
135 |     copts = LIB_COPTS,
136 |     linkopts = LIB_LINKOPTS,
137 |     visibility = ["//visibility:public"],
138 |     deps = [":gemmlowp"],
139 | )
140 | 
141 | cc_library(
142 |     name = "profiler",
143 |     hdrs = [
144 |         "profiling/instrumentation.h",
145 |         "profiling/profiler.h",
146 |         "profiling/pthread_everywhere.h",
147 |     ],
148 |     visibility = ["//visibility:public"],
149 | )
150 | 
151 | # The main gemmlowp unit test
152 | cc_test(
153 |     name = "test",
154 |     size = "medium",
155 |     srcs = [
156 |         "test/test.cc",
157 |         "test/test_data.cc",
158 |         ":gemmlowp_test_headers",
159 |     ],
160 |     copts = ["-O3"],
161 |     deps = [":eight_bit_int_gemm"],
162 | )
163 | 
164 | # Math helpers test
165 | cc_test(
166 |     name = "test_math_helpers",
167 |     size = "small",
168 |     srcs = [
169 |         "test/test_math_helpers.cc",
170 |         ":gemmlowp_test_headers",
171 |     ],
172 | )
173 | 
174 | # BlockingCounter test
175 | cc_test(
176 |     name = "test_blocking_counter",
177 |     size = "medium",
178 |     srcs = [
179 |         "test/test_blocking_counter.cc",
180 |         ":gemmlowp_test_headers",
181 |     ],
182 |     linkopts = BIN_LINKOPTS,
183 | )
184 | 
185 | # Allocator test
186 | cc_test(
187 |     name = "test_allocator",
188 |     size = "small",
189 |     srcs = [
190 |         "test/test_allocator.cc",
191 |         ":gemmlowp_test_headers",
192 |     ],
193 | )
194 | 
195 | # FixedPoint test
196 | cc_test(
197 |     name = "test_fixedpoint",
198 |     size = "small",
199 |     srcs = [
200 |         "test/test_fixedpoint.cc",
201 |         ":gemmlowp_test_headers",
202 |     ],
203 | )
204 | 
205 | # Benchmark
206 | cc_binary(
207 |     name = "benchmark",
208 |     srcs = [
209 |         "test/benchmark.cc",
210 |         ":gemmlowp_test_headers",
211 |     ],
212 |     copts = [
213 |         "-O3",
214 |         "-DNDEBUG",
215 |     ],
216 |     linkopts = BIN_LINKOPTS,
217 | )
218 | 
219 | # Benchmark
220 | cc_binary(
221 |     name = "benchmark_profile",
222 |     srcs = [
223 |         "test/benchmark.cc",
224 |         ":gemmlowp_test_headers",
225 |     ],
226 |     copts = [
227 |         "-O3",
228 |         "-DNDEBUG",
229 |         "-DGEMMLOWP_TEST_PROFILE",
230 |     ],
231 |     linkopts = BIN_LINKOPTS,
232 | )
233 | 


--------------------------------------------------------------------------------
/CONTRIBUTING:
--------------------------------------------------------------------------------
 1 | Want to contribute? Great! First, read this page (including the small print at the end).
 2 | 
 3 | 
 4 | Before you contribute
 5 | =====================
 6 | 
 7 | Before we can use your code, you must sign the Google Individual Contributor
 8 | License Agreement (CLA),
 9 | 
10 |   https://developers.google.com/open-source/cla/individual?csw=1
11 | 
12 | which you can do online. The CLA is necessary mainly because you own the
13 | copyright to your changes, even after your contribution becomes part of our
14 | codebase, so we need your permission to use and distribute your code. We also
15 | need to be sure of various other things—for instance that you'll tell us if you
16 | know that your code infringes on other people's patents. You don't have to sign
17 | the CLA until after you've submitted your code for review and a member has
18 | approved it, but you must do it before we can put your code into our codebase.
19 | Before you start working on a larger contribution, you should get in touch with
20 | us first through the issue tracker with your idea so that we can help out and
21 | possibly guide you. Coordinating up front makes it much easier to avoid
22 | frustration later on.
23 | 
24 | 
25 | Getting in touch with the gemmlowp community
26 | ============================================
27 | 
28 | The central point of communication around gemmlowp is the mailing list,
29 |   https://groups.google.com/forum/#!forum/gemmlowp
30 | 
31 | 
32 | TODO items and projects
33 | =======================
34 | 
35 | We try to keep a current list of TODO items in the todo/ directory.
36 | Please feel free to pick one to work on, and to ask current maintainers for
37 | guidance. The gemmlowp mailing list is a good place for that.
38 | 
39 | 
40 | Code reviews
41 | ============
42 | 
43 | All submissions, including submissions by project members, require review.
44 | For this purpose, we use Github pull requests against this repository:
45 | 
46 |   https://github.com/google/gemmlowp
47 | 
48 | 
49 | The small print
50 | ===============
51 | 
52 | Contributions made by corporations are covered by a different agreement than
53 | the one above, the Software Grant and Corporate Contributor License Agreement.
54 | 


--------------------------------------------------------------------------------
/CONTRIBUTORS:
--------------------------------------------------------------------------------
 1 | # People who have agreed to one of the CLAs and can contribute patches.
 2 | # The AUTHORS.txt file lists the copyright holders; this file
 3 | # lists people.  For example, Google employees are listed here
 4 | # but not in AUTHORS.txt, because Google holds the copyright.
 5 | #
 6 | # https://developers.google.com/open-source/cla/individual
 7 | # https://developers.google.com/open-source/cla/corporate
 8 | #
 9 | # Names should be added to this file as:
10 | #     Name <email address>
11 | 
12 | Google:
13 | Benoit Jacob <benoitjacob@google.com>
14 | Pete Warden <petewarden@google.com>
15 | Miao Wang <miaowang@google.com>
16 | David Andersen <dga@google.com>
17 | Maciek Chociej <maciekc@google.com>
18 | Justine Tunney <jart@google.com>
19 | Mark J. Matthews <mjmatthews@google.com>
20 | Marie White <mariewhite@google.com>
21 | Suharsh Sivakumar <suharshs@google.com>
22 | 
23 | Intel:
24 | Sagi Marcovich <sagi.marcovich@intel.com>
25 | Murat Efe Guney <murat.e.guney@intel.com>
26 | Sarah Knepper <sarah.knepper@intel.com>
27 | Mourad Gouicem <mourad.gouicem@intel.com>
28 | Richard Winterton <richard.winterton@intel.com>
29 | 
30 | ARM:
31 | David Mansell <David.Mansell@arm.com>
32 | 
33 | Silk Labs:
34 | Andreas Gal <andreas@silklabs.com>
35 | 
36 | MIPS Tech LLC:
37 | Alexey Frunze <Alexey.Frunze@mips.com>
38 | 
39 | Wave Computing Inc.:
40 | Alexey Frunze <afrunze@wavecomp.com>
41 | 


--------------------------------------------------------------------------------
/Makefile.travis:
--------------------------------------------------------------------------------
 1 | UNITTESTS_COMMON=test.cc test_allocator.cc test_blocking_counter.cc test_fixedpoint.cc test_math_helpers.cc
 2 | UNITTESTS_X86=$(UNITTESTS_COMMON)
 3 | 
 4 | UNITTESTS_X86_BIN=$(addprefix ./test/, $(addsuffix .x86, $(basename $(UNITTESTS_X86))))
 5 | UNITTESTS_BIN=$(UNITTESTS_X86_BIN)
 6 | 
 7 | VPATH=./test ./public
 8 | 
 9 | space :=
10 | space +=
11 | join-with = $(subst $(space),$1,$(strip $2))
12 | 
13 | .PHONY: compile clean unittest
14 | 
15 | CC_X86=clang++
16 | CFLAGS_X86=-march=native -O3 -lpthread
17 | 
18 | compile: $(UNITTESTS_BIN)
19 | 
20 | clean:
21 | 	rm -f $(UNITTESTS_BIN)
22 | 
23 | unittest: $(UNITTESTS_BIN)
24 | 	$(call join-with, && ,$(addprefix ./, $^))
25 | 
26 | %.x86: %.cc ./eight_bit_int_gemm/eight_bit_int_gemm.cc ./test/test_data.cc
27 | 	$(CC_X86) $(CFLAGS_X86) -std=c++11 -g -O3 -o $@ $^
28 | 


--------------------------------------------------------------------------------
/WORKSPACE:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/google/gemmlowp/16e8662c34917be0065110bfcd9cc27d30f52fdf/WORKSPACE


--------------------------------------------------------------------------------
/contrib/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # Gemmlowp CMake file written for Debian.
  2 | # Copyright © 2016 Zhou Mo <cdluminate@gmail.com>
  3 | # Licence Apache-2.0
  4 | 
  5 | cmake_minimum_required(VERSION 3.7)
  6 | 
  7 | # Project
  8 | project(gemmlowp C CXX)
  9 | 
 10 | include(CTest) # option(BUILD_TESTING). ON by default.
 11 | include(GNUInstallDirs)
 12 | 
 13 | # Set C++11 as default standard
 14 | set(CMAKE_CXX_STANDARD 11)
 15 | 
 16 | set(THREADS_PREFER_PTHREAD_FLAG ON)
 17 | find_package(Threads REQUIRED)
 18 | 
 19 | get_filename_component(gemmlowp_src ${gemmlowp_SOURCE_DIR} PATH)
 20 | 
 21 | if(WIN32)
 22 |   # one can enable simd from the cmake command line, ie -DCMAKE_CXX_FLAGS="/arch:AVX2
 23 |   add_definitions(-DNOMINMAX -DWIN64 -DWIN32_LEAN_AND_MEAN -DNOGDI)
 24 |   add_definitions(/bigobj /nologo /EHsc /GF /MP /Gm- /wd4800 /wd4805 /wd4244)
 25 |   if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
 26 |     # if we compile for windows with clang, allow inline asm
 27 |     add_definitions(-DGEMMLOWP_ALLOW_INLINE_ASM)
 28 |   endif()
 29 | else()
 30 |   set(EXTERNAL_LIBRARIES Threads::Threads)
 31 | endif()
 32 | 
 33 | # Glob header files
 34 | file(GLOB gemmlowp_private_headers "${gemmlowp_src}/fixedpoint/*.h" "${gemmlowp_src}/internal/*.h")
 35 | file(GLOB gemmlowp_public_headers "${gemmlowp_src}/meta/*.h" "${gemmlowp_src}/public/*.h" "${gemmlowp_src}/profiling/*.h")
 36 | list(APPEND gemmlowp_headers ${gemmlowp_private_headers} ${gemmlowp_public_headers})
 37 | 
 38 | file(GLOB eight_bit_int_gemm_headers "${gemmlowp_src}/eight_bit_int_gemm/*.h")
 39 | list(APPEND eight_bit_int_gemm_public_headers ${eight_bit_int_gemm_headers} ${gemmlowp_public_headers})
 40 | file(GLOB eight_bit_int_gemm_sources_with_no_headers "${gemmlowp_src}/eight_bit_int_gemm/*.cc")
 41 | 
 42 | list(APPEND eight_bit_int_gemm_sources
 43 |             ${eight_bit_int_gemm_headers}
 44 |             ${eight_bit_int_gemm_sources_with_no_headers}
 45 |             ${gemmlowp_headers})
 46 | 
 47 | file(GLOB gemmlowp_test_headers "${gemmlowp_src}/test/*.h")
 48 | list(APPEND gemmlowp_test_headers ${gemmlowp_headers})
 49 | 
 50 | file(GLOB fixedpoint_private_headers "${gemmlowp_src}/fixedpoint/*.h")
 51 | list(APPEND fixedpoint_private_headers "${gemmlowp_src}/internal/common.h")
 52 | 
 53 | add_library(eight_bit_int_gemm ${eight_bit_int_gemm_sources_with_no_headers})
 54 | set_target_properties(eight_bit_int_gemm PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS ON)
 55 | target_link_libraries(eight_bit_int_gemm ${EXTERNAL_LIBRARIES})
 56 | 
 57 | # INTERFACE target to help header include
 58 | add_library(gemmlowp INTERFACE)
 59 | target_include_directories(gemmlowp INTERFACE
 60 |     $<BUILD_INTERFACE:${gemmlowp_src}>
 61 |     $<INSTALL_INTERFACE:${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp>)
 62 | target_link_libraries(gemmlowp INTERFACE eight_bit_int_gemm)
 63 | 
 64 | install(FILES ${eight_bit_int_gemm_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/eight_bit_int_gemm)
 65 | file(GLOB meta_headers "${gemmlowp_src}/meta/*.h")
 66 | install(FILES ${meta_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/meta)
 67 | file(GLOB public_headers "${gemmlowp_src}/public/*.h")
 68 | install(FILES ${public_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/public)
 69 | file(GLOB profile_headers "${gemmlowp_src}/profiling/*.h")
 70 | install(FILES ${profile_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/profiling)
 71 | file(GLOB internal_headers "${gemmlowp_src}/internal/*.h")
 72 | install(FILES ${internal_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/internal)
 73 | file(GLOB fixedpoint_headers "${gemmlowp_src}/fixedpoint/*.h")
 74 | install(FILES ${fixedpoint_headers} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gemmlowp/fixedpoint)
 75 | 
 76 | install(TARGETS gemmlowp eight_bit_int_gemm
 77 |         EXPORT  gemmlowp-config # support find_package(gemmlowp CONFIG)
 78 |         RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}
 79 |         LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR}
 80 |         ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR})
 81 | 
 82 | install(EXPORT  gemmlowp-config  # export gemmlowp::gemmlowp
 83 |         NAMESPACE gemmlowp::     #        gemmlowp::eight_bit_int_gemm
 84 |         DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/gemmlowp)
 85 | 
 86 | if(BUILD_TESTING)
 87 |     # Benchmarks
 88 |     add_executable(benchmark
 89 |         "${gemmlowp_src}/test/benchmark.cc" ${gemmlowp_test_headers})
 90 |     target_link_libraries(benchmark ${EXTERNAL_LIBRARIES})
 91 |     
 92 |     add_executable(benchmark_all_sizes
 93 |         "${gemmlowp_src}/test/benchmark_all_sizes.cc" ${gemmlowp_test_headers})
 94 |     target_compile_options(benchmark_all_sizes PRIVATE -DBENCHMARK_8bit -DBENCHMARK_QUICK)
 95 |     target_link_libraries(benchmark_all_sizes ${EXTERNAL_LIBRARIES})
 96 |     
 97 |     # Gemmlowp test
 98 |     add_executable(test_gemmlowp
 99 |         "${gemmlowp_src}/test/test.cc" "${gemmlowp_src}/test/test_data.cc" ${gemmlowp_test_headers})
100 |     target_link_libraries(test_gemmlowp eight_bit_int_gemm)
101 |     
102 |     # Math helpers test
103 |     add_executable(test_math_helpers
104 |         "${gemmlowp_src}/test/test_math_helpers.cc" ${gemmlowp_test_headers})
105 |     
106 |     # BlockingCounter test
107 |     add_executable(test_blocking_counter
108 |         "${gemmlowp_src}/test/test_blocking_counter.cc" ${gemmlowp_test_headers})
109 |     target_link_libraries(test_blocking_counter ${EXTERNAL_LIBRARIES})
110 |     
111 |     # Allocator test
112 |     add_executable(test_allocator
113 |         "${gemmlowp_src}/test/test_allocator.cc" ${gemmlowp_test_headers})
114 |     
115 |     # FixedPoint test
116 |     add_executable(test_fixedpoint
117 |         "${gemmlowp_src}/test/test_fixedpoint.cc" ${gemmlowp_test_headers})
118 |     
119 |     # Add tests
120 |     enable_testing()
121 |     foreach(testname "test_math_helpers" "test_blocking_counter" "test_allocator" "test_fixedpoint" "test_gemmlowp")
122 |         add_test(NAME ${testname} COMMAND "${testname}")
123 |     endforeach(testname)
124 | endif()
125 | 


--------------------------------------------------------------------------------
/doc/design.md:
--------------------------------------------------------------------------------
  1 | # Overview of gemmlowp design
  2 | 
  3 | ## Primer on GEMM, kernels, and cache friendliness
  4 | 
  5 | gemmlowp, like most GEMMs, implements the straightforward matrix multiplication
  6 | algorithm, which takes n^3 multiply-accumulate instructions for n*n sized
  7 | matrices. Because the arithmetic complexity grows quicker than the memory
  8 | complexity (n^3 vs. n^2), memory accesses are redundant (each matrix entry is
  9 | accessed n times). A large part of a GEMM's performance and design goes toward
 10 | minimizing the inefficiency resulting from these redundant memory accesses.
 11 | 
 12 | Ultimately, once values are loaded into CPU registers, they cost nothing to
 13 | access, so as long as we can work within registers, this problem doesn't exist.
 14 | Thus, in order to be efficient, a GEMM's inner loops must wisely use the
 15 | available registers to do as much arithmetic work as possible before loading
 16 | more data from memory into registers. This means that a GEMM implementation
 17 | needs to have architecture-specific inner loops tailored for architecture
 18 | details such as the number of registers, and typically written in assembly. This
 19 | 'inner loops' architecture-specific component is referred to as the GEMM kernel.
 20 | (More details about kernels are in [kernel.md](kernel.md)).
 21 | 
 22 | However, only small blocks can fit at a given time in registers, so at larger
 23 | scales one needs to repeatedly load blocks of matrices from memory, and these
 24 | accesses are redundant for the reason outlined above. The way that one minimizes
 25 | the resulting inefficiency is by organizing for cache locality, so that most of
 26 | these accesses hit the L1 cache, and most of the remaining ones hit the L2
 27 | cache, etc.
 28 | 
 29 | This is achieved by subdividing the matrices into blocks sized to fit in L2
 30 | cache, and subdividing these blocks into sub-blocks sizes to fit in L1 cache,
 31 | and performing the matrix multiplication one such block at a time.
 32 | 
 33 | In practice, it tends to pay off to "pack" input blocks for optimally efficient
 34 | traversal by the kernel, since they will be traversed multiple times. "packing"
 35 | means at least reordering the data layout for 1) simple access patterns that fit
 36 | the CPU's cache behavior (in particular, the cache line size), and 2) simple
 37 | loading into SIMD vector registers by the kernel.
 38 | 
 39 | So a typical GEMM, in pseudo-code, tends to look like this:
 40 | 
 41 | ```
 42 | allocate(some_lhs_L2_block);
 43 | allocate(some_rhs_L2_block);
 44 | for (some_lhs_L2_block) {
 45 |   pack(some_lhs_L2_block);
 46 |   for (some_rhs_L2_block) {
 47 |     pack(some_rhs_L2_block);
 48 |     for (some_lhs_sub_block in some_lhs_L2_block) {
 49 |       for (some_rhs_sub_block in some_rhs_L2_block) {
 50 |         kernel(some_lhs_sub_block, some_rhs_sub_block);
 51 |       }
 52 |     }
 53 |   }
 54 | }
 55 | ```
 56 | 
 57 | ## Impact of low-precision computation on gemmlowp design
 58 | 
 59 | Refer to [low-precision.md](low-precision.md) for specifics of the
 60 | low-precision-computation paradigm and how it's implemented in gemmlowp.
 61 | 
 62 | Inputs and outputs are matrices of uint8 values, but internally we are
 63 | accumulating int32 values, only converting them back to uint8 at the end. This
 64 | means that we need so store a block of int32 accumulators at a time. We compute
 65 | a block of the result in int32 accumulators and then we "unpack" it into the
 66 | destination matrix at once. In this way, we minimize the amount of memory used
 67 | to store int32 values at a given time.
 68 | 
 69 | Because of that, besides the "pack" and "kernel" stages outlined above, a third
 70 | stage is needed in gemmlowp, which we call "unpack". Thus we arrive at the
 71 | 3-stage computation scheme that gemmlowp uses:
 72 | 
 73 | 1.  Pack lhs/rhs blocks from the input matrices.
 74 | 2.  Compute the product of the packed blocks, using the kernel.
 75 | 3.  Unpack the result block into the output matrix.
 76 | 
 77 | The pseudo-code overview of gemmlowp now looks like:
 78 | 
 79 | ```
 80 | allocate(some_lhs_L2_block);
 81 | allocate(some_rhs_L2_block);
 82 | // new: temp storage for int32 accums
 83 | allocate(some_int32_accumulators_block);
 84 | for (some_lhs_L2_block) {
 85 |   pack(some_lhs_L2_block);
 86 |   for (some_rhs_L2_block) {
 87 |     pack(some_rhs_L2_block);
 88 |     for (some_lhs_sub_block in some_lhs_L2_block) {
 89 |       for (some_rhs_sub_block in some_rhs_L2_block) {
 90 |         // new: pass int32 accums to kernel
 91 |         kernel(&some_int32_accumulators_block,
 92 |                some_lhs_sub_block,
 93 |                some_rhs_sub_block);
 94 |       }
 95 |     }
 96 |     // new: unpack int32 accums into destination matrix
 97 |     unpack(some_int32_accumulators_block);
 98 |   }
 99 | }
100 | ```
101 | 
102 | ## Exploring gemmlowp code
103 | 
104 | The design outlined above can be readily matched to gemmlowp source code, in
105 | particular in this file, which gives a simple GEMM implementation fitting in one
106 | rather small function:
107 | 
108 | ```
109 | internal/single_thread_gemm.h
110 | ```
111 | 
112 | The reader can compare the above pseudo-code to the actual code in this file:
113 | 
114 | ```
115 | for (int r = 0; r < rows; r += block_params.l2_rows) {
116 |   int rs = std::min(block_params.l2_rows, rows - r);
117 | 
118 |   PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
119 | 
120 |   for (int c = 0; c < cols; c += block_params.l2_cols) {
121 |     int cs = std::min(block_params.l2_cols, cols - c);
122 | 
123 |     if (!pack_rhs_once) {
124 |       PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
125 |     }
126 | 
127 |     Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs);
128 | 
129 |     auto result_block = result->block(r, c, rs, cs);
130 |     UnpackResult(&result_block, packed_result, packed_lhs, packed_rhs, depth,
131 |                  result_offset, result_mult_int, result_shift);
132 |   }
133 | }
134 | ```
135 | 
136 | The files in `internal/` fall into a few categories:
137 | 
138 | There are two top-level GEMM implementations,
139 | 
140 | *   [internal/single_thread_gemm.h](../internal/single_thread_gemm.h)
141 | *   [internal/multi_thread_gemm.h](../internal/multi_thread_gemm.h)
142 | 
143 | They both call into pack/compute/unpack stages (see [kernel.md](kernel.md) and
144 | [packing.md](packing.md)) implemented in the following files:
145 | 
146 | *   [internal/pack.h](../internal/pack.h)
147 | *   [internal/compute.h](../internal/compute.h)
148 | *   [internal/unpack.h](../internal/unpack.h)
149 |     *   This in turn calls into [internal/output.h](../internal/output.h) for
150 |         the output pipeline (see [output.md](output.md))
151 | 
152 | The pack.h and unpack.h files contain generic templated code that can be
153 | overridden by optimized code in template specializations; for example, see the
154 | NEON optimized code here:
155 | 
156 | *   [internal/pack_neon.h](../internal/pack_neon.h)
157 | *   [internal/unpack_neon.h](../internal/unpack_neon.h)
158 |     *   This in turn calls into
159 |         [internal/output_neon.h](../internal/output_neon.h)
160 | 
161 | The compute stage contains generic code in compute.h that only calls into
162 | optimized code through the Kernel::Run() entry point. Each kernel is basically
163 | just as struct offering a Run() implementation; see the NEON kernels in:
164 | 
165 | *   [internal/kernel_neon.h](../internal/kernel_neon.h)
166 | 


--------------------------------------------------------------------------------
/doc/output.md:
--------------------------------------------------------------------------------
 1 | # Output pipelines in gemmlowp
 2 | 
 3 | In gemmlowp, the "output pipeline" is the process that takes a final `int32`
 4 | accumulator value (the output of the compute/kernel stage), and processes it to
 5 | obtain the final value (typically a `uint8` value) and write it to the
 6 | destination matrix.
 7 | 
 8 | Gemmlowp has some genericity in what arithmetic transformations take place in
 9 | the output pipeline, so as to allow different users to implement different
10 | quantization paradigms. See [low-precision.md](low-precision.md) and
11 | [quantization.md](quantization.md).
12 | 
13 | Besides implementing a quantization paradigm, the other thing that output
14 | pipelines is good for, is implementing fused operations where a matrix
15 | multiplication feeds into other operations applied to its result, without
16 | additional array traversals. For instance, when implementing neural network
17 | inference, one might have a Convolutional layer with a bias-addition and an
18 | activation. One then wants to feed the result of the matrix multiplication
19 | implementing the Convolutional operator itself, directly into the bias-addition
20 | and activation function. gemmlowp's output pipelines allow implementing that:
21 | the bias-addition and activation function are just additional stages in the
22 | output pipeline.
23 | 
24 | ## Usage
25 | 
26 | The gemmlowp entry point allowing to use an arbitrary output pipeline is
27 | `GemmWithOutputPipeline` in [public/gemmlowp.h](../public/gemmlowp.h).
28 | 
29 | The output pipeline is specified as a `std::tuple` of "output stages", each of
30 | which defining an elementary arithmetic transformation.
31 | 
32 | All available output stages are defined in
33 | [public/output_stages.h](../public/output_stages.h).
34 | 
35 | ## Example usage
36 | 
37 | The best part to see examples of using various output pipelines is in the unit
38 | test,
39 | 
40 | ```
41 | test/test.cc
42 | ```
43 | 
44 | specifically in this function:
45 | 
46 | ```
47 | TestOutputStages
48 | ```
49 | 
50 | Separately, a self-contained example showing how to use gemmlowp to compute a
51 | quantized matrix multiplication with a sounds quantization paradigm, is here:
52 | 
53 | [doc/quantization_example.cc](quantization_example.cc)
54 | 


--------------------------------------------------------------------------------
/eight_bit_int_gemm/eight_bit_int_gemm.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 Google Inc. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // eight_bit_int_gemm.h: exposes the standard EightBitIntGemm interface.
16 | 
17 | #ifndef GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_
18 | #define GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_
19 | 
20 | #ifndef GEMMLOWP_USE_STLPORT
21 | #include <cstdint>
22 | #else
23 | #include <stdint.h>
24 | namespace std {
25 | using ::uint8_t;
26 | using ::int32_t;
27 | }
28 | #endif
29 | 
30 | namespace gemmlowp {
31 | 
32 | namespace eight_bit_int_gemm {
33 | 
34 | // Concurrency / reentrancy notice
35 | // ===============================
36 | //
37 | // This eight_bit_int_gemm has global singleton persistent state.
38 | // A global lock ensures serialization of calls, so this library
39 | // is fully reentrant but only one calling thread gets to actually run
40 | // at a time, while other calling threads would wait. So it is safe
41 | // albeit potentially slow to call the functions exposed here on
42 | // multiple threads concurrently.
43 | //
44 | // Users who prefer a state-less, singleton-less interface,
45 | // should use the main gemmlowp interface (public/gemmlowp.h) instead.
46 | 
47 | // The BitDepthSetting enum lists supported a/b bit-depth combinations.
48 | enum class BitDepthSetting {
49 |   A8B8,  // 8-bit a, 8-bit b
50 |   A5B7   // 5-bit a, 7-bit b
51 | };
52 | 
53 | // The main entry point to compute a Gemm. This is the standard
54 | // EightBitIntGemm interface.
55 | void EightBitIntGemm(bool transpose_a, bool transpose_b, bool transpose_c,
56 |                      int m, int n, int k, const std::uint8_t *a,
57 |                      std::int32_t a_offset, int lda, const std::uint8_t *b,
58 |                      std::int32_t b_offset, int ldb, std::uint8_t *c,
59 |                      std::int32_t c_offset, std::int32_t c_mult_int,
60 |                      std::int32_t c_shift, int ldc, BitDepthSetting bit_depth);
61 | 
62 | void EightBitIntGemm(bool transpose_a, bool transpose_b, bool transpose_c,
63 |                      int m, int n, int k, const std::uint8_t *a,
64 |                      std::int32_t a_offset, int lda, const std::uint8_t *b,
65 |                      std::int32_t b_offset, int ldb, float *c, float c_offset,
66 |                      int ldc, BitDepthSetting bit_depth);
67 | 
68 | // Frees any persistent resources
69 | // (threads, thread pools, allocators, buffers, ...)
70 | // that gemmlowp might hold. This is called automatically
71 | // on thread exit, but one may also call it earlier, at any time.
72 | void FreePersistentResources();
73 | 
74 | // Allows specifying the number of hardware threads, as a hint as to
75 | // how many worker threads to use for sufficiently large Gemm's.
76 | // We will never use more threads than that, but may use fewer,
77 | // for instance on Gemm's that are too small to benefit from all
78 | // available threads. The value 0 lets the implementation query
79 | // the system to determine the number of hardware threads.
80 | // Default value: 0.
81 | void SetMaxNumThreads(int n);
82 | 
83 | }  // namespace eight_bit_int_gemm
84 | 
85 | }  // namespace gemmlowp
86 | 
87 | #endif  // GEMMLOWP_EIGHT_BIT_INT_GEMM_EIGHT_BIT_INT_GEMM_H_
88 | 


--------------------------------------------------------------------------------
/flags.bzl:
--------------------------------------------------------------------------------
 1 | # Android builds do not need to link in a separate pthread library.
 2 | LIB_COPTS = []
 3 | 
 4 | LIB_LINKOPTS = select({
 5 |     ":android": [],
 6 |     ":windows": [],
 7 |     "//conditions:default": ["-lpthread"],
 8 | })
 9 | 
10 | BIN_LINKOPTS = LIB_LINKOPTS
11 | 
12 | 


--------------------------------------------------------------------------------
/internal/allocator.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // allocator.h: a buffer allocator that allows avoiding most of the
 16 | // malloc/free overhead, by:
 17 | // 1. Requiring all N allocations to be reserved in advance, and
 18 | //    then commited at once, turning N allocations into 1.
 19 | // 2. Being persistent, the allocated storage is reused across commits,
 20 | //    and only reallocated as needed when the commit size gets larger.
 21 | //
 22 | // This is driven by Android-specific needs:
 23 | // 1. On Android, the default (Bionic) allocator tends to aggressively
 24 | // unmap pages, which means that malloc/free can be surprisingly expensive.
 25 | // 2. On Android, stack allocations with alloca() can't be as large as on
 26 | // desktop platforms.
 27 | //
 28 | // General usage:
 29 | // 1. Reserve blocks by calling Reserve(), which returns a Handle.
 30 | // 2. Call Commit() once.
 31 | // 3. Now it is possible to get pointers to allocated buffers by calling
 32 | //    GetPointer().
 33 | // 4. Call Decommit() once.
 34 | // 5. The allocator is now reverted to its original state, except that
 35 | //    it retained its allocated storage, so the next Commit() will be faster.
 36 | //    The allocated storage is only freed when the Allocator object is
 37 | //    destroyed.
 38 | 
 39 | #ifndef GEMMLOWP_INTERNAL_ALLOCATOR_H_
 40 | #define GEMMLOWP_INTERNAL_ALLOCATOR_H_
 41 | 
 42 | #include "common.h"
 43 | 
 44 | namespace gemmlowp {
 45 | 
 46 | enum class TypeId : std::uint8_t { Uint8, Int8, Uint16, Int16, Uint32, Int32 };
 47 | 
 48 | template <typename T>
 49 | struct GetTypeIdImpl {};
 50 | 
 51 | template <typename T>
 52 | inline TypeId GetTypeId() {
 53 |   return GetTypeIdImpl<T>::Value;
 54 | }
 55 | 
 56 | template <typename T>
 57 | struct GetTypeIdImpl<const T> : GetTypeIdImpl<T> {};
 58 | 
 59 | #define GEMMLOWP_REGISTER_TYPEID(type_, id) \
 60 |   template <>                               \
 61 |   struct GetTypeIdImpl<type_> {             \
 62 |     static const TypeId Value = TypeId::id; \
 63 |   };
 64 | 
 65 | GEMMLOWP_REGISTER_TYPEID(std::uint8_t, Uint8)
 66 | GEMMLOWP_REGISTER_TYPEID(std::int8_t, Int8)
 67 | GEMMLOWP_REGISTER_TYPEID(std::uint16_t, Uint16)
 68 | GEMMLOWP_REGISTER_TYPEID(std::int16_t, Int16)
 69 | GEMMLOWP_REGISTER_TYPEID(std::uint32_t, Uint32)
 70 | GEMMLOWP_REGISTER_TYPEID(std::int32_t, Int32)
 71 | 
 72 | class Allocator {
 73 |  public:
 74 |   Allocator()
 75 |       : committed_(false),
 76 |         storage_size_(0),
 77 |         storage_(nullptr),
 78 |         reserved_blocks_(0),
 79 |         reserved_bytes_(0),
 80 |         generation_(0) {}
 81 | 
 82 |   ~Allocator() {
 83 |     assert(!committed_);
 84 |     assert(!reserved_blocks_);
 85 |     DeallocateStorage();
 86 |   }
 87 | 
 88 |   // Alignment of allocated blocks.
 89 |   static constexpr std::size_t kAlignment = kDefaultCacheLineSize;
 90 | 
 91 |   // This is all we need so far, and since the usage pattern is fixed,
 92 |   // there is no point in allowing more until we need to.
 93 |   static constexpr std::size_t kMaxBlocks = 5;
 94 | 
 95 |   void Commit() {
 96 |     assert(!committed_);
 97 | 
 98 |     if (reserved_bytes_ > storage_size_) {
 99 |       DeallocateStorage();
100 |       storage_size_ = RoundUpToPowerOfTwo(reserved_bytes_);
101 |       storage_ = aligned_alloc(kAlignment, storage_size_);
102 |     }
103 | 
104 |     ReleaseBuildAssertion(!storage_size_ || storage_, "allocation failure");
105 |     committed_ = true;
106 |   }
107 | 
108 |   void Decommit() {
109 |     assert(committed_);
110 |     committed_ = false;
111 |     generation_++;
112 | 
113 |     reserved_blocks_ = 0;
114 |     reserved_bytes_ = 0;
115 |   }
116 | 
117 |   // See generation_
118 |   typedef std::size_t generation_t;
119 | 
120 |   // A handle on a reserved block. The user obtains
121 |   // one by calling Reserve() and, after committing,
122 |   // passes it to GetPointer().
123 |   class Handle {
124 |     std::uint8_t index_;
125 |     generation_t generation_;
126 |     TypeId type_;
127 | 
128 |     friend class Allocator;
129 |   };
130 | 
131 |   // Reserves a block sized for n elements of type T, and
132 |   // returns a handle to it. Must be called before committing.
133 |   template <typename T>
134 |   Handle Reserve(std::size_t n) {
135 |     assert(!committed_ && "can't reserve blocks while committed");
136 |     assert(reserved_blocks_ < kMaxBlocks &&
137 |            "didn't expect to allocate this many blocks");
138 |     const std::size_t bytes = RoundUp<kAlignment>(n * sizeof(T));
139 |     const std::size_t offset = reserved_bytes_;
140 |     const std::size_t index = reserved_blocks_;
141 | 
142 |     reserved_blocks_offsets_[index] = offset;
143 |     Handle h;
144 |     h.index_ = index;
145 |     h.generation_ = generation_;
146 |     h.type_ = GetTypeId<T>();
147 | 
148 |     reserved_blocks_++;
149 |     reserved_bytes_ += bytes;
150 | 
151 |     return h;
152 |   }
153 | 
154 |   // Returns the pointer to the allocated buffer for the given handle.
155 |   // Must be called after committing.
156 |   template <typename T>
157 |   T* GetPointer(const Handle& h) const {
158 |     assert(committed_ && "can't get block pointers unless committed");
159 |     assert(h.index_ < reserved_blocks_ &&
160 |            "bad handle, points to inexistant block");
161 |     assert(h.generation_ == generation_ &&
162 |            "handle from earlier generation, have decommitted since");
163 |     assert(h.type_ == GetTypeId<T>() && "type mismatch");
164 |     std::size_t offset = reserved_blocks_offsets_[h.index_];
165 |     std::uintptr_t addr = reinterpret_cast<std::uintptr_t>(storage_) + offset;
166 |     return reinterpret_cast<T*>(addr);
167 |   }
168 | 
169 |  private:
170 |   void DeallocateStorage() {
171 |     assert(!committed_);
172 |     aligned_free(storage_);
173 |     storage_size_ = 0;
174 |   }
175 | 
176 |   // Set to true by Commit() and to false by Decommit(). Initially false.
177 |   bool committed_;
178 | 
179 |   // The actually allocated storage size and buffer pointer.
180 |   std::size_t storage_size_;
181 |   mutable void* storage_;
182 | 
183 |   // The number of blocks that have been reserved by Reserve().
184 |   std::size_t reserved_blocks_;
185 |   // The number of bytes that have been reserved by Reserve().
186 |   std::size_t reserved_bytes_;
187 |   // The offsets of reserved blocks into the storage buffer.
188 |   std::size_t reserved_blocks_offsets_[kMaxBlocks];
189 | 
190 |   // The 'generation' is incremented on Decommit() and allows catching
191 |   // bad GetPointer() calls still referring to a previous commit.
192 |   generation_t generation_;
193 | };
194 | 
195 | }  // namespace gemmlowp
196 | 
197 | #endif  // GEMMLOWP_INTERNAL_ALLOCATOR_H_
198 | 


--------------------------------------------------------------------------------
/internal/compute.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // compute.h: the central stage of the Gemm computation, operates
 16 | // on already-packed LHS and RHS blocks and calls the Gemm kernel
 17 | // to compute a block of the product.
 18 | 
 19 | #ifndef GEMMLOWP_INTERNAL_COMPUTE_H_
 20 | #define GEMMLOWP_INTERNAL_COMPUTE_H_
 21 | 
 22 | #include "block_params.h"
 23 | #include "kernel.h"
 24 | #include "pack.h"
 25 | 
 26 | namespace gemmlowp {
 27 | 
 28 | template <typename PackedLhs, typename PackedRhs, typename PackedResult>
 29 | class ComputeImpl {
 30 |   typedef typename PackedLhs::KernelSideFormat KernelLhsFormat;
 31 |   typedef typename PackedRhs::KernelSideFormat KernelRhsFormat;
 32 |   typedef KernelFormat<KernelLhsFormat, KernelRhsFormat> Format;
 33 | 
 34 |   const KernelBase& kernel_;
 35 |   const BlockParams& block_params_;
 36 | 
 37 |   PackedResult* const packed_result_;
 38 |   const PackedLhs& packed_lhs_;
 39 |   const PackedRhs& packed_rhs_;
 40 | 
 41 |  public:
 42 |   ComputeImpl(const KernelBase& _kernel, const BlockParams& _block_params,
 43 |               PackedResult* _packed_result, const PackedLhs& _packed_lhs,
 44 |               const PackedRhs& _packed_rhs)
 45 |       : kernel_(_kernel),
 46 |         block_params_(_block_params),
 47 |         packed_result_(_packed_result),
 48 |         packed_lhs_(_packed_lhs),
 49 |         packed_rhs_(_packed_rhs) {}
 50 | 
 51 |   void Compute(int depth) {
 52 |     depth = RoundUp<Format::kDepth>(depth);
 53 |     assert(depth <= block_params_.l2_depth);
 54 |     for (int d = 0; d < depth; d += block_params_.l1_depth) {
 55 |       int ds = std::min(block_params_.l1_depth, depth - d);
 56 | 
 57 |       for (int r = 0; r < block_params_.l2_rows; r += block_params_.l1_rows) {
 58 |         int rs = std::min(block_params_.l1_rows, block_params_.l2_rows - r);
 59 | 
 60 |         ComputeL1(r, rs, 0, block_params_.l2_cols, d, ds);
 61 |       }
 62 |     }
 63 |   }
 64 | 
 65 |  private:
 66 |   static void MarkPackedResultBlockAsInitialized(
 67 |       const MatrixMap<std::int32_t, MapOrder::ColMajor>& packed_result_block) {
 68 | #ifdef GEMMLOWP_MARK_MEMORY_AS_INITIALIZED
 69 |     for (int col = 0; col < packed_result_block.cols(); col++) {
 70 |       MarkMemoryAsInitialized(
 71 |           packed_result_block.data() + col * packed_result_block.cols_stride(),
 72 |           packed_result_block.rows());
 73 |     }
 74 | #else
 75 |     (void)packed_result_block;
 76 | #endif
 77 |   }
 78 | 
 79 |   void ComputeRun(int start_row, int start_col, int start_depth,
 80 |                   int depth) GEMMLOWP_NOINLINE {
 81 |     packed_lhs_.seek_run(start_row, start_depth);
 82 |     packed_rhs_.seek_run(start_col, start_depth);
 83 |     auto packed_result_block = packed_result_->Map().block(
 84 |         start_row, start_col, Format::kRows, Format::kCols);
 85 |     kernel_.Run(packed_result_block.data(), packed_result_block.rows_stride(),
 86 |                 packed_result_block.cols_stride(), packed_lhs_.current_data(),
 87 |                 packed_rhs_.current_data(), start_depth, depth);
 88 |     MarkPackedResultBlockAsInitialized(packed_result_block);
 89 |   }
 90 | 
 91 |   void ComputeL1(int start_row, int rows, int start_col, int cols,
 92 |                  int start_depth, int depth) {
 93 |     assert(rows % Format::kRows == 0);
 94 |     assert(cols % Format::kCols == 0);
 95 |     assert(depth % Format::kDepth == 0);
 96 | 
 97 |     for (int c = 0; c < cols; c += Format::kCols) {
 98 |       for (int r = 0; r < rows; r += Format::kRows) {
 99 |         ComputeRun(start_row + r, start_col + c, start_depth, depth);
100 |       }
101 |     }
102 |   }
103 | };
104 | 
105 | template <typename PackedLhs, typename PackedRhs, typename PackedResult>
106 | void Compute(const KernelBase& kernel, const BlockParams& block_params,
107 |              PackedResult* packed_result, const PackedLhs& packed_lhs,
108 |              const PackedRhs& packed_rhs, int depth) {
109 |   ScopedProfilingLabel label("compute");
110 |   ComputeImpl<PackedLhs, PackedRhs, PackedResult> impl(
111 |       kernel, block_params, packed_result, packed_lhs, packed_rhs);
112 | 
113 |   impl.Compute(depth);
114 | }
115 | 
116 | }  // namespace gemmlowp
117 | 
118 | #endif  // GEMMLOWP_INTERNAL_COMPUTE_H_
119 | 


--------------------------------------------------------------------------------
/internal/detect_platform.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2018 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // detect_platform.h: Sets up macros that control architecture-specific
 16 | // features of gemmlowp's implementation.
 17 | 
 18 | #ifndef GEMMLOWP_INTERNAL_DETECT_PLATFORM_H_
 19 | #define GEMMLOWP_INTERNAL_DETECT_PLATFORM_H_
 20 | 
 21 | // Our inline assembly path assume GCC/Clang syntax.
 22 | // Native Client doesn't seem to support inline assembly(?).
 23 | #if (defined(__GNUC__) || defined(__clang__)) && !defined(__native_client__)
 24 | #define GEMMLOWP_ALLOW_INLINE_ASM
 25 | #endif
 26 | 
 27 | // Define macro statement that avoids inlining for GCC.
 28 | // For non-GCC, define as empty macro.
 29 | #if defined(__GNUC__)
 30 | #define GEMMLOWP_NOINLINE __attribute__((noinline))
 31 | #else
 32 | #define GEMMLOWP_NOINLINE
 33 | #endif
 34 | 
 35 | // Detect ARM, 32-bit or 64-bit
 36 | #ifdef __arm__
 37 | #define GEMMLOWP_ARM_32
 38 | #endif
 39 | 
 40 | #ifdef __aarch64__
 41 | #define GEMMLOWP_ARM_64
 42 | #endif
 43 | 
 44 | #if defined(GEMMLOWP_ARM_32) || defined(GEMMLOWP_ARM_64)
 45 | #define GEMMLOWP_ARM
 46 | #endif
 47 | 
 48 | // Detect MIPS, 32-bit or 64-bit
 49 | #if defined(__mips) && !defined(__LP64__)
 50 | #define GEMMLOWP_MIPS_32
 51 | #endif
 52 | 
 53 | #if defined(__mips) && defined(__LP64__)
 54 | #define GEMMLOWP_MIPS_64
 55 | #endif
 56 | 
 57 | #if defined(GEMMLOWP_MIPS_32) || defined(GEMMLOWP_MIPS_64)
 58 | #define GEMMLOWP_MIPS
 59 | #endif
 60 | 
 61 | // Detect x86, 32-bit or 64-bit
 62 | #if defined(__i386__) || defined(_M_IX86) || defined(_X86_) || defined(__i386)
 63 | #define GEMMLOWP_X86_32
 64 | #endif
 65 | 
 66 | #if defined(__x86_64__) || defined(_M_X64) || defined(__amd64)
 67 | #define GEMMLOWP_X86_64
 68 | #endif
 69 | 
 70 | #if defined(GEMMLOWP_X86_32) || defined(GEMMLOWP_X86_64)
 71 | #define GEMMLOWP_X86
 72 | #endif
 73 | 
 74 | // Detect WebAssembly SIMD.
 75 | #if defined(__wasm_simd128__)
 76 | #define GEMMLOWP_WASMSIMD
 77 | #endif
 78 | 
 79 | // Some of our optimized paths use inline assembly and for
 80 | // now we don't bother enabling some other optimized paths using intrinddics
 81 | // where we can't use inline assembly paths.
 82 | #ifdef GEMMLOWP_ALLOW_INLINE_ASM
 83 | 
 84 | // Detect NEON. It's important to check for both tokens.
 85 | #if (defined __ARM_NEON) || (defined __ARM_NEON__)
 86 | #define GEMMLOWP_NEON
 87 | #endif
 88 | 
 89 | // Convenience NEON tokens for 32-bit or 64-bit
 90 | #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_32)
 91 | #define GEMMLOWP_NEON_32
 92 | #endif
 93 | 
 94 | #if defined(GEMMLOWP_NEON) && defined(GEMMLOWP_ARM_64)
 95 | #define GEMMLOWP_NEON_64
 96 | #endif
 97 | 
 98 | // Detect MIPS MSA.
 99 | // Limit MSA optimizations to little-endian CPUs for now.
100 | // TODO: Perhaps, eventually support MSA optimizations on big-endian CPUs?
101 | #if defined(GEMMLOWP_MIPS) && (__mips_isa_rev >= 5) && defined(__mips_msa) && \
102 |     defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__)
103 | #define GEMMLOWP_MSA
104 | #endif
105 | 
106 | // Convenience MIPS MSA tokens for 32-bit or 64-bit.
107 | #if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_32)
108 | #define GEMMLOWP_MSA_32
109 | #endif
110 | 
111 | #if defined(GEMMLOWP_MSA) && defined(GEMMLOWP_MIPS_64)
112 | #define GEMMLOWP_MSA_64
113 | #endif
114 | 
115 | // compiler define for AVX2 -D GEMMLOWP_ENABLE_AVX2
116 | // Detect AVX2
117 | #if defined(__AVX2__) && defined(GEMMLOWP_ENABLE_AVX2)
118 | #define GEMMLOWP_AVX2
119 | // Detect SSE4.
120 | // MSVC does not have __SSE4_1__ macro, but will enable SSE4
121 | // when AVX is turned on.
122 | #elif defined(__SSE4_1__) || (defined(_MSC_VER) && defined(__AVX__))
123 | #define GEMMLOWP_SSE4
124 | // Detect SSE3.
125 | #elif defined(__SSE3__)
126 | #define GEMMLOWP_SSE3
127 | #endif
128 | 
129 | // Convenience SSE4 tokens for 32-bit or 64-bit
130 | #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_32) && \
131 |     !defined(GEMMLOWP_DISABLE_SSE4)
132 | #define GEMMLOWP_SSE4_32
133 | #endif
134 | 
135 | #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_32)
136 | #define GEMMLOWP_SSE3_32
137 | #endif
138 | 
139 | #if defined(GEMMLOWP_SSE4) && defined(GEMMLOWP_X86_64) && \
140 |     !defined(GEMMLOWP_DISABLE_SSE4)
141 | #define GEMMLOWP_SSE4_64
142 | #endif
143 | 
144 | #if defined(GEMMLOWP_SSE3) && defined(GEMMLOWP_X86_64)
145 | #define GEMMLOWP_SSE3_64
146 | #endif
147 | 
148 | #if defined(GEMMLOWP_AVX2) && defined(GEMMLOWP_X86_64)
149 | #define GEMMLOWP_AVX2_64
150 | #endif
151 | 
152 | #if defined(__has_feature)
153 | #if __has_feature(memory_sanitizer)
154 | #include <sanitizer/msan_interface.h>
155 | #define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __msan_unpoison
156 | #elif __has_feature(address_sanitizer)
157 | #include <sanitizer/asan_interface.h>
158 | #define GEMMLOWP_MARK_MEMORY_AS_INITIALIZED __asan_unpoison_memory_region
159 | #endif
160 | #endif
161 | 
162 | #endif  // GEMMLOWP_ALLOW_INLINE_ASM
163 | 
164 | // Detect Android. Don't conflate with ARM - we care about tuning
165 | // for non-ARM Android devices too. This can be used in conjunction
166 | // with x86 to tune differently for mobile x86 CPUs (Atom) vs. desktop x86 CPUs.
167 | #if defined(__ANDROID__) || defined(ANDROID)
168 | #define GEMMLOWP_ANDROID
169 | #endif
170 | 
171 | #endif  // GEMMLOWP_INTERNAL_DETECT_PLATFORM_H_
172 | 


--------------------------------------------------------------------------------
/internal/kernel_default.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // kernel_default.h: Chooses default GEMM and GEMV kernels for the
 16 | // host platform.
 17 | 
 18 | #ifndef GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
 19 | #define GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
 20 | 
 21 | #include "../public/bit_depth.h"
 22 | #include "common.h"
 23 | #include "kernel.h"
 24 | #include "kernel_reference.h"
 25 | 
 26 | namespace gemmlowp {
 27 | 
 28 | template <bool MaxProductIsLessThan4096, bool IsUnsigned, bool LhsNonZero>
 29 | struct DefaultKernelImpl {};
 30 | 
 31 | // Partial specialization implementing the logic that if we want to use
 32 | // a kernel for MaxProductIsLessThan4096 but do not have such a kernel, then we
 33 | // fall back to a generic kernel not taking advantage of
 34 | // MaxProductIsLessThan4096.
 35 | template <bool LhsNonZero>
 36 | struct DefaultKernelImpl<true, true, LhsNonZero>
 37 |     : DefaultKernelImpl<false, true, LhsNonZero> {};
 38 | 
 39 | // Partial specialization implementing the logic that if we want to use
 40 | // a kernel for LhsNonZero but do not have such a kernel, then we fall
 41 | // back to a generic kernel not taking advantage of LhsNonZero.
 42 | template <bool MaxProductIsLessThan4096>
 43 | struct DefaultKernelImpl<MaxProductIsLessThan4096, true, true>
 44 |     : DefaultKernelImpl<MaxProductIsLessThan4096, true, false> {};
 45 | 
 46 | template <typename BitDepthParams>
 47 | struct DefaultKernel
 48 |     : DefaultKernelImpl<(BitDepthParams::LhsRange::kMaxValue *
 49 |                              BitDepthParams::RhsRange::kMaxValue <
 50 |                          4096),
 51 |                         (BitDepthParams::LhsRange::kMinValue >= 0),
 52 |                         (BitDepthParams::LhsRange::kMinValue > 0 ||
 53 |                          (BitDepthParams::LhsRange::kMaxValue <= 127 &&
 54 |                           BitDepthParams::LhsRange::kMinValue > -128))> {};
 55 | 
 56 | }  // end namespace gemmlowp
 57 | 
 58 | #define GEMMLOWP_SET_DEFAULT_KERNEL(MaxProductIsLessThan4096, IsUnsigned, \
 59 |                                     LhsAlwaysNonZero, Kernel)             \
 60 |   namespace gemmlowp {                                                    \
 61 |   template <>                                                             \
 62 |   struct DefaultKernelImpl<MaxProductIsLessThan4096, IsUnsigned,          \
 63 |                            LhsAlwaysNonZero> : Kernel {};                 \
 64 |   }
 65 | 
 66 | // User-provided int8 inputs is only supported in the NEON path currently.
 67 | #if defined GEMMLOWP_NEON_32
 68 | #include "kernel_neon.h"
 69 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, NEON_32_Kernel12x4Depth2)
 70 | GEMMLOWP_SET_DEFAULT_KERNEL(true, true, false,
 71 |                             NEON_32_Kernel12x4Depth2Assuming12BitProducts)
 72 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true,
 73 |                             NEON_32bit_GEMM_Int8Operands_LhsNonzero)
 74 | GEMMLOWP_SET_DEFAULT_KERNEL(false, false, true,
 75 |                             NEON_32bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs)
 76 | #elif defined GEMMLOWP_NEON_64
 77 | #include "kernel_neon.h"
 78 | #if defined GEMMLOWP_DOTPROD_KERNEL
 79 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false,
 80 |                             NEON_64_Kernel12x8Depth4_dotprod)
 81 | #else
 82 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, NEON_64_Kernel12x8Depth2)
 83 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true,
 84 |                             NEON_64bit_GEMM_Int8Operands_LhsNonzero)
 85 | #endif
 86 | GEMMLOWP_SET_DEFAULT_KERNEL(false, false, true,
 87 |                             NEON_64bit_GEMM_Int8Operands_LhsNonzero_Int8Inputs)
 88 | #elif defined(GEMMLOWP_MSA)
 89 | #include "kernel_msa.h"
 90 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, MSA_Kernel12x8Depth2)
 91 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, true, MSA_GEMM_Int8Operands_LhsNonzero)
 92 | #elif defined GEMMLOWP_SSE4_32
 93 | #include "kernel_sse.h"
 94 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, SSE4_32_Kernel4x4Depth2)
 95 | #elif defined GEMMLOWP_SSE4_64
 96 | #include "kernel_sse.h"
 97 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, SSE4_64_Kernel12x4Depth2)
 98 | #elif defined GEMMLOWP_AVX2_64
 99 | #include "kernel_avx.h"
100 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, AVX2_64_Kernel24x8Depth2)
101 | #else
102 | #include "kernel_reference.h"
103 | namespace gemmlowp {
104 | typedef ReferenceKernel<KernelFormat<
105 |     KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1>,
106 |     KernelSideFormat<CellFormat<4, 16, CellOrder::WidthMajor>, 1> > >
107 |     DefaultReferenceKernel;
108 | }
109 | GEMMLOWP_SET_DEFAULT_KERNEL(false, true, false, DefaultReferenceKernel)
110 | #endif
111 | 
112 | #endif  // GEMMLOWP_INTERNAL_KERNEL_DEFAULT_H_
113 | 


--------------------------------------------------------------------------------
/internal/kernel_reference.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // kernel_reference.h: a reference kernel for CPU architectures where we don't
 16 | // have optimized kernels yet. Also useful for testing, as it's templatized
 17 | // to have any arbitrary format, allowing tests to cover all sorts of corner
 18 | // cases.
 19 | 
 20 | #ifndef GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
 21 | #define GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
 22 | 
 23 | #include "kernel.h"
 24 | 
 25 | #include <cstdio>
 26 | #include <cstring>
 27 | 
 28 | namespace gemmlowp {
 29 | 
 30 | // This kernel is templatized in an arbitrary Format template parameter,
 31 | // allowing it to have any arbitrary format.
 32 | template <typename tFormat>
 33 | struct ReferenceKernel : KernelBase {
 34 |   typedef tFormat Format;
 35 | 
 36 |   const char* Name() const override {
 37 |     static char buf[256];
 38 |     snprintf(buf, sizeof(buf),
 39 |              "reference(Lhs: %d cells %dx%d %s, Rhs: %d cells %dx%d %s)",
 40 |              Format::Lhs::kCells, Format::Lhs::Cell::kWidth,
 41 |              Format::Lhs::Cell::kDepth,
 42 |              CellOrderName(Format::Lhs::Cell::kOrder), Format::Rhs::kCells,
 43 |              Format::Rhs::Cell::kDepth, Format::Rhs::Cell::kWidth,
 44 |              CellOrderName(Format::Rhs::Cell::kOrder));
 45 |     return buf;
 46 |   }
 47 | 
 48 |   void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
 49 |            std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
 50 |            const std::uint8_t* rhs_ptr, std::size_t start_depth,
 51 |            std::size_t run_depth) const override {
 52 |     std::int32_t accumulator[Format::kRows * Format::kCols];
 53 |     memset(accumulator, 0, sizeof(accumulator));
 54 | 
 55 |     const int run_depth_cells = static_cast<int>(run_depth / Format::kDepth);
 56 | 
 57 |     // The outer loop is over the depth dimension.
 58 |     for (int dc = 0; dc < run_depth_cells; dc++) {
 59 |       // The next two loops are over cells of the Lhs (stacked vertically),
 60 |       // and over cells of the Rhs (stacked horizontally).
 61 |       for (int rc = 0; rc < Format::Lhs::kCells; rc++) {
 62 |         const std::uint8_t* lhs_cell_ptr =
 63 |             lhs_ptr + (dc * Format::Lhs::kCells + rc) *
 64 |                           Format::Lhs::Cell::kWidth * Format::kDepth;
 65 |         for (int cc = 0; cc < Format::Rhs::kCells; cc++) {
 66 |           const std::uint8_t* rhs_cell_ptr =
 67 |               rhs_ptr + (dc * Format::Rhs::kCells + cc) *
 68 |                             Format::Rhs::Cell::kWidth * Format::kDepth;
 69 | 
 70 |           // Now we are inside one cell of the Lhs and inside one cell
 71 |           // of the Rhs, so the remaining inner loops are just
 72 |           // traditional three loops of matrix multiplication.
 73 |           for (int di = 0; di < Format::kDepth; di++) {
 74 |             for (int ri = 0; ri < Format::Lhs::Cell::kWidth; ri++) {
 75 |               for (int ci = 0; ci < Format::Rhs::Cell::kWidth; ci++) {
 76 |                 const std::uint8_t* lhs_coeff_ptr =
 77 |                     lhs_cell_ptr +
 78 |                     OffsetIntoCell<typename Format::Lhs::Cell>(ri, di);
 79 |                 const std::uint8_t* rhs_coeff_ptr =
 80 |                     rhs_cell_ptr +
 81 |                     OffsetIntoCell<typename Format::Rhs::Cell>(ci, di);
 82 |                 std::int32_t* accumulator_coeff_ptr =
 83 |                     accumulator + (ri + rc * Format::Lhs::Cell::kWidth) +
 84 |                     (ci + cc * Format::Rhs::Cell::kWidth) * Format::kRows;
 85 |                 *accumulator_coeff_ptr +=
 86 |                     std::int32_t(*lhs_coeff_ptr) * std::int32_t(*rhs_coeff_ptr);
 87 |               }
 88 |             }
 89 |           }
 90 |         }
 91 |       }
 92 |     }
 93 | 
 94 |     if (start_depth == 0) {
 95 |       // start_depth == 0 means we haven't accumulated anything yet, so we need
 96 |       // to overwrite the accumulator, as it hasn't been initialized to zero.
 97 |       for (int r = 0; r < Format::kRows; r++) {
 98 |         for (int c = 0; c < Format::kCols; c++) {
 99 |           dst_ptr[r * dst_row_stride + c * dst_col_stride] =
100 |               accumulator[r + c * Format::kRows];
101 |         }
102 |       }
103 |     } else {
104 |       // We have already accumulated stuff, so we need to continue accumulating
105 |       // instead of just overwriting.
106 |       for (int r = 0; r < Format::kRows; r++) {
107 |         for (int c = 0; c < Format::kCols; c++) {
108 |           dst_ptr[r * dst_row_stride + c * dst_col_stride] +=
109 |               accumulator[r + c * Format::kRows];
110 |         }
111 |       }
112 |     }
113 |   }
114 | };
115 | 
116 | }  // namespace gemmlowp
117 | 
118 | #endif  // GEMMLOWP_INTERNAL_KERNEL_REFERENCE_H_
119 | 


--------------------------------------------------------------------------------
/internal/output_avx.h:
--------------------------------------------------------------------------------
 1 | //
 2 | // Licensed under the Apache License, Version 2.0 (the "License");
 3 | // you may not use this file except in compliance with the License.
 4 | // You may obtain a copy of the License at
 5 | //
 6 | //     http://www.apache.org/licenses/LICENSE-2.0
 7 | //
 8 | // Unless required by applicable law or agreed to in writing, software
 9 | // distributed under the License is distributed on an "AS IS" BASIS,
10 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | // See the License for the specific language governing permissions and
12 | // limitations under the License.
13 | 
14 | // output_avx.h: optimized AVX 2 specializations of the templates in output.h.
15 | 
16 | #ifndef GEMMLOWP_INTERNAL_OUTPUT_AVX_H_
17 | #define GEMMLOWP_INTERNAL_OUTPUT_AVX_H_
18 | 
19 | #endif  // GEMMLOWP_INTERNAL_OUTPUT_AVX_H_
20 | 


--------------------------------------------------------------------------------
/internal/pack_sse.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // pack_SSE.h: optimized SSE specializations of the templates in pack.h.
 16 | 
 17 | #ifndef GEMMLOWP_INTERNAL_PACK_SSE_H_
 18 | #define GEMMLOWP_INTERNAL_PACK_SSE_H_
 19 | 
 20 | #include <smmintrin.h>
 21 | #include "pack.h"
 22 | 
 23 | namespace gemmlowp {
 24 | 
 25 | // TODO: Add DepthMajorUint8SideMap
 26 | 
 27 | typedef SideMap<const std::uint8_t, SideMapOrder::WidthMajor>
 28 |     WidthMajorUint8SideMap;
 29 | 
 30 | template <int Cells>
 31 | using WidthMajorSideFormatNCells4x2 =
 32 |     KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, Cells>;
 33 | 
 34 | template <int Cells>
 35 | class PackingRegisterBlock<
 36 |     WidthMajorUint8SideMap,
 37 |     PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells> > >
 38 |     : public PackingRegisterBlockBase<
 39 |           WidthMajorUint8SideMap,
 40 |           PackedSideBlock<WidthMajorSideFormatNCells4x2<Cells> > > {
 41 |  public:
 42 |   typedef WidthMajorSideFormatNCells4x2<Cells> KernelSideFormat;
 43 |   typedef typename KernelSideFormat::Cell CellFormat;
 44 |   static constexpr int kCells = KernelSideFormat::kCells;
 45 |   static constexpr int kCellWidth = CellFormat::kWidth;
 46 |   static constexpr int kKernelWidth = CellFormat::kWidth * kCells;
 47 |   static constexpr int kCellDepth = CellFormat::kDepth;
 48 |   static constexpr int kCellSize = CellFormat::kSize;
 49 | 
 50 |   void Pack(PackedSideBlock<KernelSideFormat>* dst, int start_width) {
 51 |     std::uint8_t* dst_ptr = dst->current_data();
 52 |     const int width_stride = this->complete_src_.width_stride();
 53 |     int depth_step = 8;
 54 | 
 55 |     __m128i one = _mm_set1_epi16(1);
 56 |     for (int cell_start_depth = 0; cell_start_depth < kRegisterSize;
 57 |          cell_start_depth += depth_step) {
 58 |       for (int cell_start_width = 0; cell_start_width < kKernelWidth;
 59 |            cell_start_width += kCellWidth) {
 60 |         std::int32_t* cell_sums_of_each_slice_ptr =
 61 |             dst->sums_of_each_slice() + start_width + cell_start_width;
 62 |         const std::uint8_t* src_data =
 63 |             this->complete_src_.data(cell_start_width, cell_start_depth);
 64 | 
 65 |         __m128i xmm1 =
 66 |             _mm_loadl_epi64(reinterpret_cast<const __m128i*>(&src_data[0]));
 67 |         __m128i xmm2 = _mm_loadl_epi64(
 68 |             reinterpret_cast<const __m128i*>(&src_data[1 * width_stride]));
 69 |         __m128i xmm3 = _mm_loadl_epi64(
 70 |             reinterpret_cast<const __m128i*>(&src_data[2 * width_stride]));
 71 |         __m128i xmm4 = _mm_loadl_epi64(
 72 |             reinterpret_cast<const __m128i*>(&src_data[3 * width_stride]));
 73 | 
 74 |         __m128i xmm5 = _mm_unpacklo_epi16(xmm1, xmm2);
 75 |         __m128i xmm8 = _mm_shuffle_epi32(xmm5, 0x31);
 76 | 
 77 |         __m128i xmm6 = _mm_unpacklo_epi16(xmm3, xmm4);
 78 |         __m128i xmm7 = _mm_shuffle_epi32(xmm6, 0x80);
 79 | 
 80 |         __m128i xmm9 = _mm_blend_epi16(xmm5, xmm7, 0xcc);
 81 |         __m128i xmm10 = _mm_blend_epi16(xmm8, xmm6, 0xcc);
 82 | 
 83 |         _mm_storel_epi64(reinterpret_cast<__m128i*>(&dst_ptr[0]), xmm9);
 84 |         _mm_storel_epi64(
 85 |             reinterpret_cast<__m128i*>(&dst_ptr[kCellSize * kCells]), xmm10);
 86 | 
 87 |         __m128i xmm11 = _mm_shuffle_epi32(xmm9, 0xee);
 88 |         __m128i xmm12 = _mm_shuffle_epi32(xmm10, 0xee);
 89 | 
 90 |         _mm_storel_epi64(
 91 |             reinterpret_cast<__m128i*>(&dst_ptr[2 * kCellSize * kCells]),
 92 |             xmm11);
 93 |         _mm_storel_epi64(
 94 |             reinterpret_cast<__m128i*>(&dst_ptr[3 * kCellSize * kCells]),
 95 |             xmm12);
 96 | 
 97 |         xmm1 = _mm_cvtepu8_epi16(xmm9);
 98 |         xmm2 = _mm_madd_epi16(xmm1, one);
 99 |         __m128i sums_of_each_slice_xmm = _mm_loadu_si128(
100 |             reinterpret_cast<const __m128i*>(&cell_sums_of_each_slice_ptr[0]));
101 |         sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
102 | 
103 |         xmm1 = _mm_cvtepu8_epi16(xmm10);
104 |         xmm2 = _mm_madd_epi16(xmm1, one);
105 |         sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
106 | 
107 |         xmm1 = _mm_cvtepu8_epi16(xmm11);
108 |         xmm2 = _mm_madd_epi16(xmm1, one);
109 |         sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
110 | 
111 |         xmm1 = _mm_cvtepu8_epi16(xmm12);
112 |         xmm2 = _mm_madd_epi16(xmm1, one);
113 |         sums_of_each_slice_xmm = _mm_add_epi32(sums_of_each_slice_xmm, xmm2);
114 | 
115 |         _mm_storeu_si128(
116 |             reinterpret_cast<__m128i*>(&cell_sums_of_each_slice_ptr[0]),
117 |             sums_of_each_slice_xmm);
118 |         dst_ptr += kCellSize;
119 |       }
120 |       dst_ptr += 3 * kCellSize * kCells;
121 |     }
122 |     dst->seek_forward_n_cells(kCells * kRegisterSize / kCellDepth);
123 |   }
124 | };
125 | 
126 | }  // namespace gemmlowp
127 | 
128 | #endif  // GEMMLOWP_INTERNAL_PACK_SSE_H_
129 | 


--------------------------------------------------------------------------------
/internal/platform.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // internal/platform.h: a place to put platform specific code
 16 | 
 17 | #ifndef GEMMLOWP_INTERNAL_PLATFORM_H_
 18 | #define GEMMLOWP_INTERNAL_PLATFORM_H_
 19 | 
 20 | #ifdef _WIN32
 21 | #include <malloc.h>
 22 | #include <windows.h>
 23 | #else
 24 | #include <stdlib.h>
 25 | #include <time.h>
 26 | #include <unistd.h>
 27 | #endif
 28 | 
 29 | #ifdef __APPLE__
 30 | #include <sys/time.h>
 31 | #endif
 32 | 
 33 | #if defined ANDROID || defined __ANDROID__
 34 | #include <malloc.h>
 35 | #include <android/api-level.h>
 36 | // The 18 here should be 16, but has to be 18 for now due
 37 | // to a Google-internal issue.
 38 | #if __ANDROID_API__ < 18
 39 | #define GEMMLOWP_USE_MEMALIGN
 40 | #endif
 41 | // posix_memalign is missing on some 4.1 x86 devices
 42 | #if __ANDROID_API__ == 18
 43 | #ifdef GEMMLOWP_X86_32
 44 | #define GEMMLOWP_USE_MEMALIGN
 45 | #endif
 46 | #endif
 47 | #endif
 48 | 
 49 | // Needed by chrome native builds
 50 | #ifndef _SC_NPROCESSORS_CONF
 51 | #define _SC_NPROCESSORS_CONF _SC_NPROCESSORS_ONLN
 52 | #endif
 53 | 
 54 | namespace gemmlowp {
 55 | 
 56 | #ifdef _WIN32
 57 | inline void *aligned_alloc(size_t alignment, size_t size) {
 58 |   return _aligned_malloc(size, alignment);
 59 | }
 60 | 
 61 | inline void aligned_free(void *memptr) { _aligned_free(memptr); }
 62 | 
 63 | inline int GetHardwareConcurrency(int max_threads) {
 64 |   if (max_threads == 0) {
 65 |     SYSTEM_INFO sysinfo;
 66 |     GetSystemInfo(&sysinfo);
 67 |     return sysinfo.dwNumberOfProcessors;
 68 |   }
 69 |   return max_threads;
 70 | }
 71 | 
 72 | inline double real_time_in_seconds() {
 73 |   __int64 wintime;
 74 |   GetSystemTimeAsFileTime((FILETIME *)&wintime);
 75 |   wintime -= 116444736000000000LL;  // 1jan1601 to 1jan1970
 76 |   return wintime / 10000000LL + wintime % 10000000LL * 100 * 1e-9;
 77 | }
 78 | 
 79 | #else
 80 | inline void *aligned_alloc(size_t alignment, size_t size) {
 81 | #ifdef GEMMLOWP_USE_MEMALIGN
 82 |   return memalign(alignment, size);
 83 | #else
 84 |   void *memptr;
 85 |   if (posix_memalign(&memptr, alignment, size)) {
 86 |     memptr = nullptr;
 87 |   }
 88 |   return memptr;
 89 | #endif
 90 | }
 91 | 
 92 | inline int GetHardwareConcurrency(int max_threads) {
 93 |   if (max_threads == 0) {
 94 |     static const int hardware_threads_count =
 95 |         static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
 96 |     return hardware_threads_count;
 97 |   }
 98 |   return max_threads;
 99 | }
100 | 
101 | inline void aligned_free(void *memptr) { free(memptr); }
102 | 
103 | inline double real_time_in_seconds() {
104 | #ifdef __APPLE__
105 |   timeval t;
106 |   gettimeofday(&t, nullptr);
107 |   return t.tv_sec + 1e-6 * t.tv_usec;
108 | #else
109 |   timespec t;
110 |   clock_gettime(CLOCK_REALTIME, &t);
111 |   return t.tv_sec + 1e-9 * t.tv_nsec;
112 | #endif
113 | }
114 | 
115 | #endif
116 | }  // namespace gemmlowp
117 | #endif  // GEMMLOWP_INTERNAL_PLATFORM_H_
118 | 


--------------------------------------------------------------------------------
/internal/simd_wrappers_msa.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2018 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // simd_wrappers_msa.h: MSA specialization of simd_wrappers.h
 16 | 
 17 | #ifndef GEMMLOWP_INTERNAL_SIMD_WRAPPERS_MSA_H_
 18 | #define GEMMLOWP_INTERNAL_SIMD_WRAPPERS_MSA_H_
 19 | 
 20 | #include <msa.h>
 21 | 
 22 | namespace gemmlowp {
 23 | 
 24 | using Int32x4 = v4i32;
 25 | using Int16x8 = v8i16;
 26 | using Uint8x16 = v16i8;
 27 | 
 28 | template <int ScalarCount>
 29 | struct RegisterType<std::int32_t, ScalarCount> {
 30 |   using Type =
 31 |       typename std::conditional<ScalarCount >= 4, Int32x4, std::int32_t>::type;
 32 | };
 33 | 
 34 | template <int ScalarCount>
 35 | struct RegisterType<std::int16_t, ScalarCount> {
 36 |   using Type = typename std::conditional<ScalarCount >= 8, Int16x8, std::int16_t>::type;
 37 | };
 38 | 
 39 | template <int ScalarCount>
 40 | struct RegisterType<std::uint8_t, ScalarCount> {
 41 |   using Type = typename std::conditional<
 42 |       ScalarCount >= 16, Uint8x16,
 43 |       typename std::conditional<ScalarCount >= 4, std::uint32_t,
 44 |                                 std::uint8_t>::type>::type;
 45 | };
 46 | 
 47 | inline Int32x4 LoadInt32x4(const std::int32_t* src) {
 48 |   return __builtin_msa_ld_w(const_cast<std::int32_t*>(src), 0);
 49 | }
 50 | 
 51 | inline Int32x4 LoadInt32x4(const Int32x4* src) {
 52 |   return __builtin_msa_ld_w(const_cast<Int32x4*>(src), 0);
 53 | }
 54 | 
 55 | inline void StoreInt32x4(std::int32_t* dst, Int32x4 value) {
 56 |   __builtin_msa_st_w(value, dst, 0);
 57 | }
 58 | 
 59 | inline void StoreInt32x4(Int32x4* dst, Int32x4 value) {
 60 |   __builtin_msa_st_w(value, dst, 0);
 61 | }
 62 | 
 63 | inline Int16x8 LoadInt16x8(const std::int16_t* src) {
 64 |   return __builtin_msa_ld_h(const_cast<std::int16_t*>(src), 0);
 65 | }
 66 | 
 67 | inline Int16x8 LoadInt16x8(const Int16x8* src) {
 68 |   return __builtin_msa_ld_h(const_cast<Int16x8*>(src), 0);
 69 | }
 70 | 
 71 | inline void StoreInt16x8(std::int16_t* dst, Int16x8 value) { __builtin_msa_st_h(value, dst, 0); }
 72 | 
 73 | inline void StoreInt16x8(Int16x8* dst, Int16x8 value) { __builtin_msa_st_h(value, dst, 0); }
 74 | 
 75 | inline Uint8x16 LoadUint8x16(const std::uint8_t* src) {
 76 |   return __builtin_msa_ld_b(const_cast<std::uint8_t*>(src), 0);
 77 | }
 78 | 
 79 | inline Uint8x16 LoadUint8x16(const Uint8x16* src) {
 80 |   return __builtin_msa_ld_b(const_cast<Uint8x16*>(src), 0);
 81 | }
 82 | 
 83 | inline void StoreUint8x16(std::uint8_t* dst, Uint8x16 value) {
 84 |   __builtin_msa_st_b(value, dst, 0);
 85 | }
 86 | 
 87 | inline void StoreUint8x16(Uint8x16* dst, Uint8x16 value) {
 88 |   __builtin_msa_st_b(value, dst, 0);
 89 | }
 90 | 
 91 | template <int Lane>
 92 | std::int32_t GetLane(Int32x4 value) {
 93 |   return __builtin_msa_copy_s_w(value, Lane);
 94 | }
 95 | 
 96 | template <int Lane>
 97 | Int32x4 DupLane(Int32x4 value) {
 98 |   static_assert(Lane >= 0 && Lane <= 3, "");
 99 |   return __builtin_msa_splati_w(value, Lane);
100 | }
101 | 
102 | inline Int32x4 Mul(Int32x4 a, std::int32_t b) {
103 |   return __builtin_msa_mulv_w(a, __builtin_msa_fill_w(b));
104 | }
105 | 
106 | inline Int32x4 Min(Int32x4 a, Int32x4 b) { return __builtin_msa_min_s_w(a, b); }
107 | 
108 | inline Int32x4 Max(Int32x4 a, Int32x4 b) { return __builtin_msa_max_s_w(a, b); }
109 | 
110 | inline Int32x4 SaturatingRoundingDoublingHighMul(Int32x4 a, std::int32_t b) {
111 |   return __builtin_msa_mulr_q_w(a, __builtin_msa_fill_w(b));
112 | }
113 | 
114 | template <int Lane>
115 | Int32x4 MulByRhsLane(Int32x4 a, Int32x4 b) {
116 |   static_assert(Lane >= 0 && Lane <= 3, "");
117 |   return __builtin_msa_mulv_w(a, __builtin_msa_splati_w(b, Lane));
118 | }
119 | 
120 | static inline v4i32 workaround_msa_maddv_w(v4i32 a, v4i32 b, v4i32 c) {
121 |   // Workaround for incorrect encoding of maddv.df in gcc (a exchanged with c).
122 | #if 0
123 |   return __builtin_msa_maddv_w(a, b, c);
124 | #else
125 |   asm volatile("maddv.w %w[a], %w[b], %w[c]\n"
126 |                // Outputs
127 |                : [a] "+f"(a)
128 |                // Inputs
129 |                : [b] "f"(b), [c] "f"(c));
130 |   return a;
131 | #endif
132 | }
133 | 
134 | inline void MulAdd(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) {
135 |   Int32x4 tmp = LoadInt32x4(acc);
136 |   tmp = workaround_msa_maddv_w(tmp, lhs, rhs);
137 |   StoreInt32x4(acc, tmp);
138 | }
139 | 
140 | inline void MulAdd(Int32x4 lhs, std::int32_t rhs, Int32x4* acc) {
141 |   Int32x4 tmp = LoadInt32x4(acc);
142 |   tmp = workaround_msa_maddv_w(tmp, lhs, __builtin_msa_fill_w(rhs));
143 |   StoreInt32x4(acc, tmp);
144 | }
145 | 
146 | template <int Lane>
147 | inline void MulAddByRhsLane(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) {
148 |   static_assert(Lane >= 0 && Lane <= 3, "");
149 |   Int32x4 tmp = LoadInt32x4(acc);
150 |   tmp = workaround_msa_maddv_w(tmp, lhs, __builtin_msa_splati_w(rhs, Lane));
151 |   StoreInt32x4(acc, tmp);
152 | }
153 | 
154 | template <>
155 | struct LoadContiguousImpl<RegBlockUint8<8, 8>> {
156 |   static RegBlockUint8<8, 8> Run(const std::uint8_t* src) {
157 |     RegBlockUint8<8, 8> result;
158 |     for (int i = 0; i < 4; i++) {
159 |       result.buf.reg[i] = LoadUint8x16(src + 16 * i);
160 |     }
161 |     return result;
162 |   }
163 | };
164 | 
165 | template <>
166 | struct LoadContiguousImpl<RegBlockInt32<8, 8>> {
167 |   static RegBlockInt32<8, 8> Run(const std::int32_t* src) {
168 |     RegBlockInt32<8, 8> result;
169 |     for (int i = 0; i < 16; i++) {
170 |       result.buf.reg[i] = LoadInt32x4(src + 4 * i);
171 |     }
172 |     return result;
173 |   }
174 | };
175 | 
176 | template <>
177 | struct LoadContiguousImpl<RegBlockInt16<8, 8>> {
178 |   static RegBlockInt16<8, 8> Run(const std::int16_t* src) {
179 |     RegBlockInt16<8, 8> result;
180 |     for (int i = 0; i < 8; i++) {
181 |       result.buf.reg[i] = LoadInt16x8(src + 8 * i);
182 |     }
183 |     return result;
184 |   }
185 | };
186 | 
187 | }  // end namespace gemmlowp
188 | 
189 | #include "simd_wrappers_common_neon_sse.h"
190 | 
191 | #endif  // GEMMLOWP_INTERNAL_SIMD_WRAPPERS_MSA_H_
192 | 


--------------------------------------------------------------------------------
/internal/simd_wrappers_sse.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2017 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // simd_wrappers_neon.h: SSE SIMD wrappers
 16 | 
 17 | #ifndef GEMMLOWP_INTERNAL_SIMD_WRAPPERS_SSE_H_
 18 | #define GEMMLOWP_INTERNAL_SIMD_WRAPPERS_SSE_H_
 19 | 
 20 | #include <smmintrin.h>
 21 | 
 22 | namespace gemmlowp {
 23 | 
 24 | using Int32x4 = __m128i;
 25 | using Int16x8 = __m128i;
 26 | using Uint8x16 = __m128i;
 27 | 
 28 | template <int ScalarCount>
 29 | struct RegisterType<std::int32_t, ScalarCount> {
 30 |   using Type =
 31 |       typename std::conditional<ScalarCount >= 4, Int32x4, std::int32_t>::type;
 32 | };
 33 | 
 34 | template <int ScalarCount>
 35 | struct RegisterType<std::int16_t, ScalarCount> {
 36 |   using Type =
 37 |       typename std::conditional<ScalarCount >= 8, Int16x8, std::int16_t>::type;
 38 | };
 39 | 
 40 | template <int ScalarCount>
 41 | struct RegisterType<std::uint8_t, ScalarCount> {
 42 |   using Type = typename std::conditional<
 43 |       ScalarCount >= 16, Uint8x16,
 44 |       typename std::conditional<ScalarCount >= 4, std::uint32_t,
 45 |                                 std::uint8_t>::type>::type;
 46 | };
 47 | 
 48 | inline Int32x4 LoadInt32x4(const std::int32_t* src) {
 49 |   return _mm_loadu_si128(reinterpret_cast<const Int32x4*>(src));
 50 | }
 51 | 
 52 | inline Int32x4 LoadInt16x8(const std::int16_t* src) {
 53 |   return _mm_loadu_si128(reinterpret_cast<const Int16x8*>(src));
 54 | }
 55 | 
 56 | inline void StoreInt32x4(std::int32_t* dst, Int32x4 value) {
 57 |   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), value);
 58 | }
 59 | 
 60 | inline void StoreInt16x8(std::int16_t* dst, Int16x8 value) {
 61 |   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), value);
 62 | }
 63 | 
 64 | inline Uint8x16 LoadUint8x16(const std::uint8_t* src) {
 65 |   return _mm_loadu_si128(reinterpret_cast<const Uint8x16*>(src));
 66 | }
 67 | 
 68 | inline void StoreUint8x16(std::uint8_t* dst, Uint8x16 value) {
 69 |   _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), value);
 70 | }
 71 | 
 72 | template <int Lane>
 73 | std::int32_t GetLane(Int32x4 value) {
 74 |   return _mm_extract_epi32(value, Lane);
 75 | }
 76 | 
 77 | template <int Lane>
 78 | Int32x4 DupLane(Int32x4 value) {
 79 |   return _mm_shuffle_epi32(value, _MM_SHUFFLE(Lane, Lane, Lane, Lane));
 80 | }
 81 | 
 82 | inline Int32x4 Mul(Int32x4 a, std::int32_t b) {
 83 |   return Mul(a, Dup<Int32x4>(b));
 84 | }
 85 | 
 86 | inline Int32x4 Min(Int32x4 a, Int32x4 b) { return _mm_min_epi32(a, b); }
 87 | 
 88 | inline Int32x4 Max(Int32x4 a, Int32x4 b) { return _mm_max_epi32(a, b); }
 89 | 
 90 | inline Int32x4 SaturatingRoundingDoublingHighMul(Int32x4 a, std::int32_t b) {
 91 |   return SaturatingRoundingDoublingHighMul(a, Dup<Int32x4>(b));
 92 | }
 93 | 
 94 | template <int Lane>
 95 | Int32x4 MulByRhsLane(Int32x4 a, Int32x4 b) {
 96 |   return Mul(a, DupLane<Lane>(b));
 97 | }
 98 | 
 99 | inline void MulAdd(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) {
100 |   *acc = Add(*acc, Mul(lhs, rhs));
101 | }
102 | 
103 | inline void MulAdd(Int32x4 lhs, std::int32_t rhs, Int32x4* acc) {
104 |   *acc = Add(*acc, Mul(lhs, rhs));
105 | }
106 | 
107 | template <int Lane>
108 | inline void MulAddByRhsLane(Int32x4 lhs, Int32x4 rhs, Int32x4* acc) {
109 |   *acc = Add(*acc, MulByRhsLane<Lane>(lhs, rhs));
110 | }
111 | 
112 | template <>
113 | struct LoadContiguousImpl<RegBlockUint8<8, 8>> {
114 |   static RegBlockUint8<8, 8> Run(const std::uint8_t* src) {
115 |     RegBlockUint8<8, 8> result;
116 |     for (int i = 0; i < 4; i++) {
117 |       result.buf.reg[i] = LoadUint8x16(src + 16 * i);
118 |     }
119 |     return result;
120 |   }
121 | };
122 | 
123 | template <>
124 | struct LoadContiguousImpl<RegBlockInt32<8, 8>> {
125 |   static RegBlockInt32<8, 8> Run(const std::int32_t* src) {
126 |     RegBlockInt32<8, 8> result;
127 |     for (int i = 0; i < 16; i++) {
128 |       result.buf.reg[i] = LoadInt32x4(src + 4 * i);
129 |     }
130 |     return result;
131 |   }
132 | };
133 | 
134 | template <>
135 | struct LoadContiguousImpl<RegBlockInt16<8, 8>> {
136 |   static RegBlockInt16<8, 8> Run(const std::int16_t* src) {
137 |     RegBlockInt16<8, 8> result;
138 |     for (int i = 0; i < 8; i++) {
139 |       result.buf.reg[i] = LoadInt16x8(src + 8 * i);
140 |     }
141 |     return result;
142 |   }
143 | };
144 | 
145 | }  // end namespace gemmlowp
146 | 
147 | #include "simd_wrappers_common_neon_sse.h"
148 | 
149 | #endif  // GEMMLOWP_INTERNAL_SIMD_WRAPPERS_SSE_H_
150 | 


--------------------------------------------------------------------------------
/internal/single_thread_gemm.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // single_thread_gemm.h: Single-threaded GEMM implementation.
 16 | // This is a good place to start reading code, as it shows the overall
 17 | // structure of a GEMM and is much simpler than multi_thread_gemm.h.
 18 | 
 19 | #ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
 20 | #define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
 21 | 
 22 | #include <cassert>
 23 | 
 24 | #include "../public/map.h"
 25 | #include "allocator.h"
 26 | #include "compute.h"
 27 | #include "kernel.h"
 28 | #include "pack.h"
 29 | #include "unpack.h"
 30 | 
 31 | #ifdef GEMMLOWP_PROFILING_SIZES
 32 | #ifndef GEMMLOWP_PROFILING
 33 | #error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING
 34 | #endif
 35 | #include <string>
 36 | #include <unordered_map>
 37 | #endif
 38 | 
 39 | namespace gemmlowp {
 40 | 
 41 | class SingleThreadGemmContext {
 42 |  public:
 43 |   Allocator* allocator() { return &allocator_; }
 44 | 
 45 |   void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; }
 46 |   void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; }
 47 |   void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; }
 48 | 
 49 |   int l1_bytes_to_use() const { return l1_bytes_to_use_; }
 50 |   int l2_bytes_to_use() const { return l2_bytes_to_use_; }
 51 |   float l2_rhs_factor() const { return l2_rhs_factor_; }
 52 | 
 53 |  protected:
 54 |   Allocator allocator_;
 55 | 
 56 |   // The cache configurationt to use.
 57 |   int l1_bytes_to_use_ = kDefaultL1CacheSize;
 58 |   int l2_bytes_to_use_ = kDefaultL2CacheSize;
 59 |   float l2_rhs_factor_ = kDefaultL2RhsFactor;
 60 | };
 61 | 
 62 | template <typename KernelFormat, typename InputScalar, typename OutputScalar,
 63 |           typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
 64 |           MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
 65 |           typename OutputPipelineType>
 66 | void SingleThreadGemm(SingleThreadGemmContext* context,
 67 |                       const KernelBase& kernel,
 68 |                       const MatrixMap<const InputScalar, LhsOrder>& lhs,
 69 |                       const MatrixMap<const InputScalar, RhsOrder>& rhs,
 70 |                       MatrixMap<OutputScalar, ResultOrder>* result,
 71 |                       const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
 72 |                       const OutputPipelineType& output_pipeline) {
 73 |   ScopedProfilingLabel label("gemmlowp::SingleThreadGemm");
 74 | 
 75 |   assert(lhs.cols() == rhs.rows());
 76 | 
 77 |   int rows = result->rows();
 78 |   int cols = result->cols();
 79 |   int depth = lhs.cols();
 80 | 
 81 |   // zero sizes should have been caught earlier and early-returned.
 82 |   assert(rows > 0);
 83 |   assert(cols > 0);
 84 |   assert(depth > 0);
 85 | 
 86 |   // The case of rows<cols should have been caught earlier and transposed.
 87 |   assert(rows >= cols);
 88 | 
 89 |   Allocator* allocator = context->allocator();
 90 | 
 91 |   BlockParams block_params;
 92 |   block_params.Init<KernelFormat>(
 93 |       rows, cols, depth, 1, context->l1_bytes_to_use(),
 94 |       context->l2_bytes_to_use(), context->l2_rhs_factor());
 95 | 
 96 | #ifdef GEMMLOWP_PROFILING_SIZES
 97 |   // Using a static map of label strings. Not reentrant at all!
 98 |   static std::unordered_map<std::uint64_t, std::string> labels_map;
 99 |   std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^
100 |                              (static_cast<std::uint64_t>(depth) << 16) ^
101 |                              (static_cast<std::uint64_t>(cols) << 32);
102 |   if (!labels_map.count(sizes_hash)) {
103 |     char label[256];
104 |     snprintf(label, sizeof(label),
105 |              "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, "
106 |              "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)",
107 |              rows, depth, cols, block_params.l2_rows, block_params.l2_depth,
108 |              block_params.l2_cols, block_params.l1_rows, block_params.l1_depth,
109 |              block_params.l1_cols);
110 |     labels_map[sizes_hash] = label;
111 |   }
112 |   ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str());
113 | #endif
114 | 
115 |   PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator,
116 |                                                          block_params);
117 |   PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
118 |                                                          block_params);
119 | 
120 |   PackedResult packed_result(allocator, block_params);
121 | 
122 |   allocator->Commit();
123 | 
124 |   const bool pack_rhs_once = block_params.l2_cols >= cols;
125 | 
126 |   if (pack_rhs_once) {
127 |     PackRhs(&packed_rhs, rhs);
128 |   }
129 | 
130 |   for (int r = 0; r < rows; r += block_params.l2_rows) {
131 |     int rs = std::min(block_params.l2_rows, rows - r);
132 | 
133 |     PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
134 | 
135 |     for (int c = 0; c < cols; c += block_params.l2_cols) {
136 |       int cs = std::min(block_params.l2_cols, cols - c);
137 | 
138 |       if (!pack_rhs_once) {
139 |         PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
140 |       }
141 | 
142 |       Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
143 |               depth);
144 | 
145 |       UnpackResult<KernelFormat>(
146 |           result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth,
147 |           packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
148 |           lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline);
149 |     }
150 |   }
151 | 
152 |   allocator->Decommit();
153 | }
154 | 
155 | }  // namespace gemmlowp
156 | 
157 | #endif  // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
158 | 


--------------------------------------------------------------------------------
/jni/Android.mk:
--------------------------------------------------------------------------------
 1 | LOCAL_PATH := $(call my-dir)
 2 | 
 3 | include $(CLEAR_VARS)
 4 | 
 5 | LOCAL_ARM_NEON := true
 6 | LOCAL_MODULE := correctness_meta_gemm
 7 | LOCAL_SRC_FILES := ../test/correctness_meta_gemm.cc
 8 | 
 9 | include $(BUILD_EXECUTABLE)
10 | 
11 | include $(CLEAR_VARS)
12 | 
13 | LOCAL_ARM_NEON := true
14 | LOCAL_MODULE := benchmark_meta_gemm
15 | LOCAL_CFLAGS := -DNDEBUG -DGEMMLOWP_USE_META_FASTPATH
16 | LOCAL_SRC_FILES := ../test/benchmark_meta_gemm.cc ../eight_bit_int_gemm/eight_bit_int_gemm.cc
17 | 
18 | include $(BUILD_EXECUTABLE)
19 | 
20 | include $(CLEAR_VARS)
21 | 
22 | LOCAL_ARM_NEON := true
23 | LOCAL_MODULE := benchmark
24 | LOCAL_SRC_FILES := ../test/benchmark.cc
25 | 
26 | include $(BUILD_EXECUTABLE)
27 | 


--------------------------------------------------------------------------------
/jni/Application.mk:
--------------------------------------------------------------------------------
1 | NDK_TOOLCHAIN_VERSION := clang
2 | APP_STL := gnustl_static
3 | APP_ABI := armeabi-v7a
4 | APP_CPPFLAGS := -std=c++11 -Wall -Wextra -pedantic -Wno-unused-variable -Wno-unused-parameter
5 | APP_LDFLAGS := -L$(SYSROOT)/usr/lib -lstdc++ -latomic
6 | APP_PIE := true
7 | 


--------------------------------------------------------------------------------
/meta/README:
--------------------------------------------------------------------------------
 1 | METAPROGRAMMED GEMM
 2 | ===================
 3 | 
 4 | The two main goals of this library are:
 5 | - providing a new matrix multiplication kernel.
 6 | - providing the optimized codepaths for as many possible user scenarios without
 7 |   enforcing additional input data constraints (padding, sizes, strides, layout)
 8 | 
 9 | To enable this code add -DGEMMLOWP_USE_META_FASTPATH to your build setup.
10 | 
11 | The new kernel
12 | --------------
13 | 
14 | The multiplication kernel - the most inner loop of the matrix multiplication,
15 | which is responsible for the row/column products was rewritten. The new code
16 | produces a 3x3 result patch and processes the row/column arrays in 8 element
17 | packs (the kernel 'shape' is 3x3x8 compared to the previous 12x4x2). By using
18 | specialized 8bit multiplication, aggregating to vector aggregators and then
19 | reduction with parallel horizontal addition we devised code that achieved
20 | higher arithmetical density (arithmetical operation per assembly instruction).
21 | The arithmetical performance of the new kernel exceeds 18 GOps/s on a vanilla
22 | Nexus 5 phone (which is practically peak for this device).
23 | 
24 | In order to feed the kernel with input data and minimize the number of
25 | instructions other than the arithmetical operations a different packing
26 | scheme was used. Three rows (columns) are interweaved every 8 elements so that
27 | they can be read from continuous memory in one op inside the kernel. Additional
28 | memory preload hint operations are inserted into the kernel to hide memory
29 | latency behind arithmetical operations.
30 | 
31 | Generated code
32 | --------------
33 | 
34 | The basic kernel used in this approach is of shape 3x3x8. Obviously this
35 | kernel can be easily applied to multipications where matrix sizes are:
36 | M x K, K x N where M and N are multiplies of 3 and K is a multiply of 8.
37 | 
38 | We rejected two obvious solutions of: padding the matrix sizes to appropriate
39 | sizes, or using the reference implementation for the leftovers. Neither did
40 | we consider enforcing extra constraints on the caller.
41 | 
42 | In order to allow all matrix sizes the kernels processing all combinations of
43 | 1, 2 or 3 rows and 1, 2 or 3 columns are required. Similarily to allow all
44 | possible depths the leftover values (up to 7 elements) needed to be handled.
45 | 
46 | Instead of writing those kernels by hand we decided to generate them with
47 | some python scripts. 9 Versions of the multiplication kernel were prepared.
48 | Additionally packing and unpacking code for different row/column counts and
49 | depth leftovers was generated. Finally different code was generated for
50 | aligned memory reads/writes and unaligned memory reads/writes.
51 | 
52 | Using those multiplication and packing/unpacking primitives 144 gemm function
53 | versions were prepared. On top of them one high level gemm function that would
54 | switch to one of those preoptimized versions.
55 | 
56 | This approach allowed moving all unnecessary branching and conditional execution
57 | outside of the inner loops. It also allowed removing of all short loops required
58 | for leftover handling. Finally aligned memory reads/writes are used everywhere
59 | where the provided input data allows.
60 | 
61 | Results
62 | -------
63 | 
64 | The library shows up to 35% faster gemm execution in some cases (e.g. ImageNet
65 | benchmark).
66 | 
67 | Files
68 | -----
69 | 
70 | single_thread_gemm.h
71 | -- generated ARM/NEON 8bit x 8bit gemm implementation. Contains all the
72 |    optimized, unrolled and curried pack/unpack, and multiply procedures and
73 |    a single gemm function that switches between the optimized versions based
74 |    on the runtime parameters.
75 | 
76 | multi_thread_gemm.h
77 | -- a simple parallelization scheme for the gemm function.
78 | 
79 | generators/gemm_NxMxK_neon.py
80 | -- script that generates the single_thread_gemm.h header library.
81 |    Usage: python gemm_NxMxK_neon > single_thread_gemm.h
82 | 


--------------------------------------------------------------------------------
/meta/base.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #ifndef GEMMLOWP_META_BASE_H_
 16 | #define GEMMLOWP_META_BASE_H_
 17 | 
 18 | #include <cassert>
 19 | #include <cstdint>
 20 | 
 21 | #include "../internal/common.h"
 22 | 
 23 | namespace gemmlowp {
 24 | namespace meta {
 25 | 
 26 | template <int align>
 27 | inline int AlignTo(int value) {
 28 |   return ((value + align - 1) / align) * align;
 29 | }
 30 | 
 31 | inline int AlignTo(int align, int value) {
 32 |   return ((value + align - 1) / align) * align;
 33 | }
 34 | 
 35 | template <typename Kernel_, typename OutputStream_>
 36 | struct FusedKernelParams {
 37 |  public:
 38 |   typedef Kernel_ Kernel;
 39 |   typedef OutputStream_ OutputStream;
 40 | 
 41 |   Kernel kernel;
 42 |   OutputStream output_stream;
 43 | };
 44 | 
 45 | template <typename InType_, typename OutType_, typename LeftStream_,
 46 |           typename RightStream_, typename Kernel_, typename OutputStream_>
 47 | struct GemmParams {
 48 |  public:
 49 |   typedef InType_ InType;
 50 |   typedef OutType_ OutType;
 51 |   typedef LeftStream_ LeftStream;
 52 |   typedef RightStream_ RightStream;
 53 |   typedef Kernel_ Kernel;
 54 |   typedef OutputStream_ OutputStream;
 55 | 
 56 |   typedef FusedKernelParams<Kernel, OutputStream> FusedKernel;
 57 | 
 58 |   // Common parameters.
 59 | 
 60 |   int m;
 61 |   int n;
 62 |   int k;
 63 | 
 64 |   const InType* lhs;
 65 |   const InType* rhs;
 66 |   OutType* result;
 67 |   std::uint8_t* scratch;
 68 | 
 69 |   // Specialized parameters.
 70 | 
 71 |   LeftStream left_stream;
 72 |   RightStream right_stream;
 73 |   FusedKernel fused_kernel;
 74 | };
 75 | 
 76 | template <typename InType, int lanes_count, int pack_size, int leftovers,
 77 |           typename StreamParams>
 78 | class Stream {
 79 |  public:
 80 |   static void Pack(const InType* in, const StreamParams& params, InType* out);
 81 | 
 82 |   static int UnpackedAdvance(const StreamParams& params);
 83 | 
 84 |   static int PackedAdvance(const StreamParams& params);
 85 | 
 86 |   static int UnpackedStride(const StreamParams& params);
 87 | 
 88 |   static int PackedStride(const StreamParams& params);
 89 | };
 90 | 
 91 | template <typename InType, typename StreamType>
 92 | class StreamUtil {
 93 |  public:
 94 |   static const InType* Offset(const StreamType& params, const InType* source,
 95 |                               int offset_stride, int offset_advance);
 96 | 
 97 |   static int Scratch(const StreamType& params, int lanes);
 98 | };
 99 | 
100 | template <typename InType, typename OutType, typename Kernel,
101 |           typename OutputStream, int kernel_m, int kernel_n, int pack_size>
102 | class MulKernel {
103 |  public:
104 |   static void Multiply(const InType* lhs, const InType* rhs,
105 |                        const FusedKernelParams<Kernel, OutputStream>& params,
106 |                        OutType* result);
107 | };
108 | 
109 | template <typename InType_, typename OutType_, typename Kernel_>
110 | struct Transform1DParams {
111 |   typedef InType_ InType;
112 |   typedef OutType_ OutType;
113 |   typedef Kernel_ Kernel;
114 | 
115 |   const InType* input;
116 |   OutType* output;
117 |   std::uint8_t* scratch;
118 | 
119 |   Kernel kernel;
120 | };
121 | 
122 | template <typename InType, typename OutType, typename Kernel, int kernel_size,
123 |           int leftovers>
124 | class Transform1DKernel {
125 |  public:
126 |   static void Transform(const InType* input, const Kernel& params,
127 |                         OutType* output);
128 | };
129 | 
130 | template <typename InType, typename OutType, typename Transform>
131 | class Transform1DUtil {
132 |  public:
133 |   static int EstimateComputeCost(const Transform& params);
134 | 
135 |   static const InType* OffsetInput(const Transform& params, const InType* input,
136 |                                    int offset);
137 | 
138 |   static OutType* OffsetOutput(const Transform& params, OutType* output,
139 |                                int offset);
140 | };
141 | 
142 | }  // namespace meta
143 | }  // namespace gemmlowp
144 | 
145 | #endif  // GEMMLOWP_META_BASE_H_
146 | 


--------------------------------------------------------------------------------
/meta/generators/common.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #    http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """."""
 15 | import collections
 16 | 
 17 | _HEADER_COPYRIGHT = (
 18 |     '''// Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
 19 | //
 20 | // Licensed under the Apache License, Version 2.0 (the "License");
 21 | // you may not use this file except in compliance with the License.
 22 | // You may obtain a copy of the License at
 23 | //
 24 | //     http://www.apache.org/licenses/LICENSE-2.0
 25 | //
 26 | // Unless required by applicable law or agreed to in writing, software
 27 | // distributed under the License is distributed on an "AS IS" BASIS,
 28 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 29 | // See the License for the specific language governing permissions and
 30 | // limitations under the License.
 31 | ''')
 32 | 
 33 | 
 34 | def GenerateHeader(cc, header_name, preprocessor_directive):
 35 |   cc.EmitCodeNoSemicolon(_HEADER_COPYRIGHT)
 36 |   cc.EmitHeaderBegin(header_name)
 37 | 
 38 |   cc.EmitPreprocessor1('ifdef', preprocessor_directive)
 39 |   cc.EmitNewline()
 40 | 
 41 |   cc.EmitInclude('<cassert>')
 42 |   cc.EmitInclude('<cstdint>')
 43 |   cc.EmitNewline()
 44 | 
 45 | 
 46 | def GenerateFooter(cc, message):
 47 |   cc.EmitPreprocessor('else')
 48 |   cc.EmitPreprocessor1('warning', '"%s"' % message)
 49 |   cc.EmitPreprocessor('endif')
 50 |   cc.EmitNewline()
 51 |   cc.EmitHeaderEnd()
 52 | 
 53 | 
 54 | def GenerateDebugLog(cc, message):
 55 |   cc.EmitPreprocessor1('ifdef', 'DEBUG')
 56 |   cc.EmitPreprocessor1('ifdef', 'DEBUG_METAGEMM_VERBOSE')
 57 |   cc.EmitCode('std::cout << __FILE__ << \"(\" << __LINE__ << \") %s\" '
 58 |               '<< std::endl << std::flush' % message)
 59 |   cc.EmitPreprocessor('endif')
 60 |   cc.EmitPreprocessor('endif')
 61 | 
 62 | 
 63 | def _TemplateName(base, params):
 64 |   return '%s<%s>' % (base, ', '.join(map(str, params)))
 65 | 
 66 | 
 67 | class StreamGenerator(object):
 68 |   """."""
 69 | 
 70 |   def __init__(self, emitter, name):
 71 |     self.name = name
 72 |     self.emitter = emitter
 73 | 
 74 |   def SpecializeStream(self, in_type, lanes_count, pack_size, leftovers):
 75 |     if isinstance(getattr(self, 'EmitPack', None), collections.Callable):
 76 |       template_params = [in_type, lanes_count, pack_size, leftovers, self.name]
 77 |       self.emitter.EmitMemberFunctionBegin(
 78 |           'Stream', [], template_params, 'Pack',
 79 |           [['const %s*' % in_type, 'in'], ['const %s&' % self.name, 'params'],
 80 |            ['%s*' % in_type, 'out']], 'inline void')
 81 |       GenerateDebugLog(self.emitter,
 82 |                        '%s::Pack()' % _TemplateName(self.name, template_params))
 83 |       self.EmitPack(in_type, lanes_count, pack_size, leftovers)
 84 |       self.emitter.EmitFunctionEnd()
 85 | 
 86 | 
 87 | class MulKernelGenerator(object):
 88 |   """."""
 89 | 
 90 |   def __init__(self, emitter, kernel_name, output_stream_name):
 91 |     self.kernel_name = kernel_name
 92 |     self.output_stream_name = output_stream_name
 93 |     self.emitter = emitter
 94 | 
 95 |   def SpecializeMulKernel(self, in_type, out_type, kernel_m, kernel_n,
 96 |                           pack_size):
 97 |     """Generates the kernel wrapped in a MulKernel template specialization."""
 98 |     template_params = [
 99 |         in_type, out_type, self.kernel_name, self.output_stream_name, kernel_m,
100 |         kernel_n, pack_size
101 |     ]
102 |     self.emitter.EmitMemberFunctionBegin(
103 |         'MulKernel', [], template_params, 'Multiply',
104 |         [['const %s*' % in_type, 'lhs'], ['const %s*' % in_type, 'rhs'], [
105 |             'const FusedKernelParams<%s, %s>&' % (self.kernel_name,
106 |                                                   self.output_stream_name),
107 |             'params'
108 |         ], ['%s*' % out_type, 'result']], 'inline void')
109 |     GenerateDebugLog(self.emitter, '%s::Multiply()' %
110 |                      _TemplateName(self.kernel_name + self.output_stream_name,
111 |                                    template_params))
112 |     self.EmitMultiply(in_type, out_type, kernel_m, kernel_n, pack_size)
113 |     self.emitter.EmitFunctionEnd()
114 | 
115 | 
116 | class Transform1DKernelGenerator(object):
117 |   """."""
118 | 
119 |   def __init__(self, emitter, kernel_name):
120 |     self.kernel_name = kernel_name
121 |     self.emitter = emitter
122 | 
123 |   def SpecializeTransform1DKernel(self, in_type, out_type, kernel_size,
124 |                                   leftovers):
125 |     """Generates the kernel wrapped in a Transform1DKernel specialization."""
126 |     template_params = [
127 |         in_type, out_type, self.kernel_name, kernel_size, leftovers
128 |     ]
129 |     self.emitter.EmitMemberFunctionBegin(
130 |         'Transform1DKernel', [], template_params, 'Transform',
131 |         [['const %s*' % in_type, 'input'],
132 |          ['const %s&' % self.kernel_name, 'params'],
133 |          ['%s*' % out_type, 'output']], 'inline void')
134 |     GenerateDebugLog(self.emitter, '%s::Transform()' %
135 |                      _TemplateName(self.kernel_name, template_params))
136 |     self.EmitTransform(in_type, out_type, kernel_size, leftovers)
137 |     self.emitter.EmitFunctionEnd()
138 | 


--------------------------------------------------------------------------------
/meta/generators/metagemm_generate_headers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python streams_arm_32.py > ../streams_arm_32.h
3 | python streams_arm_64.py > ../streams_arm_64.h
4 | python quantized_mul_kernels_arm_32.py > ../quantized_mul_kernels_arm_32.h
5 | python quantized_mul_kernels_arm_64.py > ../quantized_mul_kernels_arm_64.h
6 | python transform_kernels_arm_32.py > ../transform_kernels_arm_32.h
7 | python transform_kernels_arm_64.py > ../transform_kernels_arm_64.h
8 | 
9 | 


--------------------------------------------------------------------------------
/meta/generators/quantized_mul_kernels_arm_32.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Generates the arm32 headers used by the gemm/gemv lib."""
15 | 
16 | import cc_emitter
17 | import common
18 | import neon_emitter
19 | import quantized_mul_kernels_common
20 | 
21 | 
22 | def Main():
23 |   """."""
24 |   cc = cc_emitter.CCEmitter()
25 |   common.GenerateHeader(cc, 'gemmlowp_meta_quantized_mul_kernels_arm_32',
26 |                         'GEMMLOWP_NEON_32')
27 | 
28 |   cc.EmitNamespaceBegin('gemmlowp')
29 |   cc.EmitNamespaceBegin('meta')
30 |   cc.EmitNewline()
31 | 
32 |   shapes = [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8),
33 |             (2, 1), (2, 2), (2, 3), (2, 4), (3, 1), (3, 2), (3, 3)]
34 | 
35 |   quantized_mul_kernels_common.GenerateKernels(cc,
36 |                                                neon_emitter.NeonEmitter(),
37 |                                                shapes)
38 | 
39 |   cc.EmitNamespaceEnd()
40 |   cc.EmitNamespaceEnd()
41 |   cc.EmitNewline()
42 | 
43 |   common.GenerateFooter(cc, 'Meta gemm for arm32 requires: GEMMLOWP_NEON_32!')
44 | 
45 | 
46 | if __name__ == '__main__':
47 |   Main()
48 | 


--------------------------------------------------------------------------------
/meta/generators/quantized_mul_kernels_arm_64.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Generates the arm32 headers used by the gemm/gemv lib."""
15 | 
16 | import cc_emitter
17 | import common
18 | import neon_emitter_64
19 | import quantized_mul_kernels_common
20 | 
21 | 
22 | def Main():
23 |   """."""
24 |   cc = cc_emitter.CCEmitter()
25 |   common.GenerateHeader(cc, 'gemmlowp_meta_quantized_mul_kernels_arm_64',
26 |                         'GEMMLOWP_NEON_64')
27 | 
28 |   cc.EmitNamespaceBegin('gemmlowp')
29 |   cc.EmitNamespaceBegin('meta')
30 |   cc.EmitNewline()
31 | 
32 |   shapes = [(1, 1), (1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8),
33 |             (2, 1), (2, 2), (2, 3), (2, 4), (3, 1), (3, 2), (3, 3)]
34 | 
35 |   quantized_mul_kernels_common.GenerateKernels(cc,
36 |                                                neon_emitter_64.NeonEmitter64(),
37 |                                                shapes)
38 | 
39 |   cc.EmitNamespaceEnd()
40 |   cc.EmitNamespaceEnd()
41 |   cc.EmitNewline()
42 | 
43 |   common.GenerateFooter(cc, 'Meta gemm for arm64 requires: GEMMLOWP_NEON_64!')
44 | 
45 | 
46 | if __name__ == '__main__':
47 |   Main()
48 | 


--------------------------------------------------------------------------------
/meta/generators/streams_arm_32.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Generates the arm32 headers used by the gemm/gemv lib."""
15 | 
16 | import cc_emitter
17 | import common
18 | import neon_emitter
19 | import streams_common
20 | 
21 | 
22 | def Main():
23 |   """."""
24 |   cc = cc_emitter.CCEmitter()
25 |   common.GenerateHeader(cc, 'gemmlowp_meta_streams_arm_32', 'GEMMLOWP_NEON_32')
26 | 
27 |   cc.EmitNamespaceBegin('gemmlowp')
28 |   cc.EmitNamespaceBegin('meta')
29 |   cc.EmitNewline()
30 | 
31 |   streams_common.GenerateUInt8x8Streams(cc, neon_emitter.NeonEmitter(), 8)
32 | 
33 |   cc.EmitNamespaceEnd()
34 |   cc.EmitNamespaceEnd()
35 |   cc.EmitNewline()
36 | 
37 |   common.GenerateFooter(cc, 'Meta gemm for arm32 requires: GEMMLOWP_NEON_32!')
38 | 
39 | 
40 | if __name__ == '__main__':
41 |   Main()
42 | 


--------------------------------------------------------------------------------
/meta/generators/streams_arm_64.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Generates the arm32 headers used by the gemm/gemv lib."""
15 | 
16 | import cc_emitter
17 | import common
18 | import neon_emitter_64
19 | import streams_common
20 | 
21 | 
22 | def Main():
23 |   """."""
24 |   cc = cc_emitter.CCEmitter()
25 |   common.GenerateHeader(cc, 'gemmlowp_meta_streams_arm_64', 'GEMMLOWP_NEON_64')
26 | 
27 |   cc.EmitNamespaceBegin('gemmlowp')
28 |   cc.EmitNamespaceBegin('meta')
29 |   cc.EmitNewline()
30 | 
31 |   streams_common.GenerateUInt8x8Streams(cc, neon_emitter_64.NeonEmitter64(), 8)
32 | 
33 |   cc.EmitNamespaceEnd()
34 |   cc.EmitNamespaceEnd()
35 |   cc.EmitNewline()
36 | 
37 |   common.GenerateFooter(cc, 'Meta gemm for arm64 requires: GEMMLOWP_NEON_64!')
38 | 
39 | 
40 | if __name__ == '__main__':
41 |   Main()
42 | 


--------------------------------------------------------------------------------
/meta/generators/transform_kernels_arm_32.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Generates the arm32 headers used by the gemm/gemv lib."""
15 | 
16 | import cc_emitter
17 | import common
18 | import neon_emitter
19 | import transform_kernels_common
20 | 
21 | 
22 | def Main():
23 |   """."""
24 |   cc = cc_emitter.CCEmitter()
25 |   common.GenerateHeader(cc, 'gemmlowp_meta_transform_kernels_arm_32',
26 |                         'GEMMLOWP_NEON_32')
27 | 
28 |   cc.EmitNamespaceBegin('gemmlowp')
29 |   cc.EmitNamespaceBegin('meta')
30 |   cc.EmitNewline()
31 | 
32 |   transform_kernels_common.GenerateKernels(cc,
33 |                                            neon_emitter.NeonEmitter(),
34 |                                            [(16, x) for x in range(16)])
35 | 
36 |   cc.EmitNamespaceEnd()
37 |   cc.EmitNamespaceEnd()
38 |   cc.EmitNewline()
39 | 
40 |   common.GenerateFooter(cc, 'Meta gemm for arm32 requires: GEMMLOWP_NEON_32!')
41 | 
42 | 
43 | if __name__ == '__main__':
44 |   Main()
45 | 


--------------------------------------------------------------------------------
/meta/generators/transform_kernels_arm_64.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2016 The Gemmlowp Authors. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #    http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Generates the arm32 headers used by the gemm/gemv lib."""
15 | 
16 | import cc_emitter
17 | import common
18 | import neon_emitter_64
19 | import transform_kernels_common
20 | 
21 | 
22 | def Main():
23 |   """."""
24 |   cc = cc_emitter.CCEmitter()
25 |   common.GenerateHeader(cc, 'gemmlowp_meta_transform_kernels_arm_64',
26 |                         'GEMMLOWP_NEON_64')
27 | 
28 |   cc.EmitNamespaceBegin('gemmlowp')
29 |   cc.EmitNamespaceBegin('meta')
30 |   cc.EmitNewline()
31 | 
32 |   transform_kernels_common.GenerateKernels(cc,
33 |                                            neon_emitter_64.NeonEmitter64(),
34 |                                            [(16, x) for x in range(16)])
35 | 
36 |   cc.EmitNamespaceEnd()
37 |   cc.EmitNamespaceEnd()
38 |   cc.EmitNewline()
39 | 
40 |   common.GenerateFooter(cc, 'Meta gemm for arm64 requires: GEMMLOWP_NEON_64!')
41 | 
42 | 
43 | if __name__ == '__main__':
44 |   Main()
45 | 


--------------------------------------------------------------------------------
/meta/legacy_multi_thread_common.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // multi_thread_common.h: Multithreading code shared by different meta gemm
 16 | // versions.
 17 | 
 18 | #ifndef GEMMLOWP_META_MULTI_THREAD_COMMON_H_
 19 | #define GEMMLOWP_META_MULTI_THREAD_COMMON_H_
 20 | 
 21 | #include "../internal/multi_thread_gemm.h"
 22 | 
 23 | namespace gemmlowp {
 24 | namespace meta {
 25 | namespace internal {
 26 | 
 27 | const std::int32_t kMinTaskSize = 16000;
 28 | const std::int32_t kMinTaskDimension = 4;
 29 | 
 30 | struct TaskRect {
 31 |   std::int32_t m_offset;
 32 |   std::int32_t m;
 33 |   std::int32_t n_offset;
 34 |   std::int32_t n;
 35 | 
 36 |   TaskRect(std::int32_t m_offset, std::int32_t m, std::int32_t n_offset,
 37 |            std::int32_t n)
 38 |       : m_offset(m_offset), m(m), n_offset(n_offset), n(n) {}
 39 | };
 40 | 
 41 | template <typename IN_TYPE, typename OUT_TYPE, typename F>
 42 | struct MetaTask : gemmlowp::Task {
 43 |   std::uint8_t* scratch;
 44 |   const IN_TYPE* lhs;
 45 |   const IN_TYPE* rhs;
 46 |   TaskRect task_rect;
 47 |   std::int32_t k;
 48 |   OUT_TYPE* result;
 49 |   std::int32_t result_stride;
 50 |   const F& operation;
 51 | 
 52 |   MetaTask(std::uint8_t* scratch, const IN_TYPE* lhs, const IN_TYPE* rhs,
 53 |            const TaskRect& task_rect, std::int32_t k, OUT_TYPE* result,
 54 |            std::int32_t result_stride, const F& operation)
 55 |       : scratch(scratch),
 56 |         lhs(lhs),
 57 |         rhs(rhs),
 58 |         task_rect(task_rect),
 59 |         k(k),
 60 |         result(result),
 61 |         result_stride(result_stride),
 62 |         operation(operation) {}
 63 | 
 64 |   void Run() override {
 65 |     const IN_TYPE* task_lhs = lhs + task_rect.m_offset * k;
 66 |     const IN_TYPE* task_rhs = rhs + task_rect.n_offset * k;
 67 |     OUT_TYPE* task_result =
 68 |         result + task_rect.m_offset * result_stride + task_rect.n_offset;
 69 |     operation.ExecuteMatrixMatrix(scratch, task_lhs, task_rhs, task_rect.m,
 70 |                                   task_rect.n, k, task_result, result_stride);
 71 |   }
 72 | };
 73 | 
 74 | std::int32_t ResolveMaxThreads(std::int32_t max_threads) {
 75 |   if (max_threads == 0) {
 76 |     static const int hardware_threads_count =
 77 |         static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
 78 |     return hardware_threads_count;
 79 |   }
 80 |   return max_threads;
 81 | }
 82 | 
 83 | void PrepareTasks(std::int32_t max_tasks, std::int32_t m, std::int32_t n,
 84 |                   std::int32_t k, std::vector<internal::TaskRect>* tasks) {
 85 |   const std::int32_t max_tasks_by_size = (m * n * k) / kMinTaskSize;
 86 |   const std::int32_t max_tasks_m = m / kMinTaskDimension;
 87 |   const std::int32_t max_tasks_n = n / kMinTaskDimension;
 88 |   const std::int32_t max_tasks_dimension = std::max(max_tasks_m, max_tasks_n);
 89 | 
 90 |   std::int32_t real_tasks = std::max(
 91 |       1, std::min(max_tasks, std::min(max_tasks_by_size, max_tasks_dimension)));
 92 | 
 93 |   if (real_tasks == 1) {
 94 |     tasks->push_back(TaskRect(0, m, 0, n));
 95 |     return;
 96 |   }
 97 | 
 98 |   if (max_tasks_m > max_tasks_n) {
 99 |     const std::int32_t m_chunk = m / real_tasks;
100 |     for (int i = 0; i < real_tasks - 1; ++i) {
101 |       tasks->push_back(TaskRect(i * m_chunk, m_chunk, 0, n));
102 |     }
103 |     const std::int32_t last_m_offset = (real_tasks - 1) * m_chunk;
104 |     tasks->push_back(TaskRect(last_m_offset, m - last_m_offset, 0, n));
105 |   } else {
106 |     const std::int32_t n_chunk = n / real_tasks;
107 |     for (int i = 0; i < real_tasks - 1; ++i) {
108 |       tasks->push_back(TaskRect(0, m, i * n_chunk, n_chunk));
109 |     }
110 |     const std::int32_t last_n_offset = (real_tasks - 1) * n_chunk;
111 |     tasks->push_back(TaskRect(0, m, last_n_offset, n - last_n_offset));
112 |   }
113 | }
114 | 
115 | template <typename IN_TYPE, typename OUT_TYPE, typename F>
116 | void MultiThreadedMatrixMatrix(gemmlowp::WorkersPool* pool,
117 |                                std::int32_t max_threads, std::uint8_t* scratch,
118 |                                const IN_TYPE* lhs, const IN_TYPE* rhs,
119 |                                std::int32_t m, std::int32_t n, std::int32_t k,
120 |                                OUT_TYPE* result, std::int32_t result_stride,
121 |                                const F& operation) {
122 |   max_threads = internal::ResolveMaxThreads(max_threads);
123 | 
124 |   std::vector<internal::TaskRect> task_rects;
125 |   internal::PrepareTasks(max_threads, m, n, k, &task_rects);
126 | 
127 |   if (task_rects.size() == 1) {
128 |     operation.ExecuteMatrixMatrix(scratch, lhs, rhs, m, n, k, result,
129 |                                   result_stride);
130 |     return;
131 |   }
132 | 
133 |   std::uint8_t* task_scratch = scratch;
134 |   std::int32_t scratch_per_thread = operation.ScratchPerThread(m, n, k);
135 |   std::vector<Task*> tasks;
136 |   std::for_each(
137 |       task_rects.begin(), task_rects.end(),
138 |       [&tasks, &task_scratch, lhs, rhs, k, result, result_stride, operation,
139 |        scratch_per_thread](internal::TaskRect& rect) {
140 |         tasks.push_back(new internal::MetaTask<IN_TYPE, OUT_TYPE, F>(
141 |             task_scratch, lhs, rhs, rect, k, result, result_stride, operation));
142 |         task_scratch += scratch_per_thread;
143 |       });
144 |   pool->Execute(tasks);
145 | }
146 | 
147 | }  // namespace internal
148 | }  // namespace meta
149 | }  // namespace gemmlowp
150 | 
151 | #endif  // GEMMLOWP_META_MULTI_THREAD_COMMON_H_
152 | 


--------------------------------------------------------------------------------
/meta/legacy_operations_common.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef GEMMLOWP_META_OPERATIONS_COMMON_H_
16 | #define GEMMLOWP_META_OPERATIONS_COMMON_H_
17 | 
18 | class Quantized8BitOperation {
19 |  public:
20 |   Quantized8BitOperation(std::int32_t lhs_offset, std::int32_t rhs_offset,
21 |                          std::int32_t sum_offset, std::int32_t multiplier,
22 |                          std::int32_t shift)
23 |       : lhs_offset(lhs_offset),
24 |         rhs_offset(rhs_offset),
25 |         sum_offset(sum_offset),
26 |         multiplier(multiplier),
27 |         shift(shift) {}
28 | 
29 |  protected:
30 |   std::int32_t lhs_offset;
31 |   std::int32_t rhs_offset;
32 |   std::int32_t sum_offset;
33 |   std::int32_t multiplier;
34 |   std::int32_t shift;
35 | };
36 | 
37 | class FloatOperation {
38 |  public:
39 |   FloatOperation(std::int32_t lhs_offset, std::int32_t rhs_offset,
40 |                  float result_offset)
41 |       : lhs_offset(lhs_offset),
42 |         rhs_offset(rhs_offset),
43 |         result_offset(result_offset) {}
44 | 
45 |  protected:
46 |   std::int32_t lhs_offset;
47 |   std::int32_t rhs_offset;
48 |   float result_offset;
49 | };
50 | 
51 | class Int32Operation {
52 |  public:
53 |   Int32Operation(std::int32_t lhs_offset, std::int32_t rhs_offset)
54 |       : lhs_offset(lhs_offset), rhs_offset(rhs_offset) {}
55 | 
56 |  protected:
57 |   std::int32_t lhs_offset;
58 |   std::int32_t rhs_offset;
59 | };
60 | 
61 | #endif  // GEMMLOWP_META_OPERATIONS_COMMON_H_
62 | 


--------------------------------------------------------------------------------
/meta/multi_thread_common.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef GEMMLOWP_META_MULTI_THREAD_COMMON_H_
16 | #define GEMMLOWP_META_MULTI_THREAD_COMMON_H_
17 | 
18 | #include "../internal/multi_thread_gemm.h"
19 | 
20 | namespace gemmlowp {
21 | namespace meta {
22 | 
23 | inline int ResolveMaxThreads(int max_threads) {
24 |   if (max_threads == 0) {
25 | #ifdef _WIN32
26 |     SYSTEM_INFO sysinfo;
27 |     GetSystemInfo(&sysinfo);
28 |     return sysinfo.dwNumberOfProcessors;
29 | #else
30 |     static const int hardware_threads_count =
31 |         static_cast<int>(sysconf(_SC_NPROCESSORS_CONF));
32 |     return hardware_threads_count;
33 | #endif
34 |   }
35 |   return max_threads;
36 | }
37 | 
38 | template <typename WorkersPool>
39 | class SimpleContext {
40 |  public:
41 |   SimpleContext(int max_num_threads, WorkersPool* pool)
42 |       : max_num_threads_(max_num_threads), pool_(pool) {}
43 | 
44 |   WorkersPool* workers_pool() { return pool_; }
45 | 
46 |   int max_num_threads() { return max_num_threads_; }
47 | 
48 |  private:
49 |   int max_num_threads_;
50 |   WorkersPool* pool_;
51 | };
52 | 
53 | }  // namespace meta
54 | }  // namespace gemmlowp
55 | 
56 | #endif  // GEMMLOWP_META_MULTI_THREAD_COMMON_H_
57 | 


--------------------------------------------------------------------------------
/meta/multi_thread_gemm.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #ifndef GEMMLOWP_META_MULTI_THREAD_GEMM_H_
 16 | #define GEMMLOWP_META_MULTI_THREAD_GEMM_H_
 17 | 
 18 | #include "multi_thread_common.h"
 19 | #include "single_thread_gemm.h"
 20 | 
 21 | namespace gemmlowp {
 22 | namespace meta {
 23 | namespace internal {
 24 | 
 25 | const std::int32_t kMinGemmTaskSize = 16000;
 26 | const std::int32_t kMinGemmTaskDimension = 4;
 27 | 
 28 | template <typename Executor, typename Params>
 29 | std::uint8_t* PrepareGemmTask(const Params& params, int kernel_m, int kernel_n,
 30 |                               int kernel_k, std::uint8_t* scratch, int m_start,
 31 |                               int m, int n_start, int n,
 32 |                               std::vector<Params>* tasks) {
 33 |   tasks->push_back(params);
 34 |   Params& task = tasks->back();
 35 |   task.scratch = scratch;
 36 | 
 37 |   task.m = m;
 38 |   task.lhs =
 39 |       StreamUtil<typename Params::InType, typename Params::LeftStream>::Offset(
 40 |           params.left_stream, params.lhs, m_start, 0);
 41 | 
 42 |   task.n = n;
 43 |   task.rhs =
 44 |       StreamUtil<typename Params::InType, typename Params::RightStream>::Offset(
 45 |           params.right_stream, params.rhs, n_start, 0);
 46 | 
 47 |   task.result =
 48 |       StreamUtil<typename Params::OutType, typename Params::OutputStream>::
 49 |           Offset(params.fused_kernel.output_stream, params.result, m_start,
 50 |                  n_start);
 51 | 
 52 |   return scratch + Executor::template EstimateScratchSize<Params>(
 53 |                        task, kernel_m, kernel_n, kernel_k);
 54 | }
 55 | 
 56 | template <typename MultiThreadingContext, typename Executor, typename Params>
 57 | bool PrepareGemmTasks(MultiThreadingContext* context, const Params& params,
 58 |                       int kernel_m, int kernel_n, int kernel_k,
 59 |                       std::vector<Params>* task_params) {
 60 |   const int max_threads = ResolveMaxThreads(context->max_num_threads());
 61 |   const int max_tasks_by_size =
 62 |       (params.m * params.n * params.k) / kMinGemmTaskSize;
 63 |   const int max_tasks_m = params.m / kMinGemmTaskDimension;
 64 |   const int max_tasks_n = params.n / kMinGemmTaskDimension;
 65 |   const int max_tasks_dimension = std::max(max_tasks_m, max_tasks_n);
 66 | 
 67 |   const int real_tasks = std::max(
 68 |       1,
 69 |       std::min(max_threads, std::min(max_tasks_by_size, max_tasks_dimension)));
 70 | 
 71 |   if (real_tasks == 1) {
 72 |     return false;
 73 |   }
 74 | 
 75 |   std::uint8_t* scratch = params.scratch;
 76 | 
 77 |   if (max_tasks_m > max_tasks_n) {
 78 |     const int m_chunk = params.m / real_tasks;
 79 |     for (int i = 0; i < real_tasks - 1; ++i) {
 80 |       scratch = PrepareGemmTask<Executor, Params>(
 81 |           params, kernel_m, kernel_n, kernel_k, scratch, i * m_chunk, m_chunk,
 82 |           0, params.n, task_params);
 83 |     }
 84 |     const int sum_m = (real_tasks - 1) * m_chunk;
 85 |     PrepareGemmTask<Executor, Params>(params, kernel_m, kernel_n, kernel_k,
 86 |                                       scratch, sum_m, params.m - sum_m, 0,
 87 |                                       params.n, task_params);
 88 |   } else {
 89 |     const int n_chunk = params.n / real_tasks;
 90 |     for (int i = 0; i < real_tasks - 1; ++i) {
 91 |       scratch = PrepareGemmTask<Executor, Params>(
 92 |           params, kernel_m, kernel_n, kernel_k, scratch, 0, params.m,
 93 |           i * n_chunk, n_chunk, task_params);
 94 |     }
 95 |     int sum_n = (real_tasks - 1) * n_chunk;
 96 |     PrepareGemmTask<Executor, Params>(params, kernel_m, kernel_n, kernel_k,
 97 |                                       scratch, 0, params.m, sum_n,
 98 |                                       params.n - sum_n, task_params);
 99 |   }
100 | 
101 |   return true;
102 | }
103 | 
104 | template <typename Executor, typename Params, int kernel_m, int kernel_n,
105 |           int kernel_k>
106 | struct GemmTaskRunner : gemmlowp::Task {
107 |   GemmTaskRunner(const Params& params) : params(params) {}
108 | 
109 |   void Run() override {
110 |     Gemm<Executor, Params, kernel_m, kernel_n, kernel_k>(params);
111 |   }
112 | 
113 |   Params params;
114 | };
115 | 
116 | }  // namespace internal
117 | 
118 | template <typename MultiThreadingContext, typename Executor, typename Params,
119 |           int kernel_m, int kernel_n, int kernel_k>
120 | inline void MultiThreadGemm(MultiThreadingContext* context,
121 |                             const Params& params) {
122 |   typedef internal::GemmTaskRunner<Executor, Params, kernel_m, kernel_n,
123 |                                    kernel_k>
124 |       TaskRunnerType;
125 | 
126 |   std::vector<Params> task_params;
127 |   if (!internal::PrepareGemmTasks<MultiThreadingContext, Executor, Params>(
128 |           context, params, kernel_m, kernel_n, kernel_k, &task_params)) {
129 |     Gemm<Executor, Params, kernel_m, kernel_n, kernel_k>(params);
130 |     return;
131 |   }
132 | 
133 |   auto workers_pool = context->workers_pool();
134 |   std::vector<Task*> tasks;
135 |   for (auto& task_param : task_params) {
136 |     tasks.push_back(new TaskRunnerType(task_param));
137 |   };
138 |   workers_pool->Execute(tasks);
139 | }
140 | 
141 | }  // namespace meta
142 | }  // namespace gemmlowp
143 | 
144 | #endif  // GEMMLOWP_META_MULTI_THREAD_GEMM_H_
145 | 


--------------------------------------------------------------------------------
/meta/multi_thread_transform.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_
16 | #define GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_
17 | 
18 | #include "multi_thread_common.h"
19 | #include "single_thread_transform.h"
20 | 
21 | namespace gemmlowp {
22 | namespace meta {
23 | namespace internal {
24 | 
25 | const int kTransformTaskOverhead = 128000;
26 | const int kMinTransformTaskSize = 32000;
27 | 
28 | template <typename MultiThreadingContext, typename Params>
29 | inline bool PrepareTransform1DTasks(MultiThreadingContext* context,
30 |                                     const Params& params, int kernel_size,
31 |                                     std::vector<Params>* task_params) {
32 |   typedef Transform1DUtil<typename Params::InType, typename Params::OutType,
33 |                           typename Params::Kernel>
34 |       Util;
35 | 
36 |   const int max_threads = ResolveMaxThreads(context->max_num_threads());
37 |   const int task_size = Util::EstimateComputeCost(params.kernel);
38 |   const int max_tasks_by_size =
39 |       (task_size - kTransformTaskOverhead) / kMinTransformTaskSize;
40 | 
41 |   const int real_tasks = std::max(1, std::min(max_threads, max_tasks_by_size));
42 | 
43 |   if (real_tasks == 1) {
44 |     return false;
45 |   }
46 | 
47 |   const int chunk = params.kernel.count / real_tasks;
48 |   for (int i = 0; i < real_tasks - 1; ++i) {
49 |     task_params->push_back(params);
50 |     Params& task = task_params->back();
51 |     task.kernel.count = chunk;
52 |     task.input = Util::OffsetInput(params.kernel, params.input, i * chunk);
53 |     task.output = Util::OffsetOutput(params.kernel, params.output, i * chunk);
54 |   }
55 |   task_params->push_back(params);
56 |   Params& task = task_params->back();
57 |   const int sum_chunk = (real_tasks - 1) * chunk;
58 |   task.kernel.count = params.kernel.count - sum_chunk;
59 |   task.input = Util::OffsetInput(params.kernel, params.input, sum_chunk);
60 |   task.output = Util::OffsetOutput(params.kernel, params.output, sum_chunk);
61 |   return true;
62 | }
63 | 
64 | template <typename Params, int kernel_size>
65 | struct Transform1DTaskRunner : gemmlowp::Task {
66 |   Transform1DTaskRunner(const Params& params) : params(params) {}
67 | 
68 |   void Run() override { Transform1D<Params, kernel_size>(params); }
69 | 
70 |   Params params;
71 | };
72 | 
73 | }  // namespace internal
74 | 
75 | template <typename MultiThreadingContext, typename Params, int kernel_size>
76 | inline void MultiThreadTransform1D(MultiThreadingContext* context,
77 |                                    const Params& params) {
78 |   typedef internal::Transform1DTaskRunner<Params, kernel_size> TaskRunnerType;
79 | 
80 |   std::vector<Params> task_params;
81 |   if (!internal::PrepareTransform1DTasks<MultiThreadingContext, Params>(
82 |           context, params, kernel_size, &task_params)) {
83 |     Transform1D<Params, kernel_size>(params);
84 |     return;
85 |   }
86 | 
87 |   auto workers_pool = context->workers_pool();
88 |   std::vector<Task*> tasks;
89 |   for (auto& task_param : task_params) {
90 |     tasks.push_back(new TaskRunnerType(task_param));
91 |   }
92 |   workers_pool->Execute(tasks);
93 | }
94 | 
95 | }  // namespace meta
96 | }  // namespace gemmlowp
97 | 
98 | #endif  // GEMMLOWP_META_MULTI_THREAD_TRANSFORM_H_
99 | 


--------------------------------------------------------------------------------
/meta/quantized_mul_kernels.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #ifndef GEMMLOWP_META_QUANTIZED_MUL_KERNELS_H_
 16 | #define GEMMLOWP_META_QUANTIZED_MUL_KERNELS_H_
 17 | 
 18 | #include <iostream>
 19 | #include <typeinfo>
 20 | 
 21 | #include "base.h"
 22 | #include "streams.h"
 23 | 
 24 | namespace gemmlowp {
 25 | namespace meta {
 26 | 
 27 | struct QuantizedStaticPreprocessed {
 28 |  public:
 29 |   int multiplicative_offset;
 30 |   int rounding_offset;
 31 |   int shift;
 32 |   int count;
 33 | };
 34 | 
 35 | template <typename InType, typename OutType, int m, int n, int k>
 36 | class MulKernel<InType, OutType, QuantizedStaticPreprocessed, RowMajor, m, n,
 37 |                 k> {
 38 |  public:
 39 |   typedef FusedKernelParams<QuantizedStaticPreprocessed, RowMajor> FusedKernel;
 40 | 
 41 |   static void Multiply(const InType* lhs, const InType*,
 42 |                        const FusedKernel& params, OutType* result) {
 43 | #ifdef DEBUG
 44 | #ifdef DEBUG_METAGEMM_VERBOSE
 45 |     std::cout << "MulQSPR(" << typeid(InType).name() << ", "
 46 |               << typeid(OutType).name() << ")::Multiply() -- " << m << "x" << n
 47 |               << "x" << k << std::endl;
 48 | #endif
 49 | #else
 50 |     if (m != 0 && n != 0) {
 51 |       std::cerr << "FATAL: QuantizedStaticPreprocessed_RowMajor::Multiply not "
 52 |                 << "implemented." << std::endl;
 53 |       std::exit(1);
 54 |     }
 55 | #endif
 56 |   }
 57 | 
 58 | #ifdef DEBUG
 59 | #ifdef DEBUG_METAGEMM_VERBOSE
 60 |   static void Debug(const FusedKernel& params) {
 61 |     std::cout << "MulQSPR(" << typeid(InType).name() << ", "
 62 |               << typeid(OutType).name() << ") -- " << m << "x" << n << "x" << k
 63 |               << std::endl;
 64 |     std::cout << "  params:" << std::endl;
 65 |     std::cout << "    kernel.multiplicative_offset: "
 66 |               << params.kernel.multiplicative_offset << std::endl;
 67 |     std::cout << "    kernel.rounding_offset: " << params.kernel.rounding_offset
 68 |               << std::endl;
 69 |     std::cout << "    kernel.shift: " << params.kernel.shift << std::endl;
 70 |     std::cout << "    kernel.count: " << params.kernel.count << std::endl;
 71 |     std::cout << "    output_stream.stride: " << params.output_stream.stride
 72 |               << std::endl;
 73 |   }
 74 | #endif
 75 | #endif
 76 | };
 77 | 
 78 | struct QuantizedStaticPreprocessedAsInt32 {
 79 |  public:
 80 |   int count;
 81 | };
 82 | 
 83 | template <typename InType, typename OutType, int m, int n, int k>
 84 | class MulKernel<InType, OutType, QuantizedStaticPreprocessedAsInt32, RowMajor,
 85 |                 m, n, k> {
 86 |  public:
 87 |   typedef FusedKernelParams<QuantizedStaticPreprocessedAsInt32, RowMajor>
 88 |       FusedKernel;
 89 | 
 90 |   static void Multiply(const InType* lhs, const InType*,
 91 |                        const FusedKernel& params, OutType* result) {
 92 | #ifdef DEBUG
 93 | #ifdef DEBUG_METAGEMM_VERBOSE
 94 |     std::cout << "MulQSPI32R(" << typeid(InType).name() << ", "
 95 |               << typeid(OutType).name() << ")::Multiply() -- " << m << "x" << n
 96 |               << "x" << k << std::endl;
 97 | #endif
 98 | #else
 99 |     if (m != 0 && n != 0) {
100 |       std::cerr << "FATAL: QuantizedStaticPreprocessedAsInt32_RowMajor::"
101 |                 << "Multiply not implemented." << std::endl;
102 |       std::exit(1);
103 |     }
104 | #endif
105 |   }
106 | 
107 | #ifdef DEBUG
108 | #ifdef DEBUG_METAGEMM_VERBOSE
109 |   static void Debug(const FusedKernel& params) {
110 |     std::cout << "MulQSPI32R(" << typeid(InType).name() << ", "
111 |               << typeid(OutType).name() << ") -- " << m << "x" << n << "x" << k
112 |               << std::endl;
113 |     std::cout << "  params:" << std::endl;
114 |     std::cout << "    kernel.count: " << params.kernel.count << std::endl;
115 |     std::cout << "    output_stream.stride: " << params.output_stream.stride
116 |               << std::endl;
117 |   }
118 | #endif
119 | #endif
120 | };
121 | 
122 | struct QuantizedStaticPreprocessedAsFloat {
123 |  public:
124 |   int count;
125 |   float scale;
126 | };
127 | 
128 | template <typename InType, typename OutType, int m, int n, int k>
129 | class MulKernel<InType, OutType, QuantizedStaticPreprocessedAsFloat, RowMajor,
130 |                 m, n, k> {
131 |  public:
132 |   typedef FusedKernelParams<QuantizedStaticPreprocessedAsFloat, RowMajor>
133 |       FusedKernel;
134 | 
135 |   static void Multiply(const InType* lhs, const InType*,
136 |                        const FusedKernel& params, OutType* result) {
137 | #ifdef DEBUG
138 | #ifdef DEBUG_METAGEMM_VERBOSE
139 |     std::cout << "MulQSPFR(" << typeid(InType).name() << ", "
140 |               << typeid(OutType).name() << ")::Multiply() -- " << m << "x" << n
141 |               << "x" << k << std::endl;
142 | #endif
143 | #else
144 |     if (m != 0 && n != 0) {
145 |       std::cerr << "FATAL: QuantizedStaticPreprocessedAsFloat_RowMajor::"
146 |                 << "Multiply not implemented." << std::endl;
147 |       std::exit(1);
148 |     }
149 | #endif
150 |   }
151 | 
152 | #ifdef DEBUG
153 | #ifdef DEBUG_METAGEMM_VERBOSE
154 |   static void Debug(const FusedKernel& params) {
155 |     std::cout << "MulQSPFR(" << typeid(InType).name() << ", "
156 |               << typeid(OutType).name() << ") -- " << m << "x" << n << "x" << k
157 |               << std::endl;
158 |     std::cout << "  params:" << std::endl;
159 |     std::cout << "    kernel.count: " << params.kernel.count << std::endl;
160 |     std::cout << "    kernel.scale: " << params.kernel.scale << std::endl;
161 |     std::cout << "    output_stream.stride: " << params.output_stream.stride
162 |               << std::endl;
163 |   }
164 | #endif
165 | #endif
166 | };
167 | 
168 | }  // namespace meta
169 | }  // namespace gemmlowp
170 | 
171 | #ifdef GEMMLOWP_NEON_32
172 | #include "quantized_mul_kernels_arm_32.h"
173 | #elif defined(GEMMLOWP_NEON_64)
174 | #include "quantized_mul_kernels_arm_64.h"
175 | #endif
176 | 
177 | #endif  // GEMMLOWP_META_QUANTIZED_MUL_KERNELS_H_
178 | 


--------------------------------------------------------------------------------
/meta/single_thread_transform.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef GEMMLOWP_META_SINGLE_THREAD_TRANSFORM_H_
16 | #define GEMMLOWP_META_SINGLE_THREAD_TRANSFORM_H_
17 | 
18 | #include <iostream>
19 | #include "base.h"
20 | 
21 | namespace gemmlowp {
22 | namespace meta {
23 | 
24 | template <typename Params, int kernel_size>
25 | void Transform1D(const Params& params);
26 | 
27 | namespace internal {
28 | 
29 | class Transform1DExecutor {
30 |  public:
31 |   template <typename P, int kernel_size, int leftovers>
32 |   static void ExecuteDispatch1D(const P& params) {
33 |     Transform1DKernel<typename P::InType, typename P::OutType,
34 |                       typename P::Kernel, kernel_size,
35 |                       leftovers>::Transform(params.input, params.kernel,
36 |                                             params.output);
37 |   }
38 | };
39 | 
40 | template <typename E, typename P, int kernel_size, int variable_leftovers>
41 | struct Dispatch1D {
42 |   static void Execute(const P& params, int leftovers) {
43 | #ifdef DEBUG
44 | #ifdef DEBUG_METAGEMM_VERBOSE
45 |     std::cout << "Dispatch(1): " << kernel_size << ":" << variable_leftovers
46 |               << std::endl
47 |               << std::flush;
48 | #endif
49 | #endif
50 |     if (leftovers == variable_leftovers) {
51 |       E::template ExecuteDispatch1D<P, kernel_size, variable_leftovers>(params);
52 |     } else {
53 |       Dispatch1D<E, P, kernel_size, variable_leftovers - 1>::Execute(params,
54 |                                                                      leftovers);
55 |     }
56 |   }
57 | };
58 | 
59 | template <typename E, typename P, int kernel_size>
60 | struct Dispatch1D<E, P, kernel_size, 0> {
61 |   static void Execute(const P& params, int leftovers) {
62 | #ifdef DEBUG
63 | #ifdef DEBUG_METAGEMM_VERBOSE
64 |     std::cout << "Dispatch(1): " << kernel_size << ": 0" << std::endl
65 |               << std::flush;
66 | #endif
67 | #endif
68 |     if (leftovers == 0) {
69 |       E::template ExecuteDispatch1D<P, kernel_size, 0>(params);
70 |     } else {
71 |       std::cerr << "FATAL: dispatch1D failed: ran out of cases." << std::endl
72 |                 << std::flush;
73 |       std::exit(1);
74 |     }
75 |   }
76 | };
77 | 
78 | }  // namespace internal
79 | 
80 | template <typename Params, int kernel_size>
81 | inline void Transform1D(const Params& params) {
82 |   internal::Dispatch1D<internal::Transform1DExecutor, Params, kernel_size,
83 |                        kernel_size - 1>::Execute(params, params.kernel.count %
84 |                                                              kernel_size);
85 | }
86 | 
87 | }  // namespace meta
88 | }  // namespace gemmlowp
89 | 
90 | #endif  // GEMMLOWP_META_SINGLE_THREAD_TRANSFORM_H_
91 | 


--------------------------------------------------------------------------------
/meta/test_streams_correctness.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include <unistd.h>
 16 | #ifdef __APPLE__
 17 | #include <sys/time.h>
 18 | #endif
 19 | 
 20 | #include <cstdint>
 21 | #include <cstdlib>
 22 | #include <ctime>
 23 | #include <iomanip>
 24 | #include <iostream>
 25 | #include <map>
 26 | #include <memory>
 27 | #include <vector>
 28 | 
 29 | #include "streams.h"
 30 | 
 31 | #define MUL_OFFSET (3)
 32 | #define ADD_OFFSET (100)
 33 | 
 34 | using namespace gemmlowp::meta;
 35 | 
 36 | void prepare_row_major_data(int rows, int elements, int stride, std::uint8_t* data) {
 37 |   for (int i = 0; i < rows * stride; ++i) {
 38 |     data[i] = 255;
 39 |   }
 40 |   for (int i = 0; i < rows; ++i) {
 41 |     for (int j = 0; j < elements; ++j) {
 42 |       data[i * stride + j] = j % 256;
 43 |     }
 44 |   }
 45 | }
 46 | 
 47 | void prepare_column_major_data(int columns, int elements, int stride,
 48 |                                std::uint8_t* data) {
 49 |   for (int i = 0; i < elements * stride; ++i) {
 50 |     data[i] = 255;
 51 |   }
 52 |   for (int i = 0; i < elements; ++i) {
 53 |     for (int j = 0; j < columns; ++j) {
 54 |       data[i * stride + j] = i % 256;
 55 |     }
 56 |   }
 57 | }
 58 | 
 59 | void print_out(std::uint8_t* result, int rows, int elements) {
 60 |   int size = rows * ((elements + 7) / 8) * 8;
 61 |   for (int i = 0; i < size; ++i) {
 62 |     std::cout << static_cast<int>(result[i]) << " ";
 63 |   }
 64 |   std::cout << std::endl << std::flush;
 65 | }
 66 | 
 67 | bool check(std::uint8_t* result, int rows, int elements) {
 68 |   int chunks = elements / 8;
 69 |   int leftover = elements % 8;
 70 |   for (int i = 0; i < chunks; ++i) {
 71 |     int chunk_index = i * rows * 8;
 72 |     int chunk_start_value = i * 8;
 73 |     for (int j = 0; j < rows; ++j) {
 74 |       for (int k = 0; k < 8; ++k) {
 75 |         if (result[chunk_index + j * 8 + k] != chunk_start_value + k) {
 76 |           return false;
 77 |         }
 78 |       }
 79 |     }
 80 |   }
 81 | 
 82 |   int leftover_index = chunks * rows * 8;
 83 |   int leftover_start_value = chunks * 8;
 84 |   for (int i = 0; i < rows; ++i) {
 85 |     for (int j = 0; j < leftover; ++j) {
 86 |       if (result[leftover_index + i * 8 + j] != leftover_start_value + j) {
 87 |         return false;
 88 |       }
 89 |     }
 90 |   }
 91 | 
 92 |   int expected_sum =
 93 |       ((elements * (elements - 1)) / 2) * MUL_OFFSET + ADD_OFFSET;
 94 |   int sums_offset = rows * ((elements + 7) / 8) * 8;
 95 |   std::int32_t* sums = reinterpret_cast<std::int32_t*>(result + sums_offset);
 96 |   for (int i = 0; i < rows; ++i) {
 97 |     if (sums[i] != expected_sum) {
 98 |       return false;
 99 |     }
100 |   }
101 | 
102 |   return true;
103 | }
104 | 
105 | template <int lanes, int leftover>
106 | void test_2(std::uint8_t* in, std::uint8_t* out) {
107 |   for (int elements = 8; elements < 64; elements += 8) {
108 |     int all_elements = elements + leftover;
109 |     for (int stride = all_elements; stride < all_elements + 4; ++stride) {
110 |       RowMajorWithSum params;
111 |       params.count = all_elements;
112 |       params.stride = stride;
113 |       params.multiplicative_sum_offset = MUL_OFFSET;
114 |       params.additive_sum_offset = ADD_OFFSET;
115 | 
116 |       prepare_row_major_data(lanes, all_elements, stride, in);
117 |       Stream<std::uint8_t, lanes, 8, leftover, RowMajorWithSum>::Pack(in, params,
118 |                                                                  out);
119 |       if (check(out, lanes, all_elements)) {
120 |         //        std::cout << "Row: " << lanes << "x8x" << leftover << " : "
121 |         //                  << all_elements << "@" << stride << " -- OK" <<
122 |         //                  std::endl;
123 |       } else {
124 |         std::cout << "Row: " << lanes << "x8x" << leftover << " : "
125 |                   << all_elements << "@" << stride << " -- ERROR" << std::endl;
126 |         std::cout << "Exiting." << std::endl;
127 |         std::exit(1);
128 |       }
129 |     }
130 | 
131 |     for (int stride = lanes; stride < lanes + 4; ++stride) {
132 |       ColumnMajorWithSum params;
133 |       params.count = all_elements;
134 |       params.stride = stride;
135 |       params.multiplicative_sum_offset = MUL_OFFSET;
136 |       params.additive_sum_offset = ADD_OFFSET;
137 | 
138 |       prepare_column_major_data(lanes, all_elements, stride, in);
139 |       Stream<std::uint8_t, lanes, 8, leftover, ColumnMajorWithSum>::Pack(in, params,
140 |                                                                     out);
141 |       if (check(out, lanes, all_elements)) {
142 |         //        std::cout << "Column: " << lanes << "x8x" << leftover << " : "
143 |         //                  << all_elements << "@" << stride << " -- OK" <<
144 |         //                  std::endl;
145 |       } else {
146 |         std::cout << "Column: " << lanes << "x8x" << leftover << " : "
147 |                   << all_elements << "@" << stride << " -- ERROR" << std::endl;
148 |         std::cout << "Exiting." << std::endl;
149 |         std::exit(1);
150 |       }
151 |     }
152 |   }
153 | }
154 | 
155 | template <int lanes>
156 | void test(std::uint8_t* in, std::uint8_t* out) {
157 |   test_2<lanes, 0>(in, out);
158 |   test_2<lanes, 1>(in, out);
159 |   test_2<lanes, 2>(in, out);
160 |   test_2<lanes, 3>(in, out);
161 |   test_2<lanes, 4>(in, out);
162 |   test_2<lanes, 5>(in, out);
163 |   test_2<lanes, 6>(in, out);
164 |   test_2<lanes, 7>(in, out);
165 | }
166 | 
167 | int main() {
168 |   std::unique_ptr<std::uint8_t> in(new std::uint8_t[128 * 1024]);
169 |   std::unique_ptr<std::uint8_t> out(new std::uint8_t[128 * 1024]);
170 | 
171 |   test<1>(in.get(), out.get());
172 |   test<2>(in.get(), out.get());
173 |   test<3>(in.get(), out.get());
174 |   test<4>(in.get(), out.get());
175 |   test<5>(in.get(), out.get());
176 |   test<6>(in.get(), out.get());
177 |   test<7>(in.get(), out.get());
178 |   test<8>(in.get(), out.get());
179 | 
180 |   std::cout << "Ok." << std::endl;
181 |   return 0;
182 | }
183 | 


--------------------------------------------------------------------------------
/meta/test_transform_benchmark.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include <unistd.h>
 16 | #ifdef __APPLE__
 17 | #include <sys/time.h>
 18 | #endif
 19 | 
 20 | #include <cstdint>
 21 | #include <cstdlib>
 22 | #include <ctime>
 23 | #include <iomanip>
 24 | #include <iostream>
 25 | #include <map>
 26 | #include <memory>
 27 | #include <vector>
 28 | 
 29 | #include "multi_thread_transform.h"
 30 | #include "transform_kernels.h"
 31 | 
 32 | using namespace gemmlowp::meta;
 33 | 
 34 | double time() {
 35 | #ifdef __APPLE__
 36 |   timeval t;
 37 |   gettimeofday(&t, nullptr);
 38 |   return t.tv_sec + 1e-6 * t.tv_usec;
 39 | #else
 40 |   timespec t;
 41 |   clock_gettime(CLOCK_REALTIME, &t);
 42 |   return t.tv_sec + 1e-9 * t.tv_nsec;
 43 | #endif
 44 | }
 45 | 
 46 | #define kernel_size (16)
 47 | 
 48 | template <typename Context, typename Params>
 49 | void run_benchmark(const std::string& name, int repetitions, int elements,
 50 |                    Context* context, const Params& params) {
 51 |   std::cout << "Benchmark: " << name << std::endl;
 52 |   std::cout << "Warmup single." << std::endl;
 53 | 
 54 |   for (int i = 0; i < 10; ++i) {
 55 |     Transform1D<Params, kernel_size>(params);
 56 |   }
 57 | 
 58 |   std::cout << "Benchmark single." << std::endl;
 59 | 
 60 |   double start = time();
 61 | 
 62 |   for (int i = 0; i < repetitions; ++i) {
 63 |     Transform1D<Params, kernel_size>(params);
 64 |   }
 65 | 
 66 |   double wall_time = time() - start;
 67 |   double ops = static_cast<double>(elements) * repetitions;
 68 |   std::cout << "Avg: " << (wall_time / repetitions) << std::endl;
 69 |   std::cout << "Perf: " << static_cast<std::int64_t>(ops / wall_time) << "/s."
 70 |             << std::endl;
 71 | 
 72 |   std::cout << "Warmup single." << std::endl;
 73 | 
 74 |   for (int i = 0; i < 10; ++i) {
 75 |     MultiThreadTransform1D<Context, Params, kernel_size>(context, params);
 76 |   }
 77 | 
 78 |   std::cout << "Benchmark multi." << std::endl;
 79 | 
 80 |   start = time();
 81 | 
 82 |   for (int i = 0; i < repetitions; ++i) {
 83 |     MultiThreadTransform1D<Context, Params, kernel_size>(context, params);
 84 |   }
 85 | 
 86 |   wall_time = time() - start;
 87 |   ops = static_cast<double>(elements) * repetitions;
 88 |   std::cout << "Avg: " << (wall_time / repetitions) << std::endl;
 89 |   std::cout << "Perf: " << static_cast<std::int64_t>(ops / wall_time) << "/s."
 90 |             << std::endl;
 91 | }
 92 | 
 93 | int main() {
 94 |   const int repetitions = 500;
 95 |   const int elements = 4 * 1024 * 1024;
 96 | 
 97 |   std::unique_ptr<std::int32_t[]> int32_array(new std::int32_t[elements]);
 98 |   std::unique_ptr<std::uint8_t[]> uint8_array(new std::uint8_t[elements]);
 99 |   std::unique_ptr<float[]> float_array(new float[elements]);
100 | 
101 |   typedef SimpleContext<gemmlowp::WorkersPool> Context;
102 |   Context context(4, new gemmlowp::WorkersPool());
103 | 
104 |   typedef Transform1DParams<std::int32_t, std::uint8_t, Requantize> RequantizeParams;
105 |   RequantizeParams requantize_params;
106 |   requantize_params.input = int32_array.get();
107 |   requantize_params.output = uint8_array.get();
108 |   requantize_params.kernel.count = elements;
109 |   requantize_params.kernel.input_range_min = -100.0f;
110 |   requantize_params.kernel.input_range_scale =
111 |       200.0f / ((static_cast<std::int64_t>(1) << 32) - 1);
112 |   requantize_params.kernel.input_range_offset =
113 |       static_cast<float>(std::numeric_limits<std::int32_t>::lowest());
114 |   requantize_params.kernel.output_range_min = -200.0f;
115 |   requantize_params.kernel.one_over_output_range_scale =
116 |       static_cast<float>((static_cast<std::int64_t>(1) << 8) - 1) / 500.0f;
117 |   requantize_params.kernel.output_range_offset =
118 |       static_cast<float>(std::numeric_limits<std::uint8_t>::lowest());
119 | 
120 |   run_benchmark("Requantize", repetitions, elements, &context,
121 |                 requantize_params);
122 | 
123 |   typedef Transform1DParams<std::uint8_t, float, Dequantize> DequantizeParams;
124 |   DequantizeParams dequantize_params;
125 |   dequantize_params.input = uint8_array.get();
126 |   dequantize_params.output = float_array.get();
127 |   dequantize_params.kernel.count = elements;
128 |   dequantize_params.kernel.range_min = -100.0f;
129 |   dequantize_params.kernel.range_scale =
130 |       static_cast<float>((static_cast<std::int64_t>(1) << 8) - 1) / 200.0f;
131 |   dequantize_params.kernel.range_offset =
132 |       static_cast<float>(std::numeric_limits<std::uint8_t>::lowest());
133 | 
134 |   run_benchmark("Dequantize", repetitions, elements, &context,
135 |                 dequantize_params);
136 | 
137 |   typedef Transform1DParams<float, std::uint8_t, Quantize> QuantizeParams;
138 |   QuantizeParams quantize_params;
139 |   quantize_params.input = float_array.get();
140 |   quantize_params.output = uint8_array.get();
141 |   quantize_params.kernel.count = elements;
142 |   quantize_params.kernel.range_min = -100.0f;
143 |   quantize_params.kernel.range_scale =
144 |       200.0f / ((static_cast<std::int64_t>(1) << 8) - 1);
145 |   quantize_params.kernel.range_offset =
146 |       static_cast<float>(std::numeric_limits<std::uint8_t>::lowest());
147 | 
148 |   run_benchmark("Quantize", repetitions, elements, &context, quantize_params);
149 | 
150 |   return 0;
151 | }
152 | 


--------------------------------------------------------------------------------
/profiling/pthread_everywhere.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2017 The Gemmlowp Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // pthread_everywhere.h: Either includes <pthread.h> or implements a
16 | // subset of pthread functionality on top of C++11 <thread> for portability.
17 | 
18 | #ifndef GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_
19 | #define GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_
20 | 
21 | #ifndef _WIN32
22 | #define GEMMLOWP_USE_PTHREAD
23 | #endif
24 | 
25 | #if defined GEMMLOWP_USE_PTHREAD
26 | #include <pthread.h>
27 | #else
28 | // Implement a small subset of pthread on top of C++11 threads.
29 | // The function signatures differ from true pthread functions in two ways:
30 | //  - True pthread functions return int error codes, ours return void.
31 | //    Rationale: the c++11 <thread> equivalent functions return void
32 | //    and use exceptions to report errors; we don't want to deal with
33 | //    exceptions in this code, so we couldn't meaningfully return errors
34 | //    in the polyfill. Also, the gemmlowp code using these pthread functions
35 | //    never checks their return values anyway.
36 | //  - True pthread *_create/*_init functions take pointers to 'attribute'
37 | //    structs; ours take nullptr_t. That is because gemmlowp always passes
38 | //    nullptr at the moment, so any support we would code for non-null
39 | //    attribs would be unused.
40 | #include <condition_variable>
41 | #include <cstddef>
42 | #include <mutex>
43 | #include <thread>
44 | namespace gemmlowp {
45 | using pthread_t = std::thread *;
46 | using pthread_mutex_t = std::mutex *;
47 | using pthread_cond_t = std::condition_variable *;
48 | inline void pthread_create(pthread_t *thread, std::nullptr_t,
49 |                            void *(*start_routine)(void *), void *arg) {
50 |   *thread = new std::thread(start_routine, arg);
51 | }
52 | inline void pthread_join(pthread_t thread, std::nullptr_t) { thread->join(); }
53 | inline void pthread_mutex_init(pthread_mutex_t *mutex, std::nullptr_t) {
54 |   *mutex = new std::mutex;
55 | }
56 | inline void pthread_mutex_lock(pthread_mutex_t *mutex) { (*mutex)->lock(); }
57 | inline void pthread_mutex_unlock(pthread_mutex_t *mutex) { (*mutex)->unlock(); }
58 | inline void pthread_mutex_destroy(pthread_mutex_t *mutex) { delete *mutex; }
59 | inline void pthread_cond_init(pthread_cond_t *cond, std::nullptr_t) {
60 |   *cond = new std::condition_variable;
61 | }
62 | inline void pthread_cond_signal(pthread_cond_t *cond) { (*cond)->notify_one(); }
63 | inline void pthread_cond_broadcast(pthread_cond_t *cond) {
64 |   (*cond)->notify_all();
65 | }
66 | inline void pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex) {
67 |   std::unique_lock<std::mutex> lock(**mutex, std::adopt_lock);
68 |   (*cond)->wait(lock);
69 |   // detach lock from mutex so when we leave this conext
70 |   // the lock is not released
71 |   lock.release();
72 | }
73 | inline void pthread_cond_destroy(pthread_cond_t *cond) { delete *cond; }
74 | }  // end namespace gemmlowp
75 | #endif
76 | 
77 | #endif  // GEMMLOWP_PROFILING_PTHREAD_EVERYWHERE_H_
78 | 


--------------------------------------------------------------------------------
/public/bit_depth.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // bit_depth.h: defines the settins controlling LHS/RHS bit depth
16 | 
17 | #ifndef GEMMLOWP_PUBLIC_BIT_DEPTH_H_
18 | #define GEMMLOWP_PUBLIC_BIT_DEPTH_H_
19 | 
20 | namespace gemmlowp {
21 | 
22 | // The range of allowed values for an operand.
23 | template <int tMinValue, int tMaxValue>
24 | struct OperandRange {
25 |   static constexpr int kMinValue = tMinValue;
26 |   static constexpr int kMaxValue = tMaxValue;
27 |   static_assert(kMinValue < kMaxValue, "");
28 | };
29 | 
30 | using Uint8Range = OperandRange<0, 255>;
31 | using Uint8RangeExcludingZero = OperandRange<1, 255>;
32 | 
33 | using Int8Range = OperandRange<-128, 127>;
34 | using Int8RangeExcludingLow = OperandRange<-127, 127>;
35 | 
36 | template <typename tLhsRange, typename tRhsRange>
37 | struct BitDepthParams {
38 |   using LhsRange = tLhsRange;
39 |   using RhsRange = tRhsRange;
40 | };
41 | 
42 | // Default: LHS and RHS are 8bit.
43 | using DefaultL8R8BitDepthParams = BitDepthParams<Uint8Range, Uint8Range>;
44 | 
45 | // Variant: LHS may not take the value 0. This allows using
46 | // faster kernels using signed arithmetic, see
47 | // NEON_64bit_GEMM_Int8Operands_Int32Accumulators_AccumTwoWithin16Bits
48 | using L8R8WithLhsNonzeroBitDepthParams =
49 |     BitDepthParams<Uint8RangeExcludingZero, Uint8Range>;
50 | 
51 | // Signed Variant: This allows using faster kernels using signed arithmetic, see
52 | // NEON_64bit_GEMM_Int8Operands_Int32Accumulators_AccumTwoWithin16Bits
53 | using SignedL8R8WithLhsNonzeroBitDepthParams =
54 |     BitDepthParams<Int8RangeExcludingLow, Int8Range>;
55 | 
56 | // Deprecated: when gemmlowp used to allow requantizing 8bit
57 | // inputs to less-than-8-bit depths, the public setting allowing
58 | // that was DefaultL7R5BitDepthParams. That requantization
59 | // feature has been removed, but as the whole point of that
60 | // requantization was to make less-than-8-bit an internal
61 | // optimization without any impact on the API (other than lowering
62 | // accuracy), we can temporarily support users who were using it
63 | // by mapping it to the default 8bit behavior.
64 | using DefaultL7R5BitDepthParams = DefaultL8R8BitDepthParams;
65 | 
66 | }  // namespace gemmlowp
67 | 
68 | #endif  // GEMMLOWP_PUBLIC_BIT_DEPTH_H_
69 | 


--------------------------------------------------------------------------------
/public/gemmlowp.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | // gemmlowp.h: the main public interface header of gemmlowp.
16 | 
17 | #ifndef GEMMLOWP_PUBLIC_GEMMLOWP_H_
18 | #define GEMMLOWP_PUBLIC_GEMMLOWP_H_
19 | #include "../internal/dispatch_gemm_shape.h"
20 | #include "bit_depth.h"
21 | #include "map.h"
22 | #include "output_stages.h"
23 | 
24 | namespace gemmlowp {
25 | 
26 | class GemmContext : public MultiThreadGemmContext {};
27 | 
28 | // Computes a general matrix product ("GEMM").
29 | // This is a version that supports per channel quantization.
30 | template <typename InputScalar, typename OutputScalar, typename BitDepthParams,
31 |           MapOrder LhsOrder, MapOrder RhsOrder, MapOrder ResultOrder,
32 |           typename LhsOffset, typename RhsOffset, typename OutputPipelineType,
33 |           typename GemmContextType>
34 | void GemmWithOutputPipelinePC(GemmContextType* context,
35 |                               const MatrixMap<const InputScalar, LhsOrder>& lhs,
36 |                               const MatrixMap<const InputScalar, RhsOrder>& rhs,
37 |                               MatrixMap<OutputScalar, ResultOrder>* result,
38 |                               const LhsOffset& lhs_offset,
39 |                               const RhsOffset& rhs_offset,
40 |                               const OutputPipelineType& output_pipeline) {
41 |   DispatchGemmShape<InputScalar, OutputScalar, BitDepthParams>(
42 |       context, lhs, rhs, result, lhs_offset, rhs_offset, output_pipeline);
43 | }
44 | 
45 | // Computes a general matrix product ("GEMM").
46 | // This is the legacy version that does not support per channel quantization.
47 | // The meaning of the offsets, result_mult_int and result_shift
48 | // parameters is the same as in the standard EightBitIntGemm interface
49 | // (which is also implemented in the eight_bit_int_gemm directory).
50 | template <typename InputScalar, typename OutputScalar, typename BitDepthParams,
51 |           MapOrder LhsOrder, MapOrder RhsOrder, MapOrder ResultOrder,
52 |           typename OutputPipelineType, typename GemmContextType>
53 | void GemmWithOutputPipeline(GemmContextType* context,
54 |                             const MatrixMap<const InputScalar, LhsOrder>& lhs,
55 |                             const MatrixMap<const InputScalar, RhsOrder>& rhs,
56 |                             MatrixMap<OutputScalar, ResultOrder>* result,
57 |                             int lhs_offset, int rhs_offset,
58 |                             const OutputPipelineType& output_pipeline) {
59 |   typedef VectorDup<const std::int32_t, VectorShape::Col> OffsetColDup;
60 |   typedef VectorDup<const std::int32_t, VectorShape::Row> OffsetRowDup;
61 |   const OffsetColDup lhs_offset_vector(lhs_offset, lhs.rows());
62 |   const OffsetRowDup rhs_offset_vector(rhs_offset, rhs.cols());
63 |   DispatchGemmShape<InputScalar, OutputScalar, BitDepthParams>(
64 |       context, lhs, rhs, result, lhs_offset_vector, rhs_offset_vector,
65 |       output_pipeline);
66 | }
67 | 
68 | // Computes a general matrix product ("GEMM").
69 | // The meaning of the offsets, result_mult_int and result_shift
70 | // parameters is the same as in the standard EightBitIntGemm interface
71 | // (which is also implemented in the eight_bit_int_gemm directory).
72 | template <typename Scalar, typename BitDepthParams, MapOrder LhsOrder,
73 |           MapOrder RhsOrder, MapOrder ResultOrder, typename GemmContextType>
74 | void Gemm(GemmContextType* context,
75 |           const MatrixMap<const Scalar, LhsOrder>& lhs,
76 |           const MatrixMap<const Scalar, RhsOrder>& rhs,
77 |           MatrixMap<Scalar, ResultOrder>* result, int lhs_offset,
78 |           int rhs_offset, int result_offset, int result_mult_int,
79 |           int result_shift) {
80 |   GemmWithOutputPipeline<Scalar, Scalar, BitDepthParams>(
81 |       context, lhs, rhs, result, lhs_offset, rhs_offset,
82 |       MakeStandardOutputPipeline(result_offset, result_mult_int, result_shift));
83 | }
84 | 
85 | }  // namespace gemmlowp
86 | 
87 | #endif  // GEMMLOWP_PUBLIC_GEMMLOWP_H_
88 | 


--------------------------------------------------------------------------------
/public/map.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // map.h: a minimalist view-existing-buffer-as-a-matrix class,
 16 | // which is how gemmlowp interfaces with external matrix data.
 17 | 
 18 | #ifndef GEMMLOWP_PUBLIC_MAP_H_
 19 | #define GEMMLOWP_PUBLIC_MAP_H_
 20 | 
 21 | #include "../internal/common.h"
 22 | 
 23 | namespace gemmlowp {
 24 | 
 25 | // The two storage orders allowed to map buffers as matrices: ColMajor
 26 | // means column-major, RowMajor means row-major.
 27 | enum class MapOrder { ColMajor, RowMajor };
 28 | 
 29 | // A MatrixMap is a view of an existing buffer as a matrix. It does not own
 30 | // the buffer.
 31 | template <typename tScalar, MapOrder tOrder>
 32 | class MatrixMap {
 33 |  public:
 34 |   typedef tScalar Scalar;
 35 |   static constexpr MapOrder kOrder = tOrder;
 36 | 
 37 |  protected:
 38 |   Scalar* data_;  // not owned.
 39 |   int rows_, cols_, stride_;
 40 | 
 41 |  public:
 42 |   MatrixMap() : data_(nullptr), rows_(0), cols_(0), stride_(0) {}
 43 |   MatrixMap(Scalar* data, int rows, int cols)
 44 |       : data_(data),
 45 |         rows_(rows),
 46 |         cols_(cols),
 47 |         stride_(kOrder == MapOrder::ColMajor ? rows : cols) {}
 48 |   MatrixMap(Scalar* data, int rows, int cols, int stride)
 49 |       : data_(data), rows_(rows), cols_(cols), stride_(stride) {}
 50 |   MatrixMap(const MatrixMap& other)
 51 |       : data_(other.data_),
 52 |         rows_(other.rows_),
 53 |         cols_(other.cols_),
 54 |         stride_(other.stride_) {}
 55 | 
 56 |   int rows() const { return rows_; }
 57 |   int cols() const { return cols_; }
 58 |   int stride() const { return stride_; }
 59 |   int rows_stride() const { return kOrder == MapOrder::ColMajor ? 1 : stride_; }
 60 |   int cols_stride() const { return kOrder == MapOrder::RowMajor ? 1 : stride_; }
 61 |   Scalar* data() const { return data_; }
 62 |   Scalar* data(int row, int col) const {
 63 |     return data_ + row * rows_stride() + col * cols_stride();
 64 |   }
 65 |   Scalar& operator()(int row, int col) const { return *data(row, col); }
 66 | 
 67 |   MatrixMap block(int start_row, int start_col, int block_rows,
 68 |                   int block_cols) const {
 69 |     assert(start_row >= 0);
 70 |     assert(start_row + block_rows <= rows_);
 71 |     assert(start_col >= 0);
 72 |     assert(start_col + block_cols <= cols_);
 73 | 
 74 |     return MatrixMap(data(start_row, start_col), block_rows, block_cols,
 75 |                      stride_);
 76 |   }
 77 | };
 78 | 
 79 | enum class VectorShape { Col, Row };
 80 | 
 81 | // A VectorMap is a view of an existing buffer as a vector. It does not own
 82 | // the buffer.
 83 | template <typename tScalar, VectorShape tShape>
 84 | class VectorMap {
 85 |  public:
 86 |   typedef tScalar Scalar;
 87 |   static constexpr VectorShape kShape = tShape;
 88 | 
 89 |  protected:
 90 |   Scalar* data_;  // not owned.
 91 |   int size_;
 92 | 
 93 |  public:
 94 |   VectorMap() : data_(nullptr), size_(0) {}
 95 |   VectorMap(Scalar* data, int size) : data_(data), size_(size) {}
 96 |   VectorMap(const VectorMap& other) = default;
 97 |   VectorMap& operator=(const VectorMap& other) = default;
 98 | 
 99 |   int size() const { return size_; }
100 |   Scalar* data() const { return data_; }
101 |   Scalar* data(int index) const { return data_ + index; }
102 |   Scalar& operator()(int index) const { return *data(index); }
103 | 
104 |   VectorMap block(int start, int len) const {
105 |     assert(start >= 0);
106 |     assert(start + len <= size_);
107 | 
108 |     return VectorMap(data(start), len);
109 |   }
110 | };
111 | 
112 | // A VectorDup is a (duplicated value) vector where all components are the same.
113 | template <typename tScalar, VectorShape tShape>
114 | class VectorDup {
115 |  public:
116 |   typedef tScalar Scalar;
117 |   static constexpr VectorShape kShape = tShape;
118 | 
119 |  protected:
120 |   Scalar data_;
121 |   int size_;
122 | 
123 |  public:
124 |   VectorDup() : data_(0), size_(0) {}
125 |   VectorDup(Scalar data, int size) : data_(data), size_(size) {}
126 |   VectorDup(const VectorDup& other) : data_(other.data_), size_(other.size_) {}
127 | 
128 |   int size() const { return size_; }
129 |   Scalar& operator()(int) const { return data_; }
130 | 
131 |   VectorDup block(int start, int len) const {
132 |     assert(start >= 0);
133 |     assert(start + len <= size_);
134 | 
135 |     (void)start;
136 |     return VectorDup(data_, len);
137 |   }
138 | };
139 | 
140 | }  // namespace gemmlowp
141 | 
142 | #endif  // GEMMLOWP_PUBLIC_MAP_H_
143 | 


--------------------------------------------------------------------------------
/scripts/ci-before.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | if [ $TEST == "arm" ]; then
3 |   curl -L https://dl.google.com/android/repository/android-ndk-${NDK_VERSION}-linux-x86_64.zip -O
4 |   unzip android-ndk-${NDK_VERSION}-linux-x86_64.zip 2> /dev/null > /dev/null
5 |   echo no | android create avd --force -n test -t android-22 --abi armeabi-v7a
6 |   emulator -avd test -no-audio -no-window &
7 | fi
8 | 


--------------------------------------------------------------------------------
/scripts/ci-test.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ $TEST == "arm" ]; then
 3 |   ./android-ndk-${NDK_VERSION}/ndk-build
 4 |   android-wait-for-emulator
 5 |   # adb shell input keyevent 82 &
 6 |   adb push ./libs/* /data/local/tmp
 7 |   adb shell /data/local/tmp/benchmark
 8 |   adb shell /data/local/tmp/correctness_meta_gemm
 9 |   # too slow
10 |   # adb shell /data/local/tmp/benchmark_meta_gemm
11 | fi
12 | if [ $TEST == "x86" ]; then
13 |   make -f Makefile.travis unittest
14 | fi
15 | 


--------------------------------------------------------------------------------
/scripts/test-android.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | if [ -z "$CXX" ]
17 | then
18 |   echo "please set the CXX environment variable to point to your native Android toolchain C++ compiler"
19 |   exit 1
20 | fi
21 | 
22 | default_cflags="-O3"
23 | 
24 | if [ "$#" -eq 0 ]
25 | then
26 |   echo "Usage: $0 files... [cflags...]"
27 |   echo "All command-line parameters are passed along to the C++ compiler, so they can \
28 | be either source files, or compiler flags."
29 |   echo "Default cflags: $default_cflags"
30 |   echo "Relies on the CXX environment variable to point to an Android C++ toolchain compiler."
31 |   exit 1
32 | fi
33 | 
34 | EXE=gemmlowp-android-binary
35 | 
36 | if [[ $CXX =~ .*aarch64.* ]]
37 | then
38 |   NEON_FLAGS=
39 | else
40 |   NEON_FLAGS="-mfpu=neon -mfloat-abi=softfp"
41 | fi
42 | 
43 | $CXX \
44 |  --std=c++11 \
45 |  -Wall -Wextra -pedantic \
46 |  -fPIE -pie $NEON_FLAGS \
47 |  -lstdc++ -latomic \
48 |  -I . -I .. \
49 |  -o $EXE \
50 |  -Wno-unused-variable -Wno-unused-parameter \
51 |  $default_cflags \
52 |  $*
53 | 
54 | if [ $? != 0 ]; then
55 |   echo "build failed"
56 |   exit 1
57 | fi
58 | 
59 | adb root
60 | 
61 | if [ $? != 0 ]; then
62 |   echo "$0: adb root failed"
63 |   exit 1
64 | fi
65 | 
66 | adb shell mkdir -p /data/local/tmp
67 | 
68 | if [ $? != 0 ]; then
69 |   echo "$0: adb shell failed to mkdir /data/local/tmp"
70 |   exit 1
71 | fi
72 | 
73 | adb push $EXE /data/local/tmp
74 | 
75 | if [ $? != 0 ]; then
76 |   echo "$0: adb push failed to write to /data/local/tmp"
77 |   exit 1
78 | fi
79 | 
80 | echo adb shell "/data/local/tmp/$EXE $TESTARGS"
81 | 
82 | adb shell "/data/local/tmp/$EXE $TESTARGS" | tee "log-$EXE"
83 | 
84 | if [ $? != 0 ]; then
85 |   echo "$0: adb shell failed to run binary on device"
86 |   exit 1
87 | fi
88 | 


--------------------------------------------------------------------------------
/standalone/encode.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2018 The gemmlowp Authors. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # ==============================================================================
 15 | """Encodes ARM asm code for certain instructions into the corresponding machine code encoding, as a .word directive in the asm code, preserving the original code in a comment.
 16 | 
 17 | Reads from stdin, writes to stdout.
 18 | 
 19 | Example diff:
 20 | -        "udot v16.4s, v4.16b, v0.16b\n"
 21 | +        ".word 0x6e809490  // udot v16.4s, v4.16b, v0.16b\n"
 22 | 
 23 | The intended use case is to make asm code easier to compile on toolchains that
 24 | do not support certain new instructions.
 25 | """
 26 | 
 27 | import sys
 28 | import re
 29 | import argparse
 30 | 
 31 | 
 32 | def encode_udot_sdot_vector(line):
 33 |   m = re.search(
 34 |       r'\b([us])dot[ ]+v([0-9]+)[ ]*\.[ ]*4s[ ]*\,[ ]*v([0-9]+)[ ]*\.[ ]*16b[ ]*\,[ ]*v([0-9]+)[ ]*\.[ ]*16b',
 35 |       line)
 36 |   if not m:
 37 |     return 0, line
 38 | 
 39 |   match = m.group(0)
 40 |   unsigned = 1 if m.group(1) == 'u' else 0
 41 |   accum = int(m.group(2))
 42 |   lhs = int(m.group(3))
 43 |   rhs = int(m.group(4))
 44 |   assert accum >= 0 and accum <= 31
 45 |   assert lhs >= 0 and lhs <= 31
 46 |   assert rhs >= 0 and rhs <= 31
 47 |   mcode = 0x4e809400 | (accum << 0) | (lhs << 5) | (rhs << 16) | (
 48 |       unsigned << 29)
 49 |   return mcode, match
 50 | 
 51 | 
 52 | def encode_udot_sdot_element(line):
 53 |   m = re.search(
 54 |       r'\b([us])dot[ ]+v([0-9]+)[ ]*\.[ ]*4s[ ]*\,[ ]*v([0-9]+)[ ]*\.[ ]*16b[ ]*\,[ ]*v([0-9]+)[ ]*\.[ ]*4b[ ]*\[([0-9])\]',
 55 |       line)
 56 |   if not m:
 57 |     return 0, line
 58 | 
 59 |   match = m.group(0)
 60 |   unsigned = 1 if m.group(1) == 'u' else 0
 61 |   accum = int(m.group(2))
 62 |   lhs = int(m.group(3))
 63 |   rhs = int(m.group(4))
 64 |   lanegroup = int(m.group(5))
 65 |   assert accum >= 0 and accum <= 31
 66 |   assert lhs >= 0 and lhs <= 31
 67 |   assert rhs >= 0 and rhs <= 31
 68 |   assert lanegroup >= 0 and lanegroup <= 3
 69 |   l = 1 if lanegroup & 1 else 0
 70 |   h = 1 if lanegroup & 2 else 0
 71 |   mcode = 0x4f80e000 | (accum << 0) | (lhs << 5) | (rhs << 16) | (l << 21) | (
 72 |       h << 11) | (
 73 |           unsigned << 29)
 74 |   return mcode, match
 75 | 
 76 | 
 77 | def encode(line):
 78 |   for encode_func in [encode_udot_sdot_vector, encode_udot_sdot_element]:
 79 |     mcode, match = encode_func(line)
 80 |     if mcode:
 81 |       return mcode, match
 82 |   return 0, line
 83 | 
 84 | 
 85 | def read_existing_encoding(line):
 86 |   m = re.search(r'\.word\ (0x[0-9a-f]+)', line)
 87 |   if m:
 88 |     return int(m.group(1), 16)
 89 |   return 0
 90 | 
 91 | 
 92 | parser = argparse.ArgumentParser(description='Encode some A64 instructions.')
 93 | parser.add_argument(
 94 |     '-f',
 95 |     '--fix',
 96 |     help='fix existing wrong encodings in-place and continue',
 97 |     action='store_true')
 98 | args = parser.parse_args()
 99 | 
100 | lineno = 0
101 | found_existing_encodings = False
102 | found_error = False
103 | found_fixes = False
104 | for line in sys.stdin:
105 |   lineno = lineno + 1
106 |   mcode, match = encode(line)
107 |   if mcode:
108 |     existing_encoding = read_existing_encoding(line)
109 |     if existing_encoding:
110 |       found_existing_encodings = True
111 |       if mcode != existing_encoding:
112 |         if args.fix:
113 |           line = line.replace('.word 0x%x  // %s' % (existing_encoding, match),
114 |                               '.word 0x%x  // %s' % (mcode, match))
115 |           found_fixes = True
116 |         else:
117 |           sys.stderr.write(
118 |               "Error at line %d: existing encoding 0x%x differs from encoding 0x%x for instruction '%s':\n\n%s\n\n"
119 |               % (lineno, existing_encoding, mcode, match, line))
120 |           found_error = True
121 |     else:
122 |       line = line.replace(match, '.word 0x%x  // %s' % (mcode, match))
123 |   sys.stdout.write(line)
124 | if found_error:
125 |   sys.exit(1)
126 | if found_existing_encodings:
127 |   if found_fixes:
128 |     sys.stderr.write(
129 |         'Note: some instructions that this program is able to encode, were already encoded and their existing encodings didn\'t match the specified asm instructions. Since --fix was passed, these were fixed in-place.\n'
130 |     )
131 |   else:
132 |     sys.stderr.write(
133 |         'Note: some instructions that this program is able to encode, were already encoded. These encodings have been checked.\n'
134 |     )
135 | 


--------------------------------------------------------------------------------
/test/ios/gemmlowp_test/AppDelegate.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  AppDelegate.h
 3 | //  gemmlowp_test
 4 | //
 5 | //  Created by petewarden on 9/28/15.
 6 | //  Copyright (c) 2015 petewarden. All rights reserved.
 7 | //
 8 | 
 9 | #import <UIKit/UIKit.h>
10 | 
11 | @interface AppDelegate : UIResponder<UIApplicationDelegate>
12 | 
13 | @property(strong, nonatomic) UIWindow *window;
14 | 
15 | @end
16 | 


--------------------------------------------------------------------------------
/test/ios/gemmlowp_test/AppDelegate.mm:
--------------------------------------------------------------------------------
 1 | //
 2 | //  AppDelegate.m
 3 | //  gemmlowp_test
 4 | //
 5 | //  Created by petewarden on 9/28/15.
 6 | //  Copyright (c) 2015 petewarden. All rights reserved.
 7 | //
 8 | 
 9 | #import "AppDelegate.h"
10 | 
11 | namespace gemmlowp {
12 |   extern void benchmark_all();
13 |   extern void test();
14 | }
15 | 
16 | @interface AppDelegate ()
17 | 
18 | @end
19 | 
20 | @implementation AppDelegate
21 | 
22 | 
23 | - (BOOL)application:(UIApplication *)application didFinishLaunchingWithOptions:(NSDictionary *)launchOptions {
24 |   // Override point for customization after application launch.
25 | 
26 |   gemmlowp::benchmark_all();
27 |   gemmlowp::test();
28 |   
29 |   return YES;
30 | }
31 | 
32 | - (void)applicationWillResignActive:(UIApplication *)application {
33 |   // Sent when the application is about to move from active to inactive state. This can occur for certain types of temporary interruptions (such as an incoming phone call or SMS message) or when the user quits the application and it begins the transition to the background state.
34 |   // Use this method to pause ongoing tasks, disable timers, and throttle down OpenGL ES frame rates. Games should use this method to pause the game.
35 | }
36 | 
37 | - (void)applicationDidEnterBackground:(UIApplication *)application {
38 |   // Use this method to release shared resources, save user data, invalidate timers, and store enough application state information to restore your application to its current state in case it is terminated later.
39 |   // If your application supports background execution, this method is called instead of applicationWillTerminate: when the user quits.
40 | }
41 | 
42 | - (void)applicationWillEnterForeground:(UIApplication *)application {
43 |   // Called as part of the transition from the background to the inactive state; here you can undo many of the changes made on entering the background.
44 | }
45 | 
46 | - (void)applicationDidBecomeActive:(UIApplication *)application {
47 |   // Restart any tasks that were paused (or not yet started) while the application was inactive. If the application was previously in the background, optionally refresh the user interface.
48 | }
49 | 
50 | - (void)applicationWillTerminate:(UIApplication *)application {
51 |   // Called when the application is about to terminate. Save data if appropriate. See also applicationDidEnterBackground:.
52 | }
53 | 
54 | @end
55 | 


--------------------------------------------------------------------------------
/test/ios/gemmlowp_test/Base.lproj/LaunchScreen.xib:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <document type="com.apple.InterfaceBuilder3.CocoaTouch.XIB" version="3.0" toolsVersion="6214" systemVersion="14A314h" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" launchScreen="YES" useTraitCollections="YES">
 3 |     <dependencies>
 4 |         <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="6207"/>
 5 |         <capability name="Constraints with non-1.0 multipliers" minToolsVersion="5.1"/>
 6 |     </dependencies>
 7 |     <objects>
 8 |         <placeholder placeholderIdentifier="IBFilesOwner" id="-1" userLabel="File's Owner"/>
 9 |         <placeholder placeholderIdentifier="IBFirstResponder" id="-2" customClass="UIResponder"/>
10 |         <view contentMode="scaleToFill" id="iN0-l3-epB">
11 |             <rect key="frame" x="0.0" y="0.0" width="480" height="480"/>
12 |             <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
13 |             <subviews>
14 |                 <label opaque="NO" clipsSubviews="YES" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="  Copyright (c) 2015 petewarden. All rights reserved." textAlignment="center" lineBreakMode="tailTruncation" baselineAdjustment="alignBaselines" minimumFontSize="9" translatesAutoresizingMaskIntoConstraints="NO" id="8ie-xW-0ye">
15 |                     <rect key="frame" x="20" y="439" width="441" height="21"/>
16 |                     <fontDescription key="fontDescription" type="system" pointSize="17"/>
17 |                     <color key="textColor" cocoaTouchSystemColor="darkTextColor"/>
18 |                     <nil key="highlightedColor"/>
19 |                 </label>
20 |                 <label opaque="NO" clipsSubviews="YES" userInteractionEnabled="NO" contentMode="left" horizontalHuggingPriority="251" verticalHuggingPriority="251" text="gemmlowp_test" textAlignment="center" lineBreakMode="middleTruncation" baselineAdjustment="alignBaselines" minimumFontSize="18" translatesAutoresizingMaskIntoConstraints="NO" id="kId-c2-rCX">
21 |                     <rect key="frame" x="20" y="140" width="441" height="43"/>
22 |                     <fontDescription key="fontDescription" type="boldSystem" pointSize="36"/>
23 |                     <color key="textColor" cocoaTouchSystemColor="darkTextColor"/>
24 |                     <nil key="highlightedColor"/>
25 |                 </label>
26 |             </subviews>
27 |             <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="calibratedWhite"/>
28 |             <constraints>
29 |                 <constraint firstItem="kId-c2-rCX" firstAttribute="centerY" secondItem="iN0-l3-epB" secondAttribute="bottom" multiplier="1/3" constant="1" id="5cJ-9S-tgC"/>
30 |                 <constraint firstAttribute="centerX" secondItem="kId-c2-rCX" secondAttribute="centerX" id="Koa-jz-hwk"/>
31 |                 <constraint firstAttribute="bottom" secondItem="8ie-xW-0ye" secondAttribute="bottom" constant="20" id="Kzo-t9-V3l"/>
32 |                 <constraint firstItem="8ie-xW-0ye" firstAttribute="leading" secondItem="iN0-l3-epB" secondAttribute="leading" constant="20" symbolic="YES" id="MfP-vx-nX0"/>
33 |                 <constraint firstAttribute="centerX" secondItem="8ie-xW-0ye" secondAttribute="centerX" id="ZEH-qu-HZ9"/>
34 |                 <constraint firstItem="kId-c2-rCX" firstAttribute="leading" secondItem="iN0-l3-epB" secondAttribute="leading" constant="20" symbolic="YES" id="fvb-Df-36g"/>
35 |             </constraints>
36 |             <nil key="simulatedStatusBarMetrics"/>
37 |             <freeformSimulatedSizeMetrics key="simulatedDestinationMetrics"/>
38 |             <point key="canvasLocation" x="548" y="455"/>
39 |         </view>
40 |     </objects>
41 | </document>
42 | 


--------------------------------------------------------------------------------
/test/ios/gemmlowp_test/Base.lproj/Main.storyboard:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 2 | <document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="6211" systemVersion="14A298i" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" initialViewController="BYZ-38-t0r">
 3 |     <dependencies>
 4 |         <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="6204"/>
 5 |     </dependencies>
 6 |     <scenes>
 7 |         <!--View Controller-->
 8 |         <scene sceneID="tne-QT-ifu">
 9 |             <objects>
10 |                 <viewController id="BYZ-38-t0r" customClass="ViewController" customModuleProvider="" sceneMemberID="viewController">
11 |                     <layoutGuides>
12 |                         <viewControllerLayoutGuide type="top" id="y3c-jy-aDJ"/>
13 |                         <viewControllerLayoutGuide type="bottom" id="wfy-db-euE"/>
14 |                     </layoutGuides>
15 |                     <view key="view" contentMode="scaleToFill" id="8bC-Xf-vdC">
16 |                         <rect key="frame" x="0.0" y="0.0" width="600" height="600"/>
17 |                         <autoresizingMask key="autoresizingMask" widthSizable="YES" heightSizable="YES"/>
18 |                         <color key="backgroundColor" white="1" alpha="1" colorSpace="custom" customColorSpace="calibratedWhite"/>
19 |                     </view>
20 |                 </viewController>
21 |                 <placeholder placeholderIdentifier="IBFirstResponder" id="dkx-z0-nzr" sceneMemberID="firstResponder"/>
22 |             </objects>
23 |         </scene>
24 |     </scenes>
25 | </document>
26 | 


--------------------------------------------------------------------------------
/test/ios/gemmlowp_test/Images.xcassets/AppIcon.appiconset/Contents.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "images" : [
 3 |     {
 4 |       "idiom" : "iphone",
 5 |       "size" : "29x29",
 6 |       "scale" : "2x"
 7 |     },
 8 |     {
 9 |       "idiom" : "iphone",
10 |       "size" : "29x29",
11 |       "scale" : "3x"
12 |     },
13 |     {
14 |       "idiom" : "iphone",
15 |       "size" : "40x40",
16 |       "scale" : "2x"
17 |     },
18 |     {
19 |       "idiom" : "iphone",
20 |       "size" : "40x40",
21 |       "scale" : "3x"
22 |     },
23 |     {
24 |       "idiom" : "iphone",
25 |       "size" : "60x60",
26 |       "scale" : "2x"
27 |     },
28 |     {
29 |       "idiom" : "iphone",
30 |       "size" : "60x60",
31 |       "scale" : "3x"
32 |     },
33 |     {
34 |       "idiom" : "ipad",
35 |       "size" : "29x29",
36 |       "scale" : "1x"
37 |     },
38 |     {
39 |       "idiom" : "ipad",
40 |       "size" : "29x29",
41 |       "scale" : "2x"
42 |     },
43 |     {
44 |       "idiom" : "ipad",
45 |       "size" : "40x40",
46 |       "scale" : "1x"
47 |     },
48 |     {
49 |       "idiom" : "ipad",
50 |       "size" : "40x40",
51 |       "scale" : "2x"
52 |     },
53 |     {
54 |       "idiom" : "ipad",
55 |       "size" : "76x76",
56 |       "scale" : "1x"
57 |     },
58 |     {
59 |       "idiom" : "ipad",
60 |       "size" : "76x76",
61 |       "scale" : "2x"
62 |     }
63 |   ],
64 |   "info" : {
65 |     "version" : 1,
66 |     "author" : "xcode"
67 |   }
68 | }


--------------------------------------------------------------------------------
/test/ios/gemmlowp_test/Info.plist:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 3 | <plist version="1.0">
 4 | <dict>
 5 | 	<key>CFBundleDevelopmentRegion</key>
 6 | 	<string>en</string>
 7 | 	<key>CFBundleExecutable</key>
 8 | 	<string>$(EXECUTABLE_NAME)</string>
 9 | 	<key>CFBundleIdentifier</key>
10 | 	<string>com.google.$(PRODUCT_NAME:rfc1034identifier)</string>
11 | 	<key>CFBundleInfoDictionaryVersion</key>
12 | 	<string>6.0</string>
13 | 	<key>CFBundleName</key>
14 | 	<string>$(PRODUCT_NAME)</string>
15 | 	<key>CFBundlePackageType</key>
16 | 	<string>APPL</string>
17 | 	<key>CFBundleShortVersionString</key>
18 | 	<string>1.0</string>
19 | 	<key>CFBundleSignature</key>
20 | 	<string>????</string>
21 | 	<key>CFBundleVersion</key>
22 | 	<string>1</string>
23 | 	<key>LSRequiresIPhoneOS</key>
24 | 	<true/>
25 | 	<key>UILaunchStoryboardName</key>
26 | 	<string>LaunchScreen</string>
27 | 	<key>UIMainStoryboardFile</key>
28 | 	<string>Main</string>
29 | 	<key>UIRequiredDeviceCapabilities</key>
30 | 	<array>
31 | 		<string>armv7</string>
32 | 	</array>
33 | 	<key>UISupportedInterfaceOrientations</key>
34 | 	<array>
35 | 		<string>UIInterfaceOrientationPortrait</string>
36 | 		<string>UIInterfaceOrientationLandscapeLeft</string>
37 | 		<string>UIInterfaceOrientationLandscapeRight</string>
38 | 	</array>
39 | 	<key>UISupportedInterfaceOrientations~ipad</key>
40 | 	<array>
41 | 		<string>UIInterfaceOrientationPortrait</string>
42 | 		<string>UIInterfaceOrientationPortraitUpsideDown</string>
43 | 		<string>UIInterfaceOrientationLandscapeLeft</string>
44 | 		<string>UIInterfaceOrientationLandscapeRight</string>
45 | 	</array>
46 | </dict>
47 | </plist>
48 | 


--------------------------------------------------------------------------------
/test/ios/gemmlowp_test/ViewController.h:
--------------------------------------------------------------------------------
 1 | //
 2 | //  ViewController.h
 3 | //  gemmlowp_test
 4 | //
 5 | //  Created by petewarden on 9/28/15.
 6 | //  Copyright (c) 2015 petewarden. All rights reserved.
 7 | //
 8 | 
 9 | #import <UIKit/UIKit.h>
10 | 
11 | @interface ViewController : UIViewController
12 | 
13 | @end
14 | 


--------------------------------------------------------------------------------
/test/ios/gemmlowp_test/ViewController.m:
--------------------------------------------------------------------------------
 1 | //
 2 | //  ViewController.m
 3 | //  gemmlowp_test
 4 | //
 5 | //  Created by petewarden on 9/28/15.
 6 | //  Copyright (c) 2015 petewarden. All rights reserved.
 7 | //
 8 | 
 9 | #import "ViewController.h"
10 | 
11 | @interface ViewController ()
12 | 
13 | @end
14 | 
15 | @implementation ViewController
16 | 
17 | - (void)viewDidLoad {
18 |   [super viewDidLoad];
19 |   // Do any additional setup after loading the view, typically from a nib.
20 | }
21 | 
22 | - (void)didReceiveMemoryWarning {
23 |   [super didReceiveMemoryWarning];
24 |   // Dispose of any resources that can be recreated.
25 | }
26 | 
27 | @end
28 | 


--------------------------------------------------------------------------------
/test/ios/gemmlowp_test/main.m:
--------------------------------------------------------------------------------
 1 | //
 2 | //  main.m
 3 | //  gemmlowp_test
 4 | //
 5 | //  Created by petewarden on 9/28/15.
 6 | //  Copyright (c) 2015 petewarden. All rights reserved.
 7 | //
 8 | 
 9 | #import <UIKit/UIKit.h>
10 | #import "AppDelegate.h"
11 | 
12 | int main(int argc, char * argv[]) {
13 |   @autoreleasepool {
14 |       return UIApplicationMain(argc, argv, nil, NSStringFromClass([AppDelegate class]));
15 |   }
16 | }
17 | 


--------------------------------------------------------------------------------
/test/test.h:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | // test.h: shared testing helpers.
 16 | 
 17 | #ifndef GEMMLOWP_TEST_TEST_H_
 18 | #define GEMMLOWP_TEST_TEST_H_
 19 | 
 20 | #ifdef GEMMLOWP_TEST_PROFILE
 21 | #define GEMMLOWP_PROFILING
 22 | #include "../profiling/profiler.h"
 23 | #endif
 24 | 
 25 | #include <cstring>
 26 | #include <iostream>
 27 | #include <random>
 28 | #include <vector>
 29 | 
 30 | #include "../public/gemmlowp.h"
 31 | 
 32 | namespace gemmlowp {
 33 | 
 34 | #define GEMMLOWP_STRINGIFY2(x) #x
 35 | #define GEMMLOWP_STRINGIFY(x) GEMMLOWP_STRINGIFY2(x)
 36 | 
 37 | #define Check(b)                                                         \
 38 |   do {                                                                   \
 39 |     ReleaseBuildAssertion(                                               \
 40 |         b, "test failed at " __FILE__ ":" GEMMLOWP_STRINGIFY(__LINE__)); \
 41 |   } while (false)
 42 | 
 43 | // gemmlowp itself doesn't have a Matrix class, only a MatrixMap class,
 44 | // since it only maps existing data. In tests though, we need to
 45 | // create our own matrices.
 46 | template <typename tScalar, MapOrder tOrder>
 47 | class Matrix : public MatrixMap<tScalar, tOrder> {
 48 |  public:
 49 |   typedef MatrixMap<tScalar, tOrder> Map;
 50 |   typedef MatrixMap<const tScalar, tOrder> ConstMap;
 51 |   typedef typename Map::Scalar Scalar;
 52 |   static constexpr MapOrder Order = tOrder;
 53 |   using Map::kOrder;
 54 |   using Map::rows_;
 55 |   using Map::cols_;
 56 |   using Map::stride_;
 57 |   using Map::data_;
 58 | 
 59 |  public:
 60 |   Matrix() : Map(nullptr, 0, 0, 0) {}
 61 | 
 62 |   Matrix(int rows, int cols) : Map(nullptr, 0, 0, 0) { Resize(rows, cols); }
 63 | 
 64 |   Matrix(const Matrix& other) : Map(nullptr, 0, 0, 0) { *this = other; }
 65 | 
 66 |   Matrix& operator=(const Matrix& other) {
 67 |     Resize(other.rows_, other.cols_);
 68 |     std::memcpy(data_, other.data_, size() * sizeof(Scalar));
 69 |     return *this;
 70 |   }
 71 | 
 72 |   friend bool operator==(const Matrix& a, const Matrix& b) {
 73 |     return a.rows_ == b.rows_ && a.cols_ == b.cols_ &&
 74 |            !std::memcmp(a.data_, b.data_, a.size());
 75 |   }
 76 | 
 77 |   void Resize(int rows, int cols) {
 78 |     rows_ = rows;
 79 |     cols_ = cols;
 80 |     stride_ = kOrder == MapOrder::ColMajor ? rows : cols;
 81 |     storage.resize(size());
 82 |     data_ = storage.data();
 83 |   }
 84 | 
 85 |   int size() const { return rows_ * cols_; }
 86 | 
 87 |   Map& map() { return *static_cast<Map*>(this); }
 88 | 
 89 |   ConstMap const_map() const { return ConstMap(data_, rows_, cols_, stride_); }
 90 | 
 91 |  protected:
 92 |   std::vector<Scalar> storage;
 93 | };
 94 | 
 95 | inline std::mt19937& RandomEngine() {
 96 |   static std::mt19937 engine;
 97 |   return engine;
 98 | }
 99 | 
100 | inline int Random() {
101 |   std::uniform_int_distribution<int> dist(0, std::numeric_limits<int>::max());
102 |   return dist(RandomEngine());
103 | }
104 | 
105 | #ifdef _MSC_VER
106 | // msvc does not support 8bit types in uniform_int_distribution<>.
107 | // Take 32 bit uniform_int_distribution<> and only use the lower 8 bits.
108 | template <typename OperandRange, typename MatrixType>
109 | void MakeRandom(MatrixType* m) {
110 |   ScopedProfilingLabel("MakeRandom(matrix)");
111 |   for (int c = 0; c < m->cols(); c++) {
112 |     for (int r = 0; r < m->rows(); r++) {
113 |       (*m)(r, c) = Random() % OperandRange::kMaxValue;
114 |     }
115 |   }
116 | }
117 | #else
118 | template <typename OperandRange, typename MatrixType>
119 | void MakeRandom(MatrixType* m) {
120 |   ScopedProfilingLabel("MakeRandom(matrix)");
121 |   typedef typename MatrixType::Scalar Scalar;
122 |   std::uniform_int_distribution<Scalar> dist(OperandRange::kMinValue,
123 |                                              OperandRange::kMaxValue);
124 |   for (int c = 0; c < m->cols(); c++) {
125 |     for (int r = 0; r < m->rows(); r++) {
126 |       (*m)(r, c) = dist(RandomEngine());
127 |     }
128 |   }
129 | }
130 | #endif
131 | 
132 | template <typename MatrixType>
133 | void MakeConstant(MatrixType* m, typename MatrixType::Scalar val) {
134 |   ScopedProfilingLabel("MakeConstant(matrix)");
135 |   for (int c = 0; c < m->cols(); c++) {
136 |     for (int r = 0; r < m->rows(); r++) {
137 |       (*m)(r, c) = val;
138 |     }
139 |   }
140 | }
141 | 
142 | template <typename MatrixType>
143 | void MakeZero(MatrixType* m) {
144 |   ScopedProfilingLabel("MakeZero(matrix)");
145 |   MakeConstant(m, 0);
146 | }
147 | 
148 | }  // namespace gemmlowp
149 | 
150 | #endif  // GEMMLOWP_TEST_TEST_H_
151 | 


--------------------------------------------------------------------------------
/test/test_allocator.cc:
--------------------------------------------------------------------------------
 1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #include "test.h"
16 | #include "../internal/allocator.h"
17 | 
18 | namespace gemmlowp {
19 | 
20 | void test_allocator(Allocator* a, int max_array_size) {
21 |   const std::size_t int32_array_size = Random() % max_array_size;
22 |   auto handle_to_int32_array = a->Reserve<std::int32_t>(int32_array_size);
23 |   const std::size_t int8_array_size = Random() % max_array_size;
24 |   auto handle_to_int8_array = a->Reserve<std::int8_t>(int8_array_size);
25 |   a->Commit();
26 |   std::int32_t* int32_array =
27 |       a->GetPointer<std::int32_t>(handle_to_int32_array);
28 |   std::int8_t* int8_array = a->GetPointer<std::int8_t>(handle_to_int8_array);
29 |   Check(int32_array == a->GetPointer<std::int32_t>(handle_to_int32_array));
30 |   Check(int8_array == a->GetPointer<std::int8_t>(handle_to_int8_array));
31 |   Check(
32 |       !(reinterpret_cast<std::uintptr_t>(int32_array) % Allocator::kAlignment));
33 |   Check(
34 |       !(reinterpret_cast<std::uintptr_t>(int8_array) % Allocator::kAlignment));
35 |   Check(reinterpret_cast<std::uintptr_t>(int8_array) >=
36 |         reinterpret_cast<std::uintptr_t>(int32_array + int32_array_size));
37 |   memset(int32_array, 0, sizeof(*int32_array) * int32_array_size);
38 |   memset(int8_array, 0, sizeof(*int8_array) * int8_array_size);
39 |   a->Decommit();
40 | }
41 | 
42 | void test_allocator() {
43 |   Allocator allocator;
44 | 
45 |   // Test allocating increasingly large sizes on the same allocator,
46 |   // starting with size 0.
47 |   for (int i = 1; i < 1000; i += 10) {
48 |     test_allocator(&allocator, i);
49 |   }
50 | }
51 | 
52 | }  // namespace gemmlowp
53 | 
54 | int main() { gemmlowp::test_allocator(); }
55 | 


--------------------------------------------------------------------------------
/test/test_blocking_counter.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include <atomic>  // NOLINT
 16 | #include <vector>
 17 | #include <iostream>
 18 | #include <cstdlib>
 19 | 
 20 | #include "../internal/multi_thread_gemm.h"
 21 | #include "../profiling/pthread_everywhere.h"
 22 | #include "test.h"
 23 | 
 24 | namespace gemmlowp {
 25 | 
 26 | class Thread {
 27 |  public:
 28 |   Thread(BlockingCounter* blocking_counter, int number_of_times_to_decrement)
 29 |       : blocking_counter_(blocking_counter),
 30 |         number_of_times_to_decrement_(number_of_times_to_decrement),
 31 |         made_the_last_decrement_(false),
 32 |         finished_(false) {
 33 | #if defined GEMMLOWP_USE_PTHREAD
 34 |     // Limit the stack size so as not to deplete memory when creating
 35 |     // many threads.
 36 |     pthread_attr_t attr;
 37 |     int err = pthread_attr_init(&attr);
 38 |     if (!err) {
 39 |       size_t stack_size;
 40 |       err = pthread_attr_getstacksize(&attr, &stack_size);
 41 |       if (!err && stack_size > max_stack_size_) {
 42 |         err = pthread_attr_setstacksize(&attr, max_stack_size_);
 43 |       }
 44 |       if (!err) {
 45 |         err = pthread_create(&thread_, &attr, ThreadFunc, this);
 46 |       }
 47 |     }
 48 |     if (err) {
 49 |       std::cerr << "Failed to create a thread.\n";
 50 |       std::abort();
 51 |     }
 52 | #else
 53 |     pthread_create(&thread_, nullptr, ThreadFunc, this);
 54 | #endif
 55 |   }
 56 | 
 57 |   ~Thread() { Join(); }
 58 | 
 59 |   bool Join() {
 60 |     while (!finished_.load()) {
 61 |     }
 62 |     return made_the_last_decrement_;
 63 |   }
 64 | 
 65 |  private:
 66 |   Thread(const Thread& other) = delete;
 67 | 
 68 |   void ThreadFunc() {
 69 |     for (int i = 0; i < number_of_times_to_decrement_; i++) {
 70 |       Check(!made_the_last_decrement_);
 71 |       made_the_last_decrement_ = blocking_counter_->DecrementCount();
 72 |     }
 73 |     finished_.store(true);
 74 |   }
 75 | 
 76 |   static void* ThreadFunc(void* ptr) {
 77 |     static_cast<Thread*>(ptr)->ThreadFunc();
 78 |     return nullptr;
 79 |   }
 80 | 
 81 |   static constexpr size_t max_stack_size_ = 256 * 1024;
 82 |   BlockingCounter* const blocking_counter_;
 83 |   const int number_of_times_to_decrement_;
 84 |   pthread_t thread_;
 85 |   bool made_the_last_decrement_;
 86 |   // finished_ is used to manually implement Join() by busy-waiting.
 87 |   // I wanted to use pthread_join / std::thread::join, but the behavior
 88 |   // observed on Android was that pthread_join aborts when the thread has
 89 |   // already joined before calling pthread_join, making that hard to use.
 90 |   // It appeared simplest to just implement this simple spinlock, and that
 91 |   // is good enough as this is just a test.
 92 |   std::atomic<bool> finished_;
 93 | };
 94 | 
 95 | void test_blocking_counter(BlockingCounter* blocking_counter, int num_threads,
 96 |                            int num_decrements_per_thread,
 97 |                            int num_decrements_to_wait_for) {
 98 |   std::vector<Thread*> threads;
 99 |   blocking_counter->Reset(num_decrements_to_wait_for);
100 |   for (int i = 0; i < num_threads; i++) {
101 |     threads.push_back(new Thread(blocking_counter, num_decrements_per_thread));
102 |   }
103 |   blocking_counter->Wait();
104 | 
105 |   int num_threads_that_made_the_last_decrement = 0;
106 |   for (int i = 0; i < num_threads; i++) {
107 |     if (threads[i]->Join()) {
108 |       num_threads_that_made_the_last_decrement++;
109 |     }
110 |     delete threads[i];
111 |   }
112 |   Check(num_threads_that_made_the_last_decrement == 1);
113 | }
114 | 
115 | void test_blocking_counter() {
116 |   BlockingCounter* blocking_counter = new BlockingCounter;
117 | 
118 |   // repeating the entire test sequence ensures that we test
119 |   // non-monotonic changes.
120 |   for (int repeat = 1; repeat <= 2; repeat++) {
121 |     for (int num_threads = 1; num_threads <= 5; num_threads++) {
122 |       for (int num_decrements_per_thread = 1;
123 |            num_decrements_per_thread <= 4 * 1024;
124 |            num_decrements_per_thread *= 16) {
125 |         test_blocking_counter(blocking_counter, num_threads,
126 |                               num_decrements_per_thread,
127 |                               num_threads * num_decrements_per_thread);
128 |       }
129 |     }
130 |   }
131 |   delete blocking_counter;
132 | }
133 | 
134 | }  // end namespace gemmlowp
135 | 
136 | int main() { gemmlowp::test_blocking_counter(); }
137 | 


--------------------------------------------------------------------------------
/test/test_data.h:
--------------------------------------------------------------------------------
 1 | // Copyright 2016 The Gemmlowp Authors. All Rights Reserved.
 2 | //
 3 | // Licensed under the Apache License, Version 2.0 (the "License");
 4 | // you may not use this file except in compliance with the License.
 5 | // You may obtain a copy of the License at
 6 | //
 7 | //     http://www.apache.org/licenses/LICENSE-2.0
 8 | //
 9 | // Unless required by applicable law or agreed to in writing, software
10 | // distributed under the License is distributed on an "AS IS" BASIS,
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | // See the License for the specific language governing permissions and
13 | // limitations under the License.
14 | 
15 | #ifndef GEMMLOWP_TEST_TEST_DATA_H_
16 | #define GEMMLOWP_TEST_TEST_DATA_H_
17 | 
18 | namespace test_data {
19 | 
20 | extern const bool is_a_transposed;
21 | extern const bool is_b_transposed;
22 | extern const bool is_c_transposed;
23 | extern const int m;
24 | extern const int n;
25 | extern const int k;
26 | extern const int a_offset;
27 | extern const int b_offset;
28 | extern const int c_shift;
29 | extern const int c_mult_int;
30 | extern const int c_shift;
31 | extern const int c_offset;
32 | 
33 | extern const int a_count;
34 | extern const int b_count;
35 | extern const int c_count;
36 | 
37 | extern unsigned char a_data[];
38 | extern unsigned char b_data[];
39 | extern unsigned char expected_c_data[];
40 | 
41 | }  // namespace test_data
42 | 
43 | #endif  // GEMMLOWP_TEST_TEST_DATA_H
44 | 


--------------------------------------------------------------------------------
/test/test_math_helpers.cc:
--------------------------------------------------------------------------------
  1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
  2 | //
  3 | // Licensed under the Apache License, Version 2.0 (the "License");
  4 | // you may not use this file except in compliance with the License.
  5 | // You may obtain a copy of the License at
  6 | //
  7 | //     http://www.apache.org/licenses/LICENSE-2.0
  8 | //
  9 | // Unless required by applicable law or agreed to in writing, software
 10 | // distributed under the License is distributed on an "AS IS" BASIS,
 11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | // See the License for the specific language governing permissions and
 13 | // limitations under the License.
 14 | 
 15 | #include "test.h"
 16 | 
 17 | #include <limits>
 18 | 
 19 | #include "../internal/common.h"
 20 | 
 21 | namespace gemmlowp {
 22 | 
 23 | // Our math helpers don't intend to be reliable all the way to the
 24 | // limit of representable range, wrt overflow.
 25 | // We don't care for 2G sized matrices.
 26 | // This test stops at half of the representable range.
 27 | template <typename Integer>
 28 | Integer ValueRangeCutoff() {
 29 |   return std::numeric_limits<Integer>::max() / 2;
 30 | }
 31 | 
 32 | int RandomNonnegativeFarAwayFromOverflow() { return Random() % (1 << 24); }
 33 | 
 34 | template <int Modulus>
 35 | void test_round_up_down(int x) {
 36 |   Check(x >= RoundDown<Modulus>(x));
 37 |   Check(x < RoundDown<Modulus>(x) + Modulus);
 38 |   Check(RoundDown<Modulus>(x) % Modulus == 0);
 39 | 
 40 |   Check(x <= RoundUp<Modulus>(x));
 41 |   Check(x > RoundUp<Modulus>(x) - Modulus);
 42 |   Check(RoundUp<Modulus>(x) % Modulus == 0);
 43 | }
 44 | 
 45 | template <int Modulus>
 46 | void test_round_up_down() {
 47 |   for (int i = 0; i < 100; i++) {
 48 |     test_round_up_down<Modulus>(i);
 49 |     const int N = ValueRangeCutoff<int>();
 50 |     test_round_up_down<Modulus>(Random() % N);
 51 |   }
 52 | }
 53 | 
 54 | template <typename Integer>
 55 | void test_ceil_quotient(Integer x, Integer y) {
 56 |   Check(CeilQuotient(x, y) * y >= x);
 57 |   Check(CeilQuotient(x, y) * y < x + y);
 58 | }
 59 | 
 60 | template <typename Integer>
 61 | void test_ceil_quotient() {
 62 |   const Integer N = ValueRangeCutoff<Integer>();
 63 |   const Integer K = std::min(N, Integer(100));
 64 |   for (Integer x = 0; x < K; x++) {
 65 |     for (Integer y = 1; y < K; y++) {
 66 |       test_ceil_quotient(x, y);
 67 |       test_ceil_quotient(x, Integer(1 + (Random() % (N - 1))));
 68 |       test_ceil_quotient(Integer(Random() % N), y);
 69 |       test_ceil_quotient(Integer(Random() % N),
 70 |                          Integer(1 + (Random() % (N - 1))));
 71 |     }
 72 |   }
 73 | }
 74 | 
 75 | template <typename Integer>
 76 | void test_round_up_to_next_power_of_two(Integer x) {
 77 |   Check(RoundUpToPowerOfTwo(RoundUpToPowerOfTwo(x) == RoundUpToPowerOfTwo(x)));
 78 |   Check(RoundUpToPowerOfTwo(x) >= x);
 79 |   Check(x == 0 || RoundUpToPowerOfTwo(x) < 2 * x);
 80 |   Check((RoundUpToPowerOfTwo(x) & (RoundUpToPowerOfTwo(x) - 1)) == 0);
 81 | }
 82 | 
 83 | template <typename Integer>
 84 | void test_round_up_to_next_power_of_two() {
 85 |   const Integer N = ValueRangeCutoff<Integer>();
 86 |   const Integer K = std::min(N, Integer(100));
 87 |   for (Integer x = 0; x < K; x++) {
 88 |     test_round_up_to_next_power_of_two(x);
 89 |     test_round_up_to_next_power_of_two(Random() % N);
 90 |   }
 91 | }
 92 | 
 93 | void test_math_helpers() {
 94 |   test_round_up_down<1>();
 95 |   test_round_up_down<2>();
 96 |   test_round_up_down<3>();
 97 |   test_round_up_down<4>();
 98 |   test_round_up_down<5>();
 99 |   test_round_up_down<6>();
100 |   test_round_up_down<7>();
101 |   test_round_up_down<8>();
102 |   test_round_up_down<9>();
103 |   test_round_up_down<10>();
104 |   test_round_up_down<11>();
105 |   test_round_up_down<12>();
106 |   test_round_up_down<13>();
107 |   test_round_up_down<14>();
108 |   test_round_up_down<15>();
109 |   test_round_up_down<16>();
110 | 
111 |   test_round_up_down<50>();
112 |   test_round_up_down<51>();
113 | 
114 |   test_round_up_down<500>();
115 |   test_round_up_down<501>();
116 | 
117 |   test_ceil_quotient<std::int8_t>();
118 |   test_ceil_quotient<std::uint8_t>();
119 |   test_ceil_quotient<std::int16_t>();
120 |   test_ceil_quotient<std::uint16_t>();
121 |   test_ceil_quotient<std::int32_t>();
122 |   test_ceil_quotient<std::uint32_t>();
123 | 
124 |   test_round_up_to_next_power_of_two<std::int8_t>();
125 |   test_round_up_to_next_power_of_two<std::uint8_t>();
126 |   test_round_up_to_next_power_of_two<std::int16_t>();
127 |   test_round_up_to_next_power_of_two<std::uint16_t>();
128 |   test_round_up_to_next_power_of_two<std::int32_t>();
129 |   test_round_up_to_next_power_of_two<std::uint32_t>();
130 | }
131 | 
132 | }  // end namespace gemmlowp
133 | 
134 | int main() { gemmlowp::test_math_helpers(); }
135 | 


--------------------------------------------------------------------------------
/todo/armv8-64bit-kernel-for-less-than-8-bit.txt:
--------------------------------------------------------------------------------
 1 | TODO: Port the ARMv7 (32bit) less-than-8-bit GEMM kernel to (ARMv8 64bit)
 2 | 
 3 | Platforms: ARM NEON
 4 | 
 5 | Coding time: M
 6 | Experimentation time: M
 7 | Skill required: M
 8 | 
 9 | Prerequisite reading:
10 |   doc/kernels.txt
11 |   doc/packing.txt
12 | 
13 | Model to follow/adapt:
14 |   internal/kernel_neon.h
15 | 
16 | In internal/kernel_neon.h, for ARMv7 (32bit), we have a kernel
17 | specifically designed to take advantage of smaller operands ranges
18 | to use 16-bit local accumulators to achieve higher arithmetic throughput:
19 | 
20 |   NEON_32_Kernel12x4Depth2Assuming12BitProducts
21 | 
22 | This is the kernel used with BitDepthSetting::L7R5 and is what allows
23 | this bit depth setting to outperform L8R8.
24 | 
25 | This TODO item is about porting it to ARMv8 (64bit) assembly. It can
26 | be approached in two parts:
27 | 
28 |   1. Make a trivial port of the existing ARMv7 assembly code in
29 |      NEON_32_Kernel12x4Depth2Assuming12BitProducts to ARMv8 assembly.
30 | 
31 |   2. Consider ways to make use of the larger register space availble on ARMv8:
32 |      there are 32 128-bit vector registers, instead of 16 on ARMv7.
33 |      A simple way, and quite possibly the best, would be to take the same
34 |      approach already implemented in NEON_64_Kernel12x8Depth2:
35 |      When porting a 12x4 kernel from ARMv7 to ARMv8, the extra register
36 |      space can be put to good use by doubling the RHS kernel width, from
37 |      4 to 8, thus changing the 12x4 kernel size to 12x8. Since cells
38 |      are of width 4, this means switching from 1 RHS cell to 2 RHS cells.
39 |      Since everything else remains unchanged, this should be a rather
40 |      simple change to implement. Compare NEON_64_Kernel12x8Depth2
41 |      to NEON_32_Kernel12x4Depth2.
42 | 


--------------------------------------------------------------------------------
/todo/error-diffusion-experiments.txt:
--------------------------------------------------------------------------------
 1 | TODO: Error diffusion experiments
 2 | 
 3 | Platforms: all
 4 | 
 5 | Coding time: M
 6 | Experimentation time: XL
 7 | Skill required: XL
 8 | 
 9 | Prerequisite reading:
10 |   doc/less-than-8-bit.txt
11 | 
12 | 
13 | Overview
14 | ========
15 | 
16 | In internal/pack.h, the Requantize function takes care of requantizing
17 | input 8 bit values to less than 8 bit. This is currently done either by
18 | rounding-to-nearest, or by probabilistic rounding.
19 | 
20 | People have suggested trying error diffusion instead.
21 | https://en.wikipedia.org/wiki/Error_diffusion
22 | This technique originally from graphics might be adaptable to GEMM; however,
23 | that is far from trivial.
24 | 
25 | Still, it may be worth experimenting with it, as the reward of higher accuracy
26 | could be very worthwhile especially if it allows to explore even smaller
27 | bit-depths.
28 | 
29 | 
30 | Why getting error diffusion to work is nontrivial
31 | =================================================
32 | 
33 | In graphics, there is only one array to
34 | apply error diffusion to, and the criteria are mostly aesthetic. Here in GEMM,
35 | there are two arrays involved, allowing for unwanted interaction between the
36 | error diffusion terms added on either side separately; and we have stringent
37 | accuracy criteria.
38 | 
39 | Here is a toy example showing how naive approaches to error diffusion may
40 | suffer from unwanted interactions between the LHS and RHS separate error
41 | diffusion terms:
42 | 
43 | Say that we're working on 1-dimensional data (as opposed to 2-D matrices) to
44 | simplify the discussion.
45 | 
46 | Say that our input values are real numbers in [0, 1] and that we're quantizing
47 | them to either 0 or 1.
48 | 
49 | Say that the left-hand-side is filled with the constant value 0.9 and that our
50 | error-diffusion filter results in the following sequence of quantized values:
51 |     1 (repeated 9 times), 0, ... (repeat).
52 | 
53 | Say that the left-hand-side is filled with the constant value 0.1 and that our
54 | error-diffusion filter results in the following sequence of quantized values:
55 |     0 (repeated 9 times), 1, ... (repeat).
56 | 
57 | So if we compute the dot product (which is what we really do in a GEMM) of
58 | these quantized vectors, we're computing
59 |     1*0 + ... (repeated 9 times) + 0*1 + ... (repeat)
60 | 
61 | So we get exactly 0! This shows how a naive approach to error diffusion may
62 | suffer from bias issues similar to round-to-nearest.
63 | 
64 | 
65 | Some avenues to explore to make error diffusion work
66 | ====================================================
67 | 
68 | 1. Maybe some fixed error diffusion kernels just happen to avoid that issue?
69 | 
70 | 2. Maybe it's just a matter of doing error diffusion for a different vector
71 | error metric, e.g. l^2 instead of l^1?
72 | 
73 | 3. Maybe some randomization (adding some random term to the error term being
74 | diffused) would be acceptable? It seems like it would allow to avoid the
75 | interference problem discussed above.
76 | 
77 | 
78 | Performance considerations
79 | ==========================
80 | 
81 | Error diffusion is going to be relatively expensive compared to the current
82 | requantization methods. It may be acceptable for large enough GEMM depth,
83 | since it only needs to be applied once for the n^2 input matrix entries, thus
84 | becoming negligible compared to the n^3 arithmetic cost of GEMM for large
85 | enough n.
86 | 
87 | Alternatively, we may consider doing requantization of some matrices once and
88 | for all, but that would likely be the case only for one of LHS or RHS,
89 | otherwise one might as well precompute the whole GEMM.
90 | 


--------------------------------------------------------------------------------
/todo/less-than-8-bit-without-requantization.txt:
--------------------------------------------------------------------------------
 1 | TODO: Discard the old requantization stuff, keep less-than-8-bit kernels, expose
 2 | them as a different contract whereby the user specifies that operands use less
 3 | than 8 bits.
 4 | 
 5 | Read: doc/less-than-8-bit.md
 6 | 
 7 | This is about going from "the present" to "the future" as described there.
 8 | 
 9 | Discard all requantization stuff.
10 | 
11 | Probably no need to worry about compatibility, this was little used.
12 | 
13 | Instead, add a new option, in the form of new "bit depth params", whereby the user can specify that operands use less than 8 bits (even though they are represented as std::uint8_t). For example, specifying 6 bits would mean that the contract is that the user guarantees that LHS matrix entries are in the [0, 63] interval.
14 | 
15 | Then make use of that to select kernels that take advantage of the lower bit depth. The existing less-than-8bit kernels would work as-is, only now no requantization would be needed anymore in the packing stage, and not rescaling in the unpacking stage.
16 | 


--------------------------------------------------------------------------------
/todo/multi-threading-experiments.txt:
--------------------------------------------------------------------------------
  1 | TODO: Multi-threading experiments for better performance on smaller GEMM sizes
  2 | 
  3 | Platforms: all, but special focus should be put on mobile OSes (Android...)
  4 | where thread scheduling seems to be unfavorable to throughput.
  5 | 
  6 | Coding time: Unknown
  7 | Experimentation time: XL
  8 | Skill required: XL
  9 | 
 10 | Relevant file:
 11 |   internal/multi_thread_gemm.h
 12 | 
 13 | 
 14 | The problem, and what we have done about it so far
 15 | ==================================================
 16 | 
 17 | It's easy to get a multi-threaded GEMM implementation to perform well
 18 | for large enough GEMM sizes, because then the parallel workloads are large
 19 | enough compared to the synchronization overhead. In gemmlowp however,
 20 | we are specifically interested in "medium" GEMM sizes, of the order of 100,
 21 | which are small enough to make synchronization overhead dominant in many
 22 | situations.
 23 | 
 24 | We have already implemented some changes that were very effective at getting
 25 | good multi-threading benefits for smaller GEMM sizes:
 26 |   https://github.com/google/gemmlowp/commit/210ac891d6d2d0749f7856103c928d9be70ded94
 27 | Let us paste the commit message:
 28 |   1. Use only N-1 worker threads while the master plays the role
 29 |      of the Nth worker, where N is the number of cores.
 30 |      This 1:1 mapping of threads to cores gives much better perf
 31 |      esp. for not-very-large GEMMs and esp. on Android.
 32 |   2. Implement waiting by actually busy-waiting for a little while
 33 |      before eventually falling back to passive waiting. That
 34 |      ensures that we wake up quickly from short naps, which helps
 35 |      with not-very-large GEMMs esp. on Android.
 36 | 
 37 | These changes revolved around the idea that when the GEMM size is too small to
 38 | be efficiently supported by the OS's theading primitives, we can instead
 39 | present the OS with a very simple workload: exactly as many threads as there
 40 | are CPU cores, and these threads being always busy, never waiting. This makes
 41 | it easy for the OS to decide to bring all CPU cores online and give each of our
 42 | threads its own CPU core, and occupy it nearly 100% of the time, thus avoiding
 43 | to have to wait to get scheduled.
 44 | 
 45 | The cost of waiting (or in particular, of locking) is not just the time it
 46 | takes; especially on mobile platforms, it is also the side effects of getting
 47 | our threads de-scheduled by the OS, of getting CPUs spun down, etc. With that
 48 | in mind, anything that can help us avoid waiting/locking in a OS-visible way,
 49 | is worth experimenting with.
 50 | 
 51 | 
 52 | Other things that would be worth experimenting with
 53 | ===================================================
 54 | 
 55 | 
 56 | Busy-waiting in mutex-locking too
 57 | ---------------------------------
 58 | 
 59 | While we have replaced most of the pthread_cond_wait waiting by busy-waiting
 60 | in WaitForVariableChange, on the other hand we are still calling
 61 | pthread_mutex_lock in a couple of places outside of WaitForVariableChange.
 62 | It might be interesting to avoid that too, by having a mutex-locking
 63 | implementation that first spends some time busy-waiting before actually
 64 | resorting to calling pthread_mutex_lock.
 65 | 
 66 | 
 67 | Minimizing locking
 68 | ------------------
 69 | 
 70 | The inherent synchronization points of the GEMM, which we essentially can't
 71 | avoid, are already implemented using WaitForVariableChange, so they are
 72 | already using busy-waiting over short periods of time, which is the
 73 | best that we can do. On the other hand, we are also using mutex locking
 74 | in a couple of places: around updates to the State of worker threads, and
 75 | around updates to the counter value in BlockingCounter. The locking done
 76 | there is unnecessary: it could be replaced by atomic operations, and in
 77 | fact, because our thread structure is so simple and rigid, even atomic
 78 | operations might not be needed at all, as long as we ensure basic
 79 | memory ordering. A precise understanding of the CPU's memory model
 80 | is needed here, and the outcome could depend on the CPU architecture.
 81 | 
 82 | 
 83 | Restructuring the GEMM to remove synchronization points
 84 | -------------------------------------------------------
 85 | 
 86 | Compared to the above ideas, this one is a much bigger departure from
 87 | what we are currently doing.
 88 | 
 89 | The current structure of our multi-threaded GEMM is:
 90 |   for_each(slice_of_RHS) {
 91 |     pack(slice_of_RHS);
 92 |     for_each(slice_of_LHS) {
 93 |       do_gemm_on_some_thread(slice_of_LHS, packed_slice_of_RHS)
 94 |     }
 95 |     wait_for_all_threads(); // synchronization point
 96 |   }
 97 | 
 98 | Thus we have a synchronization point at the end of each slice of RHS.
 99 | The motivation for this design is to have all threads work on a single
100 | large slice of RHS, occupying top-level (shared among cores) CPU cache.
101 | 
102 | Thus the current approach is optimized for cache-friendliness at the
103 | expense of parallelization. Maybe we should consider amending it
104 | to strike a better balance of cache-friendliness vs. parallelization.
105 | 
106 | For instance, we could have a "pipeline" where at a given time we have
107 | *two* slices of RHS packed into top-level CPU cache. We would normally
108 | schedule thread tasks to work with the first of these two RHS slices;
109 | whenever a thread task is done, we would immediately give the thread
110 | a new task, and if we are already done with the first RHS slice, we
111 | could then immediately start a task against the second RHS slice.
112 | 
113 | There could still be some necessary waiting, if one thread is lagging
114 | behind another by more than one full RHS slice; but that should be a
115 | lot better than the current situation, where we wait at the end of
116 | each slice.
117 | 


--------------------------------------------------------------------------------
/todo/neon-depth-major-sources-packing.txt:
--------------------------------------------------------------------------------
 1 | TODO: Implement depth-major-sources packing paths for NEON
 2 | 
 3 | Platforms: ARM NEON
 4 | 
 5 | Coding time: M
 6 | Experimentation time: M
 7 | Skill required: M
 8 | 
 9 | Prerequisite reading:
10 |   doc/kernels.txt
11 |   doc/packing.txt
12 | 
13 | Model to follow/adapt:
14 |   internal/pack_neon.h
15 | 
16 | At the moment we have NEON optimized packing paths for WidthMajor sources.
17 | We also need paths for DepthMajor sources.
18 | 
19 | This is harder because for DepthMajor sources, the size of each slice that
20 | we have to load is the kernel's width, which is typically 12 (for the LHS)
21 | or 4 (for the RHS). That's not very friendly to NEON vector-load instructions
22 | which would allow us to load 8 or 16 entries, but not 4 or 12.
23 | 
24 | So you will have to load 4 entries at a time only. For that, the
25 | vld1q_lane_u32 seems to be as good as you'll get. The other possible
26 | approach would be to load (with plain scalar C++) four uint32's into a
27 | temporary local buffer, and use vld1q_u8 on that. Some experimentation
28 | will be useful here. For that, you can generate assembly with -save-temps
29 | and make assembly easier to inspect by inserting inline assembly comments
30 | such as
31 |   asm volatile("#hello");
32 | 


--------------------------------------------------------------------------------
/todo/remove-default-template-param-values.txt:
--------------------------------------------------------------------------------
 1 | TODO: Remove default template parameter values
 2 | 
 3 | Platforms: all
 4 | 
 5 | Coding time: S
 6 | Experimentation time: S
 7 | Skill required: S
 8 | 
 9 | We should generally not have default values for template parameters: this makes
10 | code harder to read, and is discouraged by the Google C++ style guide for good
11 | reason.
12 | 
13 | Specifically, I'm concerned about CellFormat having the CellOrder parameter
14 | defaulting to WidthMajor. This specific case has been causing confusion.
15 | 
16 | There might be other instances to fix. Part of this TODO item is to audit that.
17 | 
18 | One exception in which default template parameters are OK, is for locally
19 | contained metaprogramming helpers, that are a local implementation detail.
20 | However, I don't know if we have any such case in gemmlowp. We're generally
21 | conservative with template metaprogramming around here.
22 | 


--------------------------------------------------------------------------------
/todo/x86-kernels.txt:
--------------------------------------------------------------------------------
 1 | TODO: Implement a full set of kernels for x86
 2 | 
 3 | Platforms: x86, different variants: 32/64bit, SSE*/AVX* etc.
 4 | 
 5 | Coding time: XL
 6 | Experimentation time: XL
 7 | Skill required: XL
 8 | 
 9 | Prerequisite reading:
10 |   doc/kernels.txt
11 | 
12 | Model to follow/adapt:
13 |   internal/kernel_neon.h
14 | 
15 | We need a full set of kernels for x86 architectures.
16 | By "a full set" we mean: covering all the variants of x86 instruction sets,
17 | and covering our different cases, by decreasing order of importance:
18 |  1. GEMM, BitDepthSetting::L8R8
19 |  2. GEMM, BitDepthSetting::L7R5
20 |  3. GEMV, BitDepthSetting::L8R8 (that one may be deprecated when we eventually
21 |     implement GEMV more efficiently as a fully specialized operation)
22 | 
23 | This generally has to be done separately for 32bit vs 64bit because an
24 | efficient GEMM kernel generally needs to use all the register space that it
25 | can get, and:
26 |  - That register space is generally different on 32bit vs 64bit;
27 |  - C++ compilers have a hard time doing good register allocation for
28 |    intrinsics-using code that's very tight on vector registers, so in
29 |    practice we generally prefer to implement kernels in (inline) assembly.
30 | 
31 | At the moment we have a couple of kernels targeting SSE4 for the
32 | (GEMM, BitDepthSetting::L8R8) case, contributed by Intel.
33 | We need to cover the other x86 instruction set variants and to cover the
34 | other cases, at least the L7R5 case.
35 | 
36 | Labelling this TODO item as XL because unless
37 | one knows the CPU inside out (only CPU vendors do), it generally takes a lot
38 | of trial-and-error to arrive to an optimally performing solution, and in any
39 | case, given the number of x86 different instruction set flavors, the end
40 | result will be a large body of assembly code.
41 | 


--------------------------------------------------------------------------------