├── .gitignore
├── .travis.yml
├── CMakeLists.txt
├── LICENSE
├── README.md
├── bench
    ├── conv1x1.cc
    ├── convolution-inference.cc
    ├── convolution.c
    ├── fully-connected.c
    ├── gemm.c
    ├── hxgemm.cc
    ├── median.c
    ├── memread.c
    ├── memread.py
    ├── perf_counter.c
    ├── perf_counter.h
    ├── pooling.c
    ├── relu.c
    ├── sgemm.cc
    ├── sxgemm.cc
    ├── transform.c
    ├── ugemm.c
    └── winograd.cc
├── benchmark.py
├── cmake
    ├── DownloadCpuinfo.cmake
    ├── DownloadEnum.cmake
    ├── DownloadFP16.cmake
    ├── DownloadFXdiv.cmake
    ├── DownloadGoogleTest.cmake
    ├── DownloadOpcodes.cmake
    ├── DownloadPSimd.cmake
    ├── DownloadPThreadPool.cmake
    ├── DownloadPeachPy.cmake
    └── DownloadSix.cmake
├── configure.py
├── confu.yaml
├── include
    ├── nnpack.h
    └── nnpack
    │   ├── AlignedAllocator.h
    │   ├── activations.h
    │   ├── arm_neon.h
    │   ├── assembly.h
    │   ├── blas.h
    │   ├── complex.h
    │   ├── fft-constants.h
    │   ├── fft.h
    │   ├── hwinfo.h
    │   ├── macros.h
    │   ├── pooling.h
    │   ├── reference.h
    │   ├── relu.h
    │   ├── softmax.h
    │   ├── system.h
    │   ├── transform.h
    │   ├── utils.h
    │   ├── validation.h
    │   └── winograd.h
├── logo
    └── NNPACK.png
├── src
    ├── convolution-inference.c
    ├── convolution-input-gradient.c
    ├── convolution-kernel-gradient.c
    ├── convolution-output.c
    ├── fully-connected-inference.c
    ├── fully-connected-output.c
    ├── init.c
    ├── neon
    │   ├── 2d-winograd-8x8-3x3-fp16.c
    │   ├── 2d-winograd-8x8-3x3.c
    │   ├── blas
    │   │   ├── c4gemm-conjb-transc.c
    │   │   ├── c4gemm-conjb.c
    │   │   ├── c4gemm.c
    │   │   ├── conv1x1.c
    │   │   ├── h4gemm-aarch32.S
    │   │   ├── h4gemm.c
    │   │   ├── s4c2gemm-conjb-transc.c
    │   │   ├── s4c2gemm-conjb.c
    │   │   ├── s4c2gemm.c
    │   │   ├── s4gemm-aarch32.S
    │   │   ├── s4gemm.c
    │   │   ├── sdotxf.c
    │   │   ├── sgemm-aarch32.S
    │   │   └── sgemm.c
    │   ├── relu.c
    │   ├── transpose.h
    │   ├── winograd-f6k3.c
    │   └── winograd
    │   │   └── f6x6k3x3.h
    ├── pooling-output.c
    ├── psimd
    │   ├── 2d-fourier-16x16.c
    │   ├── 2d-fourier-8x8.c
    │   ├── 2d-winograd-8x8-3x3.c
    │   ├── blas
    │   │   ├── c4gemm-conjb-transc.c
    │   │   ├── c4gemm-conjb.c
    │   │   ├── c4gemm.c
    │   │   ├── conv1x1.c
    │   │   ├── s4c2gemm-conjb-transc.c
    │   │   ├── s4c2gemm-conjb.c
    │   │   ├── s4c2gemm.c
    │   │   ├── s4gemm.c
    │   │   ├── sdotxf.c
    │   │   ├── sgemm.c
    │   │   └── shdotxf.c
    │   ├── butterfly.h
    │   ├── exp.c
    │   ├── exp.h
    │   ├── fft-aos.c
    │   ├── fft-dualreal.c
    │   ├── fft-real.c
    │   ├── fft-soa.c
    │   ├── fft
    │   │   ├── aos.h
    │   │   ├── dualreal.h
    │   │   ├── real.h
    │   │   └── soa.h
    │   ├── relu.c
    │   ├── softmax.c
    │   ├── transpose.h
    │   ├── winograd-f6k3.c
    │   └── winograd
    │   │   └── f6x6k3x3.h
    ├── ref
    │   ├── convolution-input-gradient.c
    │   ├── convolution-kernel.c
    │   ├── convolution-output.c
    │   ├── fft
    │   │   ├── aos.c
    │   │   ├── complex.h
    │   │   ├── forward-dualreal.c
    │   │   ├── forward-real.c
    │   │   ├── inverse-dualreal.c
    │   │   ├── inverse-real.c
    │   │   └── soa.c
    │   ├── fully-connected-output.c
    │   ├── max-pooling-output.c
    │   ├── relu-input-gradient.c
    │   ├── relu-output.c
    │   └── softmax-output.c
    ├── relu-input-gradient.c
    ├── relu-output.c
    ├── scalar
    │   ├── 2d-fourier-16x16.c
    │   ├── 2d-fourier-8x8.c
    │   ├── 2d-winograd-8x8-3x3.c
    │   ├── blas
    │   │   ├── cgemm-conjb-transc.c
    │   │   ├── cgemm-conjb.c
    │   │   ├── cgemm.c
    │   │   ├── conv1x1.c
    │   │   ├── s2gemm-transc.c
    │   │   ├── s2gemm.c
    │   │   ├── sdotxf.c
    │   │   ├── sgemm.c
    │   │   └── shdotxf.c
    │   ├── butterfly.h
    │   ├── fft-aos.c
    │   ├── fft-dualreal.c
    │   ├── fft-real.c
    │   ├── fft-soa.c
    │   ├── fft
    │   │   ├── aos.h
    │   │   ├── dualreal.h
    │   │   ├── real.h
    │   │   └── soa.h
    │   ├── relu.c
    │   ├── softmax.c
    │   ├── winograd-f6k3.c
    │   └── winograd
    │   │   └── f6x6k3x3.h
    ├── softmax-output.c
    └── x86_64-fma
    │   ├── 2d-fourier-16x16.py
    │   ├── 2d-fourier-8x8.py
    │   ├── 2d-winograd-8x8-3x3.py
    │   ├── __init__.py
    │   ├── blas
    │       ├── c8gemm.py
    │       ├── conv1x1.py
    │       ├── s4c6gemm.py
    │       ├── s8gemm.py
    │       ├── sdotxf.py
    │       ├── sgemm.py
    │       └── shdotxf.py
    │   ├── block8x8.py
    │   ├── common.py
    │   ├── exp.c
    │   ├── exp.py
    │   ├── fft-aos.py
    │   ├── fft-dualreal.py
    │   ├── fft-real.py
    │   ├── fft-soa.py
    │   ├── fft
    │       ├── __init__.py
    │       ├── complex_soa.py
    │       ├── complex_soa_perm_to_real.py
    │       ├── real_to_complex_soa_perm.py
    │       ├── two_complex_soa_perm_to_two_real_planar.py
    │       └── two_real_to_two_complex_soa_perm_planar.py
    │   ├── fft16x16.py
    │   ├── ifft-dualreal.py
    │   ├── ifft-real.py
    │   ├── max-pooling.py
    │   ├── relu.py
    │   ├── softmax.c
    │   ├── softmax.py
    │   ├── vecmath
    │       ├── __init__.py
    │       └── exp.py
    │   ├── winograd-f6k3.py
    │   └── winograd
    │       ├── __init__.py
    │       └── o6x6k3x3.py
├── test
    ├── convolution-inference
    │   ├── alexnet.cc
    │   ├── overfeat-fast.cc
    │   ├── smoke.cc
    │   └── vgg-a.cc
    ├── convolution-input-gradient
    │   ├── alexnet.cc
    │   ├── overfeat-fast.cc
    │   ├── smoke.cc
    │   └── vgg-a.cc
    ├── convolution-kernel-gradient
    │   ├── alexnet.cc
    │   ├── overfeat-fast.cc
    │   ├── smoke.cc
    │   └── vgg-a.cc
    ├── convolution-output
    │   ├── alexnet.cc
    │   ├── overfeat-fast.cc
    │   ├── smoke.cc
    │   └── vgg-a.cc
    ├── fft-samples.h
    ├── fourier
    │   ├── psimd.cc
    │   ├── reference.cc
    │   ├── scalar.cc
    │   └── x86_64-avx2.cc
    ├── fully-connected-inference
    │   ├── alexnet.cc
    │   ├── overfeat-fast.cc
    │   └── vgg-a.cc
    ├── fully-connected-output
    │   ├── alexnet.cc
    │   ├── overfeat-fast.cc
    │   ├── smoke.cc
    │   └── vgg-a.cc
    ├── hxgemm
    │   └── neon.cc
    ├── max-pooling-output
    │   ├── overfeat-fast.cc
    │   ├── smoke.cc
    │   └── vgg-a.cc
    ├── models
    │   ├── alexnet.h
    │   ├── overfeat-fast.h
    │   └── vgg-a.h
    ├── relu-input-gradient
    │   ├── alexnet.cc
    │   ├── overfeat-fast.cc
    │   └── vgg-a.cc
    ├── relu-output
    │   ├── alexnet.cc
    │   ├── overfeat-fast.cc
    │   └── vgg-a.cc
    ├── sgemm
    │   ├── neon.cc
    │   ├── psimd.cc
    │   ├── scalar.cc
    │   └── x86_64-fma3.cc
    ├── softmax-output
    │   ├── imagenet.cc
    │   └── smoke.cc
    ├── sxgemm
    │   └── neon.cc
    ├── testers
    │   ├── convolution.h
    │   ├── fourier.h
    │   ├── fully-connected.h
    │   ├── gemm-ukernel.h
    │   ├── padding.h
    │   ├── pooling.h
    │   ├── relu.h
    │   ├── softmax.h
    │   └── winograd.h
    └── winograd
    │   ├── neon.cc
    │   ├── psimd.cc
    │   ├── scalar.cc
    │   └── x86_64-fma3.cc
└── web
    ├── nnpack.html
    └── nnpack.nmf


/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ninja files
 2 | build.ninja
 3 | 
 4 | # Build objects and artifacts
 5 | deps/
 6 | build/
 7 | build-*/
 8 | bin/
 9 | lib/
10 | out/
11 | obj/
12 | libs/
13 | *.pyc
14 | *.pyo
15 | 
16 | # System files
17 | .DS_Store
18 | .DS_Store?
19 | ._*
20 | .Spotlight-V100
21 | .Trashes
22 | ehthumbs.db
23 | Thumbs.db
24 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: c
 2 | compiler: clang
 3 | install:
 4 |  - git clone https://github.com/ninja-build/ninja.git /tmp/ninja
 5 |  - pushd /tmp/ninja
 6 |  - git checkout release
 7 |  - python configure.py --bootstrap
 8 |  - mkdir -p $HOME/.local/bin
 9 |  - install -m 755 /tmp/ninja/ninja $HOME/.local/bin/ninja
10 |  - popd
11 |  - export PATH=$HOME/.local/bin:$PATH
12 |  - pip install --user git+https://github.com/Maratyszcza/PeachPy
13 |  - pip install --user git+https://github.com/Maratyszcza/confu
14 | before_script:
15 |  - confu setup
16 |  - python ./configure.py --toolchain=clang --backend=$BACKEND
17 |  - ninja
18 | script:
19 |  - ninja smoketest
20 | addons:
21 |   apt:
22 |     packages:
23 |     - python-pip
24 | env:
25 |  - BACKEND=psimd
26 |  - BACKEND=scalar
27 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Facebook Inc.
 2 | Copyright (c) 2015-2017, Georgia Institute of Technology
 3 | All rights reserved.
 4 | 
 5 | Redistribution and use in source and binary forms, with or without
 6 | modification, are permitted provided that the following conditions are met:
 7 | 
 8 | * Redistributions of source code must retain the above copyright notice, this
 9 |   list of conditions and the following disclaimer.
10 | 
11 | * Redistributions in binary form must reproduce the above copyright notice,
12 |   this list of conditions and the following disclaimer in the documentation
13 |   and/or other materials provided with the distribution.
14 | 
15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 | 


--------------------------------------------------------------------------------
/bench/conv1x1.cc:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | #include <cfloat>
  3 | #include <vector>
  4 | #include <random>
  5 | #include <chrono>
  6 | #include <functional>
  7 | #include <algorithm>
  8 | 
  9 | #include <cpuinfo.h>
 10 | #include <nnpack/macros.h>
 11 | #include <nnpack/blas.h>
 12 | #include <nnpack/AlignedAllocator.h>
 13 | 
 14 | #include <benchmark/benchmark.h>
 15 | 
 16 | 
 17 | template<uint32_t mr_, uint32_t nr_>
 18 | class CONV1x1 : public benchmark::Fixture {
 19 | public:
 20 | 	inline CONV1x1() {
 21 | 		cpuinfo_initialize();
 22 | 		const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
 23 | 		const size_t l1d_reserve = 512;
 24 | 		kc_ = ((l1d_size - l1d_reserve) / sizeof(float) - mr() * nr()) / (mr() + nr());
 25 | 	}
 26 | 
 27 | 	virtual void SetUp(const benchmark::State&) override {
 28 | 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
 29 | 		auto rng = std::bind(std::uniform_real_distribution<float>(), std::mt19937(seed));
 30 | 
 31 | 		i_.resize(mr() * kc());
 32 | 		std::generate(i_.begin(), i_.end(), std::ref(rng));
 33 | 		k_.resize(mr() * kc() + nr());
 34 | 		std::fill(k_.begin(), k_.end(), std::nanf(""));
 35 | 		o_.resize(nr() * kc());
 36 | 		std::generate(o_.begin(), o_.end(), std::ref(rng));
 37 | 	}
 38 | 
 39 | 	virtual void TearDown(benchmark::State& state) override {
 40 | 		state.SetItemsProcessed(uint64_t(state.iterations()) * 2 * mr() * nr() * kc());
 41 | 		i_.clear();
 42 | 		k_.clear();
 43 | 		o_.clear();
 44 | 	}
 45 | 
 46 | 	inline const float* i() const {
 47 | 		return i_.data();
 48 | 	}
 49 | 
 50 | 	inline const float* k() const {
 51 | 		return k_.data();
 52 | 	}
 53 | 
 54 | 	inline float* o() {
 55 | 		return o_.data();
 56 | 	}
 57 | 
 58 | 	inline uint32_t mr() const {
 59 | 		return mr_;
 60 | 	}
 61 | 
 62 | 	inline uint32_t nr() const {
 63 | 		return nr_;
 64 | 	}
 65 | 
 66 | 	inline uint32_t kc() const {
 67 | 		return kc_;
 68 | 	}
 69 | 
 70 | private:
 71 | 	std::vector<float> i_;
 72 | 	std::vector<float> k_;
 73 | 	std::vector<float> o_;
 74 | 	uint32_t kc_;
 75 | };
 76 | 
 77 | #if NNP_BACKEND_X86_64
 78 | 	BENCHMARK_TEMPLATE_F(CONV1x1, fast__neon, 2, 4)(benchmark::State& state) {
 79 | 		for (auto _ : state) {
 80 | 			nnp_conv1x1_only_2x4__fma3(mr(), kc(), i(), k(), o());
 81 | 		}
 82 | 	}
 83 | #endif
 84 | 
 85 | #if NNP_BACKEND_ARM
 86 | 	BENCHMARK_TEMPLATE_F(CONV1x1, fast__neon, 4, 4)(benchmark::State& state) {
 87 | 		for (auto _ : state) {
 88 | 			nnp_conv1x1_only_4x4__neon(mr(), kc(), i(), k(), o());
 89 | 		}
 90 | 	}
 91 | #endif
 92 | 
 93 | #if NNP_BACKEND_PSIMD
 94 | 	BENCHMARK_TEMPLATE_F(CONV1x1, psimd, 2, 8)(benchmark::State& state) {
 95 | 		for (auto _ : state) {
 96 | 			nnp_conv1x1_only_2x4__psimd(mr(), kc(), i(), k(), o());
 97 | 		}
 98 | 	}
 99 | #endif
100 | 
101 | #if NNP_BACKEND_SCALAR
102 | 	BENCHMARK_TEMPLATE_F(CONV1x1, scalar, 2, 4)(benchmark::State& state) {
103 | 		for (auto _ : state) {
104 | 			nnp_conv1x1_only_2x4__scalar(mr(), kc(), i(), k(), o());
105 | 		}
106 | 	}
107 | #endif
108 | 
109 | BENCHMARK_MAIN();
110 | 


--------------------------------------------------------------------------------
/bench/median.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | #include <stdlib.h>
 3 | #include <nnpack.h>
 4 | 
 5 | static int compare_ulonglong(const void *a_ptr, const void *b_ptr) {
 6 | 	const unsigned long long a = *((unsigned long long*) a_ptr);
 7 | 	const unsigned long long b = *((unsigned long long*) b_ptr);
 8 | 	if (a < b) {
 9 | 		return -1;
10 | 	} else if (a > b) {
11 | 		return 1;
12 | 	} else {
13 | 		return 0;
14 | 	}
15 | }
16 | 
17 | static int compare_profile(const void *a_ptr, const void *b_ptr) {
18 | 	const double a_total = ((const struct nnp_profile*) a_ptr)->total;
19 | 	const double b_total = ((const struct nnp_profile*) b_ptr)->total;
20 | 	if (a_total < b_total) {
21 | 		return -1;
22 | 	} else if (a_total > b_total) {
23 | 		return 1;
24 | 	} else {
25 | 		return 0;
26 | 	}
27 | }
28 | 
29 | static inline unsigned long long average(unsigned long long a, unsigned long long b) {
30 | 	return (a / 2) + (b / 2) + (a & b & 1ull);
31 | }
32 | 
33 | static inline struct nnp_profile average_profile(struct nnp_profile a, struct nnp_profile b) {
34 | 	return (struct nnp_profile) {
35 | 		.total = 0.5 * (a.total + b.total),
36 | 		.input_transform = 0.5 * (a.input_transform + b.input_transform),
37 | 		.kernel_transform = 0.5 * (a.kernel_transform + b.kernel_transform),
38 | 		.output_transform = 0.5 * (a.output_transform + b.output_transform),
39 | 		.block_multiplication = 0.5 * (a.block_multiplication + b.block_multiplication)
40 | 	};
41 | }
42 | 
43 | unsigned long long median(unsigned long long array[], size_t length) {
44 | 	qsort(array, length, sizeof(unsigned long long), &compare_ulonglong);
45 | 	if (length % 2 == 0) {
46 | 		const unsigned long long median_lo = array[length / 2 - 1];
47 | 		const unsigned long long median_hi = array[length / 2];
48 | 		return average(median_lo, median_hi);
49 | 	} else {
50 | 		return array[length / 2];
51 | 	}
52 | }
53 | 
54 | struct nnp_profile median_profile(struct nnp_profile array[], size_t length) {
55 | 	qsort(array, length, sizeof(struct nnp_profile), &compare_profile);
56 | 	if (length % 2 == 0) {
57 | 		return average_profile(array[length / 2 - 1], array[length / 2]);
58 | 	} else {
59 | 		return array[length / 2];
60 | 	}
61 | }
62 | 


--------------------------------------------------------------------------------
/bench/memread.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | 
 3 | int read_memory(const void* pointer, size_t bytes) {
 4 | 	int hash = 0;
 5 | 	while (bytes >= 64) {
 6 | 		hash ^= *((const int*) pointer);
 7 | 		pointer += 64;
 8 | 		bytes -= 64;		
 9 | 	}
10 | 	return hash;
11 | }
12 | 


--------------------------------------------------------------------------------
/bench/memread.py:
--------------------------------------------------------------------------------
 1 | arg_mem = Argument(ptr(), "mem")
 2 | arg_len = Argument(size_t, "n")
 3 | with Function("read_memory", (arg_mem, arg_len)):
 4 | 	reg_mem = GeneralPurposeRegister64()
 5 | 	LOAD.ARGUMENT(reg_mem, arg_mem)
 6 | 
 7 | 	reg_len = GeneralPurposeRegister64()
 8 | 	LOAD.ARGUMENT(reg_len, arg_len)
 9 | 
10 | 	main_loop = Loop()
11 | 	SUB(reg_len, 64)
12 | 	JB(main_loop.end)
13 | 	with main_loop:
14 | 		MOVAPS(xmm0, [reg_mem])
15 | 		ADD(reg_mem, 64)
16 | 		SUB(reg_len, 64)
17 | 		JAE(main_loop.begin)
18 | 
19 | 	RETURN()
20 | 


--------------------------------------------------------------------------------
/bench/perf_counter.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stddef.h>
  4 | #include <stdbool.h>
  5 | 
  6 | #if defined(__linux__)
  7 | 	#include <time.h>
  8 | 	#include <unistd.h>
  9 | 	#include <sys/ioctl.h>
 10 | 	#if !defined(__ANDROID__)
 11 | 		#include <linux/perf_event.h>
 12 | 	#endif
 13 | #elif defined(__native_client__)
 14 | 	#include <sys/time.h>
 15 | #elif defined(EMSCRIPTEN)
 16 | 	#include <emscripten.h>
 17 | #else
 18 | 	#if defined(__MACH__)
 19 | 		#include <mach/mach.h>
 20 | 		#include <mach/mach_time.h>
 21 | 	#endif
 22 | 	#if defined(__x86_64__)
 23 | 		#include <x86intrin.h>
 24 | 	#endif
 25 | #endif
 26 | 
 27 | struct performance_counter {
 28 | 	const char* name;
 29 | 	int file_descriptor;
 30 | };
 31 | 
 32 | static inline bool enable_perf_counter(int file_descriptor) {
 33 | #if defined(__linux__) && defined(__x86_64__) && !defined(__ANDROID__)
 34 | 	return ioctl(file_descriptor, PERF_EVENT_IOC_ENABLE, 0) == 0;
 35 | #else
 36 | 	return true;
 37 | #endif
 38 | }
 39 | 
 40 | static inline bool disable_perf_counter(int file_descriptor) {
 41 | #if defined(__linux__) && defined(__x86_64__) && !defined(__ANDROID__)
 42 | 	return ioctl(file_descriptor, PERF_EVENT_IOC_DISABLE, 0) == 0;
 43 | #else
 44 | 	return true;
 45 | #endif
 46 | }
 47 | 
 48 | static inline bool read_perf_counter(int file_descriptor, unsigned long long output[restrict static 1]) {
 49 | #if defined(__linux__) && defined(__x86_64__) && !defined(__ANDROID__)
 50 | 	return read(file_descriptor, output, sizeof(*output)) == sizeof(*output);
 51 | #elif defined(EMSCRIPTEN) || (defined(__native_client__) && !defined(__x86_64__))
 52 | 	return false;
 53 | #elif (defined(__native_client__) || defined(__ANDROID__)) && (defined(__x86_64__) || defined(__i386__))
 54 | 	unsigned int lo, hi;
 55 | 	asm volatile(
 56 | 		"XORL %%eax, %%eax;"
 57 | 		"CPUID;"
 58 | 		"RDTSC;"
 59 | 		: "=a" (lo), "=d" (hi)
 60 | 		:
 61 | 		: "%rbx", "%rcx"
 62 | 	);
 63 | 	*output = (((unsigned long long) hi) << 32) | ((unsigned long long) lo);
 64 | 	return true;
 65 | #elif defined(__x86_64__)
 66 | 	unsigned int aux;
 67 | 	*output = __rdtscp(&aux);
 68 | 	return true;
 69 | #else
 70 | 	return false;
 71 | #endif
 72 | }
 73 | 
 74 | static inline bool read_timer(unsigned long long output[restrict static 1]) {
 75 | #if defined(__linux__)
 76 | 	struct timespec ts;
 77 | 	if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) {
 78 | 		return false;
 79 | 	} else {
 80 | 		*output = ts.tv_sec * 1000000000ull + ts.tv_nsec;
 81 | 		return true;
 82 | 	}
 83 | #elif defined(__MACH__)
 84 | 	static mach_timebase_info_data_t timebase_info;
 85 | 	if (timebase_info.denom == 0) {
 86 | 		mach_timebase_info(&timebase_info);
 87 | 	}
 88 | 
 89 | 	*output = mach_absolute_time() * timebase_info.numer / timebase_info.denom;
 90 | 	return true;
 91 | #elif defined(__native_client__)
 92 | 	struct timeval walltime;
 93 | 	if (gettimeofday(&walltime, NULL) == 0) {
 94 | 		*output = walltime.tv_sec * 1000000000ull + walltime.tv_usec * 1000ull;
 95 | 		return true;
 96 | 	} else {
 97 | 		return false;
 98 | 	}
 99 | #elif defined(EMSCRIPTEN)
100 | 	*output = (unsigned long long) (emscripten_get_now() * 1.0e+6);
101 | 	return true;
102 | #else
103 | 	#error No implementation available
104 | #endif
105 | }
106 | 
107 | #if defined(__linux__) && defined(__x86_64__)
108 | const struct performance_counter* init_performance_counters(size_t* count_ptr);
109 | #else
110 | static inline const struct performance_counter* init_performance_counters(size_t* count_ptr) {
111 | 	static const struct performance_counter performance_counter = {
112 | 		.name = "Cycles"
113 | 	};
114 | 	*count_ptr = 1;
115 | 	return &performance_counter;
116 | }
117 | #endif
118 | 


--------------------------------------------------------------------------------
/bench/sgemm.cc:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | #include <cfloat>
  3 | #include <vector>
  4 | #include <random>
  5 | #include <chrono>
  6 | #include <functional>
  7 | #include <algorithm>
  8 | 
  9 | #include <cpuinfo.h>
 10 | #include <nnpack/macros.h>
 11 | #include <nnpack/blas.h>
 12 | #include <nnpack/AlignedAllocator.h>
 13 | 
 14 | #include <benchmark/benchmark.h>
 15 | 
 16 | 
 17 | template<uint32_t mr_, uint32_t nr_>
 18 | class SGEMM : public benchmark::Fixture {
 19 | public:
 20 | 	inline SGEMM() {
 21 | 		cpuinfo_initialize();
 22 | 		const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size;
 23 | 		const size_t l1d_reserve = 512;
 24 | 		kc_ = ((l1d_size - l1d_reserve) / sizeof(float) - mr() * nr()) / (mr() + nr());
 25 | 	}
 26 | 
 27 | 	virtual void SetUp(const benchmark::State&) override {
 28 | 		const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count();
 29 | 		auto rng = std::bind(std::uniform_real_distribution<float>(), std::mt19937(seed));
 30 | 
 31 | 		a_.resize(mr() * kc());
 32 | 		std::generate(a_.begin(), a_.end(), std::ref(rng));
 33 | 		b_.resize(nr() * kc());
 34 | 		std::generate(b_.begin(), b_.end(), std::ref(rng));
 35 | 		c_.resize(mr() * nr());
 36 | 		std::fill(c_.begin(), c_.end(), std::nanf(""));
 37 | 	}
 38 | 
 39 | 	virtual void TearDown(benchmark::State& state) override {
 40 | 		state.SetItemsProcessed(uint64_t(state.iterations()) * 2 * mr() * nr() * kc());
 41 | 		a_.clear();
 42 | 		b_.clear();
 43 | 		c_.clear();
 44 | 	}
 45 | 
 46 | 	inline const float* a() const {
 47 | 		return a_.data();
 48 | 	}
 49 | 
 50 | 	inline const float* b() const {
 51 | 		return b_.data();
 52 | 	}
 53 | 
 54 | 	inline float* c() {
 55 | 		return c_.data();
 56 | 	}
 57 | 
 58 | 	inline uint32_t mr() const {
 59 | 		return mr_;
 60 | 	}
 61 | 
 62 | 	inline uint32_t nr() const {
 63 | 		return nr_;
 64 | 	}
 65 | 
 66 | 	inline uint32_t kc() const {
 67 | 		return kc_;
 68 | 	}
 69 | 
 70 | private:
 71 | 	std::vector<float, AlignedAllocator<float, 32>> a_;
 72 | 	std::vector<float, AlignedAllocator<float, 32>> b_;
 73 | 	std::vector<float> c_;
 74 | 	uint32_t kc_;
 75 | };
 76 | 
 77 | #if NNP_BACKEND_X86_64
 78 | 	BENCHMARK_TEMPLATE_F(SGEMM, fma3, 4, 24)(benchmark::State& state) {
 79 | 		for (auto _ : state) {
 80 | 			nnp_sgemm_only_4x24__fma3(kc(), 0, a(), b(), c(), nr());
 81 | 		}
 82 | 	}
 83 | #endif
 84 | 
 85 | #if NNP_BACKEND_ARM && CPUINFO_ARCH_ARM
 86 | 	BENCHMARK_TEMPLATE_F(SGEMM, aarch32_neon, 6, 8)(benchmark::State& state) {
 87 | 		for (auto _ : state) {
 88 | 			nnp_sgemm_only_6x8__aarch32_neon(kc(), 0, a(), b(), c(), nr());
 89 | 		}
 90 | 	}
 91 | #endif
 92 | 
 93 | #if NNP_BACKEND_ARM
 94 | 	BENCHMARK_TEMPLATE_F(SGEMM, neon, 6, 8)(benchmark::State& state) {
 95 | 		for (auto _ : state) {
 96 | 			nnp_sgemm_only_6x8__neon(kc(), 0, a(), b(), c(), nr());
 97 | 		}
 98 | 	}
 99 | #endif
100 | 
101 | #if NNP_BACKEND_PSIMD
102 | 	BENCHMARK_TEMPLATE_F(SGEMM, psimd, 4, 8)(benchmark::State& state) {
103 | 		for (auto _ : state) {
104 | 			nnp_sgemm_only_4x8__psimd(kc(), 0, a(), b(), c(), nr());
105 | 		}
106 | 	}
107 | #endif
108 | 
109 | #if NNP_BACKEND_SCALAR
110 | 	BENCHMARK_TEMPLATE_F(SGEMM, scalar, 4, 3)(benchmark::State& state) {
111 | 		for (auto _ : state) {
112 | 			nnp_sgemm_only_4x3__scalar(kc(), 0, a(), b(), c(), nr());
113 | 		}
114 | 	}
115 | #endif
116 | 
117 | BENCHMARK_MAIN();
118 | 


--------------------------------------------------------------------------------
/cmake/DownloadCpuinfo.cmake:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 2 | 
 3 | PROJECT(cpuinfo-download NONE)
 4 | 
 5 | INCLUDE(ExternalProject)
 6 | ExternalProject_Add(cpuinfo
 7 | 	GIT_REPOSITORY https://github.com/Maratyszcza/cpuinfo.git
 8 | 	GIT_TAG main
 9 | 	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/cpuinfo"
10 | 	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/cpuinfo"
11 | 	CONFIGURE_COMMAND ""
12 | 	BUILD_COMMAND ""
13 | 	INSTALL_COMMAND ""
14 | 	TEST_COMMAND ""
15 | )
16 | 


--------------------------------------------------------------------------------
/cmake/DownloadEnum.cmake:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 2 | 
 3 | PROJECT(enum-download NONE)
 4 | 
 5 | INCLUDE(ExternalProject)
 6 | ExternalProject_Add(enum
 7 | 	URL https://pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz
 8 | 	URL_HASH SHA256=8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1
 9 | 	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/enum"
10 | 	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/enum"
11 | 	CONFIGURE_COMMAND ""
12 | 	BUILD_COMMAND ""
13 | 	INSTALL_COMMAND ""
14 | 	TEST_COMMAND ""
15 | )
16 | 


--------------------------------------------------------------------------------
/cmake/DownloadFP16.cmake:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 2 | 
 3 | PROJECT(fp16-download NONE)
 4 | 
 5 | INCLUDE(ExternalProject)
 6 | ExternalProject_Add(fp16
 7 | 	GIT_REPOSITORY https://github.com/Maratyszcza/FP16.git
 8 | 	GIT_TAG master
 9 | 	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/fp16"
10 | 	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/fp16"
11 | 	CONFIGURE_COMMAND ""
12 | 	BUILD_COMMAND ""
13 | 	INSTALL_COMMAND ""
14 | 	TEST_COMMAND ""
15 | )
16 | 


--------------------------------------------------------------------------------
/cmake/DownloadFXdiv.cmake:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 2 | 
 3 | PROJECT(fxdiv-download NONE)
 4 | 
 5 | INCLUDE(ExternalProject)
 6 | ExternalProject_Add(fxdiv
 7 | 	GIT_REPOSITORY https://github.com/Maratyszcza/FXdiv.git
 8 | 	GIT_TAG master
 9 | 	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/fxdiv"
10 | 	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/fxdiv"
11 | 	CONFIGURE_COMMAND ""
12 | 	BUILD_COMMAND ""
13 | 	INSTALL_COMMAND ""
14 | 	TEST_COMMAND ""
15 | )
16 | 


--------------------------------------------------------------------------------
/cmake/DownloadGoogleTest.cmake:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 2 | 
 3 | PROJECT(googletest-download NONE)
 4 | 
 5 | INCLUDE(ExternalProject)
 6 | ExternalProject_Add(googletest
 7 | 	URL https://github.com/google/googletest/archive/release-1.8.0.zip
 8 | 	URL_HASH SHA256=f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf
 9 | 	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest"
10 | 	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest"
11 | 	CONFIGURE_COMMAND ""
12 | 	BUILD_COMMAND ""
13 | 	INSTALL_COMMAND ""
14 | 	TEST_COMMAND ""
15 | )
16 | 


--------------------------------------------------------------------------------
/cmake/DownloadOpcodes.cmake:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 2 | 
 3 | PROJECT(opcodes-download NONE)
 4 | 
 5 | INCLUDE(ExternalProject)
 6 | ExternalProject_Add(opcodes
 7 | 	URL https://pypi.python.org/packages/e8/59/8c2e293c9c8d60f206fd5d0f6c8236a2e0a97832379ac319077441552c6a/opcodes-0.3.13.tar.gz
 8 | 	URL_HASH SHA256=1859c23143fe20daa4110be87a947cbf3eefa048da71dde642290213f251590c
 9 | 	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/opcodes"
10 | 	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/opcodes"
11 | 	CONFIGURE_COMMAND ""
12 | 	BUILD_COMMAND ""
13 | 	INSTALL_COMMAND ""
14 | 	TEST_COMMAND ""
15 | )
16 | 


--------------------------------------------------------------------------------
/cmake/DownloadPSimd.cmake:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 2 | 
 3 | PROJECT(psimd-download NONE)
 4 | 
 5 | INCLUDE(ExternalProject)
 6 | ExternalProject_Add(psimd
 7 | 	GIT_REPOSITORY https://github.com/Maratyszcza/psimd.git
 8 | 	GIT_TAG master
 9 | 	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/psimd"
10 | 	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/psimd"
11 | 	CONFIGURE_COMMAND ""
12 | 	BUILD_COMMAND ""
13 | 	INSTALL_COMMAND ""
14 | 	TEST_COMMAND ""
15 | )
16 | 


--------------------------------------------------------------------------------
/cmake/DownloadPThreadPool.cmake:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 2 | 
 3 | PROJECT(pthreadpool-download NONE)
 4 | 
 5 | INCLUDE(ExternalProject)
 6 | ExternalProject_Add(pthreadpool
 7 | 	GIT_REPOSITORY https://github.com/Maratyszcza/pthreadpool.git
 8 | 	GIT_TAG master
 9 | 	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/pthreadpool"
10 | 	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool"
11 | 	CONFIGURE_COMMAND ""
12 | 	BUILD_COMMAND ""
13 | 	INSTALL_COMMAND ""
14 | 	TEST_COMMAND ""
15 | )
16 | 


--------------------------------------------------------------------------------
/cmake/DownloadPeachPy.cmake:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 2 | 
 3 | PROJECT(peachpy-download NONE)
 4 | 
 5 | FIND_PACKAGE(PythonInterp REQUIRED)
 6 | 
 7 | INCLUDE(ExternalProject)
 8 | ExternalProject_Add(peachpy
 9 | 	GIT_REPOSITORY https://github.com/Maratyszcza/PeachPy.git
10 | 	GIT_TAG master
11 | 	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/peachpy"
12 | 	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/peachpy"
13 | 	PATCH_COMMAND "PYTHONPATH=${PYTHON_SIX_SOURCE_DIR}:${PYTHON_ENUM_SOURCE_DIR}:${PYTHON_OPCODES_SOURCE_DIR}" ${PYTHON_EXECUTABLE} setup.py generate
14 | 	CONFIGURE_COMMAND ""
15 | 	BUILD_COMMAND ""
16 | 	INSTALL_COMMAND ""
17 | 	TEST_COMMAND ""
18 | )
19 | 


--------------------------------------------------------------------------------
/cmake/DownloadSix.cmake:
--------------------------------------------------------------------------------
 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR)
 2 | 
 3 | PROJECT(six-download NONE)
 4 | 
 5 | INCLUDE(ExternalProject)
 6 | ExternalProject_Add(six
 7 | 	URL https://pypi.python.org/packages/16/d8/bc6316cf98419719bd59c91742194c111b6f2e85abac88e496adefaf7afe/six-1.11.0.tar.gz
 8 | 	URL_HASH SHA256=70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9
 9 | 	SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/six"
10 | 	BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/six"
11 | 	CONFIGURE_COMMAND ""
12 | 	BUILD_COMMAND ""
13 | 	INSTALL_COMMAND ""
14 | 	TEST_COMMAND ""
15 | )
16 | 


--------------------------------------------------------------------------------
/confu.yaml:
--------------------------------------------------------------------------------
 1 | name: nnpack
 2 | title: Neural Networks acceleration PACKage
 3 | license: Simplified BSD
 4 | deps:
 5 |   - name: pthreadpool
 6 |     url:  https://github.com/Maratyszcza/pthreadpool.git
 7 |   - name: cpuinfo
 8 |     url:  https://github.com/pytorch/cpuinfo.git
 9 |   - name: fxdiv
10 |     url:  https://github.com/Maratyszcza/FXdiv.git
11 |   - name: fp16
12 |     url:  https://github.com/Maratyszcza/FP16.git
13 |   - name: psimd
14 |     url:  https://github.com/Maratyszcza/psimd.git
15 |   - name: clog
16 |   - name: googletest
17 |   - name: googlebenchmark
18 | 


--------------------------------------------------------------------------------
/include/nnpack/AlignedAllocator.h:
--------------------------------------------------------------------------------
 1 | #include <cstddef>
 2 | #include <limits>
 3 | 
 4 | #include <stdlib.h>
 5 | 
 6 | template <typename T, size_t Alignment>
 7 | class AlignedAllocator;
 8 | 
 9 | template <size_t Alignment>
10 | class AlignedAllocator<void, Alignment>
11 | {
12 | public:
13 | 	typedef void*             pointer;
14 | 	typedef const void*       const_pointer;
15 | 	typedef void              value_type;
16 | 
17 | 	template <class U>
18 | 	struct rebind {
19 | 		typedef AlignedAllocator<U, Alignment> other;
20 | 	};
21 | };
22 | 
23 | template <typename T, size_t Alignment>
24 | class AlignedAllocator
25 | {
26 | public:
27 | 	typedef T         value_type;
28 | 	typedef T*        pointer;
29 | 	typedef const T*  const_pointer;
30 | 	typedef T&        reference;
31 | 	typedef const T&  const_reference;
32 | 	typedef size_t    size_type;
33 | 	typedef ptrdiff_t difference_type;
34 | 
35 | #if __cplusplus >= 201402L
36 | 	typedef std::true_type propagate_on_container_move_assignment;
37 | #endif
38 | 
39 | 	template <class U>
40 | 	struct rebind {
41 | 		typedef AlignedAllocator<U, Alignment> other;
42 | 	};
43 | 
44 | public:
45 | 	inline AlignedAllocator() noexcept {
46 | 	}
47 | 
48 | 	template <class U>
49 | 	inline AlignedAllocator(const AlignedAllocator<U, Alignment>& other) noexcept {
50 | 	}
51 | 
52 | 	inline size_type max_size() const noexcept {
53 | 		return (std::numeric_limits<size_type>::max() - size_type(Alignment)) / sizeof(T);
54 | 	}
55 | 
56 | 	inline pointer address(reference x) const noexcept {
57 | 		return std::addressof(x);
58 | 	}
59 | 
60 | 	inline const_pointer address(const_reference x) const noexcept {
61 | 		return std::addressof(x);
62 | 	}
63 | 
64 | 	inline pointer allocate(size_type n, typename AlignedAllocator<void, Alignment>::const_pointer hint = 0) {
65 | 		#if defined(__ANDROID__)
66 | 			void* memory = memalign(Alignment, n * sizeof(T));
67 | 			if (memory == 0) {
68 | 				#if !defined(__GNUC__) || defined(__EXCEPTIONS)
69 | 					throw std::bad_alloc();
70 | 				#endif
71 | 			}
72 | 		#else
73 | 			void* memory = nullptr;
74 | 			if (posix_memalign(&memory, Alignment, n * sizeof(T)) != 0) {
75 | 				#if !defined(__GNUC__) || defined(__EXCEPTIONS)
76 | 					throw std::bad_alloc();
77 | 				#endif
78 | 			}
79 | 		#endif
80 | 		return static_cast<pointer>(memory);
81 | 	}
82 | 
83 | 	inline void deallocate(pointer p, size_type n) noexcept {
84 | 		free(static_cast<void*>(p));
85 | 	}
86 | 
87 | 	template <class U, class ...Args>
88 | 	inline void construct(U* p, Args&&... args) {
89 | 		::new(static_cast<void*>(p)) U(std::forward<Args>(args)...);
90 | 	}
91 | 
92 | 	template <class U>
93 | 	inline void destroy(U* p) {
94 | 		p->~U();
95 | 	}
96 | };
97 | 


--------------------------------------------------------------------------------
/include/nnpack/activations.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <math.h>
 4 | 
 5 | 
 6 | static inline float relu(float data, float negative_slope) {
 7 | 	return signbit(data) ? data * negative_slope : data;
 8 | }
 9 | 
10 | static inline float grad_relu(float grad_output_data, float input_data, float negative_slope) {
11 | 	return signbit(input_data) ? grad_output_data * negative_slope : grad_output_data;
12 | }
13 | 
14 | #ifdef PSIMD_H
15 | 	static inline psimd_f32 psimd_relu_f32(psimd_f32 data, psimd_f32 negative_slope) {
16 | 		return psimd_signblend_f32(data, data * negative_slope, data);
17 | 	}
18 | 
19 | 	static inline psimd_f32 psimd_grad_relu_f32(psimd_f32 grad_output_data, psimd_f32 input_data, psimd_f32 negative_slope) {
20 | 		return psimd_signblend_f32(input_data, grad_output_data * negative_slope, grad_output_data);
21 | 	}
22 | #endif
23 | 
24 | #if defined(__ARM_NEON) || defined(__ARM_NEON__)
25 | 	#include <nnpack/arm_neon.h>
26 | 
27 | 	static inline float32x4_t neon_reluq_f32(float32x4_t data, float32x4_t negative_slope) {
28 | 		const uint32x4_t negative_mask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(data), 31));
29 | 		return vbslq_f32(negative_mask, vmulq_f32(data, negative_slope), data);
30 | 	}
31 | 
32 | 	static inline float32x4_t neon_grad_reluq_f32(float32x4_t grad_output_data, float32x4_t input_data, float32x4_t negative_slope) {
33 | 		const uint32x4_t negative_mask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(input_data), 31));
34 | 		return vbslq_f32(negative_mask, vmulq_f32(grad_output_data, negative_slope), grad_output_data);
35 | 	}
36 | 
37 | 	static inline float32x2_t neon_relu_f32(float32x2_t data, float32x2_t negative_slope) {
38 | 		const uint32x2_t negative_mask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(data), 31));
39 | 		return vbsl_f32(negative_mask, vmul_f32(data, negative_slope), data);
40 | 	}
41 | 
42 | 	static inline float32x2_t neon_grad_relu_f32(float32x2_t grad_output_data, float32x2_t input_data, float32x2_t negative_slope) {
43 | 		const uint32x2_t negative_mask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(input_data), 31));
44 | 		return vbsl_f32(negative_mask, vmul_f32(grad_output_data, negative_slope), grad_output_data);
45 | 	}
46 | #endif
47 | 


--------------------------------------------------------------------------------
/include/nnpack/assembly.h:
--------------------------------------------------------------------------------
 1 | #ifdef __ELF__
 2 | 	.macro BEGIN_FUNCTION name
 3 | 		.text
 4 | 		.align 2
 5 | 		.global \name
 6 | 		.type \name, %function
 7 | 		\name:
 8 | 	.endm
 9 | 
10 | 	.macro END_FUNCTION name
11 | 		.size \name, .-\name
12 | 	.endm
13 | #elif defined(__MACH__)
14 | 	.macro BEGIN_FUNCTION name
15 | 		.text
16 | 		.align 2
17 | 		.global _\name
18 | 		_\name:
19 | 	.endm
20 | 
21 | 	.macro END_FUNCTION name
22 | 	.endm
23 | #endif
24 | 


--------------------------------------------------------------------------------
/include/nnpack/complex.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <complex.h>
 4 | 
 5 | #ifndef CMPLXF
 6 | 	#define CMPLXF(real, imag) ((real) + _Complex_I * (imag))
 7 | #endif
 8 | 
 9 | #ifdef __ANDROID__
10 | 	/* Work-around for pre-API 23 Android, where libc does not provide crealf */
11 | 	#if __ANDROID_API__ < 23
12 | 		static inline float crealf(_Complex float c) {
13 | 			return __real__ c;
14 | 		}
15 | 
16 | 		static inline float cimagf(_Complex float c) {
17 | 			return __imag__ c;
18 | 		}
19 | 	#endif
20 | #endif
21 | 


--------------------------------------------------------------------------------
/include/nnpack/macros.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | 
 4 | #if defined(__GNUC__)
 5 | 	#if defined(__clang__) || ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5)))
 6 | 		#define NNP_UNREACHABLE do { __builtin_unreachable(); } while (0)
 7 | 	#else
 8 | 		#define NNP_UNREACHABLE do { __builtin_trap(); } while (0)
 9 | 	#endif
10 | #else
11 | 	#define NNP_UNREACHABLE do { } while (0)
12 | #endif
13 | 
14 | 
15 | #if defined(NNP_BACKEND_PSIMD)
16 | 	#if !(NNP_BACKEND_PSIMD)
17 | 		#error NNP_BACKEND_PSIMD predefined as 0
18 | 	#endif
19 | #elif defined(NNP_BACKEND_SCALAR)
20 | 	#if !(NNP_BACKEND_SCALAR)
21 | 		#error NNP_BACKEND_SCALAR predefined as 0
22 | 	#endif
23 | #elif defined(__arm__) || defined(__aarch64__)
24 | 	#define NNP_BACKEND_ARM 1
25 | #elif defined(__ANDROID__) && (defined(__i686__) || defined(__x86_64__))
26 | 	#define NNP_BACKEND_PSIMD 1
27 | #elif defined(__x86_64__)
28 | 	#define NNP_BACKEND_X86_64 1
29 | #elif defined(__ANDROID__) && defined(__mips__)
30 | 	#define NNP_BACKEND_SCALAR 1
31 | #else
32 | 	#define NNP_BACKEND_PSIMD 1
33 | #endif
34 | 
35 | #ifndef NNP_BACKEND_PSIMD
36 | 	#define NNP_BACKEND_PSIMD 0
37 | #endif
38 | #ifndef NNP_BACKEND_SCALAR
39 | 	#define NNP_BACKEND_SCALAR 0
40 | #endif
41 | #ifndef NNP_BACKEND_ARM
42 | 	#define NNP_BACKEND_ARM 0
43 | #endif
44 | #ifndef NNP_BACKEND_X86_64
45 | 	#define NNP_BACKEND_X86_64 0
46 | #endif
47 | 
48 | #define NNP_ALIGN(alignment) __attribute__((__aligned__(alignment)))
49 | #define NNP_SIMD_ALIGN NNP_ALIGN(64)
50 | #define NNP_CACHE_ALIGN NNP_ALIGN(64)
51 | 
52 | #define NNP_COUNT_OF(array) (sizeof(array) / sizeof(0[array]))
53 | 
54 | #if defined(__GNUC__)
55 | 	#define NNP_LIKELY(condition) (__builtin_expect(!!(condition), 1))
56 | 	#define NNP_UNLIKELY(condition) (__builtin_expect(!!(condition), 0))
57 | #else
58 | 	#define NNP_LIKELY(condition) (!!(condition))
59 | 	#define NNP_UNLIKELY(condition) (!!(condition))
60 | #endif
61 | 
62 | #if defined(__GNUC__)
63 | 	#define NNP_INLINE inline __attribute__((__always_inline__))
64 | #else
65 | 	#define NNP_INLINE inline
66 | #endif
67 | 


--------------------------------------------------------------------------------
/include/nnpack/pooling.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stddef.h>
 4 | 
 5 | #include <pthreadpool.h>
 6 | 
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | 
11 | typedef void (*nnp_pooling_function)(const float*, float*, size_t, size_t, size_t, size_t, size_t, size_t, uint32_t, uint32_t, uint32_t, uint32_t);
12 | 
13 | void nnp_maxpool_2x2_2x2__avx2(const float* src_pointer, float* dst_pointer, size_t src_stride,
14 | 	uint32_t src_row_offset, uint32_t src_row_count, uint32_t src_column_offset, uint32_t src_column_count, uint32_t dst_column_count);
15 | 
16 | #ifdef __cplusplus
17 | } /* extern "C" */
18 | #endif
19 | 


--------------------------------------------------------------------------------
/include/nnpack/reference.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stddef.h>
  4 | 
  5 | #include <pthreadpool.h>
  6 | 
  7 | #ifdef __cplusplus
  8 | extern "C" {
  9 | #endif
 10 | 
 11 | void nnp_convolution_output__reference(
 12 | 	size_t batch_size,
 13 | 	size_t input_channels,
 14 | 	size_t output_channels,
 15 | 	struct nnp_size input_size,
 16 | 	struct nnp_padding input_padding,
 17 | 	struct nnp_size kernel_size,
 18 | 	struct nnp_size output_subsampling,
 19 | 	const float input_pointer[],
 20 | 	const float kernel_pointer[],
 21 | 	const float bias[],
 22 | 	float output_pointer[],
 23 | 	pthreadpool_t threadpool);
 24 | 
 25 | void nnp_convolution_input_gradient__reference(
 26 | 	size_t batch_size,
 27 | 	size_t input_channels,
 28 | 	size_t output_channels,
 29 | 	struct nnp_size input_size,
 30 | 	struct nnp_padding input_padding,
 31 | 	struct nnp_size kernel_size,
 32 | 	const float grad_output[],
 33 | 	const float kernel[],
 34 | 	float grad_input[],
 35 | 	pthreadpool_t threadpool);
 36 | 
 37 | void nnp_convolution_kernel_gradient__reference(
 38 | 	size_t batch_size,
 39 | 	size_t input_channels,
 40 | 	size_t output_channels,
 41 | 	struct nnp_size input_size,
 42 | 	struct nnp_padding input_padding,
 43 | 	struct nnp_size kernel_size,
 44 | 	const float input[],
 45 | 	const float grad_output[],
 46 | 	float grad_kernel[],
 47 | 	pthreadpool_t threadpool);
 48 | 
 49 | void nnp_fully_connected_output_f32__reference(
 50 | 	size_t batch_size,
 51 | 	size_t input_channels,
 52 | 	size_t output_channels,
 53 | 	const float* input,
 54 | 	const float* kernel,
 55 | 	float* output,
 56 | 	pthreadpool_t threadpool);
 57 | 
 58 | void nnp_fully_connected_output_f16f32__reference(
 59 | 	size_t batch_size,
 60 | 	size_t input_channels,
 61 | 	size_t output_channels,
 62 | 	const float* input,
 63 | 	const void* kernel,
 64 | 	float* output,
 65 | 	pthreadpool_t threadpool);
 66 | 
 67 | void nnp_max_pooling_output__reference(
 68 | 	size_t batch_size,
 69 | 	size_t channels,
 70 | 	struct nnp_size input_size,
 71 | 	struct nnp_padding input_padding,
 72 | 	struct nnp_size pooling_size,
 73 | 	struct nnp_size pooling_stride,
 74 | 	const float input[],
 75 | 	float output[],
 76 | 	pthreadpool_t threadpool);
 77 | 
 78 | void nnp_relu_output__reference(
 79 | 	size_t batch_size,
 80 | 	size_t channels,
 81 | 	const float input[],
 82 | 	float output[],
 83 | 	float negative_slope,
 84 | 	pthreadpool_t threadpool);
 85 | 
 86 | void nnp_relu_input_gradient__reference(
 87 | 	size_t batch_size,
 88 | 	size_t channels,
 89 | 	const float grad_output[],
 90 | 	const float input[],
 91 | 	float grad_input[],
 92 | 	float negative_slope,
 93 | 	pthreadpool_t threadpool);
 94 | 
 95 | void nnp_softmax_output__reference(
 96 |     size_t batch_size,
 97 |     size_t channels,
 98 |     const float input[],
 99 |     float output[],
100 |     pthreadpool_t threadpool);
101 | 
102 | #ifdef __cplusplus
103 | } /* extern "C" */
104 | #endif
105 | 


--------------------------------------------------------------------------------
/include/nnpack/relu.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stddef.h>
 4 | #include <stdint.h>
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | void nnp_relu__avx2(const float* input, float* output, size_t length, float negative_slope);
11 | void nnp_inplace_relu__avx2(float* data, size_t length, float negative_slope);
12 | void nnp_grad_relu__avx2(const float* output_gradient, const float* input, float* input_gradient, size_t length, float negative_slope);
13 | 
14 | void nnp_relu__neon(const float* input, float* output, size_t length, float negative_slope);
15 | void nnp_inplace_relu__neon(float* data, size_t length, float negative_slope);
16 | void nnp_grad_relu__neon(const float* output_gradient, const float* input, float* input_gradient, size_t length, float negative_slope);
17 | 
18 | void nnp_relu__psimd(const float* input, float* output, size_t length, float negative_slope);
19 | void nnp_inplace_relu__psimd(float* data, size_t length, float negative_slope);
20 | void nnp_grad_relu__psimd(const float* output_gradient, const float* input, float* input_gradient, size_t length, float negative_slope);
21 | 
22 | void nnp_relu__scalar(const float* input, float* output, size_t length, float negative_slope);
23 | void nnp_inplace_relu__scalar(float* data, size_t length, float negative_slope);
24 | void nnp_grad_relu__scalar(const float* output_gradient, const float* input, float* input_gradient, size_t length, float negative_slope);
25 | 
26 | #ifdef __cplusplus
27 | } /* extern "C" */
28 | #endif
29 | 


--------------------------------------------------------------------------------
/include/nnpack/softmax.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stddef.h>
 4 | 
 5 | #ifdef __cplusplus
 6 | extern "C" {
 7 | #endif
 8 | 
 9 | typedef void (*nnp_exp_function)(size_t, const float*, float*);
10 | 
11 | void nnp_vector_exp__psimd(size_t n, const float* x, float* y);
12 | 
13 | void nnp_softmax__avx2(size_t n, const float* x, float* y);
14 | void nnp_inplace_softmax__avx2(size_t n, float* v);
15 | 
16 | void nnp_softmax__psimd(size_t n, const float* x, float* y);
17 | void nnp_inplace_softmax__psimd(size_t n, float* v);
18 | 
19 | void nnp_softmax__scalar(size_t n, const float* x, float* y);
20 | void nnp_inplace_softmax__scalar(size_t n, float* v);
21 | 
22 | #ifdef __cplusplus
23 | } /* extern "C" */
24 | #endif
25 | 


--------------------------------------------------------------------------------
/include/nnpack/system.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <stddef.h>
  4 | #include <stdbool.h>
  5 | #include <stdlib.h>
  6 | #include <assert.h>
  7 | 
  8 | #if defined(__linux__) || defined(__native_client__)
  9 | 	#include <time.h>
 10 | 	#include <unistd.h>
 11 | 	#include <sys/mman.h>
 12 | #elif defined(__MACH__)
 13 | 	#include <mach/mach.h>
 14 | 	#include <mach/mach_time.h>
 15 | #elif defined(EMSCRIPTEN)
 16 | 	#include <emscripten.h>
 17 | #endif
 18 | 
 19 | inline static double read_timer() {
 20 | #if defined(__linux__) || defined(__native_client__)
 21 | 	struct timespec ts;
 22 | 	int result = clock_gettime(CLOCK_MONOTONIC, &ts);
 23 | 	assert(result == 0);
 24 | 	return ((double) ts.tv_sec) + ((double) ts.tv_nsec) * 1.0e-9;
 25 | #elif defined(__MACH__)
 26 | 	static mach_timebase_info_data_t timebase_info;
 27 | 	if (timebase_info.denom == 0) {
 28 | 		mach_timebase_info(&timebase_info);
 29 | 	}
 30 | 
 31 | 	return ((double) (mach_absolute_time() * timebase_info.numer / timebase_info.denom)) * 1.0e-9;
 32 | #elif defined(EMSCRIPTEN)
 33 | 	return emscripten_get_now() * 1.0e-3;
 34 | #else
 35 | 	#error No implementation available
 36 | #endif
 37 | }
 38 | 
 39 | #define NNP_TOTAL_START(profile_ptr) \
 40 | 	double total_start; \
 41 | 	if (profile_ptr != NULL) { \
 42 | 		*profile_ptr = (struct nnp_profile) { 0 }; \
 43 | 		total_start = read_timer(); \
 44 | 	}
 45 | 
 46 | #define NNP_KERNEL_TRANSFORM_START(profile_ptr) \
 47 | 	double kernel_transform_start; \
 48 | 	if (profile_ptr != NULL) { \
 49 | 		kernel_transform_start = read_timer(); \
 50 | 	}
 51 | 
 52 | #define NNP_INPUT_TRANSFORM_START(profile_ptr) \
 53 | 	double input_transform_start; \
 54 | 	if (profile_ptr != NULL) { \
 55 | 		input_transform_start = read_timer(); \
 56 | 	}
 57 | 
 58 | #define NNP_OUTPUT_TRANSFORM_START(profile_ptr) \
 59 | 	double output_transform_start; \
 60 | 	if (profile_ptr != NULL) { \
 61 | 		output_transform_start = read_timer(); \
 62 | 	}
 63 | 
 64 | #define NNP_BLOCK_MULTIPLICATION_START(profile_ptr) \
 65 | 	double block_multiplication_start; \
 66 | 	if (profile_ptr != NULL) { \
 67 | 		block_multiplication_start = read_timer(); \
 68 | 	}
 69 | 
 70 | #define NNP_TOTAL_END(profile_ptr) \
 71 | 	if (profile_ptr != NULL) { \
 72 | 		profile_ptr->total = read_timer() - total_start; \
 73 | 	}
 74 | 
 75 | #define NNP_KERNEL_TRANSFORM_END(profile_ptr) \
 76 | 	if (profile_ptr != NULL) { \
 77 | 		profile_ptr->kernel_transform += read_timer() - kernel_transform_start; \
 78 | 	}
 79 | 
 80 | #define NNP_INPUT_TRANSFORM_END(profile_ptr) \
 81 | 	if (profile_ptr != NULL) { \
 82 | 		profile_ptr->input_transform += read_timer() - input_transform_start; \
 83 | 	}
 84 | 
 85 | #define NNP_OUTPUT_TRANSFORM_END(profile_ptr) \
 86 | 	if (profile_ptr != NULL) { \
 87 | 		profile_ptr->output_transform += read_timer() - output_transform_start; \
 88 | 	}
 89 | 
 90 | #define NNP_BLOCK_MULTIPLICATION_END(profile_ptr) \
 91 | 	if (profile_ptr != NULL) { \
 92 | 		profile_ptr->block_multiplication += read_timer() - block_multiplication_start; \
 93 | 	}
 94 | 
 95 | inline static void* allocate_memory(size_t memory_size) {
 96 | #if defined(__linux__)
 97 | 	#if !defined(__ANDROID__)
 98 | 		/* Try to use large page TLB */
 99 | 		void* memory_block = mmap(NULL, memory_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB, -1, 0);
100 | 	#else
101 | 		void* memory_block = MAP_FAILED;
102 | 	#endif
103 | 	if (memory_block == MAP_FAILED) {
104 | 		/* Fallback to standard pages */
105 | 		memory_block = mmap(NULL, memory_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0);
106 | 		if (memory_block == MAP_FAILED) {
107 | 			return NULL;
108 | 		}
109 | 	}
110 | 	return memory_block;
111 | #else
112 | 	void* memory_block = NULL;
113 | 	int allocation_result = posix_memalign(&memory_block, 64, memory_size);
114 | 	return (allocation_result == 0) ? memory_block : NULL;
115 | #endif
116 | }
117 | 
118 | inline static void release_memory(void* memory_block, size_t memory_size) {
119 | #if defined(__linux__)
120 | 	if (memory_block != NULL) {
121 | 		munmap(memory_block, memory_size);
122 | 	}
123 | #else
124 | 	free(memory_block);
125 | #endif
126 | }
127 | 


--------------------------------------------------------------------------------
/include/nnpack/utils.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #ifdef __cplusplus
 4 | 	#include <cstddef>
 5 | #else
 6 | 	#include <stddef.h>
 7 | #endif
 8 | 
 9 | static inline float maxf(float a, float b) {
10 | 	return a > b ? a : b;
11 | }
12 | 
13 | static inline size_t doz(size_t a, size_t b) {
14 | 	return a > b ? a - b : 0;
15 | }
16 | 
17 | static inline size_t max(size_t a, size_t b) {
18 | 	return a > b ? a : b;
19 | }
20 | 
21 | static inline size_t min(size_t a, size_t b) {
22 | 	return a > b ? b : a;
23 | }
24 | 
25 | static inline size_t round_up(size_t number, size_t factor) {
26 | 	return (number + factor - 1) / factor * factor;
27 | }
28 | 
29 | static inline size_t round_up_by_power_of_2(size_t number, size_t power_of_2_factor) {
30 | 	return (number + power_of_2_factor - 1) & ~(power_of_2_factor - 1);
31 | }
32 | 
33 | static inline size_t round_down(size_t number, size_t factor) {
34 | 	return number / factor * factor;
35 | }
36 | 
37 | static inline size_t divide_round_up(size_t dividend, size_t divisor) {
38 | 	if (dividend % divisor == 0) {
39 | 		return dividend / divisor;
40 | 	} else {
41 | 		return dividend / divisor + 1;
42 | 	}
43 | }
44 | 


--------------------------------------------------------------------------------
/include/nnpack/winograd.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <stddef.h>
 4 | #include <stdint.h>
 5 | 
 6 | #ifdef __cplusplus
 7 | extern "C" {
 8 | #endif
 9 | 
10 | typedef void (*nnp_wt_function)(const float*, float*);
11 | 
12 | void nnp_iwt_f6k3__fma3(const float d[], float w[]);
13 | void nnp_kwt_f6k3__fma3(const float g[], float w[]);
14 | void nnp_owt_f6k3__fma3(const float m[], float s[]);
15 | 
16 | void nnp_iwt_f6k3__psimd(const float d[], float w[]);
17 | void nnp_kwt_f6k3__psimd(const float g[], float w[]);
18 | void nnp_owt_f6k3__psimd(const float m[], float s[]);
19 | 
20 | void nnp_iwt_f6k3__neon(const float d[], float w[]);
21 | void nnp_kwt_f6k3__neon(const float g[], float w[]);
22 | void nnp_owt_f6k3__neon(const float m[], float s[]);
23 | 
24 | void nnp_iwt_f6k3__scalar(const float d[], float w[]);
25 | void nnp_kwt_f6k3__scalar(const float g[], float w[]);
26 | void nnp_owt_f6k3__scalar(const float m[], float s[]);
27 | 
28 | #ifdef __cplusplus
29 | } /* extern "C" */
30 | #endif
31 | 


--------------------------------------------------------------------------------
/logo/NNPACK.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Maratyszcza/NNPACK/70a77f485e8b934224f3a79efd8edcd84cd377b8/logo/NNPACK.png


--------------------------------------------------------------------------------
/src/neon/relu.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | 
 3 | #include <nnpack/arm_neon.h>
 4 | #include <nnpack/activations.h>
 5 | 
 6 | 
 7 | void nnp_relu__neon(
 8 | 	const float input[restrict static 4],
 9 | 	float output[restrict static 4],
10 | 	size_t length,
11 | 	float negative_slope)
12 | {
13 | 	const float32x4_t vec_negative_slope = vdupq_n_f32(negative_slope);
14 | 
15 | 	/* Length is always non-zero and proportional to SIMD width */
16 | 	do {
17 | 		vst1q_f32(output,
18 | 			neon_reluq_f32(vld1q_f32(input), vec_negative_slope));
19 | 
20 | 		input  += 4;
21 | 		output += 4;
22 | 		length -= 4;
23 | 	} while (length != 0);
24 | }
25 | 
26 | void nnp_inplace_relu__neon(
27 | 	float data[restrict static 4],
28 | 	size_t length,
29 | 	float negative_slope)
30 | {
31 | 	const float32x4_t vec_negative_slope = vdupq_n_f32(negative_slope);
32 | 
33 | 	/* Length is always non-zero and proportional to SIMD width */
34 | 	do {
35 | 		vst1q_f32(data,
36 | 			neon_reluq_f32(vld1q_f32(data), vec_negative_slope));
37 | 
38 | 		data += 4;
39 | 		length -= 4;
40 | 	} while (length != 0);
41 | }
42 | 
43 | void nnp_grad_relu__neon(
44 | 	const float output_gradient[restrict static 4],
45 | 	const float input[restrict static 4],
46 | 	float input_gradient[restrict static 4],
47 | 	size_t length,
48 | 	float negative_slope)
49 | {
50 | 	const float32x4_t vec_negative_slope = vdupq_n_f32(negative_slope);
51 | 
52 | 	/* Length is always non-zero and proportional to SIMD width */
53 | 	do {
54 | 		vst1q_f32(input_gradient,
55 | 			neon_grad_reluq_f32(vld1q_f32(output_gradient), vld1q_f32(input), vec_negative_slope));
56 | 
57 | 		output_gradient += 4;
58 | 		input += 4;
59 | 		input_gradient += 4;
60 | 		length -= 4;
61 | 	} while (length != 0);
62 | }
63 | 


--------------------------------------------------------------------------------
/src/neon/transpose.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <arm_neon.h>
 4 | 
 5 | 
 6 | static inline void neon_transpose4x4_inplace_f32(
 7 |     float32x4_t row0[static restrict 1],
 8 |     float32x4_t row1[static restrict 1],
 9 |     float32x4_t row2[static restrict 1],
10 |     float32x4_t row3[static restrict 1])
11 | {
12 |     /*
13 |      * row0 = ( x00 x01 x02 x03 )
14 |      * row1 = ( x10 x11 x12 x13 )
15 |      * row2 = ( x20 x21 x22 x23 )
16 |      * row3 = ( x30 x31 x32 x33 )
17 |      */
18 | 
19 |     /*
20 |      * row01 = ( x00 x10 x02 x12 ), ( x01 x11 x03, x13 )
21 |      * row23 = ( x20 x30 x22 x32 ), ( x21 x31 x23, x33 )
22 |      */
23 |     float32x4x2_t row01 = vtrnq_f32(*row0, *row1);
24 |     float32x4x2_t row23 = vtrnq_f32(*row2, *row3);
25 | 
26 |     /*
27 |      * row0 = ( x00 x10 x20 x30 )
28 |      * row1 = ( x01 x11 x21 x31 )
29 |      * row2 = ( x02 x12 x22 x32 )
30 |      * row3 = ( x03 x13 x23 x33 )
31 |      */
32 |     *row0 = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0]));
33 |     *row1 = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1]));
34 |     *row2 = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0]));
35 |     *row3 = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1]));
36 | }
37 | 


--------------------------------------------------------------------------------
/src/neon/winograd-f6k3.c:
--------------------------------------------------------------------------------
 1 | #include <neon/winograd/f6x6k3x3.h>
 2 | 
 3 | 
 4 | void nnp_iwt_f6k3__neon(
 5 | 	const float d[restrict static 32],
 6 | 	float w[restrict static 32])
 7 | {
 8 | 	float32x4_t w0 = vld1q_f32(d +  0);
 9 | 	float32x4_t w1 = vld1q_f32(d +  4);
10 | 	float32x4_t w2 = vld1q_f32(d +  8);
11 | 	float32x4_t w3 = vld1q_f32(d + 12);
12 | 	float32x4_t w4 = vld1q_f32(d + 16);
13 | 	float32x4_t w5 = vld1q_f32(d + 20);
14 | 	float32x4_t w6 = vld1q_f32(d + 24);
15 | 	float32x4_t w7 = vld1q_f32(d + 28);
16 | 
17 | 	winograd_f6k3_input_transform(
18 | 		w0, w1, w2, w3, w4, w5, w6, w7,
19 | 		&w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7);
20 | 
21 | 	vst1q_f32(w +  0, w0);
22 | 	vst1q_f32(w +  4, w1);
23 | 	vst1q_f32(w +  8, w2);
24 | 	vst1q_f32(w + 12, w3);
25 | 	vst1q_f32(w + 16, w4);
26 | 	vst1q_f32(w + 20, w5);
27 | 	vst1q_f32(w + 24, w6);
28 | 	vst1q_f32(w + 28, w7);
29 | }
30 | 
31 | void nnp_kwt_f6k3__neon(
32 | 	const float g[restrict static 12],
33 | 	float w[restrict static 32])
34 | {
35 | 	const float32x4_t g0 = vld1q_f32(g + 0);
36 | 	const float32x4_t g1 = vld1q_f32(g + 4);
37 | 	const float32x4_t g2 = vld1q_f32(g + 8);
38 | 
39 | 	float32x4_t w0, w1, w2, w3, w4, w5, w6, w7;
40 | 	winograd_f6k3_kernel_transform(
41 | 		g0, g1, g2,
42 | 		&w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7,
43 | 		true /* rescale coefficients */);
44 | 
45 | 	vst1q_f32(w +  0, w0);
46 | 	vst1q_f32(w +  4, w1);
47 | 	vst1q_f32(w +  8, w2);
48 | 	vst1q_f32(w + 12, w3);
49 | 	vst1q_f32(w + 16, w4);
50 | 	vst1q_f32(w + 20, w5);
51 | 	vst1q_f32(w + 24, w6);
52 | 	vst1q_f32(w + 28, w7);
53 | }
54 | 
55 | void nnp_owt_f6k3__neon(
56 | 	const float m[restrict static 32],
57 | 	float s[restrict static 24])
58 | {
59 | 	float32x4_t w0 = vld1q_f32(m +  0);
60 | 	float32x4_t w1 = vld1q_f32(m +  4);
61 | 	float32x4_t w2 = vld1q_f32(m +  8);
62 | 	float32x4_t w3 = vld1q_f32(m + 12);
63 | 	float32x4_t w4 = vld1q_f32(m + 16);
64 | 	float32x4_t w5 = vld1q_f32(m + 20);
65 | 	float32x4_t w6 = vld1q_f32(m + 24);
66 | 	float32x4_t w7 = vld1q_f32(m + 28);
67 | 
68 | 	float32x4_t s0, s1, s2, s3, s4, s5;
69 | 	winograd_f6k3_output_transformq(
70 | 		w0, w1, w2, w3, w4, w5, w6, w7,
71 | 		&s0, &s1, &s2, &s3, &s4, &s5);
72 | 
73 | 	vst1q_f32(s +  0, s0);
74 | 	vst1q_f32(s +  4, s1);
75 | 	vst1q_f32(s +  8, s2);
76 | 	vst1q_f32(s + 12, s3);
77 | 	vst1q_f32(s + 16, s4);
78 | 	vst1q_f32(s + 20, s5);
79 | }
80 | 


--------------------------------------------------------------------------------
/src/psimd/butterfly.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <psimd.h>
 4 | 
 5 | 
 6 | static inline void psimd_butterfly_f32(
 7 |     psimd_f32 a[restrict static 1],
 8 |     psimd_f32 b[restrict static 1])
 9 | {
10 |     const psimd_f32 new_a = *a + *b;
11 |     const psimd_f32 new_b = *a - *b;
12 |     *a = new_a;
13 |     *b = new_b;
14 | }
15 | 
16 | static inline void psimd_butterfly_and_negate_b_f32(
17 |     psimd_f32 a[restrict static 1],
18 |     psimd_f32 b[restrict static 1])
19 | {
20 |     const psimd_f32 new_a = *a + *b;
21 |     const psimd_f32 new_b = *b - *a;
22 |     *a = new_a;
23 |     *b = new_b;
24 | }
25 | 
26 | static inline void psimd_butterfly_with_negated_b_f32(
27 |     psimd_f32 a[restrict static 1],
28 |     psimd_f32 b[restrict static 1])
29 | {
30 |     const psimd_f32 new_a = *a - *b;
31 |     const psimd_f32 new_b = *a + *b;
32 |     *a = new_a;
33 |     *b = new_b;
34 | }
35 | 


--------------------------------------------------------------------------------
/src/psimd/exp.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | 
 3 | #include <psimd/exp.h>
 4 | 
 5 | #include <nnpack/softmax.h>
 6 | 
 7 | 
 8 | void nnp_vector_exp__psimd(
 9 | 	size_t n,
10 | 	const float x[restrict static n],
11 | 	float y[restrict static n])
12 | {
13 | 	do {
14 | 		psimd_store_f32(y,
15 | 			psimd_exp_f32(psimd_load_f32(x)));
16 | 
17 | 		y += 4;
18 | 		x += 4;
19 | 		n -= 4;
20 | 	} while (n >= 4);
21 | 	if (n != 0) {
22 | 		psimd_store_f32(y + n - 4,
23 | 			psimd_exp_f32(psimd_load_f32(x + n - 4)));
24 | 	}
25 | }
26 | 


--------------------------------------------------------------------------------
/src/psimd/exp.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <psimd.h>
 4 | 
 5 | 
 6 | static inline psimd_f32 psimd_exp_f32(psimd_f32 x) {
 7 | 	const psimd_f32 magic_bias = psimd_splat_f32(0x1.800000p+23f);
 8 |     const psimd_f32 zero_cutoff = psimd_splat_f32(-0x1.9FE368p+6f); /* The smallest x for which expf(x) is non-zero */
 9 |     const psimd_f32 inf_cutoff = psimd_splat_f32(0x1.62E42Ep+6f); /* The largest x for which expf(x) is finite */
10 |     const psimd_f32 log2e  = psimd_splat_f32(0x1.715476p+0f);
11 | 	const psimd_f32 ln2_hi = psimd_splat_f32(0x1.62E400p-1f); /* The lowest 7 bits are zeros */
12 | 	const psimd_f32 ln2_lo = psimd_splat_f32(0x1.7F7D1Cp-20f);
13 |     const psimd_f32 plus_inf = psimd_splat_f32(__builtin_inff());
14 | 
15 |     const psimd_f32 c2 = psimd_splat_f32(0x1.FFFFFCp-2f);
16 |     const psimd_f32 c3 = psimd_splat_f32(0x1.55548Cp-3f);
17 |     const psimd_f32 c4 = psimd_splat_f32(0x1.555834p-5f);
18 |     const psimd_f32 c5 = psimd_splat_f32(0x1.123CFEp-7f);
19 |     const psimd_f32 c6 = psimd_splat_f32(0x1.6ADCAEp-10f);
20 | 
21 |     const psimd_s32 min_exponent = psimd_splat_s32((int32_t)((uint32_t) -126 << 23));
22 |     const psimd_s32 max_exponent = psimd_splat_s32(127 << 23);
23 |     const psimd_s32 default_exponent = psimd_splat_s32(0x3F800000);
24 | 
25 |     psimd_f32 t = x * log2e + magic_bias;
26 |     psimd_s32 e1 = ((psimd_s32) t) << psimd_splat_s32(23);
27 |     psimd_s32 e2 = e1;
28 |     e1 = psimd_min_s32(psimd_max_s32(e1, min_exponent), max_exponent);
29 |     e2 = e2 - e1;
30 | 
31 |     const psimd_f32 s1 = (psimd_f32) (e1 + default_exponent);
32 |     const psimd_f32 s2 = (psimd_f32) (e2 + default_exponent);
33 | 
34 |     t = t - magic_bias;
35 | 	const psimd_f32 rx = (x - t * ln2_hi) - t * ln2_lo;
36 | 	const psimd_f32 rf = rx  + rx * rx * (c2 + rx * (c3 + rx * (c4 + rx * (c5 + rx * c6))));
37 | 	psimd_f32 f = s2 * (s1 * rf + s1);
38 | 
39 |     /* Fixup underflow to zero */
40 |     f = psimd_andmask_f32(x > zero_cutoff, f);
41 | 
42 |     /* Fixup overflow */
43 |     f = psimd_blend_f32(x > inf_cutoff, plus_inf, f);
44 |     return f;
45 | }
46 | 


--------------------------------------------------------------------------------
/src/psimd/fft-aos.c:
--------------------------------------------------------------------------------
 1 | #include <psimd/fft/aos.h>
 2 | 
 3 | 
 4 | void nnp_fft4_4aos__psimd(
 5 | 	const float t[restrict static 32],
 6 | 	float f[restrict static 32])
 7 | {
 8 | 	psimd_f32 w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
 9 | 	psimd_fft4_aos_f32(
10 | 		t, t + 16, 4, 0, 8,
11 | 		&w0r, &w0i, &w1r, &w1i, &w2r, &w2i, &w3r, &w3i);
12 | 	psimd_store_f32(f +  0, w0r);
13 | 	psimd_store_f32(f +  4, w0i);
14 | 	psimd_store_f32(f +  8, w1r);
15 | 	psimd_store_f32(f + 12, w1i);
16 | 	psimd_store_f32(f + 16, w2r);
17 | 	psimd_store_f32(f + 20, w2i);
18 | 	psimd_store_f32(f + 24, w3r);
19 | 	psimd_store_f32(f + 28, w3i);
20 | }
21 | 
22 | void nnp_fft8_4aos__psimd(
23 | 	const float t[restrict static 64],
24 | 	float f[restrict static 64])
25 | {
26 | 	psimd_f32 w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i, w5r, w5i, w6r, w6i, w7r, w7i;
27 | 	psimd_fft8_aos_f32(
28 | 		t, t + 32, 4, 0, 16,
29 | 		&w0r, &w0i, &w1r, &w1i, &w2r, &w2i, &w3r, &w3i, &w4r, &w4i, &w5r, &w5i, &w6r, &w6i, &w7r, &w7i);
30 | 	psimd_store_f32(f +  0, w0r);
31 | 	psimd_store_f32(f +  4, w0i);
32 | 	psimd_store_f32(f +  8, w1r);
33 | 	psimd_store_f32(f + 12, w1i);
34 | 	psimd_store_f32(f + 16, w2r);
35 | 	psimd_store_f32(f + 20, w2i);
36 | 	psimd_store_f32(f + 24, w3r);
37 | 	psimd_store_f32(f + 28, w3i);
38 | 	psimd_store_f32(f + 32, w4r);
39 | 	psimd_store_f32(f + 36, w4i);
40 | 	psimd_store_f32(f + 40, w5r);
41 | 	psimd_store_f32(f + 44, w5i);
42 | 	psimd_store_f32(f + 48, w6r);
43 | 	psimd_store_f32(f + 52, w6i);
44 | 	psimd_store_f32(f + 56, w7r);
45 | 	psimd_store_f32(f + 60, w7i);
46 | }
47 | 
48 | void nnp_ifft4_4aos__psimd(
49 | 	const float f[restrict static 32],
50 | 	float t[restrict static 32])
51 | {
52 | 	const psimd_f32 w0r = psimd_load_f32(f +  0);
53 | 	const psimd_f32 w0i = psimd_load_f32(f +  4);
54 | 	const psimd_f32 w1r = psimd_load_f32(f +  8);
55 | 	const psimd_f32 w1i = psimd_load_f32(f + 12);
56 | 	const psimd_f32 w2r = psimd_load_f32(f + 16);
57 | 	const psimd_f32 w2i = psimd_load_f32(f + 20);
58 | 	const psimd_f32 w3r = psimd_load_f32(f + 24);
59 | 	const psimd_f32 w3i = psimd_load_f32(f + 28);
60 | 
61 | 	psimd_ifft4_aos_f32(
62 | 		w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i,
63 | 		t, t + 16, 4);
64 | }
65 | 
66 | void nnp_ifft8_4aos__psimd(
67 | 	const float f[restrict static 64],
68 | 	float t[restrict static 64])
69 | {
70 | 	const psimd_f32 w0r = psimd_load_f32(f +  0);
71 | 	const psimd_f32 w0i = psimd_load_f32(f +  4);
72 | 	const psimd_f32 w1r = psimd_load_f32(f +  8);
73 | 	const psimd_f32 w1i = psimd_load_f32(f + 12);
74 | 	const psimd_f32 w2r = psimd_load_f32(f + 16);
75 | 	const psimd_f32 w2i = psimd_load_f32(f + 20);
76 | 	const psimd_f32 w3r = psimd_load_f32(f + 24);
77 | 	const psimd_f32 w3i = psimd_load_f32(f + 28);
78 | 	const psimd_f32 w4r = psimd_load_f32(f + 32);
79 | 	const psimd_f32 w4i = psimd_load_f32(f + 36);
80 | 	const psimd_f32 w5r = psimd_load_f32(f + 40);
81 | 	const psimd_f32 w5i = psimd_load_f32(f + 44);
82 | 	const psimd_f32 w6r = psimd_load_f32(f + 48);
83 | 	const psimd_f32 w6i = psimd_load_f32(f + 52);
84 | 	const psimd_f32 w7r = psimd_load_f32(f + 56);
85 | 	const psimd_f32 w7i = psimd_load_f32(f + 60);
86 | 
87 | 	psimd_ifft8_aos_f32(
88 | 		w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i, w5r, w5i, w6r, w6i, w7r, w7i,
89 | 		t, t + 32, 4);
90 | }
91 | 


--------------------------------------------------------------------------------
/src/psimd/fft-dualreal.c:
--------------------------------------------------------------------------------
 1 | #include <psimd/fft/dualreal.h>
 2 | 
 3 | 
 4 | void nnp_fft8_dualreal__psimd(
 5 | 	const float t[restrict static 16],
 6 | 	float f[restrict static 16])
 7 | {
 8 | 	psimd_f32 s0123 = psimd_load_f32(t +  0);
 9 | 	psimd_f32 s4567 = psimd_load_f32(t +  4);
10 | 	psimd_f32 h0123 = psimd_load_f32(t +  8);
11 | 	psimd_f32 h4567 = psimd_load_f32(t + 12);
12 | 
13 | 	psimd_fft8_dualreal_f32(&s0123, &s4567, &h0123, &h4567);
14 | 
15 | 	psimd_store_f32(f +  0, s0123);
16 | 	psimd_store_f32(f +  4, s4567);
17 | 	psimd_store_f32(f +  8, h0123);
18 | 	psimd_store_f32(f + 12, h4567);
19 | }
20 | 
21 | void nnp_fft16_dualreal__psimd(
22 | 	const float t[restrict static 32],
23 | 	float f[restrict static 32])
24 | {
25 | 	psimd_f32 s0123 = psimd_load_f32(t +  0);
26 | 	psimd_f32 s4567 = psimd_load_f32(t +  4);
27 | 	psimd_f32 s89AB = psimd_load_f32(t +  8);
28 | 	psimd_f32 sCDEF = psimd_load_f32(t + 12);
29 | 	psimd_f32 h0123 = psimd_load_f32(t + 16);
30 | 	psimd_f32 h4567 = psimd_load_f32(t + 20);
31 | 	psimd_f32 h89AB = psimd_load_f32(t + 24);
32 | 	psimd_f32 hCDEF = psimd_load_f32(t + 28);
33 | 
34 | 	psimd_fft16_dualreal_f32(&s0123, &s4567, &s89AB, &sCDEF, &h0123, &h4567, &h89AB, &hCDEF);
35 | 
36 | 	psimd_store_f32(f +  0, s0123);
37 | 	psimd_store_f32(f +  4, s4567);
38 | 	psimd_store_f32(f +  8, s89AB);
39 | 	psimd_store_f32(f + 12, sCDEF);
40 | 	psimd_store_f32(f + 16, h0123);
41 | 	psimd_store_f32(f + 20, h4567);
42 | 	psimd_store_f32(f + 24, h89AB);
43 | 	psimd_store_f32(f + 28, hCDEF);
44 | }
45 | 
46 | void nnp_ifft8_dualreal__psimd(
47 | 	const float f[restrict static 16],
48 | 	float t[restrict static 16])
49 | {
50 | 	psimd_f32 s0123 = psimd_load_f32(f +  0);
51 | 	psimd_f32 s4567 = psimd_load_f32(f +  4);
52 | 	psimd_f32 h0123 = psimd_load_f32(f +  8);
53 | 	psimd_f32 h4567 = psimd_load_f32(f + 12);
54 | 
55 | 	psimd_ifft8_dualreal_f32(&s0123, &s4567, &h0123, &h4567);
56 | 
57 | 	psimd_store_f32(t +  0, s0123);
58 | 	psimd_store_f32(t +  4, s4567);
59 | 	psimd_store_f32(t +  8, h0123);
60 | 	psimd_store_f32(t + 12, h4567);
61 | }
62 | 
63 | void nnp_ifft16_dualreal__psimd(
64 | 	const float f[restrict static 32],
65 | 	float t[restrict static 32])
66 | {
67 | 	psimd_f32 s0123 = psimd_load_f32(f +  0);
68 | 	psimd_f32 s4567 = psimd_load_f32(f +  4);
69 | 	psimd_f32 s89AB = psimd_load_f32(f +  8);
70 | 	psimd_f32 sCDEF = psimd_load_f32(f + 12);
71 | 	psimd_f32 h0123 = psimd_load_f32(f + 16);
72 | 	psimd_f32 h4567 = psimd_load_f32(f + 20);
73 | 	psimd_f32 h89AB = psimd_load_f32(f + 24);
74 | 	psimd_f32 hCDEF = psimd_load_f32(f + 28);
75 | 
76 | 	psimd_ifft16_dualreal_f32(&s0123, &s4567, &s89AB, &sCDEF, &h0123, &h4567, &h89AB, &hCDEF);
77 | 
78 | 	psimd_store_f32(t +  0, s0123);
79 | 	psimd_store_f32(t +  4, s4567);
80 | 	psimd_store_f32(t +  8, s89AB);
81 | 	psimd_store_f32(t + 12, sCDEF);
82 | 	psimd_store_f32(t + 16, h0123);
83 | 	psimd_store_f32(t + 20, h4567);
84 | 	psimd_store_f32(t + 24, h89AB);
85 | 	psimd_store_f32(t + 28, hCDEF);
86 | }
87 | 


--------------------------------------------------------------------------------
/src/psimd/fft-real.c:
--------------------------------------------------------------------------------
 1 | #include <psimd/fft/real.h>
 2 | 
 3 | 
 4 | void nnp_fft8_4real__psimd(
 5 | 	const float t[restrict static 32],
 6 | 	float f[restrict static 32])
 7 | {
 8 | 	psimd_fft8_real_f32(
 9 | 		t, t + 16, 4, 0, 8,
10 | 		f, 4);
11 | }
12 | 
13 | void nnp_fft16_4real__psimd(
14 | 	const float t[restrict static 64],
15 | 	float f[restrict static 64])
16 | {
17 | 	psimd_fft16_real_f32(
18 | 		t, t + 32, 4, 0, 16,
19 | 		f, 4);
20 | }
21 | 
22 | void nnp_ifft8_4real__psimd(
23 | 	const float f[restrict static 32],
24 | 	float t[restrict static 32])
25 | {
26 | 	const psimd_f32 f0r = psimd_load_f32(f +  0);
27 | 	const psimd_f32 f4r = psimd_load_f32(f +  4);
28 | 	const psimd_f32 f1r = psimd_load_f32(f +  8);
29 | 	const psimd_f32 f1i = psimd_load_f32(f + 12);
30 | 	const psimd_f32 f2r = psimd_load_f32(f + 16);
31 | 	const psimd_f32 f2i = psimd_load_f32(f + 20);
32 | 	const psimd_f32 f3r = psimd_load_f32(f + 24);
33 | 	const psimd_f32 f3i = psimd_load_f32(f + 28);
34 | 	psimd_ifft8_real_f32(
35 | 		f0r, f4r, f1r, f1i, f2r, f2i, f3r, f3i,
36 | 		t, t + 16, 4);
37 | }
38 | 
39 | void nnp_ifft16_4real__psimd(
40 | 	const float f[restrict static 64],
41 | 	float t[restrict static 64])
42 | {
43 | 	const psimd_f32 f0r = psimd_load_f32(f +  0);
44 | 	const psimd_f32 f8r = psimd_load_f32(f +  4);
45 | 	const psimd_f32 f1r = psimd_load_f32(f +  8);
46 | 	const psimd_f32 f1i = psimd_load_f32(f + 12);
47 | 	const psimd_f32 f2r = psimd_load_f32(f + 16);
48 | 	const psimd_f32 f2i = psimd_load_f32(f + 20);
49 | 	const psimd_f32 f3r = psimd_load_f32(f + 24);
50 | 	const psimd_f32 f3i = psimd_load_f32(f + 28);
51 | 	const psimd_f32 f4r = psimd_load_f32(f + 32);
52 | 	const psimd_f32 f4i = psimd_load_f32(f + 36);
53 | 	const psimd_f32 f5r = psimd_load_f32(f + 40);
54 | 	const psimd_f32 f5i = psimd_load_f32(f + 44);
55 | 	const psimd_f32 f6r = psimd_load_f32(f + 48);
56 | 	const psimd_f32 f6i = psimd_load_f32(f + 52);
57 | 	const psimd_f32 f7r = psimd_load_f32(f + 56);
58 | 	const psimd_f32 f7i = psimd_load_f32(f + 60);
59 | 	psimd_ifft16_real_f32(
60 | 		f0r, f8r, f1r, f1i, f2r, f2i, f3r, f3i, f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i,
61 | 		t, t + 32, 4);
62 | }
63 | 


--------------------------------------------------------------------------------
/src/psimd/fft-soa.c:
--------------------------------------------------------------------------------
 1 | #include <psimd/fft/soa.h>
 2 | 
 3 | 
 4 | void nnp_fft8_soa__psimd(
 5 | 	const float t[restrict static 16],
 6 | 	float f[restrict static 16])
 7 | {
 8 | 	psimd_f32 w0123r = psimd_load_f32(t +  0);
 9 | 	psimd_f32 w4567r = psimd_load_f32(t +  4);
10 | 	psimd_f32 w0123i = psimd_load_f32(t +  8);
11 | 	psimd_f32 w4567i = psimd_load_f32(t + 12);
12 | 
13 | 	psimd_fft8_soa_f32(&w0123r, &w4567r, &w0123i, &w4567i);
14 | 
15 | 	psimd_store_f32(f +  0, w0123r);
16 | 	psimd_store_f32(f +  4, w4567r);
17 | 	psimd_store_f32(f +  8, w0123i);
18 | 	psimd_store_f32(f + 12, w4567i);
19 | }
20 | 
21 | void nnp_fft16_soa__psimd(
22 | 	const float t[restrict static 32],
23 | 	float f[restrict static 32])
24 | {
25 | 	psimd_f32 w0123r = psimd_load_f32(t +  0);
26 | 	psimd_f32 w4567r = psimd_load_f32(t +  4);
27 | 	psimd_f32 w89ABr = psimd_load_f32(t +  8);
28 | 	psimd_f32 wCDEFr = psimd_load_f32(t + 12);
29 | 	psimd_f32 w0123i = psimd_load_f32(t + 16);
30 | 	psimd_f32 w4567i = psimd_load_f32(t + 20);
31 | 	psimd_f32 w89ABi = psimd_load_f32(t + 24);
32 | 	psimd_f32 wCDEFi = psimd_load_f32(t + 28);
33 | 
34 | 	psimd_fft16_soa_f32(&w0123r, &w4567r, &w89ABr, &wCDEFr, &w0123i, &w4567i, &w89ABi, &wCDEFi);
35 | 
36 | 	psimd_store_f32(f +  0, w0123r);
37 | 	psimd_store_f32(f +  4, w4567r);
38 | 	psimd_store_f32(f +  8, w89ABr);
39 | 	psimd_store_f32(f + 12, wCDEFr);
40 | 	psimd_store_f32(f + 16, w0123i);
41 | 	psimd_store_f32(f + 20, w4567i);
42 | 	psimd_store_f32(f + 24, w89ABi);
43 | 	psimd_store_f32(f + 28, wCDEFi);
44 | }
45 | 
46 | void nnp_ifft8_soa__psimd(
47 | 	const float f[restrict static 16],
48 | 	float t[restrict static 16])
49 | {
50 | 	psimd_f32 w0123r = psimd_load_f32(f +  0);
51 | 	psimd_f32 w4567r = psimd_load_f32(f +  4);
52 | 	psimd_f32 w0123i = psimd_load_f32(f +  8);
53 | 	psimd_f32 w4567i = psimd_load_f32(f + 12);
54 | 
55 | 	psimd_ifft8_soa_f32(&w0123r, &w4567r, &w0123i, &w4567i);
56 | 
57 | 	psimd_store_f32(t +  0, w0123r);
58 | 	psimd_store_f32(t +  4, w4567r);
59 | 	psimd_store_f32(t +  8, w0123i);
60 | 	psimd_store_f32(t + 12, w4567i);
61 | }
62 | 
63 | void nnp_ifft16_soa__psimd(
64 | 	const float f[restrict static 32],
65 | 	float t[restrict static 32])
66 | {
67 | 	psimd_f32 w0123r = psimd_load_f32(f +  0);
68 | 	psimd_f32 w4567r = psimd_load_f32(f +  4);
69 | 	psimd_f32 w89ABr = psimd_load_f32(f +  8);
70 | 	psimd_f32 wCDEFr = psimd_load_f32(f + 12);
71 | 	psimd_f32 w0123i = psimd_load_f32(f + 16);
72 | 	psimd_f32 w4567i = psimd_load_f32(f + 20);
73 | 	psimd_f32 w89ABi = psimd_load_f32(f + 24);
74 | 	psimd_f32 wCDEFi = psimd_load_f32(f + 28);
75 | 
76 | 	psimd_ifft16_soa_f32(&w0123r, &w4567r, &w89ABr, &wCDEFr, &w0123i, &w4567i, &w89ABi, &wCDEFi);
77 | 
78 | 	psimd_store_f32(t +  0, w0123r);
79 | 	psimd_store_f32(t +  4, w4567r);
80 | 	psimd_store_f32(t +  8, w89ABr);
81 | 	psimd_store_f32(t + 12, wCDEFr);
82 | 	psimd_store_f32(t + 16, w0123i);
83 | 	psimd_store_f32(t + 20, w4567i);
84 | 	psimd_store_f32(t + 24, w89ABi);
85 | 	psimd_store_f32(t + 28, wCDEFi);
86 | }
87 | 


--------------------------------------------------------------------------------
/src/psimd/relu.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | 
 3 | #include <psimd.h>
 4 | 
 5 | #include <nnpack/activations.h>
 6 | 
 7 | 
 8 | void nnp_relu__psimd(
 9 | 	const float input[restrict static 4],
10 | 	float output[restrict static 4],
11 | 	size_t length,
12 | 	float negative_slope)
13 | {
14 | 	const psimd_f32 vec_negative_slope = psimd_splat_f32(negative_slope);
15 | 
16 | 	/* Length is always non-zero and proportional to SIMD width */
17 | 	do {
18 | 		psimd_store_f32(output,
19 | 			psimd_relu_f32(psimd_load_f32(input), vec_negative_slope));
20 | 
21 | 		input  += 4;
22 | 		output += 4;
23 | 		length -= 4;
24 | 	} while (length != 0);
25 | }
26 | 
27 | void nnp_inplace_relu__psimd(
28 | 	float data[restrict static 4],
29 | 	size_t length,
30 | 	float negative_slope)
31 | {
32 | 	const psimd_f32 vec_negative_slope = psimd_splat_f32(negative_slope);
33 | 
34 | 	/* Length is always non-zero and proportional to SIMD width */
35 | 	do {
36 | 		psimd_store_f32(data,
37 | 			psimd_relu_f32(psimd_load_f32(data), vec_negative_slope));
38 | 
39 | 		data += 4;
40 | 		length -= 4;
41 | 	} while (length != 0);
42 | }
43 | 
44 | void nnp_grad_relu__psimd(
45 | 	const float output_gradient[restrict static 4],
46 | 	const float input[restrict static 4],
47 | 	float input_gradient[restrict static 4],
48 | 	size_t length,
49 | 	float negative_slope)
50 | {
51 | 	const psimd_f32 vec_negative_slope = psimd_splat_f32(negative_slope);
52 | 
53 | 	/* Length is always non-zero and proportional to SIMD width */
54 | 	do {
55 | 		psimd_store_f32(input_gradient,
56 | 			psimd_grad_relu_f32(psimd_load_f32(output_gradient), psimd_load_f32(input), vec_negative_slope));
57 | 
58 | 		output_gradient += 4;
59 | 		input += 4;
60 | 		input_gradient += 4;
61 | 		length -= 4;
62 | 	} while (length != 0);
63 | }
64 | 


--------------------------------------------------------------------------------
/src/psimd/transpose.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <psimd.h>
 4 | 
 5 | 
 6 | static inline void psimd_transpose4x4_f32(
 7 |     const psimd_f32 row0, const psimd_f32 row1, const psimd_f32 row2, const psimd_f32 row3,
 8 |     psimd_f32 col0[restrict static 1],
 9 |     psimd_f32 col1[restrict static 1],
10 |     psimd_f32 col2[restrict static 1],
11 |     psimd_f32 col3[restrict static 1])
12 | {
13 |     /*
14 |      * row0 = ( x00 x01 x02 x03 )
15 |      * row1 = ( x10 x11 x12 x13 )
16 |      * row2 = ( x20 x21 x22 x23 )
17 |      * row3 = ( x30 x31 x32 x33 )
18 |      */
19 | 
20 |     /*
21 |      * row01lo = ( x00 x10 x01 x11 )
22 |      * row01hi = ( x02 x12 x03 x13 )
23 |      * row23lo = ( x20 x30 x21 x31 )
24 |      * row23hi = ( x22 x32 x23 x33 )
25 |      */
26 |     const psimd_f32 row01lo = psimd_interleave_lo_f32(row0, row1);
27 |     const psimd_f32 row01hi = psimd_interleave_hi_f32(row0, row1);
28 |     const psimd_f32 row23lo = psimd_interleave_lo_f32(row2, row3);
29 |     const psimd_f32 row23hi = psimd_interleave_hi_f32(row2, row3);
30 | 
31 |     /*
32 |      * col0 = ( x00 x10 x20 x30 )
33 |      * col1 = ( x01 x11 x21 x31 )
34 |      * col2 = ( x02 x12 x22 x32 )
35 |      * col3 = ( x03 x13 x23 x33 )
36 |      */
37 |     *col0 = psimd_concat_lo_f32(row01lo, row23lo);
38 |     *col1 = psimd_concat_hi_f32(row01lo, row23lo);
39 |     *col2 = psimd_concat_lo_f32(row01hi, row23hi);
40 |     *col3 = psimd_concat_hi_f32(row01hi, row23hi);
41 | }
42 | 


--------------------------------------------------------------------------------
/src/psimd/winograd-f6k3.c:
--------------------------------------------------------------------------------
 1 | #include <psimd/winograd/f6x6k3x3.h>
 2 | 
 3 | 
 4 | void nnp_iwt_f6k3__psimd(
 5 | 	const float d[restrict static 32],
 6 | 	float w[restrict static 32])
 7 | {
 8 | 	const psimd_f32 d0 = psimd_load_f32(d +  0);
 9 | 	const psimd_f32 d1 = psimd_load_f32(d +  4);
10 | 	const psimd_f32 d2 = psimd_load_f32(d +  8);
11 | 	const psimd_f32 d3 = psimd_load_f32(d + 12);
12 | 	const psimd_f32 d4 = psimd_load_f32(d + 16);
13 | 	const psimd_f32 d5 = psimd_load_f32(d + 20);
14 | 	const psimd_f32 d6 = psimd_load_f32(d + 24);
15 | 	const psimd_f32 d7 = psimd_load_f32(d + 28);
16 | 
17 | 	psimd_f32 w0, w1, w2, w3, w4, w5, w6, w7;
18 | 	winograd_f6k3_input_transform(
19 | 		d0, d1, d2, d3, d4, d5, d6, d7,
20 | 		&w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7);
21 | 
22 | 	psimd_store_f32(w +  0, w0);
23 | 	psimd_store_f32(w +  4, w1);
24 | 	psimd_store_f32(w +  8, w2);
25 | 	psimd_store_f32(w + 12, w3);
26 | 	psimd_store_f32(w + 16, w4);
27 | 	psimd_store_f32(w + 20, w5);
28 | 	psimd_store_f32(w + 24, w6);
29 | 	psimd_store_f32(w + 28, w7);
30 | }
31 | 
32 | void nnp_kwt_f6k3__psimd(
33 | 	const float g[restrict static 12],
34 | 	float w[restrict static 32])
35 | {
36 | 	const psimd_f32 g0 = psimd_load_f32(g + 0);
37 | 	const psimd_f32 g1 = psimd_load_f32(g + 4);
38 | 	const psimd_f32 g2 = psimd_load_f32(g + 8);
39 | 
40 | 	psimd_f32 w0, w1, w2, w3, w4, w5, w6, w7;
41 | 	winograd_f6k3_kernel_transform(
42 | 		g0, g1, g2,
43 | 		&w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7,
44 | 		true /* rescale coefficients */);
45 | 
46 | 	psimd_store_f32(w +  0, w0);
47 | 	psimd_store_f32(w +  4, w1);
48 | 	psimd_store_f32(w +  8, w2);
49 | 	psimd_store_f32(w + 12, w3);
50 | 	psimd_store_f32(w + 16, w4);
51 | 	psimd_store_f32(w + 20, w5);
52 | 	psimd_store_f32(w + 24, w6);
53 | 	psimd_store_f32(w + 28, w7);
54 | }
55 | 
56 | void nnp_owt_f6k3__psimd(
57 | 	const float m[restrict static 32],
58 | 	float s[restrict static 24])
59 | {
60 | 	const psimd_f32 m0 = psimd_load_f32(m +  0);
61 | 	const psimd_f32 m1 = psimd_load_f32(m +  4);
62 | 	const psimd_f32 m2 = psimd_load_f32(m +  8);
63 | 	const psimd_f32 m3 = psimd_load_f32(m + 12);
64 | 	const psimd_f32 m4 = psimd_load_f32(m + 16);
65 | 	const psimd_f32 m5 = psimd_load_f32(m + 20);
66 | 	const psimd_f32 m6 = psimd_load_f32(m + 24);
67 | 	const psimd_f32 m7 = psimd_load_f32(m + 28);
68 | 
69 | 	psimd_f32 s0, s1, s2, s3, s4, s5;
70 | 	winograd_f6k3_output_transform(
71 | 		m0, m1, m2, m3, m4, m5, m6, m7,
72 | 		&s0, &s1, &s2, &s3, &s4, &s5);
73 | 
74 | 	psimd_store_f32(s +  0, s0);
75 | 	psimd_store_f32(s +  4, s1);
76 | 	psimd_store_f32(s +  8, s2);
77 | 	psimd_store_f32(s + 12, s3);
78 | 	psimd_store_f32(s + 16, s4);
79 | 	psimd_store_f32(s + 20, s5);
80 | }
81 | 


--------------------------------------------------------------------------------
/src/ref/convolution-input-gradient.c:
--------------------------------------------------------------------------------
 1 | #include <nnpack.h>
 2 | #include <nnpack/reference.h>
 3 | 
 4 | struct convolution_input_gradient_context {
 5 | 	size_t input_channels;
 6 | 	size_t output_channels;
 7 | 	struct nnp_size input_size;
 8 | 	struct nnp_padding input_padding;
 9 | 	struct nnp_size kernel_size;
10 | 	struct nnp_size output_size;
11 | 	const float* grad_output_pointer;
12 | 	const float* kernel_pointer;
13 | 	float* grad_input_pointer;
14 | };
15 | 
16 | static void compute_convolution_input_gradient(
17 | 	const struct convolution_input_gradient_context context[restrict static 1],
18 | 	size_t sample, size_t input_channel)
19 | {
20 | 	const size_t input_channels            = context->input_channels;
21 | 	const size_t output_channels           = context->output_channels;
22 | 	const struct nnp_size input_size       = context->input_size;
23 | 	const struct nnp_padding input_padding = context->input_padding;
24 | 	const struct nnp_size kernel_size      = context->kernel_size;
25 | 	const struct nnp_size output_size      = context->output_size;
26 | 
27 | 	const float (*grad_output)[output_channels][output_size.height][output_size.width] =
28 | 		(const float(*)[output_channels][output_size.height][output_size.width]) context->grad_output_pointer;
29 | 	const float (*kernel)[input_channels][kernel_size.height][kernel_size.width] =
30 | 		(const float(*)[input_channels][kernel_size.height][kernel_size.width]) context->kernel_pointer;
31 | 
32 | 	float (*grad_input)[input_channels][input_size.height][input_size.width] =
33 | 		(float(*)[input_channels][input_size.height][input_size.width]) context->grad_input_pointer;
34 | 
35 | 	for (size_t y = 0; y < input_size.height; y++) {
36 | 		for (size_t x = 0; x < input_size.width; x++) {
37 | 			double v = 0.0;
38 | 			for (size_t output_channel = 0; output_channel < output_channels; output_channel++) {
39 | 				for (size_t i = 0; i < kernel_size.height; i++) {
40 | 					const size_t s = y - i + input_padding.top;
41 | 					if (s < output_size.height) {
42 | 						for (size_t j = 0; j < kernel_size.width; j++) {
43 | 							const size_t t = x - j + input_padding.left;
44 | 							if (t < output_size.width) {
45 | 								v += grad_output[sample][output_channel][s][t] * kernel[output_channel][input_channel][i][j];
46 | 							}
47 | 						}
48 | 					}
49 | 				}
50 | 			}
51 | 			grad_input[sample][input_channel][y][x] = v;
52 | 		}
53 | 	}
54 | }
55 | 
56 | void nnp_convolution_input_gradient__reference(
57 | 	size_t batch_size,
58 | 	size_t input_channels,
59 | 	size_t output_channels,
60 | 	struct nnp_size input_size,
61 | 	struct nnp_padding input_padding,
62 | 	struct nnp_size kernel_size,
63 | 	const float grad_output_pointer[],
64 | 	const float kernel_pointer[],
65 | 	float grad_input_pointer[],
66 | 	pthreadpool_t threadpool)
67 | {
68 | 	const struct nnp_size output_size = {
69 | 		.width = input_padding.left + input_size.width + input_padding.right - kernel_size.width + 1,
70 | 		.height = input_padding.top + input_size.height + input_padding.bottom - kernel_size.height + 1
71 | 	};
72 | 	struct convolution_input_gradient_context convolution_input_gradient_context = {
73 | 		.input_channels = input_channels,
74 | 		.output_channels = output_channels,
75 | 		.input_size = input_size,
76 | 		.input_padding = input_padding,
77 | 		.kernel_size = kernel_size,
78 | 		.output_size = output_size,
79 | 		.grad_output_pointer = grad_output_pointer,
80 | 		.kernel_pointer = kernel_pointer,
81 | 		.grad_input_pointer = grad_input_pointer,
82 | 	};
83 | 
84 | 	pthreadpool_parallelize_2d(threadpool,
85 | 		(pthreadpool_function_2d_t) compute_convolution_input_gradient,
86 | 		&convolution_input_gradient_context,
87 | 		batch_size, input_channels,
88 | 		PTHREADPOOL_FLAG_DISABLE_DENORMALS);
89 | }
90 | 


--------------------------------------------------------------------------------
/src/ref/convolution-output.c:
--------------------------------------------------------------------------------
 1 | #include <nnpack.h>
 2 | #include <nnpack/reference.h>
 3 | 
 4 | struct convolution_output_context {
 5 | 	size_t input_channels;
 6 | 	size_t output_channels;
 7 | 	struct nnp_size input_size;
 8 | 	struct nnp_padding input_padding;
 9 | 	struct nnp_size kernel_size;
10 | 	struct nnp_size output_size;
11 | 	struct nnp_size output_subsampling;
12 | 	const float* input_pointer;
13 | 	const float* kernel_pointer;
14 | 	const float* bias;
15 | 	float* output_pointer;
16 | };
17 | 
18 | static void compute_convolution_output(
19 | 	const struct convolution_output_context context[restrict static 1],
20 | 	size_t sample, size_t output_channel)
21 | {
22 | 	const size_t input_channels              = context->input_channels;
23 | 	const size_t output_channels             = context->output_channels;
24 | 	const struct nnp_size input_size         = context->input_size;
25 | 	const struct nnp_padding input_padding   = context->input_padding;
26 | 	const struct nnp_size kernel_size        = context->kernel_size;
27 | 	const struct nnp_size output_size        = context->output_size;
28 | 	const struct nnp_size output_subsampling = context->output_subsampling;
29 | 
30 | 	const float (*input)[input_channels][input_size.height][input_size.width] =
31 | 		(const float(*)[input_channels][input_size.height][input_size.width]) context->input_pointer;
32 | 	const float (*kernel)[input_channels][kernel_size.height][kernel_size.width] =
33 | 		(const float(*)[input_channels][kernel_size.height][kernel_size.width]) context->kernel_pointer;
34 | 	float (*output)[output_channels][output_size.height][output_size.width] =
35 | 		(float(*)[output_channels][output_size.height][output_size.width]) context->output_pointer;
36 | 
37 | 	for (size_t y = 0; y < output_size.height; y++) {
38 | 		for (size_t x = 0; x < output_size.width; x++) {
39 | 			double v = 0.0;
40 | 			for (size_t input_channel = 0; input_channel < input_channels; input_channel++) {
41 | 				for (size_t i = 0; i < kernel_size.height; i++) {
42 | 					const size_t s = y * output_subsampling.height + i - input_padding.top;
43 | 					if (s < input_size.height) {
44 | 						for (size_t j = 0; j < kernel_size.width; j++) {
45 | 							const size_t t = x * output_subsampling.width + j - input_padding.left;
46 | 							if (t < input_size.width) {
47 | 								v += input[sample][input_channel][s][t] * kernel[output_channel][input_channel][i][j];
48 | 							}
49 | 						}
50 | 					}
51 | 				}
52 | 			}
53 | 			output[sample][output_channel][y][x] = v + context->bias[output_channel];
54 | 		}
55 | 	}
56 | }
57 | 
58 | void nnp_convolution_output__reference(
59 | 	size_t batch_size,
60 | 	size_t input_channels,
61 | 	size_t output_channels,
62 | 	struct nnp_size input_size,
63 | 	struct nnp_padding input_padding,
64 | 	struct nnp_size kernel_size,
65 | 	struct nnp_size output_subsampling,
66 | 	const float input_pointer[],
67 | 	const float kernel_pointer[],
68 | 	const float bias[],
69 | 	float output_pointer[],
70 | 	pthreadpool_t threadpool)
71 | {
72 | 	const struct nnp_size output_size = {
73 | 		.width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width) / output_subsampling.width + 1,
74 | 		.height = (input_padding.top + input_size.height + input_padding.bottom - kernel_size.height) / output_subsampling.height + 1
75 | 	};
76 | 	struct convolution_output_context convolution_output_context = {
77 | 		.input_channels = input_channels,
78 | 		.output_channels = output_channels,
79 | 		.input_size = input_size,
80 | 		.input_padding = input_padding,
81 | 		.kernel_size = kernel_size,
82 | 		.output_size = output_size,
83 | 		.output_subsampling = output_subsampling,
84 | 		.input_pointer = input_pointer,
85 | 		.kernel_pointer = kernel_pointer,
86 | 		.bias = bias,
87 | 		.output_pointer = output_pointer
88 | 	};
89 | 
90 | 	pthreadpool_parallelize_2d(threadpool,
91 | 		(pthreadpool_task_2d_t) compute_convolution_output,
92 | 		&convolution_output_context,
93 | 		batch_size, output_channels,
94 | 		PTHREADPOOL_FLAG_DISABLE_DENORMALS);
95 | }
96 | 


--------------------------------------------------------------------------------
/src/ref/fully-connected-output.c:
--------------------------------------------------------------------------------
 1 | #include <fp16.h>
 2 | 
 3 | #include <nnpack.h>
 4 | #include <nnpack/reference.h>
 5 | 
 6 | struct fully_connected_output_context {
 7 | 	size_t input_channels;
 8 | 	size_t output_channels;
 9 | 	const void* input_pointer;
10 | 	const void* kernel_pointer;
11 | 	void* output_pointer;
12 | };
13 | 
14 | static void compute_fully_connected_output_f32(
15 | 	const struct fully_connected_output_context* context,
16 | 	size_t sample, size_t output_channel)
17 | {
18 | 	const size_t input_channels = context->input_channels;
19 | 	const size_t output_channels = context->output_channels;
20 | 
21 | 	const float (*input)[input_channels] = (const float(*)[input_channels]) context->input_pointer;
22 | 	const float (*kernel)[input_channels] = (const float(*)[input_channels]) context->kernel_pointer;
23 | 	float (*output)[output_channels] = (float(*)[output_channels]) context->output_pointer;
24 | 
25 | 	double v = 0.0;
26 | 	for (size_t input_channel = 0; input_channel < input_channels; input_channel++) {
27 | 		v += (double) input[sample][input_channel] * (double) kernel[output_channel][input_channel];
28 | 	}
29 | 	output[sample][output_channel] = v;
30 | }
31 | 
32 | static void compute_fully_connected_output_f16f32(
33 | 	const struct fully_connected_output_context* context,
34 | 	size_t sample, size_t output_channel)
35 | {
36 | 	const size_t input_channels = context->input_channels;
37 | 	const size_t output_channels = context->output_channels;
38 | 
39 | 	const float (*input)[input_channels] = (const float(*)[input_channels]) context->input_pointer;
40 | 	const uint16_t (*kernel)[input_channels] = (const uint16_t(*)[input_channels]) context->kernel_pointer;
41 | 	float (*output)[output_channels] = (float(*)[output_channels]) context->output_pointer;
42 | 
43 | 	double v = 0.0;
44 | 	for (size_t input_channel = 0; input_channel < input_channels; input_channel++) {
45 | 		v += (double) input[sample][input_channel] *
46 | 			(double) fp16_alt_to_fp32_value(kernel[output_channel][input_channel]);
47 | 	}
48 | 	output[sample][output_channel] = v;
49 | }
50 | 
51 | void nnp_fully_connected_output_f32__reference(
52 | 	size_t batch_size,
53 | 	size_t input_channels,
54 | 	size_t output_channels,
55 | 	const float* input,
56 | 	const float* kernel,
57 | 	float* output,
58 | 	pthreadpool_t threadpool)
59 | {
60 | 	struct fully_connected_output_context fully_connected_output_context = {
61 | 		.input_channels = input_channels,
62 | 		.output_channels = output_channels,
63 | 		.input_pointer = input,
64 | 		.kernel_pointer = kernel,
65 | 		.output_pointer = output
66 | 	};
67 | 
68 | 	pthreadpool_parallelize_2d(threadpool,
69 | 		(pthreadpool_function_2d_t) compute_fully_connected_output_f32,
70 | 		&fully_connected_output_context,
71 | 		batch_size, output_channels,
72 | 		PTHREADPOOL_FLAG_DISABLE_DENORMALS);
73 | }
74 | 
75 | void nnp_fully_connected_output_f16f32__reference(
76 | 	size_t batch_size,
77 | 	size_t input_channels,
78 | 	size_t output_channels,
79 | 	const float* input,
80 | 	const void* kernel,
81 | 	float* output,
82 | 	pthreadpool_t threadpool)
83 | {
84 | 	struct fully_connected_output_context fully_connected_output_context = {
85 | 		.input_channels = input_channels,
86 | 		.output_channels = output_channels,
87 | 		.input_pointer = input,
88 | 		.kernel_pointer = kernel,
89 | 		.output_pointer = output
90 | 	};
91 | 
92 | 	pthreadpool_parallelize_2d(threadpool,
93 | 		(pthreadpool_function_2d_t) compute_fully_connected_output_f16f32,
94 | 		&fully_connected_output_context,
95 | 		batch_size, output_channels,
96 | 		PTHREADPOOL_FLAG_DISABLE_DENORMALS);
97 | }
98 | 


--------------------------------------------------------------------------------
/src/ref/max-pooling-output.c:
--------------------------------------------------------------------------------
 1 | #include <nnpack.h>
 2 | #include <nnpack/reference.h>
 3 | #include <nnpack/utils.h>
 4 | 
 5 | struct max_pooling_output_context {
 6 | 	size_t channels;
 7 | 	struct nnp_size input_size;
 8 | 	struct nnp_padding input_padding;
 9 | 	struct nnp_size pooling_size;
10 | 	struct nnp_size pooling_stride;
11 | 	struct nnp_size output_size;
12 | 	const float* input;
13 | 	float* output;
14 | };
15 | 
16 | static void compute_max_pooling_output(
17 | 	const struct max_pooling_output_context context[restrict static 1],
18 | 	size_t sample, size_t channel)
19 | {
20 | 	const size_t channels                  = context->channels;
21 | 	const struct nnp_size input_size       = context->input_size;
22 | 	const struct nnp_padding input_padding = context->input_padding;
23 | 	const struct nnp_size pooling_size     = context->pooling_size;
24 | 	const struct nnp_size pooling_stride   = context->pooling_stride;
25 | 	const struct nnp_size output_size      = context->output_size;
26 | 
27 | 	const float (*input)[channels][input_size.height][input_size.width] =
28 | 		(const float(*)[channels][input_size.height][input_size.width]) context->input;
29 | 	float (*output)[channels][output_size.height][output_size.width] =
30 | 		(float(*)[channels][output_size.height][output_size.width]) context->output;
31 | 
32 | 	for (size_t y = 0; y < output_size.height; y++) {
33 | 		for (size_t x = 0; x < output_size.width; x++) {
34 | 			float v = -__builtin_inff();
35 | 			for (size_t i = 0; i < pooling_size.height; i++) {
36 | 				const size_t s = y * pooling_stride.height + i - input_padding.top;
37 | 				if (s < input_size.height) {
38 | 					for (size_t j = 0; j < pooling_size.width; j++) {
39 | 						const size_t t = x * pooling_stride.width + j - input_padding.left;
40 | 						if (t < input_size.width) {
41 | 							v = maxf(input[sample][channel][s][t], v);
42 | 						}
43 | 					}
44 | 				}
45 | 			}
46 | 			output[sample][channel][y][x] = v;
47 | 		}
48 | 	}
49 | }
50 | 
51 | void nnp_max_pooling_output__reference(
52 | 	size_t batch_size,
53 | 	size_t channels,
54 | 	struct nnp_size input_size,
55 | 	struct nnp_padding input_padding,
56 | 	struct nnp_size pooling_size,
57 | 	struct nnp_size pooling_stride,
58 | 	const float* input,
59 | 	float* output,
60 | 	pthreadpool_t threadpool)
61 | {
62 | 	const struct nnp_size output_size = {
63 | 		.height = divide_round_up(doz(input_padding.top + input_size.height + input_padding.bottom, pooling_size.height), pooling_stride.height) + 1,
64 | 		.width = divide_round_up(doz(input_padding.left + input_size.width + input_padding.right, pooling_size.width), pooling_stride.width) + 1,
65 | 	};
66 | 
67 | 	struct max_pooling_output_context max_pooling_output_context = {
68 | 		.channels = channels,
69 | 		.input_size = input_size,
70 | 		.input_padding = input_padding,
71 | 		.pooling_size = pooling_size,
72 | 		.pooling_stride = pooling_stride,
73 | 		.output_size = output_size,
74 | 		.input = input,
75 | 		.output = output
76 | 	};
77 | 
78 | 	pthreadpool_parallelize_2d(threadpool,
79 | 		(pthreadpool_function_2d_t) compute_max_pooling_output,
80 | 		&max_pooling_output_context,
81 | 		batch_size, channels,
82 | 		PTHREADPOOL_FLAG_DISABLE_DENORMALS);
83 | }
84 | 


--------------------------------------------------------------------------------
/src/ref/relu-input-gradient.c:
--------------------------------------------------------------------------------
 1 | #include <nnpack.h>
 2 | #include <nnpack/reference.h>
 3 | #include <nnpack/activations.h>
 4 | 
 5 | 
 6 | struct relu_input_gradient_context {
 7 | 	size_t channels;
 8 | 	const float* grad_output;
 9 | 	const float* input;
10 | 	float* grad_input;
11 | 	float negative_slope;
12 | };
13 | 
14 | static void compute_relu_input_gradient(
15 | 	const struct relu_input_gradient_context context[restrict static 1],
16 | 	size_t sample)
17 | {
18 | 	const size_t channels    = context->channels;
19 | 	const float* grad_output = context->grad_output + sample * channels;
20 | 	const float* input       = context->input       + sample * channels;
21 | 	float* grad_input        = context->grad_input  + sample * channels;
22 | 	float negative_slope     = context->negative_slope;
23 | 
24 | 	for (size_t channel = 0; channel < channels; channel++) {
25 | 		grad_input[channel] = grad_relu(grad_output[channel], input[channel], negative_slope);
26 | 	}
27 | }
28 | 
29 | void nnp_relu_input_gradient__reference(
30 | 	size_t batch_size,
31 | 	size_t channels,
32 | 	const float grad_output[],
33 | 	const float input[],
34 | 	float grad_input[],
35 | 	float negative_slope,
36 | 	pthreadpool_t threadpool)
37 | {
38 | 	struct relu_input_gradient_context relu_input_gradient_context = {
39 | 		.channels = channels,
40 | 		.grad_output = grad_output,
41 | 		.input = input,
42 | 		.grad_input = grad_input,
43 | 		.negative_slope = negative_slope,
44 | 	};
45 | 
46 | 	pthreadpool_parallelize_1d(threadpool,
47 | 		(pthreadpool_function_1d_t) compute_relu_input_gradient,
48 | 		&relu_input_gradient_context,
49 | 		batch_size,
50 | 		PTHREADPOOL_FLAG_DISABLE_DENORMALS);
51 | }
52 | 


--------------------------------------------------------------------------------
/src/ref/relu-output.c:
--------------------------------------------------------------------------------
 1 | #include <nnpack.h>
 2 | #include <nnpack/reference.h>
 3 | #include <nnpack/activations.h>
 4 | 
 5 | struct relu_output_context {
 6 | 	size_t channels;
 7 | 	const float* input;
 8 | 	float* output;
 9 | 	float negative_slope;
10 | };
11 | 
12 | static void compute_relu_output(
13 | 	const struct relu_output_context context[restrict static 1],
14 | 	size_t sample)
15 | {
16 | 	const size_t channels = context->channels;
17 | 	const float* input    = context->input  + sample * channels;
18 | 	float* output         = context->output + sample * channels;
19 | 	float negative_slope  = context->negative_slope;
20 | 
21 | 	for (size_t channel = 0; channel < channels; channel++) {
22 | 		output[channel] = relu(input[channel], negative_slope);
23 | 	}
24 | }
25 | 
26 | void nnp_relu_output__reference(
27 | 	size_t batch_size,
28 | 	size_t channels,
29 | 	const float input[],
30 | 	float output[],
31 | 	float negative_slope,
32 | 	pthreadpool_t threadpool)
33 | {
34 | 	struct relu_output_context relu_output_context = {
35 | 		.channels = channels,
36 | 		.input = input,
37 | 		.output = output,
38 | 		.negative_slope = negative_slope,
39 | 	};
40 | 
41 | 	pthreadpool_parallelize_1d(threadpool,
42 | 		(pthreadpool_function_1d_t) compute_relu_output,
43 | 		&relu_output_context,
44 | 		batch_size,
45 | 		PTHREADPOOL_FLAG_DISABLE_DENORMALS);
46 | }
47 | 


--------------------------------------------------------------------------------
/src/ref/softmax-output.c:
--------------------------------------------------------------------------------
 1 | #include <nnpack.h>
 2 | #include <nnpack/reference.h>
 3 | #include <nnpack/utils.h>
 4 | 
 5 | #include <float.h>
 6 | #include <math.h>
 7 | 
 8 | static inline float vector_maxf(size_t length, const float array[restrict static length]) {
 9 |     float max_element = -FLT_MAX;
10 |     for (size_t i = 0; i < length; i++) {
11 |         max_element = maxf(max_element, array[i]);
12 |     }
13 |     return max_element;
14 | }
15 | 
16 | static inline float vector_sum_expf_minus_c(size_t length, const float array[restrict static length], float c) {
17 |     float sum = 0.0f;
18 |     for (size_t i = 0; i < length; i++) {
19 |         sum += expf(array[i] - c);
20 |     }
21 |     return sum;
22 | }
23 | 
24 | struct softmax_output_context {
25 |     size_t channels;
26 |     const float* input;
27 |     float* output;
28 | };
29 | 
30 | static void compute_softmax_output(
31 |     const struct softmax_output_context context[restrict static 1],
32 |     size_t sample)
33 | {
34 |     const size_t channels = context->channels;
35 | 
36 |     const float (*input)[channels] =
37 |         (const float(*)[channels]) context->input;
38 |     float (*output)[channels] =
39 |         (float(*)[channels]) context->output;
40 | 
41 |     const float max_element = vector_maxf(channels, input[sample]);
42 |     const float sum_exp = vector_sum_expf_minus_c(channels, input[sample], max_element);
43 |     const float norm_factor = 1.0f / sum_exp;
44 |     for (size_t channel = 0; channel < channels; channel++) {
45 |         output[sample][channel] = norm_factor * expf(input[sample][channel] - max_element);
46 |     }
47 | }
48 | 
49 | void nnp_softmax_output__reference(
50 |     size_t batch_size,
51 |     size_t channels,
52 |     const float* input,
53 |     float* output,
54 |     pthreadpool_t threadpool)
55 | {
56 |     struct softmax_output_context softmax_output_context = {
57 |         .channels = channels,
58 |         .input = input,
59 |         .output = output,
60 |     };
61 |     pthreadpool_parallelize_1d(threadpool,
62 |         (pthreadpool_function_1d_t) compute_softmax_output,
63 |         &softmax_output_context,
64 |         batch_size,
65 |         PTHREADPOOL_FLAG_DISABLE_DENORMALS);
66 | }
67 | 


--------------------------------------------------------------------------------
/src/relu-input-gradient.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <assert.h>
 4 | 
 5 | #include <nnpack.h>
 6 | #include <nnpack/macros.h>
 7 | #include <nnpack/utils.h>
 8 | 
 9 | #include <nnpack/hwinfo.h>
10 | #include <nnpack/activations.h>
11 | #include <nnpack/validation.h>
12 | 
13 | 
14 | struct NNP_CACHE_ALIGN relu_context {
15 | 	nnp_grad_relu_function grad_relu_function;
16 | 	const float* grad_output;
17 | 	const float* input;
18 | 	float* grad_input;
19 | 	float negative_slope;
20 | };
21 | 
22 | static void compute_grad_relu(
23 | 	const struct relu_context context[restrict static 1],
24 | 	size_t block_start, size_t block_size)
25 | {
26 | 	nnp_grad_relu_function grad_relu = context->grad_relu_function;
27 | 	const float* grad_output         = context->grad_output;
28 | 	const float* input               = context->input;
29 | 	float* grad_input                = context->grad_input;
30 | 	float negative_slope             = context->negative_slope;
31 | 
32 | 	grad_relu(grad_output + block_start, input + block_start, grad_input + block_start, block_size, negative_slope);
33 | }
34 | 
35 | enum nnp_status nnp_relu_input_gradient(
36 | 	size_t batch_size,
37 | 	size_t channels,
38 | 	const float grad_output[],
39 | 	const float input[],
40 | 	float grad_input[],
41 | 	float negative_slope,
42 | 	pthreadpool_t threadpool)
43 | {
44 | 	enum nnp_status status = validate_relu_arguments(batch_size, channels);
45 | 	if (status != nnp_status_success) {
46 | 		return status;
47 | 	}
48 | 
49 | 	size_t elements = batch_size * channels;
50 | 	const size_t simd_width = nnp_hwinfo.simd_width;
51 | 
52 | 	assert(((uintptr_t) grad_output) % sizeof(float) == 0);
53 | 	assert(((uintptr_t) input) % sizeof(float) == 0);
54 | 	assert(((uintptr_t) grad_input) % sizeof(float) == 0);
55 | 
56 | 	const size_t prologue_elements = min((size_t) (-(((uintptr_t) grad_input) / sizeof(float)) % simd_width), elements);
57 | 	for (size_t i = 0; i < prologue_elements; i++) {
58 | 		grad_input[i] = grad_relu(grad_output[i], input[i], negative_slope);
59 | 	}
60 | 	elements -= prologue_elements;
61 | 	grad_output += prologue_elements;
62 | 	input += prologue_elements;
63 | 	grad_input += prologue_elements;
64 | 
65 | 	const size_t epilogue_elements = elements % simd_width;
66 | 	for (size_t i = 0; i < epilogue_elements; i++) {
67 | 		grad_input[elements - epilogue_elements + i] = grad_relu(
68 | 			grad_output[elements - epilogue_elements + i],
69 | 			input[elements - epilogue_elements + i],
70 | 			negative_slope);
71 | 	}
72 | 	elements -= epilogue_elements;
73 | 
74 | 	struct relu_context relu_context = {
75 | 		.grad_relu_function = nnp_hwinfo.activations.grad_relu,
76 | 		.grad_output = grad_output,
77 | 		.input = input,
78 | 		.grad_input = grad_input,
79 | 		.negative_slope = negative_slope,
80 | 	};
81 | 	pthreadpool_parallelize_1d_tile_1d(threadpool,
82 | 		(pthreadpool_function_1d_tiled_t) compute_grad_relu,
83 | 		&relu_context,
84 | 		elements, round_down(nnp_hwinfo.blocking.l1 / sizeof(float), simd_width),
85 | 		PTHREADPOOL_FLAG_DISABLE_DENORMALS);
86 | 
87 | 	return nnp_status_success;
88 | }
89 | 


--------------------------------------------------------------------------------
/src/relu-output.c:
--------------------------------------------------------------------------------
  1 | #include <stdint.h>
  2 | #include <stddef.h>
  3 | #include <assert.h>
  4 | 
  5 | #include <nnpack.h>
  6 | #include <nnpack/macros.h>
  7 | #include <nnpack/utils.h>
  8 | 
  9 | #include <nnpack/hwinfo.h>
 10 | #include <nnpack/activations.h>
 11 | #include <nnpack/validation.h>
 12 | 
 13 | 
 14 | struct NNP_CACHE_ALIGN relu_context {
 15 | 	nnp_relu_function relu_function;
 16 | 	const float* input;
 17 | 	float* output;
 18 | 	float negative_slope;
 19 | };
 20 | 
 21 | static void compute_relu_output(
 22 | 	const struct relu_context context[restrict static 1],
 23 | 	size_t block_start, size_t block_size)
 24 | {
 25 | 	nnp_relu_function relu = context->relu_function;
 26 | 	const float* input     = context->input;
 27 | 	float* output          = context->output;
 28 | 	float negative_slope   = context->negative_slope;
 29 | 
 30 | 	relu(input + block_start, output + block_start, block_size, negative_slope);
 31 | }
 32 | 
 33 | struct NNP_CACHE_ALIGN inplace_relu_context {
 34 | 	nnp_inplace_relu_function relu_function;
 35 | 	float* data;
 36 | 	float negative_slope;
 37 | };
 38 | 
 39 | static void compute_inplace_relu_output(
 40 | 	const struct inplace_relu_context context[restrict static 1],
 41 | 	size_t block_start, size_t block_size)
 42 | {
 43 | 	nnp_inplace_relu_function relu = context->relu_function;
 44 | 	float* data                    = context->data;
 45 | 	float negative_slope           = context->negative_slope;
 46 | 
 47 | 	relu(data + block_start, block_size, negative_slope);
 48 | }
 49 | 
 50 | enum nnp_status nnp_relu_output(
 51 | 	size_t batch_size,
 52 | 	size_t channels,
 53 | 	const float input[],
 54 | 	float output[],
 55 | 	float negative_slope,
 56 | 	pthreadpool_t threadpool)
 57 | {
 58 | 	enum nnp_status status = validate_relu_arguments(batch_size, channels);
 59 | 	if (status != nnp_status_success) {
 60 | 		return status;
 61 | 	}
 62 | 
 63 | 	size_t elements = batch_size * channels;
 64 | 	const size_t simd_width = nnp_hwinfo.simd_width;
 65 | 
 66 | 	assert(((uintptr_t) input) % sizeof(float) == 0);
 67 | 	assert(((uintptr_t) output) % sizeof(float) == 0);
 68 | 
 69 | 	const size_t prologue_elements = min((size_t) (-(((uintptr_t) output) / sizeof(float)) % simd_width), elements);
 70 | 	for (size_t i = 0; i < prologue_elements; i++) {
 71 | 		output[i] = relu(input[i], negative_slope);
 72 | 	}
 73 | 	elements -= prologue_elements;
 74 | 	input += prologue_elements;
 75 | 	output += prologue_elements;
 76 | 
 77 | 	const size_t epilogue_elements = elements % simd_width;
 78 | 	for (size_t i = 0; i < epilogue_elements; i++) {
 79 | 		output[elements - epilogue_elements + i] =
 80 | 			relu(input[elements - epilogue_elements + i], negative_slope);
 81 | 	}
 82 | 	elements -= epilogue_elements;
 83 | 
 84 | 	if (input != output) {
 85 | 		/* Out-of-place transformation */
 86 | 		struct relu_context relu_context = {
 87 | 			.relu_function = nnp_hwinfo.activations.relu,
 88 | 			.input = input,
 89 | 			.output = output,
 90 | 			.negative_slope = negative_slope,
 91 | 		};
 92 | 		pthreadpool_parallelize_1d_tile_1d(threadpool,
 93 | 			(pthreadpool_function_1d_tiled_t) compute_relu_output,
 94 | 			&relu_context,
 95 | 			elements, round_down(nnp_hwinfo.blocking.l1 / sizeof(float), simd_width),
 96 | 			PTHREADPOOL_FLAG_DISABLE_DENORMALS);
 97 | 	} else {
 98 | 		/* In-place transformation */
 99 | 		struct inplace_relu_context inplace_relu_context = {
100 | 			.relu_function = nnp_hwinfo.activations.inplace_relu,
101 | 			.data = output,
102 | 			.negative_slope = negative_slope,
103 | 		};
104 | 		pthreadpool_parallelize_1d_tile_1d(threadpool,
105 | 			(pthreadpool_function_1d_tiled_t) compute_inplace_relu_output,
106 | 			&inplace_relu_context,
107 | 			elements, round_down(nnp_hwinfo.blocking.l1 / sizeof(float), simd_width),
108 | 			PTHREADPOOL_FLAG_DISABLE_DENORMALS);
109 | 	}
110 | 
111 | 	return nnp_status_success;
112 | }
113 | 


--------------------------------------------------------------------------------
/src/scalar/blas/cgemm-conjb-transc.c:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdint.h>
  3 | 
  4 | 
  5 | void nnp_cgemm_conjb_transc_only_2x2__scalar(
  6 | 	size_t k, size_t update,
  7 | 	const float a[restrict static 1],
  8 | 	const float b[restrict static 1],
  9 | 	float c[restrict static 1],
 10 | 	size_t row_stride_c)
 11 | {
 12 | 	float acc00r, acc01r, acc10r, acc11r;
 13 | 	float acc00i, acc01i, acc10i, acc11i;
 14 | 	acc00r = acc01r = acc10r = acc11r = 0.0f;
 15 | 	acc00i = acc01i = acc10i = acc11i = 0.0f;
 16 | 	do {
 17 | 		const float a0r = a[0];
 18 | 		const float a1r = a[2];
 19 | 		const float a0i = a[1];
 20 | 		const float a1i = a[3];
 21 | 		a += 4;
 22 | 
 23 | 		const float b0r = b[0];
 24 | 		const float b1r = b[2];
 25 | 		acc00r += a0r * b0r;
 26 | 		acc01r += a0r * b1r;
 27 | 		acc10r += a1r * b0r;
 28 | 		acc11r += a1r * b1r;
 29 | 		acc00i += a0i * b0r;
 30 | 		acc01i += a0i * b1r;
 31 | 		acc10i += a1i * b0r;
 32 | 		acc11i += a1i * b1r;
 33 | 
 34 | 		const float b0i = b[1];
 35 | 		const float b1i = b[3];
 36 | 		b += 4;
 37 | 
 38 | 		acc00r += a0i * b0i;
 39 | 		acc01r += a0i * b1i;
 40 | 		acc10r += a1i * b0i;
 41 | 		acc11r += a1i * b1i;
 42 | 		acc00i -= a0r * b0i;
 43 | 		acc01i -= a0r * b1i;
 44 | 		acc10i -= a1r * b0i;
 45 | 		acc11i -= a1r * b1i;
 46 | 	} while (--k);
 47 | 
 48 | 	if (update != 0) {
 49 | 		c[0] += acc00r;
 50 | 		c[1] += acc00i;
 51 | 		c[2] += acc10r;
 52 | 		c[3] += acc10i;
 53 | 		c += row_stride_c;
 54 | 		c[0] += acc01r;
 55 | 		c[1] += acc01i;
 56 | 		c[2] += acc11r;
 57 | 		c[3] += acc11i;
 58 | 	} else {
 59 | 		c[0] = acc00r;
 60 | 		c[1] = acc00i;
 61 | 		c[2] = acc10r;
 62 | 		c[3] = acc10i;
 63 | 		c += row_stride_c;
 64 | 		c[0] = acc01r;
 65 | 		c[1] = acc01i;
 66 | 		c[2] = acc11r;
 67 | 		c[3] = acc11i;
 68 | 	}
 69 | }
 70 | 
 71 | void nnp_cgemm_conjb_transc_upto_2x2__scalar(
 72 | 	uint32_t mr, uint32_t nr,
 73 | 	size_t k, size_t update,
 74 | 	const float a[restrict static 1],
 75 | 	const float b[restrict static 1],
 76 | 	float c[restrict static 1],
 77 | 	size_t row_stride_c)
 78 | {
 79 | 	float acc00r, acc01r, acc10r, acc11r;
 80 | 	float acc00i, acc01i, acc10i, acc11i;
 81 | 	acc00r = acc01r = acc10r = acc11r = 0.0f;
 82 | 	acc00i = acc01i = acc10i = acc11i = 0.0f;
 83 | 	do {
 84 | 		const float a0r = a[0];
 85 | 		const float a0i = a[1];
 86 | 		a += 2;
 87 | 
 88 | 		float a1r, a1i;
 89 | 		if (mr > 1) {
 90 | 			a1r = a[0];
 91 | 			a1i = a[1];
 92 | 			a += 2;
 93 | 		}
 94 | 
 95 | 		const float b0r = b[0];
 96 | 		const float b0i = b[1];
 97 | 		b += 2;
 98 | 
 99 | 		acc00r += a0r * b0r;
100 | 		acc10r += a1r * b0r;
101 | 		acc00i += a0i * b0r;
102 | 		acc10i += a1i * b0r;
103 | 
104 | 		acc00r += a0i * b0i;
105 | 		acc10r += a1i * b0i;
106 | 		acc00i -= a0r * b0i;
107 | 		acc10i -= a1r * b0i;
108 | 
109 | 		if (nr > 1) {
110 | 			const float b1r = b[0];
111 | 			const float b1i = b[1];
112 | 			b += 2;
113 | 
114 | 			acc01r += a0r * b1r;
115 | 			acc11r += a1r * b1r;
116 | 			acc01i += a0i * b1r;
117 | 			acc11i += a1i * b1r;
118 | 
119 | 			acc01r += a0i * b1i;
120 | 			acc11r += a1i * b1i;
121 | 			acc01i -= a0r * b1i;
122 | 			acc11i -= a1r * b1i;
123 | 		}
124 | 	} while (--k);
125 | 
126 | 	if (update != 0) {
127 | 		c[0] += acc00r;
128 | 		c[1] += acc00i;
129 | 		if (mr > 1) {
130 | 			c[2] += acc10r;
131 | 			c[3] += acc10i;
132 | 		}
133 | 		if (nr > 1) {
134 | 			c += row_stride_c;
135 | 			c[0] += acc01r;
136 | 			c[1] += acc01i;
137 | 			if (mr > 1) {
138 | 				c[2] += acc11r;
139 | 				c[3] += acc11i;
140 | 			}
141 | 		}
142 | 	} else {
143 | 		c[0] = acc00r;
144 | 		c[1] = acc00i;
145 | 		if (mr > 1) {
146 | 			c[2] = acc10r;
147 | 			c[3] = acc10i;
148 | 		}
149 | 		if (nr > 1) {
150 | 			c += row_stride_c;
151 | 			c[0] = acc01r;
152 | 			c[1] = acc01i;
153 | 			if (mr > 1) {
154 | 				c[2] = acc11r;
155 | 				c[3] = acc11i;
156 | 			}
157 | 		}
158 | 	}
159 | }
160 | 


--------------------------------------------------------------------------------
/src/scalar/blas/cgemm-conjb.c:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdint.h>
  3 | 
  4 | 
  5 | void nnp_cgemm_conjb_only_2x2__scalar(
  6 | 	size_t k, size_t update,
  7 | 	const float a[restrict static 1],
  8 | 	const float b[restrict static 1],
  9 | 	float c[restrict static 1],
 10 | 	size_t row_stride_c)
 11 | {
 12 | 	float acc00r, acc01r, acc10r, acc11r;
 13 | 	float acc00i, acc01i, acc10i, acc11i;
 14 | 	acc00r = acc01r = acc10r = acc11r = 0.0f;
 15 | 	acc00i = acc01i = acc10i = acc11i = 0.0f;
 16 | 	do {
 17 | 		const float a0r = a[0];
 18 | 		const float a1r = a[2];
 19 | 		const float a0i = a[1];
 20 | 		const float a1i = a[3];
 21 | 		a += 4;
 22 | 
 23 | 		const float b0r = b[0];
 24 | 		const float b1r = b[2];
 25 | 		acc00r += a0r * b0r;
 26 | 		acc01r += a0r * b1r;
 27 | 		acc10r += a1r * b0r;
 28 | 		acc11r += a1r * b1r;
 29 | 		acc00i += a0i * b0r;
 30 | 		acc01i += a0i * b1r;
 31 | 		acc10i += a1i * b0r;
 32 | 		acc11i += a1i * b1r;
 33 | 
 34 | 		const float b0i = b[1];
 35 | 		const float b1i = b[3];
 36 | 		b += 4;
 37 | 
 38 | 		acc00r += a0i * b0i;
 39 | 		acc01r += a0i * b1i;
 40 | 		acc10r += a1i * b0i;
 41 | 		acc11r += a1i * b1i;
 42 | 		acc00i -= a0r * b0i;
 43 | 		acc01i -= a0r * b1i;
 44 | 		acc10i -= a1r * b0i;
 45 | 		acc11i -= a1r * b1i;
 46 | 	} while (--k);
 47 | 
 48 | 	if (update != 0) {
 49 | 		c[0] += acc00r;
 50 | 		c[1] += acc00i;
 51 | 		c[2] += acc01r;
 52 | 		c[3] += acc01i;
 53 | 		c += row_stride_c;
 54 | 		c[0] += acc10r;
 55 | 		c[1] += acc10i;
 56 | 		c[2] += acc11r;
 57 | 		c[3] += acc11i;
 58 | 	} else {
 59 | 		c[0] = acc00r;
 60 | 		c[1] = acc00i;
 61 | 		c[2] = acc01r;
 62 | 		c[3] = acc01i;
 63 | 		c += row_stride_c;
 64 | 		c[0] = acc10r;
 65 | 		c[1] = acc10i;
 66 | 		c[2] = acc11r;
 67 | 		c[3] = acc11i;
 68 | 	}
 69 | }
 70 | 
 71 | void nnp_cgemm_conjb_upto_2x2__scalar(
 72 | 	uint32_t mr, uint32_t nr,
 73 | 	size_t k, size_t update,
 74 | 	const float a[restrict static 1],
 75 | 	const float b[restrict static 1],
 76 | 	float c[restrict static 1],
 77 | 	size_t row_stride_c)
 78 | {
 79 | 	float acc00r, acc01r, acc10r, acc11r;
 80 | 	float acc00i, acc01i, acc10i, acc11i;
 81 | 	acc00r = acc01r = acc10r = acc11r = 0.0f;
 82 | 	acc00i = acc01i = acc10i = acc11i = 0.0f;
 83 | 	do {
 84 | 		const float a0r = a[0];
 85 | 		const float a0i = a[1];
 86 | 		a += 2;
 87 | 
 88 | 		float a1r, a1i;
 89 | 		if (mr > 1) {
 90 | 			a1r = a[0];
 91 | 			a1i = a[1];
 92 | 			a += 2;
 93 | 		}
 94 | 
 95 | 		const float b0r = b[0];
 96 | 		const float b0i = b[1];
 97 | 		b += 2;
 98 | 
 99 | 		acc00r += a0r * b0r;
100 | 		acc10r += a1r * b0r;
101 | 		acc00i += a0i * b0r;
102 | 		acc10i += a1i * b0r;
103 | 
104 | 		acc00r += a0i * b0i;
105 | 		acc10r += a1i * b0i;
106 | 		acc00i -= a0r * b0i;
107 | 		acc10i -= a1r * b0i;
108 | 
109 | 		if (nr > 1) {
110 | 			const float b1r = b[0];
111 | 			const float b1i = b[1];
112 | 			b += 2;
113 | 
114 | 			acc01r += a0r * b1r;
115 | 			acc11r += a1r * b1r;
116 | 			acc01i += a0i * b1r;
117 | 			acc11i += a1i * b1r;
118 | 
119 | 			acc01r += a0i * b1i;
120 | 			acc11r += a1i * b1i;
121 | 			acc01i -= a0r * b1i;
122 | 			acc11i -= a1r * b1i;
123 | 		}
124 | 	} while (--k);
125 | 
126 | 	if (update != 0) {
127 | 		c[0] += acc00r;
128 | 		c[1] += acc00i;
129 | 		if (nr > 1) {
130 | 			c[2] += acc01r;
131 | 			c[3] += acc01i;
132 | 		}
133 | 		if (mr > 1) {
134 | 			c += row_stride_c;
135 | 			c[0] += acc10r;
136 | 			c[1] += acc10i;
137 | 			if (nr > 1) {
138 | 				c[2] += acc11r;
139 | 				c[3] += acc11i;
140 | 			}
141 | 		}
142 | 	} else {
143 | 		c[0] = acc00r;
144 | 		c[1] = acc00i;
145 | 		if (nr > 1) {
146 | 			c[2] = acc01r;
147 | 			c[3] = acc01i;
148 | 		}
149 | 		if (mr > 1) {
150 | 			c += row_stride_c;
151 | 			c[0] = acc10r;
152 | 			c[1] = acc10i;
153 | 			if (nr > 1) {
154 | 				c[2] = acc11r;
155 | 				c[3] = acc11i;
156 | 			}
157 | 		}
158 | 	}
159 | }
160 | 


--------------------------------------------------------------------------------
/src/scalar/blas/cgemm.c:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdint.h>
  3 | 
  4 | 
  5 | void nnp_cgemm_only_2x2__scalar(
  6 | 	size_t k, size_t update,
  7 | 	const float a[restrict static 1],
  8 | 	const float b[restrict static 1],
  9 | 	float c[restrict static 1],
 10 | 	size_t row_stride_c)
 11 | {
 12 | 	float acc00r, acc01r, acc10r, acc11r;
 13 | 	float acc00i, acc01i, acc10i, acc11i;
 14 | 	acc00r = acc01r = acc10r = acc11r = 0.0f;
 15 | 	acc00i = acc01i = acc10i = acc11i = 0.0f;
 16 | 	do {
 17 | 		const float a0r = a[0];
 18 | 		const float a1r = a[2];
 19 | 		const float a0i = a[1];
 20 | 		const float a1i = a[3];
 21 | 		a += 4;
 22 | 
 23 | 		const float b0r = b[0];
 24 | 		const float b1r = b[2];
 25 | 		acc00r += a0r * b0r;
 26 | 		acc01r += a0r * b1r;
 27 | 		acc10r += a1r * b0r;
 28 | 		acc11r += a1r * b1r;
 29 | 		acc00i += a0i * b0r;
 30 | 		acc01i += a0i * b1r;
 31 | 		acc10i += a1i * b0r;
 32 | 		acc11i += a1i * b1r;
 33 | 
 34 | 		const float b0i = b[1];
 35 | 		const float b1i = b[3];
 36 | 		b += 4;
 37 | 
 38 | 		acc00r -= a0i * b0i;
 39 | 		acc01r -= a0i * b1i;
 40 | 		acc10r -= a1i * b0i;
 41 | 		acc11r -= a1i * b1i;
 42 | 		acc00i += a0r * b0i;
 43 | 		acc01i += a0r * b1i;
 44 | 		acc10i += a1r * b0i;
 45 | 		acc11i += a1r * b1i;
 46 | 	} while (--k);
 47 | 
 48 | 	if (update != 0) {
 49 | 		c[0] += acc00r;
 50 | 		c[1] += acc00i;
 51 | 		c[2] += acc01r;
 52 | 		c[3] += acc01i;
 53 | 		c += row_stride_c;
 54 | 		c[0] += acc10r;
 55 | 		c[1] += acc10i;
 56 | 		c[2] += acc11r;
 57 | 		c[3] += acc11i;
 58 | 	} else {
 59 | 		c[0] = acc00r;
 60 | 		c[1] = acc00i;
 61 | 		c[2] = acc01r;
 62 | 		c[3] = acc01i;
 63 | 		c += row_stride_c;
 64 | 		c[0] = acc10r;
 65 | 		c[1] = acc10i;
 66 | 		c[2] = acc11r;
 67 | 		c[3] = acc11i;
 68 | 	}
 69 | }
 70 | 
 71 | void nnp_cgemm_upto_2x2__scalar(
 72 | 	uint32_t mr, uint32_t nr,
 73 | 	size_t k, size_t update,
 74 | 	const float a[restrict static 1],
 75 | 	const float b[restrict static 1],
 76 | 	float c[restrict static 1],
 77 | 	size_t row_stride_c)
 78 | {
 79 | 	float acc00r, acc01r, acc10r, acc11r;
 80 | 	float acc00i, acc01i, acc10i, acc11i;
 81 | 	acc00r = acc01r = acc10r = acc11r = 0.0f;
 82 | 	acc00i = acc01i = acc10i = acc11i = 0.0f;
 83 | 	do {
 84 | 		const float a0r = a[0];
 85 | 		const float a0i = a[1];
 86 | 		a += 2;
 87 | 
 88 | 		float a1r, a1i;
 89 | 		if (mr > 1) {
 90 | 			a1r = a[0];
 91 | 			a1i = a[1];
 92 | 			a += 2;
 93 | 		}
 94 | 
 95 | 		const float b0r = b[0];
 96 | 		const float b0i = b[1];
 97 | 		b += 2;
 98 | 
 99 | 		acc00r += a0r * b0r;
100 | 		acc10r += a1r * b0r;
101 | 		acc00i += a0i * b0r;
102 | 		acc10i += a1i * b0r;
103 | 
104 | 		acc00r -= a0i * b0i;
105 | 		acc10r -= a1i * b0i;
106 | 		acc00i += a0r * b0i;
107 | 		acc10i += a1r * b0i;
108 | 
109 | 		if (nr > 1) {
110 | 			const float b1r = b[0];
111 | 			const float b1i = b[1];
112 | 			b += 2;
113 | 
114 | 			acc01r += a0r * b1r;
115 | 			acc11r += a1r * b1r;
116 | 			acc01i += a0i * b1r;
117 | 			acc11i += a1i * b1r;
118 | 
119 | 			acc01r -= a0i * b1i;
120 | 			acc11r -= a1i * b1i;
121 | 			acc01i += a0r * b1i;
122 | 			acc11i += a1r * b1i;
123 | 		}
124 | 	} while (--k);
125 | 
126 | 	if (update != 0) {
127 | 		c[0] += acc00r;
128 | 		c[1] += acc00i;
129 | 		if (nr > 1) {
130 | 			c[2] += acc01r;
131 | 			c[3] += acc01i;
132 | 		}
133 | 		if (mr > 1) {
134 | 			c += row_stride_c;
135 | 			c[0] += acc10r;
136 | 			c[1] += acc10i;
137 | 			if (nr > 1) {
138 | 				c[2] += acc11r;
139 | 				c[3] += acc11i;
140 | 			}
141 | 		}
142 | 	} else {
143 | 		c[0] = acc00r;
144 | 		c[1] = acc00i;
145 | 		if (nr > 1) {
146 | 			c[2] = acc01r;
147 | 			c[3] = acc01i;
148 | 		}
149 | 		if (mr > 1) {
150 | 			c += row_stride_c;
151 | 			c[0] = acc10r;
152 | 			c[1] = acc10i;
153 | 			if (nr > 1) {
154 | 				c[2] = acc11r;
155 | 				c[3] = acc11i;
156 | 			}
157 | 		}
158 | 	}
159 | }
160 | 


--------------------------------------------------------------------------------
/src/scalar/blas/s2gemm-transc.c:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdint.h>
  3 | 
  4 | 
  5 | void nnp_s2gemm_transc_only_2x2__scalar(
  6 | 	size_t k, size_t update,
  7 | 	const float a[restrict static 1],
  8 | 	const float b[restrict static 1],
  9 | 	float c[restrict static 1],
 10 | 	size_t row_stride_c)
 11 | {
 12 | 	float acc00r, acc01r, acc10r, acc11r;
 13 | 	float acc00i, acc01i, acc10i, acc11i;
 14 | 	acc00r = acc01r = acc10r = acc11r = 0.0f;
 15 | 	acc00i = acc01i = acc10i = acc11i = 0.0f;
 16 | 	do {
 17 | 		const float a0r = a[0];
 18 | 		const float a1r = a[2];
 19 | 		const float b0r = b[0];
 20 | 		const float b1r = b[2];
 21 | 		acc00r += a0r * b0r;
 22 | 		acc01r += a0r * b1r;
 23 | 		acc10r += a1r * b0r;
 24 | 		acc11r += a1r * b1r;
 25 | 
 26 | 		const float a0i = a[1];
 27 | 		const float a1i = a[3];
 28 | 		const float b0i = b[1];
 29 | 		const float b1i = b[3];
 30 | 		acc00i += a0i * b0i;
 31 | 		acc01i += a0i * b1i;
 32 | 		acc10i += a1i * b0i;
 33 | 		acc11i += a1i * b1i;
 34 | 
 35 | 		a += 4;
 36 | 		b += 4;
 37 | 	} while (--k);
 38 | 
 39 | 	if (update != 0) {
 40 | 		c[0] += acc00r;
 41 | 		c[1] += acc00i;
 42 | 		c[2] += acc10r;
 43 | 		c[3] += acc10i;
 44 | 		c += row_stride_c;
 45 | 		c[0] += acc01r;
 46 | 		c[1] += acc01i;
 47 | 		c[2] += acc11r;
 48 | 		c[3] += acc11i;
 49 | 	} else {
 50 | 		c[0] = acc00r;
 51 | 		c[1] = acc00i;
 52 | 		c[2] = acc10r;
 53 | 		c[3] = acc10i;
 54 | 		c += row_stride_c;
 55 | 		c[0] = acc01r;
 56 | 		c[1] = acc01i;
 57 | 		c[2] = acc11r;
 58 | 		c[3] = acc11i;
 59 | 	}
 60 | }
 61 | 
 62 | void nnp_s2gemm_transc_upto_2x2__scalar(
 63 | 	uint32_t mr, uint32_t nr,
 64 | 	size_t k, size_t update,
 65 | 	const float a[restrict static 1],
 66 | 	const float b[restrict static 1],
 67 | 	float c[restrict static 1],
 68 | 	size_t row_stride_c)
 69 | {
 70 | 	float acc00r, acc01r, acc10r, acc11r;
 71 | 	float acc00i, acc01i, acc10i, acc11i;
 72 | 	acc00r = acc01r = acc10r = acc11r = 0.0f;
 73 | 	acc00i = acc01i = acc10i = acc11i = 0.0f;
 74 | 	do {
 75 | 		const float a0r = a[0];
 76 | 		const float a0i = a[1];
 77 | 		a += 2;
 78 | 
 79 | 		float a1r, a1i;
 80 | 		if (mr > 1) {
 81 | 			a1r = a[0];
 82 | 			a1i = a[1];
 83 | 			a += 2;
 84 | 		}
 85 | 
 86 | 		const float b0r = b[0];
 87 | 		const float b0i = b[1];
 88 | 		b += 2;
 89 | 
 90 | 		acc00r += a0r * b0r;
 91 | 		acc10r += a1r * b0r;
 92 | 		acc00i += a0i * b0i;
 93 | 		acc10i += a1i * b0i;
 94 | 
 95 | 		if (nr > 1) {
 96 | 			const float b1r = b[0];
 97 | 			const float b1i = b[1];
 98 | 			b += 2;
 99 | 
100 | 			acc01r += a0r * b1r;
101 | 			acc11r += a1r * b1r;
102 | 			acc01i += a0i * b1i;
103 | 			acc11i += a1i * b1i;
104 | 		}
105 | 	} while (--k);
106 | 
107 | 	if (update != 0) {
108 | 		c[0] += acc00r;
109 | 		c[1] += acc00i;
110 | 		if (mr > 1) {
111 | 			c[2] += acc10r;
112 | 			c[3] += acc10i;
113 | 		}
114 | 		if (nr > 1) {
115 | 			c += row_stride_c;
116 | 			c[0] += acc01r;
117 | 			c[1] += acc01i;
118 | 			if (mr > 1) {
119 | 				c[2] += acc11r;
120 | 				c[3] += acc11i;
121 | 			}
122 | 		}
123 | 	} else {
124 | 		c[0] = acc00r;
125 | 		c[1] = acc00i;
126 | 		if (mr > 1) {
127 | 			c[2] = acc10r;
128 | 			c[3] = acc10i;
129 | 		}
130 | 		if (nr > 1) {
131 | 			c += row_stride_c;
132 | 			c[0] = acc01r;
133 | 			c[1] = acc01i;
134 | 			if (mr > 1) {
135 | 				c[2] = acc11r;
136 | 				c[3] = acc11i;
137 | 			}
138 | 		}
139 | 	}
140 | }
141 | 


--------------------------------------------------------------------------------
/src/scalar/blas/s2gemm.c:
--------------------------------------------------------------------------------
  1 | #include <stddef.h>
  2 | #include <stdint.h>
  3 | 
  4 | 
  5 | void nnp_s2gemm_only_2x2__scalar(
  6 | 	size_t k, size_t update,
  7 | 	const float a[restrict static 1],
  8 | 	const float b[restrict static 1],
  9 | 	float c[restrict static 1],
 10 | 	size_t row_stride_c)
 11 | {
 12 | 	float acc00r, acc01r, acc10r, acc11r;
 13 | 	float acc00i, acc01i, acc10i, acc11i;
 14 | 	acc00r = acc01r = acc10r = acc11r = 0.0f;
 15 | 	acc00i = acc01i = acc10i = acc11i = 0.0f;
 16 | 	do {
 17 | 		const float a0r = a[0];
 18 | 		const float a1r = a[2];
 19 | 		const float b0r = b[0];
 20 | 		const float b1r = b[2];
 21 | 		acc00r += a0r * b0r;
 22 | 		acc01r += a0r * b1r;
 23 | 		acc10r += a1r * b0r;
 24 | 		acc11r += a1r * b1r;
 25 | 
 26 | 		const float a0i = a[1];
 27 | 		const float a1i = a[3];
 28 | 		const float b0i = b[1];
 29 | 		const float b1i = b[3];
 30 | 		acc00i += a0i * b0i;
 31 | 		acc01i += a0i * b1i;
 32 | 		acc10i += a1i * b0i;
 33 | 		acc11i += a1i * b1i;
 34 | 
 35 | 		a += 4;
 36 | 		b += 4;
 37 | 	} while (--k);
 38 | 
 39 | 	if (update != 0) {
 40 | 		c[0] += acc00r;
 41 | 		c[1] += acc00i;
 42 | 		c[2] += acc01r;
 43 | 		c[3] += acc01i;
 44 | 		c += row_stride_c;
 45 | 		c[0] += acc10r;
 46 | 		c[1] += acc10i;
 47 | 		c[2] += acc11r;
 48 | 		c[3] += acc11i;
 49 | 	} else {
 50 | 		c[0] = acc00r;
 51 | 		c[1] = acc00i;
 52 | 		c[2] = acc01r;
 53 | 		c[3] = acc01i;
 54 | 		c += row_stride_c;
 55 | 		c[0] = acc10r;
 56 | 		c[1] = acc10i;
 57 | 		c[2] = acc11r;
 58 | 		c[3] = acc11i;
 59 | 	}
 60 | }
 61 | 
 62 | void nnp_s2gemm_upto_2x2__scalar(
 63 | 	uint32_t mr, uint32_t nr,
 64 | 	size_t k, size_t update,
 65 | 	const float a[restrict static 1],
 66 | 	const float b[restrict static 1],
 67 | 	float c[restrict static 1],
 68 | 	size_t row_stride_c)
 69 | {
 70 | 	float acc00r, acc01r, acc10r, acc11r;
 71 | 	float acc00i, acc01i, acc10i, acc11i;
 72 | 	acc00r = acc01r = acc10r = acc11r = 0.0f;
 73 | 	acc00i = acc01i = acc10i = acc11i = 0.0f;
 74 | 	do {
 75 | 		const float a0r = a[0];
 76 | 		const float a0i = a[1];
 77 | 		a += 2;
 78 | 
 79 | 		float a1r, a1i;
 80 | 		if (mr > 1) {
 81 | 			a1r = a[0];
 82 | 			a1i = a[1];
 83 | 			a += 2;
 84 | 		}
 85 | 
 86 | 		const float b0r = b[0];
 87 | 		const float b0i = b[1];
 88 | 		b += 2;
 89 | 
 90 | 		acc00r += a0r * b0r;
 91 | 		acc10r += a1r * b0r;
 92 | 		acc00i += a0i * b0i;
 93 | 		acc10i += a1i * b0i;
 94 | 
 95 | 		if (nr > 1) {
 96 | 			const float b1r = b[0];
 97 | 			const float b1i = b[1];
 98 | 			b += 2;
 99 | 
100 | 			acc01r += a0r * b1r;
101 | 			acc11r += a1r * b1r;
102 | 			acc01i += a0i * b1i;
103 | 			acc11i += a1i * b1i;
104 | 		}
105 | 	} while (--k);
106 | 
107 | 	if (update != 0) {
108 | 		c[0] += acc00r;
109 | 		c[1] += acc00i;
110 | 		if (nr > 1) {
111 | 			c[2] += acc01r;
112 | 			c[3] += acc01i;
113 | 		}
114 | 		if (mr > 1) {
115 | 			c += row_stride_c;
116 | 			c[0] += acc10r;
117 | 			c[1] += acc10i;
118 | 			if (nr > 1) {
119 | 				c[2] += acc11r;
120 | 				c[3] += acc11i;
121 | 			}
122 | 		}
123 | 	} else {
124 | 		c[0] = acc00r;
125 | 		c[1] = acc00i;
126 | 		if (nr > 1) {
127 | 			c[2] = acc01r;
128 | 			c[3] = acc01i;
129 | 		}
130 | 		if (mr > 1) {
131 | 			c += row_stride_c;
132 | 			c[0] = acc10r;
133 | 			c[1] = acc10i;
134 | 			if (nr > 1) {
135 | 				c[2] = acc11r;
136 | 				c[3] = acc11i;
137 | 			}
138 | 		}
139 | 	}
140 | }
141 | 


--------------------------------------------------------------------------------
/src/scalar/butterfly.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | 
 4 | static inline void scalar_swap(float a[restrict static 1], float b[restrict static 1]) {
 5 |     const float new_a = *b;
 6 |     const float new_b = *a;
 7 |     *a = new_a;
 8 |     *b = new_b;
 9 | }
10 | 
11 | static inline void scalar_butterfly(float a[restrict static 1], float b[restrict static 1]) {
12 |     const float new_a = *a + *b;
13 |     const float new_b = *a - *b;
14 |     *a = new_a;
15 |     *b = new_b;
16 | }
17 | 
18 | static inline void scalar_butterfly_and_negate_b(float a[restrict static 1], float b[restrict static 1]) {
19 |     const float new_a = *a + *b;
20 |     const float new_b = *b - *a;
21 |     *a = new_a;
22 |     *b = new_b;
23 | }
24 | 
25 | static inline void scalar_butterfly_with_negated_b(float a[restrict static 1], float b[restrict static 1]) {
26 |     const float new_a = *a - *b;
27 |     const float new_b = *a + *b;
28 |     *a = new_a;
29 |     *b = new_b;
30 | }
31 | 


--------------------------------------------------------------------------------
/src/scalar/fft-aos.c:
--------------------------------------------------------------------------------
 1 | #include <scalar/fft/aos.h>
 2 | 
 3 | 
 4 | void nnp_fft4_aos__scalar(
 5 | 	const float t[restrict static 8],
 6 | 	float f[restrict static 8])
 7 | {
 8 | 	float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
 9 | 	scalar_fft4_aos(
10 | 		t, t + 4, 1, 0, 8,
11 | 		&w0r, &w0i, &w1r, &w1i, &w2r, &w2i, &w3r, &w3i);
12 | 	f[0] = w0r;
13 | 	f[1] = w0i;
14 | 	f[2] = w1r;
15 | 	f[3] = w1i;
16 | 	f[4] = w2r;
17 | 	f[5] = w2i;
18 | 	f[6] = w3r;
19 | 	f[7] = w3i;
20 | }
21 | 
22 | void nnp_fft8_aos__scalar(
23 | 	const float t[restrict static 16],
24 | 	float f[restrict static 16])
25 | {
26 | 	float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i, w5r, w5i, w6r, w6i, w7r, w7i;
27 | 	scalar_fft8_aos(
28 | 		t, t + 8, 1, 0, 16,
29 | 		&w0r, &w0i, &w1r, &w1i, &w2r, &w2i, &w3r, &w3i, &w4r, &w4i, &w5r, &w5i, &w6r, &w6i, &w7r, &w7i);
30 | 	f[ 0] = w0r;
31 | 	f[ 1] = w0i;
32 | 	f[ 2] = w1r;
33 | 	f[ 3] = w1i;
34 | 	f[ 4] = w2r;
35 | 	f[ 5] = w2i;
36 | 	f[ 6] = w3r;
37 | 	f[ 7] = w3i;
38 | 	f[ 8] = w4r;
39 | 	f[ 9] = w4i;
40 | 	f[10] = w5r;
41 | 	f[11] = w5i;
42 | 	f[12] = w6r;
43 | 	f[13] = w6i;
44 | 	f[14] = w7r;
45 | 	f[15] = w7i;
46 | }
47 | 
48 | void nnp_ifft4_aos__scalar(
49 | 	const float f[restrict static 8],
50 | 	float t[restrict static 8])
51 | {
52 | 	const float w0r = f[0];
53 | 	const float w0i = f[1];
54 | 	const float w1r = f[2];
55 | 	const float w1i = f[3];
56 | 	const float w2r = f[4];
57 | 	const float w2i = f[5];
58 | 	const float w3r = f[6];
59 | 	const float w3i = f[7];
60 | 
61 | 	scalar_ifft4_aos(
62 | 		w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i,
63 | 		t, t + 4, 1);
64 | }
65 | 
66 | void nnp_ifft8_aos__scalar(
67 | 	const float f[restrict static 16],
68 | 	float t[restrict static 16])
69 | {
70 | 	const float w0r = f[ 0];
71 | 	const float w0i = f[ 1];
72 | 	const float w1r = f[ 2];
73 | 	const float w1i = f[ 3];
74 | 	const float w2r = f[ 4];
75 | 	const float w2i = f[ 5];
76 | 	const float w3r = f[ 6];
77 | 	const float w3i = f[ 7];
78 | 	const float w4r = f[ 8];
79 | 	const float w4i = f[ 9];
80 | 	const float w5r = f[10];
81 | 	const float w5i = f[11];
82 | 	const float w6r = f[12];
83 | 	const float w6i = f[13];
84 | 	const float w7r = f[14];
85 | 	const float w7i = f[15];
86 | 
87 | 	scalar_ifft8_aos(
88 | 		w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i, w5r, w5i, w6r, w6i, w7r, w7i,
89 | 		t, t + 8, 1);
90 | }
91 | 


--------------------------------------------------------------------------------
/src/scalar/fft-dualreal.c:
--------------------------------------------------------------------------------
  1 | #include <scalar/fft/dualreal.h>
  2 | 
  3 | 
  4 | void nnp_fft8_dualreal__scalar(
  5 | 	const float t[restrict static 16],
  6 | 	float f[restrict static 16])
  7 | {
  8 | 	float x0, y0, x1r, y1r, x2r, y2r, x3r, y3r;
  9 | 	float x4, y4, x1i, y1i, x2i, y2i, x3i, y3i;
 10 | 	scalar_fft8_dualreal(t,
 11 | 		&x0, &y0, &x1r, &y1r, &x2r, &y2r, &x3r, &y3r,
 12 | 		&x4, &y4, &x1i, &y1i, &x2i, &y2i, &x3i, &y3i);
 13 | 
 14 | 	f[0] = x0;
 15 | 	f[1] = y0;
 16 | 	f[2] = x1r;
 17 | 	f[3] = y1r;
 18 | 	f[4] = x2r;
 19 | 	f[5] = y2r;
 20 | 	f[6] = x3r;
 21 | 	f[7] = y3r;
 22 | 
 23 | 	f[ 8] = x4;
 24 | 	f[ 9] = y4;
 25 | 	f[10] = x1i;
 26 | 	f[11] = y1i;
 27 | 	f[12] = x2i;
 28 | 	f[13] = y2i;
 29 | 	f[14] = x3i;
 30 | 	f[15] = y3i;
 31 | }
 32 | 
 33 | void nnp_fft16_dualreal__scalar(
 34 | 	const float t[restrict static 32],
 35 | 	float f[restrict static 32])
 36 | {
 37 | 	float x0, y0, x1r, y1r, x2r, y2r, x3r, y3r, x4r, y4r, x5r, y5r, x6r, y6r, x7r, y7r;
 38 | 	float x8, y8, x1i, y1i, x2i, y2i, x3i, y3i, x4i, y4i, x5i, y5i, x6i, y6i, x7i, y7i;
 39 | 	scalar_fft16_dualreal(t,
 40 | 		&x0, &y0, &x1r, &y1r, &x2r, &y2r, &x3r, &y3r, &x4r, &y4r, &x5r, &y5r, &x6r, &y6r, &x7r, &y7r,
 41 | 		&x8, &y8, &x1i, &y1i, &x2i, &y2i, &x3i, &y3i, &x4i, &y4i, &x5i, &y5i, &x6i, &y6i, &x7i, &y7i);
 42 | 
 43 | 	f[ 0] = x0;
 44 | 	f[ 1] = y0;
 45 | 	f[ 2] = x1r;
 46 | 	f[ 3] = y1r;
 47 | 	f[ 4] = x2r;
 48 | 	f[ 5] = y2r;
 49 | 	f[ 6] = x3r;
 50 | 	f[ 7] = y3r;
 51 | 	f[ 8] = x4r;
 52 | 	f[ 9] = y4r;
 53 | 	f[10] = x5r;
 54 | 	f[11] = y5r;
 55 | 	f[12] = x6r;
 56 | 	f[13] = y6r;
 57 | 	f[14] = x7r;
 58 | 	f[15] = y7r;
 59 | 
 60 | 	f[16] = x8;
 61 | 	f[17] = y8;
 62 | 	f[18] = x1i;
 63 | 	f[19] = y1i;
 64 | 	f[20] = x2i;
 65 | 	f[21] = y2i;
 66 | 	f[22] = x3i;
 67 | 	f[23] = y3i;
 68 | 	f[24] = x4i;
 69 | 	f[25] = y4i;
 70 | 	f[26] = x5i;
 71 | 	f[27] = y5i;
 72 | 	f[28] = x6i;
 73 | 	f[29] = y6i;
 74 | 	f[30] = x7i;
 75 | 	f[31] = y7i;
 76 | }
 77 | 
 78 | void nnp_ifft8_dualreal__scalar(
 79 | 	const float f[restrict static 16],
 80 | 	float t[restrict static 16])
 81 | {
 82 | 	const float x0  = f[ 0];
 83 | 	const float y0  = f[ 1];
 84 | 	const float x1r = f[ 2];
 85 | 	const float y1r = f[ 3];
 86 | 	const float x2r = f[ 4];
 87 | 	const float y2r = f[ 5];
 88 | 	const float x3r = f[ 6];
 89 | 	const float y3r = f[ 7];
 90 | 	const float x4  = f[ 8];
 91 | 	const float y4  = f[ 9];
 92 | 	const float x1i = f[10];
 93 | 	const float y1i = f[11];
 94 | 	const float x2i = f[12];
 95 | 	const float y2i = f[13];
 96 | 	const float x3i = f[14];
 97 | 	const float y3i = f[15];
 98 | 
 99 | 	scalar_ifft8_dualreal(
100 | 		x0, y0, x1r, y1r, x2r, y2r, x3r, y3r,
101 | 		x4, y4, x1i, y1i, x2i, y2i, x3i, y3i,
102 | 		t);
103 | }
104 | 
105 | void nnp_ifft16_dualreal__scalar(
106 | 	const float f[restrict static 32],
107 | 	float t[restrict static 32])
108 | {
109 | 	const float x0  = f[ 0];
110 | 	const float y0  = f[ 1];
111 | 	const float x1r = f[ 2];
112 | 	const float y1r = f[ 3];
113 | 	const float x2r = f[ 4];
114 | 	const float y2r = f[ 5];
115 | 	const float x3r = f[ 6];
116 | 	const float y3r = f[ 7];
117 | 	const float x4r = f[ 8];
118 | 	const float y4r = f[ 9];
119 | 	const float x5r = f[10];
120 | 	const float y5r = f[11];
121 | 	const float x6r = f[12];
122 | 	const float y6r = f[13];
123 | 	const float x7r = f[14];
124 | 	const float y7r = f[15];
125 | 
126 | 	const float x8  = f[16];
127 | 	const float y8  = f[17];
128 | 	const float x1i = f[18];
129 | 	const float y1i = f[19];
130 | 	const float x2i = f[20];
131 | 	const float y2i = f[21];
132 | 	const float x3i = f[22];
133 | 	const float y3i = f[23];
134 | 	const float x4i = f[24];
135 | 	const float y4i = f[25];
136 | 	const float x5i = f[26];
137 | 	const float y5i = f[27];
138 | 	const float x6i = f[28];
139 | 	const float y6i = f[29];
140 | 	const float x7i = f[30];
141 | 	const float y7i = f[31];
142 | 
143 | 	scalar_ifft16_dualreal(
144 | 		x0, y0, x1r, y1r, x2r, y2r, x3r, y3r, x4r, y4r, x5r, y5r, x6r, y6r, x7r, y7r,
145 | 		x8, y8, x1i, y1i, x2i, y2i, x3i, y3i, x4i, y4i, x5i, y5i, x6i, y6i, x7i, y7i,
146 | 		t);
147 | }
148 | 


--------------------------------------------------------------------------------
/src/scalar/fft-real.c:
--------------------------------------------------------------------------------
 1 | #include <scalar/fft/real.h>
 2 | 
 3 | 
 4 | void nnp_fft8_real__scalar(
 5 | 	const float t[restrict static 8],
 6 | 	float f[restrict static 8])
 7 | {
 8 | 	scalar_fft8_real(
 9 | 		t, t + 4, 1, 0, 8,
10 | 		f, 1);
11 | }
12 | 
13 | void nnp_fft16_real__scalar(
14 | 	const float t[restrict static 16],
15 | 	float f[restrict static 16])
16 | {
17 | 	float w0r, w8r, w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i, w5r, w5i, w6r, w6i, w7r, w7i;
18 | 	scalar_fft16_real(
19 | 		t, t + 8, 1, 0, 16,
20 | 		f, 1);
21 | }
22 | 
23 | void nnp_ifft8_real__scalar(
24 | 	const float f[restrict static 8],
25 | 	float t[restrict static 8])
26 | {
27 | 	const float f0r = f[0];
28 | 	const float f4r = f[1];
29 | 	const float f1r = f[2];
30 | 	const float f1i = f[3];
31 | 	const float f2r = f[4];
32 | 	const float f2i = f[5];
33 | 	const float f3r = f[6];
34 | 	const float f3i = f[7];
35 | 	scalar_ifft8_real(
36 | 		f0r, f4r, f1r, f1i, f2r, f2i, f3r, f3i,
37 | 		t, t + 4, 1);
38 | }
39 | 
40 | void nnp_ifft16_real__scalar(
41 | 	const float f[restrict static 16],
42 | 	float t[restrict static 16])
43 | {
44 | 	const float f0r = f[ 0];
45 | 	const float f8r = f[ 1];
46 | 	const float f1r = f[ 2];
47 | 	const float f1i = f[ 3];
48 | 	const float f2r = f[ 4];
49 | 	const float f2i = f[ 5];
50 | 	const float f3r = f[ 6];
51 | 	const float f3i = f[ 7];
52 | 	const float f4r = f[ 8];
53 | 	const float f4i = f[ 9];
54 | 	const float f5r = f[10];
55 | 	const float f5i = f[11];
56 | 	const float f6r = f[12];
57 | 	const float f6i = f[13];
58 | 	const float f7r = f[14];
59 | 	const float f7i = f[15];
60 | 	scalar_ifft16_real(
61 | 		f0r, f8r, f1r, f1i, f2r, f2i, f3r, f3i, f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i,
62 | 		t, t + 8, 1);
63 | }
64 | 


--------------------------------------------------------------------------------
/src/scalar/fft-soa.c:
--------------------------------------------------------------------------------
  1 | #include <scalar/fft/soa.h>
  2 | 
  3 | 
  4 | void nnp_fft8_soa__scalar(
  5 | 	const float t[restrict static 16],
  6 | 	float f[restrict static 16])
  7 | {
  8 | 	float f0r, f1r, f2r, f3r, f4r, f5r, f6r, f7r;
  9 | 	float f0i, f1i, f2i, f3i, f4i, f5i, f6i, f7i;
 10 | 	scalar_fft8_soa(t,
 11 | 		&f0r, &f1r, &f2r, &f3r, &f4r, &f5r, &f6r, &f7r,
 12 | 		&f0i, &f1i, &f2i, &f3i, &f4i, &f5i, &f6i, &f7i);
 13 | 
 14 | 	f[0] = f0r;
 15 | 	f[1] = f1r;
 16 | 	f[2] = f2r;
 17 | 	f[3] = f3r;
 18 | 	f[4] = f4r;
 19 | 	f[5] = f5r;
 20 | 	f[6] = f6r;
 21 | 	f[7] = f7r;
 22 | 
 23 | 	f[ 8] = f0i;
 24 | 	f[ 9] = f1i;
 25 | 	f[10] = f2i;
 26 | 	f[11] = f3i;
 27 | 	f[12] = f4i;
 28 | 	f[13] = f5i;
 29 | 	f[14] = f6i;
 30 | 	f[15] = f7i;
 31 | }
 32 | 
 33 | void nnp_fft16_soa__scalar(
 34 | 	const float t[restrict static 32],
 35 | 	float f[restrict static 32])
 36 | {
 37 | 	float f0r, f1r, f2r, f3r, f4r, f5r, f6r, f7r, f8r, f9r, f10r, f11r, f12r, f13r, f14r, f15r;
 38 | 	float f0i, f1i, f2i, f3i, f4i, f5i, f6i, f7i, f8i, f9i, f10i, f11i, f12i, f13i, f14i, f15i;
 39 | 	scalar_fft16_soa(t,
 40 | 		&f0r, &f1r, &f2r, &f3r, &f4r, &f5r, &f6r, &f7r, &f8r, &f9r, &f10r, &f11r, &f12r, &f13r, &f14r, &f15r,
 41 | 		&f0i, &f1i, &f2i, &f3i, &f4i, &f5i, &f6i, &f7i, &f8i, &f9i, &f10i, &f11i, &f12i, &f13i, &f14i, &f15i);
 42 | 
 43 | 	f[ 0] = f0r;
 44 | 	f[ 1] = f1r;
 45 | 	f[ 2] = f2r;
 46 | 	f[ 3] = f3r;
 47 | 	f[ 4] = f4r;
 48 | 	f[ 5] = f5r;
 49 | 	f[ 6] = f6r;
 50 | 	f[ 7] = f7r;
 51 | 	f[ 8] = f8r;
 52 | 	f[ 9] = f9r;
 53 | 	f[10] = f10r;
 54 | 	f[11] = f11r;
 55 | 	f[12] = f12r;
 56 | 	f[13] = f13r;
 57 | 	f[14] = f14r;
 58 | 	f[15] = f15r;
 59 | 
 60 | 	f[16] = f0i;
 61 | 	f[17] = f1i;
 62 | 	f[18] = f2i;
 63 | 	f[19] = f3i;
 64 | 	f[20] = f4i;
 65 | 	f[21] = f5i;
 66 | 	f[22] = f6i;
 67 | 	f[23] = f7i;
 68 | 	f[24] = f8i;
 69 | 	f[25] = f9i;
 70 | 	f[26] = f10i;
 71 | 	f[27] = f11i;
 72 | 	f[28] = f12i;
 73 | 	f[29] = f13i;
 74 | 	f[30] = f14i;
 75 | 	f[31] = f15i;
 76 | }
 77 | 
 78 | void nnp_ifft8_soa__scalar(
 79 | 	const float f[restrict static 16],
 80 | 	float t[restrict static 16])
 81 | {
 82 | 	const float f0r = f[0];
 83 | 	const float f1r = f[1];
 84 | 	const float f2r = f[2];
 85 | 	const float f3r = f[3];
 86 | 	const float f4r = f[4];
 87 | 	const float f5r = f[5];
 88 | 	const float f6r = f[6];
 89 | 	const float f7r = f[7];
 90 | 
 91 | 	const float f0i = f[ 8];
 92 | 	const float f1i = f[ 9];
 93 | 	const float f2i = f[10];
 94 | 	const float f3i = f[11];
 95 | 	const float f4i = f[12];
 96 | 	const float f5i = f[13];
 97 | 	const float f6i = f[14];
 98 | 	const float f7i = f[15];
 99 | 
100 | 	scalar_ifft8_soa(
101 | 		f0r, f1r, f2r, f3r, f4r, f5r, f6r, f7r,
102 | 		f0i, f1i, f2i, f3i, f4i, f5i, f6i, f7i,
103 | 		t);
104 | }
105 | 
106 | void nnp_ifft16_soa__scalar(
107 | 	const float f[restrict static 32],
108 | 	float t[restrict static 32])
109 | {
110 | 	const float f0r  = f[ 0];
111 | 	const float f1r  = f[ 1];
112 | 	const float f2r  = f[ 2];
113 | 	const float f3r  = f[ 3];
114 | 	const float f4r  = f[ 4];
115 | 	const float f5r  = f[ 5];
116 | 	const float f6r  = f[ 6];
117 | 	const float f7r  = f[ 7];
118 | 	const float f8r  = f[ 8];
119 | 	const float f9r  = f[ 9];
120 | 	const float f10r = f[10];
121 | 	const float f11r = f[11];
122 | 	const float f12r = f[12];
123 | 	const float f13r = f[13];
124 | 	const float f14r = f[14];
125 | 	const float f15r = f[15];
126 | 
127 | 	const float f0i  = f[16];
128 | 	const float f1i  = f[17];
129 | 	const float f2i  = f[18];
130 | 	const float f3i  = f[19];
131 | 	const float f4i  = f[20];
132 | 	const float f5i  = f[21];
133 | 	const float f6i  = f[22];
134 | 	const float f7i  = f[23];
135 | 	const float f8i  = f[24];
136 | 	const float f9i  = f[25];
137 | 	const float f10i = f[26];
138 | 	const float f11i = f[27];
139 | 	const float f12i = f[28];
140 | 	const float f13i = f[29];
141 | 	const float f14i = f[30];
142 | 	const float f15i = f[31];
143 | 
144 | 	scalar_ifft16_soa(
145 | 		f0r, f1r, f2r, f3r, f4r, f5r, f6r, f7r, f8r, f9r, f10r, f11r, f12r, f13r, f14r, f15r,
146 | 		f0i, f1i, f2i, f3i, f4i, f5i, f6i, f7i, f8i, f9i, f10i, f11i, f12i, f13i, f14i, f15i,
147 | 		t);
148 | }
149 | 


--------------------------------------------------------------------------------
/src/scalar/relu.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | 
 4 | #include <nnpack/activations.h>
 5 | 
 6 | 
 7 | void nnp_relu__scalar(
 8 | 	const float input[restrict static 1],
 9 | 	float output[restrict static 1],
10 | 	size_t length,
11 | 	float negative_slope)
12 | {
13 | 	while (length >= 4) {
14 | 		const float data0 = input[0];
15 | 		const float data1 = input[1];
16 | 		const float data2 = input[2];
17 | 		const float data3 = input[3];
18 | 		input += 4;
19 | 
20 | 		output[0] = relu(data0, negative_slope);
21 | 		output[1] = relu(data1, negative_slope);
22 | 		output[2] = relu(data2, negative_slope);
23 | 		output[3] = relu(data3, negative_slope);
24 | 		output += 4;
25 | 
26 | 		length -= 4;
27 | 	}
28 | 	while (length != 0) {
29 | 		*output++ = relu(*input++, negative_slope);
30 | 		length -= 1;
31 | 	}
32 | }
33 | 
34 | void nnp_inplace_relu__scalar(
35 | 	float data[restrict static 1],
36 | 	size_t length,
37 | 	float negative_slope)
38 | {
39 | 	while (length >= 4) {
40 | 		const float data0 = data[0];
41 | 		const float data1 = data[1];
42 | 		const float data2 = data[2];
43 | 		const float data3 = data[3];
44 | 
45 | 		data[0] = relu(data0, negative_slope);
46 | 		data[1] = relu(data1, negative_slope);
47 | 		data[2] = relu(data2, negative_slope);
48 | 		data[3] = relu(data3, negative_slope);
49 | 		data += 4;
50 | 
51 | 		length -= 4;
52 | 	}
53 | 	while (length != 0) {
54 | 		*data = relu(*data, negative_slope);
55 | 
56 | 		data += 1;
57 | 		length -= 1;
58 | 	}
59 | }
60 | 
61 | void nnp_grad_relu__scalar(
62 | 	const float output_gradient[restrict static 4],
63 | 	const float input[restrict static 4],
64 | 	float input_gradient[restrict static 4],
65 | 	size_t length,
66 | 	float negative_slope)
67 | {
68 | 	while (length >= 4) {
69 | 		const float data0 = input[0];
70 | 		const float data1 = input[1];
71 | 		const float data2 = input[2];
72 | 		const float data3 = input[3];
73 | 		input += 4;
74 | 
75 | 		const float grad0 = output_gradient[0];
76 | 		const float grad1 = output_gradient[1];
77 | 		const float grad2 = output_gradient[2];
78 | 		const float grad3 = output_gradient[3];
79 | 		output_gradient += 4;
80 | 
81 | 		input_gradient[0] = grad_relu(grad0, data0, negative_slope);
82 | 		input_gradient[1] = grad_relu(grad1, data1, negative_slope);
83 | 		input_gradient[2] = grad_relu(grad2, data2, negative_slope);
84 | 		input_gradient[3] = grad_relu(grad3, data3, negative_slope);
85 | 		input_gradient += 4;
86 | 
87 | 		length -= 4;
88 | 	}
89 | 	while (length != 0) {
90 | 		*input_gradient++ = grad_relu(*output_gradient++, *input++, negative_slope);
91 | 		length -= 1;
92 | 	}
93 | }
94 | 


--------------------------------------------------------------------------------
/src/scalar/softmax.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <math.h>
 4 | 
 5 | #include <nnpack/utils.h>
 6 | #include <nnpack/softmax.h>
 7 | 
 8 | 
 9 | static float max__scalar(size_t n, const float v[restrict static n]) {
10 | 	float max_v = *v++;
11 | 	while (--n) {
12 | 		max_v = maxf(max_v, *v++);
13 | 	}
14 | 	return max_v;
15 | }
16 | 
17 | static float sum_exp_minus_c__scalar(size_t n, const float v[restrict static n], float c) {
18 | 	float sum = 0.0f;
19 | 	do {
20 | 		sum += expf(*v++ - c);
21 | 	} while (--n);
22 | 	return sum;
23 | }
24 | 
25 | static void scaled_exp_minus_c__scalar(size_t n, const float x[static n], float y[static n], float scale, float c) {
26 | 	do {
27 | 		*y++ = scale * expf(*x++ - c);
28 | 	} while (--n);
29 | }
30 | 
31 | void nnp_softmax__scalar(
32 | 	size_t n,
33 | 	const float x[restrict static n],
34 | 	float y[restrict static n])
35 | {
36 | 	const float c = max__scalar(n, x);
37 | 	const float sum = sum_exp_minus_c__scalar(n, x, c);
38 | 	const float scale = 1.0f / sum;
39 | 	scaled_exp_minus_c__scalar(n, x, y, scale, c);
40 | }
41 | 
42 | void nnp_inplace_softmax__scalar(
43 | 	size_t n,
44 | 	float v[restrict static n])
45 | {
46 | 	const float c = max__scalar(n, v);
47 | 	const float sum = sum_exp_minus_c__scalar(n, v, c);
48 | 	const float scale = 1.0f / sum;
49 | 	scaled_exp_minus_c__scalar(n, v, v, scale, c);
50 | }
51 | 


--------------------------------------------------------------------------------
/src/scalar/winograd-f6k3.c:
--------------------------------------------------------------------------------
 1 | #include <scalar/winograd/f6x6k3x3.h>
 2 | 
 3 | 
 4 | void nnp_iwt_f6k3__scalar(
 5 | 	const float d[restrict static 8],
 6 | 	float w[restrict static 8])
 7 | {
 8 | 	winograd_f6k3_input_transform(
 9 | 		d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7],
10 | 		&w[0], &w[1], &w[2], &w[3], &w[4], &w[5], &w[6], &w[7]);
11 | }
12 | 
13 | void nnp_kwt_f6k3__scalar(
14 | 	const float g[restrict static 3],
15 | 	float w[restrict static 8])
16 | {
17 | 	winograd_f6k3_kernel_transform(
18 | 		g[0], g[1], g[2],
19 | 		&w[0], &w[1], &w[2], &w[3], &w[4], &w[5], &w[6], &w[7],
20 | 		true /* rescale coefficients */);
21 | }
22 | 
23 | void nnp_owt_f6k3__scalar(
24 | 	const float m[restrict static 8],
25 | 	float s[restrict static 6])
26 | {
27 | 	winograd_f6k3_output_transform(
28 | 		m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7],
29 | 		&s[0], &s[1], &s[2], &s[3], &s[4], &s[5]);
30 | }
31 | 


--------------------------------------------------------------------------------
/src/softmax-output.c:
--------------------------------------------------------------------------------
 1 | #include <stddef.h>
 2 | 
 3 | #include <nnpack.h>
 4 | #include <nnpack/macros.h>
 5 | #include <nnpack/utils.h>
 6 | 
 7 | #include <nnpack/hwinfo.h>
 8 | #include <nnpack/softmax.h>
 9 | #include <nnpack/validation.h>
10 | 
11 | 
12 | struct NNP_CACHE_ALIGN softmax_context {
13 | 	nnp_softmax_function softmax_function;
14 | 	size_t channels;
15 | 	const float* input;
16 | 	float* output;
17 | };
18 | 
19 | static void compute_softmax_output(
20 | 	const struct softmax_context context[restrict static 1],
21 | 	size_t sample)
22 | {
23 | 	const nnp_softmax_function softmax = context->softmax_function;
24 | 	const size_t channels              = context->channels;
25 | 
26 | 	const float (*input)[channels] = (const float(*)[channels]) context->input;
27 | 	float (*output)[channels] = (float(*)[channels]) context->output;
28 | 
29 | 	softmax(channels, input[sample], output[sample]);
30 | }
31 | 
32 | struct NNP_CACHE_ALIGN inplace_softmax_context {
33 | 	nnp_inplace_softmax_function softmax_function;
34 | 	size_t channels;
35 | 	float* data;
36 | };
37 | 
38 | static void compute_inplace_softmax_output(
39 | 	const struct inplace_softmax_context context[restrict static 1],
40 | 	size_t sample)
41 | {
42 | 	const nnp_inplace_softmax_function softmax = context->softmax_function;
43 | 	const size_t channels                      = context->channels;
44 | 
45 | 	float (*data)[channels] = (float(*)[channels]) context->data;
46 | 
47 | 	softmax(channels, data[sample]);
48 | }
49 | 
50 | enum nnp_status nnp_softmax_output(
51 | 	size_t batch_size,
52 | 	size_t channels,
53 | 	const float* input,
54 | 	float* output,
55 | 	pthreadpool_t threadpool)
56 | {
57 | 	enum nnp_status status = validate_softmax_arguments(batch_size, channels);
58 | 	if (status != nnp_status_success) {
59 | 		return status;
60 | 	}
61 | 
62 | 	if (input != output) {
63 | 		/* Out-of-place softmax */
64 | 		struct softmax_context softmax_context = {
65 | 			.softmax_function = nnp_hwinfo.activations.softmax,
66 | 			.channels = channels,
67 | 			.input = input,
68 | 			.output = output,
69 | 		};
70 | 		pthreadpool_parallelize_1d(threadpool,
71 | 			(pthreadpool_function_1d_t) compute_softmax_output,
72 | 			&softmax_context,
73 | 			batch_size,
74 | 			PTHREADPOOL_FLAG_DISABLE_DENORMALS);
75 | 	} else {
76 | 		/* In-place softmax */
77 | 		struct inplace_softmax_context inplace_softmax_context = {
78 | 			.softmax_function = nnp_hwinfo.activations.inplace_softmax,
79 | 			.channels = channels,
80 | 			.data = output,
81 | 		};
82 | 		pthreadpool_parallelize_1d(threadpool,
83 | 			(pthreadpool_function_1d_t) compute_inplace_softmax_output,
84 | 			&inplace_softmax_context,
85 | 			batch_size,
86 | 			PTHREADPOOL_FLAG_DISABLE_DENORMALS);
87 | 	}
88 | 
89 | 	return nnp_status_success;
90 | }
91 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Maratyszcza/NNPACK/70a77f485e8b934224f3a79efd8edcd84cd377b8/src/x86_64-fma/__init__.py


--------------------------------------------------------------------------------
/src/x86_64-fma/blas/sdotxf.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | 
 4 | simd_width = YMMRegister.size // float_.size
 5 | 
 6 | for fusion_factor in range(1, 8 + 1):
 7 | 	arg_x = Argument(ptr(const_float_), "x")
 8 | 	arg_y = Argument(ptr(const_float_), "y")
 9 | 	arg_stride_y = Argument(size_t, "stride_y")
10 | 	arg_sum = Argument(ptr(float_), "sum")
11 | 	arg_n = Argument(size_t, "n")
12 | 	with Function("nnp_sdotxf{fusion_factor}__avx2".format(fusion_factor=fusion_factor),
13 | 		(arg_x, arg_y, arg_stride_y, arg_sum, arg_n),
14 | 		target=uarch.default + isa.fma3 + isa.avx2):
15 | 
16 | 		reg_x = GeneralPurposeRegister64()
17 | 		LOAD.ARGUMENT(reg_x, arg_x)
18 | 
19 | 		reg_ys = [GeneralPurposeRegister64() for m in range(fusion_factor)]
20 | 		LOAD.ARGUMENT(reg_ys[0], arg_y)
21 | 
22 | 		reg_stride_y = GeneralPurposeRegister64()
23 | 		LOAD.ARGUMENT(reg_stride_y, arg_stride_y)
24 | 		SHL(reg_stride_y, 2)
25 | 
26 | 		reg_sum = GeneralPurposeRegister64()
27 | 		LOAD.ARGUMENT(reg_sum, arg_sum)
28 | 
29 | 		reg_n = GeneralPurposeRegister64()
30 | 		LOAD.ARGUMENT(reg_n, arg_n)
31 | 
32 | 		ymm_accs = [YMMRegister() for m in range(fusion_factor)]
33 | 		VZEROALL()
34 | 
35 | 		for m in range(1, fusion_factor):
36 | 			LEA(reg_ys[m], [reg_ys[m - 1] + reg_stride_y * 1])
37 | 
38 | 		main_loop = Loop()
39 | 		end_block = Block()
40 | 
41 | 		SUB(reg_n, YMMRegister.size // float_.size)
42 | 		JB(main_loop.end)
43 | 
44 | 		with main_loop:
45 | 			ymm_x = YMMRegister()
46 | 			VMOVUPS(ymm_x, [reg_x])
47 | 			ADD(reg_x, YMMRegister.size)
48 | 
49 | 			for reg_y, ymm_acc in zip(reg_ys, ymm_accs):
50 | 				VFMADD231PS(ymm_acc, ymm_x, [reg_y])
51 | 				ADD(reg_y, YMMRegister.size)
52 | 
53 | 			SUB(reg_n, YMMRegister.size // float_.size)
54 | 			JAE(main_loop.begin)
55 | 
56 | 		ADD(reg_n, YMMRegister.size // float_.size)
57 | 		JE(end_block.end)
58 | 
59 | 		with end_block:
60 | 			ymm_mask = YMMRegister()
61 | 			VMOVD(ymm_mask.as_xmm, reg_n.as_dword)
62 | 			VPBROADCASTD(ymm_mask, ymm_mask.as_xmm)
63 | 			VPCMPGTD(ymm_mask, ymm_mask, Constant.uint32x8(0, 1, 2, 3, 4, 5, 6, 7))
64 | 
65 | 			ymm_x = YMMRegister()
66 | 			VMASKMOVPS(ymm_x, ymm_mask, [reg_x])
67 | 
68 | 			for reg_y, ymm_acc in zip(reg_ys, ymm_accs):
69 | 				ymm_y = YMMRegister()
70 | 				VMASKMOVPS(ymm_y, ymm_mask, [reg_y])
71 | 				VFMADD231PS(ymm_acc, ymm_x, ymm_y)
72 | 
73 | 		# Reduce the SIMD registers into a single elements
74 | 		xmm_tmp = XMMRegister()
75 | 		for i, ymm_acc in enumerate(ymm_accs):
76 | 			VEXTRACTF128(xmm_tmp, ymm_acc, 1)
77 | 			VADDPS(ymm_acc.as_xmm, ymm_acc.as_xmm, xmm_tmp)
78 | 			VHADDPS(ymm_acc, ymm_acc, ymm_acc)
79 | 			VHADDPS(ymm_acc, ymm_acc, ymm_acc)
80 | 			VMOVSS([reg_sum + i * float_.size], ymm_acc.as_xmm)
81 | 
82 | 		RETURN()
83 | 
84 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/blas/shdotxf.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | 
 4 | from fp16.avx import fp16_alt_xmm_to_fp32_xmm
 5 | from fp16.avx2 import fp16_alt_xmm_to_fp32_ymm
 6 | 
 7 | simd_width = YMMRegister.size // float_.size
 8 | 
 9 | for fusion_factor in range(1, 8 + 1):
10 | 	arg_x = Argument(ptr(const_float_), "x")
11 | 	arg_y = Argument(ptr(const_float_), "y")
12 | 	arg_stride_y = Argument(size_t, "stride_y")
13 | 	arg_sum = Argument(ptr(float_), "sum")
14 | 	arg_n = Argument(size_t, "n")
15 | 	with Function("nnp_shdotxf{fusion_factor}__avx2".format(fusion_factor=fusion_factor),
16 | 		(arg_x, arg_y, arg_stride_y, arg_sum, arg_n),
17 | 		target=uarch.default + isa.fma3 + isa.avx2):
18 | 
19 | 		reg_x = GeneralPurposeRegister64()
20 | 		LOAD.ARGUMENT(reg_x, arg_x)
21 | 
22 | 		reg_ys = [GeneralPurposeRegister64() for m in range(fusion_factor)]
23 | 		LOAD.ARGUMENT(reg_ys[0], arg_y)
24 | 
25 | 		reg_stride_y = GeneralPurposeRegister64()
26 | 		LOAD.ARGUMENT(reg_stride_y, arg_stride_y)
27 | 		ADD(reg_stride_y, reg_stride_y)
28 | 
29 | 		reg_sum = GeneralPurposeRegister64()
30 | 		LOAD.ARGUMENT(reg_sum, arg_sum)
31 | 
32 | 		reg_n = GeneralPurposeRegister64()
33 | 		LOAD.ARGUMENT(reg_n, arg_n)
34 | 
35 | 		ymm_accs = [YMMRegister() for m in range(fusion_factor)]
36 | 		VZEROALL()
37 | 
38 | 		for m in range(1, fusion_factor):
39 | 			LEA(reg_ys[m], [reg_ys[m - 1] + reg_stride_y * 1])
40 | 
41 | 		main_loop = Loop()
42 | 		edge_loop = Loop()
43 | 
44 | 		SUB(reg_n, XMMRegister.size // uint16_t.size)
45 | 		JB(main_loop.end)
46 | 
47 | 		with main_loop:
48 | 			ymm_x = YMMRegister()
49 | 			VMOVUPS(ymm_x, [reg_x])
50 | 			ADD(reg_x, YMMRegister.size)
51 | 
52 | 			for reg_y, ymm_acc in zip(reg_ys, ymm_accs):
53 | 				xmm_half = XMMRegister()
54 | 				VMOVUPS(xmm_half, [reg_y])
55 | 				ADD(reg_y, XMMRegister.size)
56 | 
57 | 				ymm_y = fp16_alt_xmm_to_fp32_ymm(xmm_half)
58 | 				VFMADD231PS(ymm_acc, ymm_x, ymm_y)
59 | 
60 | 			SUB(reg_n, YMMRegister.size // float_.size)
61 | 			JAE(main_loop.begin)
62 | 
63 | 		ADD(reg_n, XMMRegister.size // uint16_t.size)
64 | 		JE(edge_loop.end)
65 | 
66 | 		with edge_loop:
67 | 			xmm_x = XMMRegister()
68 | 			VMOVSS(xmm_x, [reg_x])
69 | 			ADD(reg_x, YMMRegister.size)
70 | 
71 | 			for reg_y, ymm_acc in zip(reg_ys, ymm_accs):
72 | 				reg_half = GeneralPurposeRegister32()
73 | 				MOVZX(reg_half, word[reg_y])
74 | 
75 | 				xmm_half = XMMRegister()
76 | 				VMOVD(xmm_half, reg_half)
77 | 				ADD(reg_y, uint16_t.size)
78 | 
79 | 				ymm_y = fp16_alt_xmm_to_fp32_ymm(xmm_half)
80 | 				VFMADD231PS(ymm_acc, xmm_x.as_ymm, ymm_y)
81 | 
82 | 			SUB(reg_n, 1)
83 | 			JAE(edge_loop.begin)
84 | 
85 | 		# Reduce the SIMD registers into a single elements
86 | 		xmm_tmp = XMMRegister()
87 | 		for i, ymm_acc in enumerate(ymm_accs):
88 | 			VEXTRACTF128(xmm_tmp, ymm_acc, 1)
89 | 			VADDPS(ymm_acc.as_xmm, ymm_acc.as_xmm, xmm_tmp)
90 | 			VHADDPS(ymm_acc, ymm_acc, ymm_acc)
91 | 			VHADDPS(ymm_acc, ymm_acc, ymm_acc)
92 | 			VMOVSS([reg_sum + i * float_.size], ymm_acc.as_xmm)
93 | 
94 | 		RETURN()
95 | 
96 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/exp.c:
--------------------------------------------------------------------------------
 1 | #include <x86intrin.h>
 2 | #include <stdint.h>
 3 | #include <stdio.h>
 4 | #include <math.h>
 5 | #include <float.h>
 6 | 
 7 | __m256 _mm256_exp_ps(__m256 x) {
 8 | 	const __m256 magic_bias = _mm256_set1_ps(0x1.800000p+23f);
 9 | 	const __m256 zero_cutoff = _mm256_set1_ps(-0x1.9FE368p+6f); /* The smallest x for which expf(x) is non-zero */
10 | 	const __m256 inf_cutoff = _mm256_set1_ps(0x1.62E42Ep+6f); /* The largest x for which expf(x) is finite */
11 | 	const __m256 log2e = _mm256_set1_ps(0x1.715476p+3f);
12 | 	const __m256 minus_ln2_hi = _mm256_set1_ps(-0x1.62E430p-4f);
13 | 	const __m256 minus_ln2_lo = _mm256_set1_ps( 0x1.05C610p-32f);
14 | 	const __m256 plus_inf = _mm256_set1_ps(__builtin_inff());
15 | 
16 | 	const __m256 c2 = _mm256_set1_ps(0x1.00088Ap-1f);
17 | 	const __m256 c3 = _mm256_set1_ps(0x1.555A86p-3f);
18 | 	const __m256 table = _mm256_set_ps(0x1.D5818Ep+0f, 0x1.AE89FAp+0f, 0x1.8ACE54p+0f, 0x1.6A09E6p+0f, 0x1.4BFDAEp+0f, 0x1.306FE0p+0f, 0x1.172B84p+0f, 0x1.000000p+0f);
19 | 
20 | 	const __m256i min_exponent = _mm256_set1_epi32(-126 << 23);
21 | 	const __m256i max_exponent = _mm256_set1_epi32(127 << 23);
22 | 	const __m256i default_exponent = _mm256_set1_epi32(0x3F800000u);
23 | 	const __m256i mantissa_mask = _mm256_set1_epi32(0x007FFFF8);
24 | 
25 | 	__m256 t = _mm256_fmadd_ps(x, log2e, magic_bias);
26 | 	__m256i e1 = _mm256_slli_epi32(_mm256_and_si256(_mm256_castps_si256(t), mantissa_mask), 20);
27 | 	__m256i e2 = e1;
28 | 	e1 = _mm256_min_epi32(_mm256_max_epi32(e1, min_exponent), max_exponent);
29 | 	e2 = _mm256_sub_epi32(e2, e1);
30 | 	const __m256 s1 = _mm256_castsi256_ps(_mm256_add_epi32(e1, default_exponent));
31 | 	const __m256 s2 = _mm256_castsi256_ps(_mm256_add_epi32(e2, default_exponent));
32 | 	const __m256 tf = _mm256_permutevar8x32_ps(table, _mm256_castps_si256(t));
33 | 	t = _mm256_sub_ps(t, magic_bias);
34 | 	const __m256 rx = _mm256_fmadd_ps(t, minus_ln2_lo, _mm256_fmadd_ps(t, minus_ln2_hi, x));
35 | 	const __m256 rf = _mm256_fmadd_ps(rx, _mm256_mul_ps(rx, _mm256_fmadd_ps(rx, c3, c2)), rx);
36 | 	__m256 f = _mm256_fmadd_ps(tf, rf, tf);
37 | 	f = _mm256_mul_ps(s2, _mm256_mul_ps(s1, f));
38 | 	/* Fixup underflow to zero */
39 | 	f = _mm256_andnot_ps(_mm256_cmp_ps(x, zero_cutoff, _CMP_LT_OS), f);
40 | 	/* Fixup overflow */
41 | 	f = _mm256_blendv_ps(f, plus_inf, _mm256_cmp_ps(x, inf_cutoff, _CMP_GT_OS));
42 | 	/* Fixup NaN */
43 | 	f = _mm256_blendv_ps(x, f, _mm256_cmp_ps(x, x, _CMP_EQ_OS));
44 | 	return f;
45 | }
46 | 
47 | static inline uint32_t as_uint32(float x) {
48 | 	union {
49 | 		float x;
50 | 		uint32_t n;
51 | 	} data = {
52 | 		.x = x
53 | 	};
54 | 	return data.n;
55 | }
56 | 
57 | static inline float as_float(uint32_t n) {
58 | 	union {
59 | 		float x;
60 | 		uint32_t n;
61 | 	} data = {
62 | 		.n = n
63 | 	};
64 | 	return data.x;
65 | }
66 | 
67 | static inline float ulpf(float x) {
68 | 	const float absx = fabsf(x);
69 | 	if (absx < __builtin_inff()) {
70 | 		return as_float(as_uint32(absx) + 1) - absx;
71 | 	} else {
72 | 		return absx;
73 | 	}
74 | }
75 | 
76 | int main() {
77 | 	float max_error = 0.0f;
78 | 	for (uint32_t n = INT32_MIN; n < as_uint32(-0x1.9FE368p+6f); n++) {
79 | 		const float x = as_float(n);
80 | 		const float ref_y = expf(x);
81 | 		const float opt_y = _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_exp_ps(_mm256_set1_ps(x))));
82 | 		const float error = fabsf(ref_y - opt_y) / ulpf(ref_y);
83 | 		if (error > max_error)
84 | 			max_error = error;
85 | 	}
86 | 	printf("Max error: %.2f ULP\n", max_error);
87 | 
88 | 	max_error = 0.0f;
89 | 	for (uint32_t n = 0; n < as_uint32(0x1.62E42Ep+6f); n++) {
90 | 		const float x = as_float(n);
91 | 		const float ref_y = expf(x);
92 | 		const float opt_y = _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_exp_ps(_mm256_set1_ps(x))));
93 | 		const float error = fabsf(ref_y - opt_y) / ulpf(ref_y);
94 | 		if (error > max_error)
95 | 			max_error = error;
96 | 	}
97 | 	printf("Max error: %.2f ULP\n", max_error);
98 | }
99 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/exp.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | 
 4 | from peachpy import *
 5 | from peachpy.x86_64 import *
 6 | 
 7 | log2e = float.fromhex("+0x1.715476p+3")
 8 | magic_bias = float.fromhex("+0x1.800000p+23")
 9 | zero_cutoff = float.fromhex("-0x1.9FE368p+6")
10 | inf_cutoff = float.fromhex("+0x1.62E42Ep+6")
11 | minus_ln2_hi = float.fromhex("-0x1.62E430p-4")
12 | minus_ln2_lo = float.fromhex("+0x1.05C610p-32")
13 | plus_inf = float("inf")
14 | 
15 | c2 = float.fromhex("0x1.00088Ap-1")
16 | c3 = float.fromhex("0x1.555A86p-3")
17 | t0 = float.fromhex("0x1.000000p+0")
18 | t1 = float.fromhex("0x1.172B84p+0")
19 | t2 = float.fromhex("0x1.306FE0p+0")
20 | t3 = float.fromhex("0x1.4BFDAEp+0")
21 | t4 = float.fromhex("0x1.6A09E6p+0")
22 | t5 = float.fromhex("0x1.8ACE54p+0")
23 | t6 = float.fromhex("0x1.AE89FAp+0")
24 | t7 = float.fromhex("0x1.D5818Ep+0")
25 | 
26 | min_exponent = (-126 << 23) & 0xFFFFFFFF
27 | max_exponent = 127 << 23
28 | default_exponent = 0x3F800000
29 | mantissa_mask = 0x007FFFF8
30 | 
31 | x_arg = Argument(m256, "x")
32 | with Function("_mm256_exp_ps", (x_arg,), m256,
33 |               target=uarch.default + isa.fma3 + isa.avx2):
34 |     ymm_x = YMMRegister()
35 |     LOAD.ARGUMENT(ymm_x, x_arg)
36 | 
37 |     ymm_magic_bias = YMMRegister()
38 |     VMOVAPS(ymm_magic_bias, Constant.float32x8(magic_bias))
39 | 
40 |     ymm_t = YMMRegister()
41 |     VMOVAPS(ymm_t, ymm_x)
42 |     VFMADD132PS(ymm_t, ymm_magic_bias, Constant.float32x8(log2e))
43 | 
44 |     ymm_e1, ymm_e2 = YMMRegister(), YMMRegister()
45 |     VPAND(ymm_e2, ymm_t, Constant.uint32x8(mantissa_mask))
46 |     VPSLLD(ymm_e2, ymm_e2, 20)
47 | 
48 |     ymm_tf = YMMRegister()
49 |     VPERMPS(ymm_tf, ymm_t, Constant.float32x8(t0, t1, t2, t3, t4, t5, t6, t7))
50 |     VSUBPS(ymm_t, ymm_t, ymm_magic_bias)
51 | 
52 |     # rx = fma(t, minus_ln2_lo, fma(t, minus_ln2_hi, x))
53 |     # rx := t * minus_ln2_hi + x
54 |     # rx := t * minus_ln2_lo + rx
55 |     ymm_rx = YMMRegister()
56 |     VMOVAPS(ymm_rx, ymm_x)
57 |     VFMADD231PS(ymm_rx, ymm_t, Constant.float32x8(minus_ln2_hi))
58 |     VFMADD231PS(ymm_rx, ymm_t, Constant.float32x8(minus_ln2_lo))
59 | 
60 |     VPMAXSD(ymm_e1, ymm_e2, Constant.uint32x8(min_exponent))
61 |     VPMINSD(ymm_e1, ymm_e1, Constant.uint32x8(max_exponent))
62 | 
63 |     ymm_default_exponent = YMMRegister()
64 |     VMOVDQA(ymm_default_exponent, Constant.uint32x8(default_exponent))
65 |     VPSUBD(ymm_e2, ymm_e2, ymm_e1)
66 | 
67 |     VPADDD(ymm_e1, ymm_e1, ymm_default_exponent)
68 |     VPADDD(ymm_e2, ymm_e2, ymm_default_exponent)
69 | 
70 |     # rf = fma(rx, rx * fma(rx, c3, c2), rx)
71 |     # rf := rx * c3 + c2
72 |     # rf := rx * rf
73 |     # rf := rx * rf + rx
74 |     ymm_rf = YMMRegister()
75 |     VMOVAPS(ymm_rf, Constant.float32x8(c2))
76 |     VFMADD231PS(ymm_rf, ymm_rx, Constant.float32x8(c3))
77 |     VMULPS(ymm_rf, ymm_rf, ymm_rx)
78 |     VFMADD213PS(ymm_rf, ymm_rx, ymm_rx)
79 | 
80 |     # f = fma(tf, rf, tf)
81 |     VFMADD231PS(ymm_tf, ymm_tf, ymm_rf)
82 |     ymm_f = ymm_tf
83 | 
84 |     VMULPS(ymm_f, ymm_f, ymm_e1)
85 |     VMULPS(ymm_f, ymm_f, ymm_e2)
86 | 
87 |     RETURN(ymm_f)
88 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/fft-dualreal.py:
--------------------------------------------------------------------------------
 1 | import fft.complex_soa
 2 | import fft.two_real_to_two_complex_soa_perm_planar
 3 | 
 4 | 
 5 | arg_t = Argument(ptr(const_float_), name="t")
 6 | arg_f = Argument(ptr(float_), name="f")
 7 | 
 8 | 
 9 | with Function("nnp_fft8_dualreal__avx2",
10 |     (arg_t, arg_f),
11 |     target=uarch.default + isa.fma3 + isa.avx2):
12 | 
13 |     reg_t = GeneralPurposeRegister64()
14 |     LOAD.ARGUMENT(reg_t, arg_t)
15 | 
16 |     reg_f = GeneralPurposeRegister64()
17 |     LOAD.ARGUMENT(reg_f, arg_f)
18 | 
19 |     ymm_seq_a, ymm_seq_b = YMMRegister(), YMMRegister()
20 | 
21 |     VMOVUPS(ymm_seq_a, [reg_t])
22 |     VMOVUPS(ymm_seq_b, [reg_t + YMMRegister.size])
23 | 
24 |     fft.complex_soa.fft8_within_rows(ymm_seq_a, ymm_seq_b)
25 |     ymm_wr, ymm_wi = ymm_seq_a, ymm_seq_b
26 | 
27 |     fft.two_real_to_two_complex_soa_perm_planar.fft8_within_rows_postprocess(ymm_wr, ymm_wi)
28 |     ymm_xhr, ymm_xhi = ymm_wr, ymm_wi
29 | 
30 |     VMOVUPS([reg_f], ymm_xhr)
31 |     VMOVUPS([reg_f + YMMRegister.size], ymm_xhi)
32 | 
33 |     RETURN()
34 | 
35 | 
36 | with Function("nnp_fft16_dualreal__avx2",
37 |     (arg_t, arg_f),
38 |     target=uarch.default + isa.fma3 + isa.avx2):
39 | 
40 |     reg_t = GeneralPurposeRegister64()
41 |     LOAD.ARGUMENT(reg_t, arg_t)
42 | 
43 |     reg_f = GeneralPurposeRegister64()
44 |     LOAD.ARGUMENT(reg_f, arg_f)
45 | 
46 |     ymm_seq_a = YMMRegister(), YMMRegister()
47 |     ymm_seq_b = YMMRegister(), YMMRegister()
48 |     for i, ymm_a in enumerate(ymm_seq_a + ymm_seq_b):
49 |         VMOVUPS(ymm_a, [reg_t + i * YMMRegister.size])
50 | 
51 |     fft.complex_soa.fft16_within_rows(ymm_seq_a, ymm_seq_b)
52 |     ymm_wr, ymm_wi = ymm_seq_a, ymm_seq_b
53 | 
54 |     fft.two_real_to_two_complex_soa_perm_planar.fft16_within_rows_postprocess(ymm_wr, ymm_wi)
55 | 
56 |     for i, ymm_w in enumerate(ymm_wr + ymm_wi):
57 |         VMOVUPS([reg_f + i * YMMRegister.size], ymm_w)
58 | 
59 |     RETURN()
60 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/fft-real.py:
--------------------------------------------------------------------------------
 1 | import fft.real_to_complex_soa_perm
 2 | 
 3 | arg_t = Argument(ptr(const_float_), name="t")
 4 | arg_f = Argument(ptr(float_), name="f")
 5 | 
 6 | 
 7 | with Function("nnp_fft8_8real__fma3",
 8 |     (arg_t, arg_f),
 9 |     target=uarch.default + isa.fma3):
10 | 
11 |     reg_t = GeneralPurposeRegister64()
12 |     LOAD.ARGUMENT(reg_t, arg_t)
13 | 
14 |     reg_f = GeneralPurposeRegister64()
15 |     LOAD.ARGUMENT(reg_f, arg_f)
16 | 
17 |     ymm_data = [YMMRegister() for _ in range(8)]
18 | 
19 |     for i, ymm_i in enumerate(ymm_data):
20 |         VMOVUPS(ymm_i, [reg_t + i * YMMRegister.size])
21 | 
22 |     fft.real_to_complex_soa_perm.fft8_across_rows(ymm_data)
23 | 
24 |     for i, ymm_i in enumerate(ymm_data):
25 |         VMOVUPS([reg_f + i * YMMRegister.size], ymm_i)
26 | 
27 |     RETURN()
28 | 
29 | 
30 | import fft16x16
31 | 
32 | 
33 | with Function("nnp_fft16_8real__fma3",
34 |     (arg_t, arg_f),
35 |     target=uarch.default + isa.fma3):
36 | 
37 |     reg_t0 = GeneralPurposeRegister64()
38 |     LOAD.ARGUMENT(reg_t0, arg_t)
39 | 
40 |     reg_f = GeneralPurposeRegister64()
41 |     LOAD.ARGUMENT(reg_f, arg_f)
42 | 
43 |     reg_stride = GeneralPurposeRegister64()
44 |     MOV(reg_stride, YMMRegister.size)
45 | 
46 |     reg_t8 = GeneralPurposeRegister64()
47 |     LEA(reg_t8, [reg_t0 + 8 * YMMRegister.size])
48 | 
49 |     fft16x16.forward_vfft(reg_t0, reg_t8, reg_stride,
50 |         data_out=[yword[reg_f + YMMRegister.size * i] for i in range(16)])
51 | 
52 |     RETURN()
53 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/fft-soa.py:
--------------------------------------------------------------------------------
 1 | import fft.complex_soa
 2 | 
 3 | arg_t = Argument(ptr(const_float_), name="t")
 4 | arg_f = Argument(ptr(float_), name="f")
 5 | 
 6 | 
 7 | with Function("nnp_fft16_soa__avx2",
 8 |     (arg_t, arg_f),
 9 |     target=uarch.default + isa.fma3 + isa.avx2):
10 | 
11 |     reg_t = GeneralPurposeRegister64()
12 |     LOAD.ARGUMENT(reg_t, arg_t)
13 | 
14 |     reg_f = GeneralPurposeRegister64()
15 |     LOAD.ARGUMENT(reg_f, arg_f)
16 | 
17 |     ymm_real = YMMRegister(), YMMRegister()
18 |     ymm_imag = YMMRegister(), YMMRegister()
19 | 
20 |     for i, ymm_data in enumerate(ymm_real + ymm_imag):
21 |         VMOVUPS(ymm_data, [reg_t + i * YMMRegister.size])
22 | 
23 |     fft.complex_soa.fft16_within_rows(ymm_real, ymm_imag)
24 | 
25 |     for i, ymm_data in enumerate(ymm_real + ymm_imag):
26 |         VMOVUPS([reg_f + i * YMMRegister.size], ymm_data)
27 | 
28 |     RETURN()
29 | 
30 | 
31 | with Function("nnp_fft8_soa__avx2",
32 |     (arg_t, arg_f),
33 |     target=uarch.default + isa.fma3 + isa.avx2):
34 | 
35 |     reg_t = GeneralPurposeRegister64()
36 |     LOAD.ARGUMENT(reg_t, arg_t)
37 | 
38 |     reg_f = GeneralPurposeRegister64()
39 |     LOAD.ARGUMENT(reg_f, arg_f)
40 | 
41 |     ymm_real, ymm_imag = YMMRegister(), YMMRegister()
42 | 
43 |     VMOVUPS(ymm_real, [reg_t])
44 |     VMOVUPS(ymm_imag, [reg_t + YMMRegister.size])
45 | 
46 |     fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag)
47 | 
48 |     VMOVUPS([reg_f], ymm_real)
49 |     VMOVUPS([reg_f + YMMRegister.size], ymm_imag)
50 | 
51 |     RETURN()
52 | 
53 | 
54 | with Function("nnp_ifft8_soa__avx2",
55 |     (arg_t, arg_f),
56 |     target=uarch.default + isa.fma3 + isa.avx2):
57 | 
58 |     reg_t = GeneralPurposeRegister64()
59 |     LOAD.ARGUMENT(reg_t, arg_t)
60 | 
61 |     reg_f = GeneralPurposeRegister64()
62 |     LOAD.ARGUMENT(reg_f, arg_f)
63 | 
64 |     ymm_real, ymm_imag = YMMRegister(), YMMRegister()
65 | 
66 |     VMOVUPS(ymm_real, [reg_t])
67 |     VMOVUPS(ymm_imag, [reg_t + YMMRegister.size])
68 | 
69 |     fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag, transformation="inverse")
70 | 
71 |     VMOVUPS([reg_f], ymm_real)
72 |     VMOVUPS([reg_f + YMMRegister.size], ymm_imag)
73 | 
74 |     RETURN()
75 | 
76 | 
77 | with Function("nnp_ifft16_soa__avx2",
78 |     (arg_f, arg_t),
79 |     target=uarch.default + isa.fma3 + isa.avx2):
80 | 
81 |     reg_f = GeneralPurposeRegister64()
82 |     LOAD.ARGUMENT(reg_f, arg_f)
83 | 
84 |     reg_t = GeneralPurposeRegister64()
85 |     LOAD.ARGUMENT(reg_t, arg_t)
86 | 
87 |     ymm_real = YMMRegister(), YMMRegister()
88 |     ymm_imag = YMMRegister(), YMMRegister()
89 | 
90 |     for i, ymm_data in enumerate(ymm_real + ymm_imag):
91 |         VMOVUPS(ymm_data, [reg_f + i * YMMRegister.size])
92 | 
93 |     fft.complex_soa.ifft16_within_rows(ymm_real, ymm_imag)
94 | 
95 |     for i, ymm_data in enumerate(ymm_real + ymm_imag):
96 |         VMOVUPS([reg_t + i * YMMRegister.size], ymm_data)
97 | 
98 |     RETURN()
99 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/fft/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Maratyszcza/NNPACK/70a77f485e8b934224f3a79efd8edcd84cd377b8/src/x86_64-fma/fft/__init__.py


--------------------------------------------------------------------------------
/src/x86_64-fma/fft/complex_soa_perm_to_real.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | 
 4 | from peachpy import *
 5 | from peachpy.x86_64 import *
 6 | 
 7 | from common import sqrt2_over_2
 8 | from common import butterfly
 9 | 
10 | import fft.complex_soa
11 | 
12 | def ifft8_across_rows(ymm_data, bias=None):
13 |     assert isinstance(ymm_data, list) and len(ymm_data) == 8
14 |     ymm_real = ymm_data[0::2]
15 |     ymm_imag = ymm_data[1::2]
16 | 
17 |     if bias is None:
18 |         # Do 1/N scaling before IFFT
19 |         ymm_one_eighth = YMMRegister()
20 |         VMOVAPS(ymm_one_eighth, Constant.float32x8(0.125))
21 |         for ymm_row in ymm_data:
22 |             if ymm_row is ymm_real[2]:
23 |                 VMULPS(ymm_row, ymm_row, Constant.float32x8(0.25))
24 |             elif ymm_row is ymm_imag[2]:
25 |                 VMULPS(ymm_row, ymm_row, Constant.float32x8(-0.25))
26 |             else:
27 |                 VMULPS(ymm_row, ymm_row, ymm_one_eighth)
28 |     else:
29 |         # Do 1/N scaling after FFT (merge with bias addition)
30 |         VMULPS(ymm_real[2], ymm_real[2], Constant.float32x8(2.0))
31 |         VMULPS(ymm_imag[2], ymm_imag[2], Constant.float32x8(-2.0))
32 | 
33 |     butterfly(ymm_real[0], ymm_imag[0])
34 | 
35 |     # H1.real, H1.imag = W1.real - W3.real, W1.imag + W3.imag
36 |     ymm_h1_real, ymm_h1_imag = YMMRegister(), YMMRegister()
37 |     VSUBPS(ymm_h1_real, ymm_real[1], ymm_real[3])
38 |     VADDPS(ymm_h1_imag, ymm_imag[1], ymm_imag[3])
39 | 
40 |     # G1.real, G1.imag = W1.real + W3.real, W1.imag - W3.imag
41 |     ymm_g1_real, ymm_g1_imag = YMMRegister(), YMMRegister()
42 |     VADDPS(ymm_g1_real, ymm_real[1], ymm_real[3])
43 |     VSUBPS(ymm_g1_imag, ymm_imag[1], ymm_imag[3])
44 | 
45 |     # H1+, H1- = H1.real + H1.imag, H1.real - H1.imag
46 |     ymm_h1_plus, ymm_h1_minus = YMMRegister(), YMMRegister()
47 |     VADDPS(ymm_h1_plus, ymm_h1_real, ymm_h1_imag)
48 |     VSUBPS(ymm_h1_minus, ymm_h1_real, ymm_h1_imag)
49 | 
50 |     ymm_sqrt2_over_2 = YMMRegister()
51 |     VMOVAPS(ymm_sqrt2_over_2, Constant.float32x8(sqrt2_over_2))
52 | 
53 |     # w1.real =  G1.real - SQRT2_OVER_2 * H1.plus;
54 |     # w3.real =  G1.real + SQRT2_OVER_2 * H1.plus;
55 |     VMOVAPS(ymm_real[1], ymm_g1_real)
56 |     VFNMADD231PS(ymm_real[1], ymm_h1_plus, ymm_sqrt2_over_2)
57 |     VFMADD231PS(ymm_g1_real, ymm_h1_plus, ymm_sqrt2_over_2)
58 |     SWAP.REGISTERS(ymm_real[3], ymm_g1_real)
59 | 
60 |     # w1.imag =  G1.imag + SQRT2_OVER_2 * H1.minus;
61 |     # w3.imag = -G1.imag + SQRT2_OVER_2 * H1.minus;
62 |     VMOVAPS(ymm_imag[1], ymm_g1_imag)
63 |     VFMADD231PS(ymm_imag[1], ymm_h1_minus, ymm_sqrt2_over_2)
64 |     VFMSUB231PS(ymm_g1_imag, ymm_h1_minus, ymm_sqrt2_over_2)
65 |     SWAP.REGISTERS(ymm_imag[3], ymm_g1_imag)
66 | 
67 |     fft.complex_soa.fft4_across_rows(ymm_real, ymm_imag, transformation="inverse")
68 | 
69 |     if bias is not None:
70 |         ymm_bias = bias
71 |         if not isinstance(bias, YMMRegister):
72 |             ymm_bias = YMMRegister()
73 |             VMOVAPS(ymm_bias, bias)
74 | 
75 |         ymm_one_eighth = YMMRegister()
76 |         VMOVAPS(ymm_one_eighth, Constant.float32x8(0.125))
77 | 
78 |         # 1/N scaling
79 |         for ymm_row in ymm_data:
80 |             VFMADD132PS(ymm_row, ymm_bias, ymm_one_eighth)
81 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/fft/real_to_complex_soa_perm.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from __future__ import division
 3 | 
 4 | from peachpy import *
 5 | from peachpy.x86_64 import *
 6 | 
 7 | from common import sqrt2_over_2
 8 | from common import butterfly
 9 | 
10 | import fft.complex_soa
11 | 
12 | 
13 | def fft8_across_rows(ymm_data):
14 |     assert isinstance(ymm_data, list) and len(ymm_data) == 8
15 |     ymm_real = ymm_data[0::2]
16 |     ymm_imag = ymm_data[1::2]
17 | 
18 |     fft.complex_soa.fft4_across_rows(ymm_real, ymm_imag)
19 | 
20 |     butterfly(ymm_real[0], ymm_imag[0])
21 | 
22 |     # const float two_gdata1_real = crealf(data1) + crealf(data3);
23 |     # const float two_gdata1_imag = cimagf(data1) - cimagf(data3);
24 |     ymm_two_gdata1_real, ymm_two_gdata1_imag = YMMRegister(), YMMRegister()
25 |     VADDPS(ymm_two_gdata1_real, ymm_real[1], ymm_real[3])
26 |     VSUBPS(ymm_two_gdata1_imag, ymm_imag[1], ymm_imag[3])
27 | 
28 |     # const float two_hdata1_real = cimagf(data1) + cimagf(data3);
29 |     # const float two_hdata1_imag = crealf(data3) - crealf(data1);
30 |     ymm_two_hdata1_real, ymm_two_hdata1_imag = YMMRegister(), YMMRegister()
31 |     VADDPS(ymm_two_hdata1_real, ymm_imag[1], ymm_imag[3])
32 |     VSUBPS(ymm_two_hdata1_imag, ymm_real[3], ymm_real[1])
33 | 
34 |     # const float two_hdata1_real_plus_imag = two_hdata1_real + two_hdata1_imag;
35 |     # const float two_hdata1_real_minus_imag = two_hdata1_real - two_hdata1_imag;
36 |     ymm_two_hdata1_plus, ymm_two_hdata1_minus = YMMRegister(), YMMRegister()
37 |     VADDPS(ymm_two_hdata1_plus, ymm_two_hdata1_real, ymm_two_hdata1_imag)
38 |     VSUBPS(ymm_two_hdata1_minus, ymm_two_hdata1_real, ymm_two_hdata1_imag)
39 | 
40 |     ymm_sqrt2_over_2 = YMMRegister()
41 |     VMOVAPS(ymm_sqrt2_over_2, Constant.float32x8(sqrt2_over_2))
42 | 
43 |     # const float two_data1_real = two_gdata1_real + SQRT2_OVER_2 * two_hdata1_real_plus_imag;
44 |     # const float two_data1_imag = two_gdata1_imag - SQRT2_OVER_2 * two_hdata1_real_minus_imag;
45 |     # const float two_data3_real = two_gdata1_real - SQRT2_OVER_2 * two_hdata1_real_plus_imag;
46 |     # const float two_data3_imag = -two_gdata1_imag - SQRT2_OVER_2 * two_hdata1_real_minus_imag;
47 |     ymm_two_data1_real, ymm_two_data1_imag = YMMRegister(), YMMRegister()
48 |     ymm_two_data3_real, ymm_two_data3_imag = YMMRegister(), YMMRegister()
49 |     VMOVAPS(ymm_two_data3_real, ymm_two_gdata1_real)
50 |     VMOVAPS(ymm_two_data3_imag, ymm_two_gdata1_imag)
51 |     VFMADD231PS(ymm_two_gdata1_real, ymm_two_hdata1_plus, ymm_sqrt2_over_2)
52 |     VFNMADD231PS(ymm_two_gdata1_imag, ymm_two_hdata1_minus, ymm_sqrt2_over_2)
53 |     SWAP.REGISTERS(ymm_two_data1_real, ymm_two_gdata1_real)
54 |     SWAP.REGISTERS(ymm_two_data1_imag, ymm_two_gdata1_imag)
55 |     VFNMADD231PS(ymm_two_data3_real, ymm_two_hdata1_plus, ymm_sqrt2_over_2)
56 |     VFNMSUB231PS(ymm_two_data3_imag, ymm_two_hdata1_minus, ymm_sqrt2_over_2)
57 | 
58 |     # /* Store outputs */
59 |     # fdata[0] = crealf(data0) + cimagf(data0);
60 |     # fdata[1] = crealf(data0) - cimagf(data0);
61 |     # fdata[2] = 0.5f * two_data1_real;
62 |     # fdata[3] = 0.5f * two_data1_imag;
63 |     # fdata[4] = crealf(data2);
64 |     # fdata[5] = -cimagf(data2);
65 |     # fdata[6] = 0.5f * two_data3_real;
66 |     # fdata[7] = 0.5f * two_data3_imag;
67 | 
68 |     ymm_half = YMMRegister()
69 |     VMOVAPS(ymm_half, Constant.float32x8(0.5))
70 |     VMULPS(ymm_real[1], ymm_two_data1_real, ymm_half)
71 |     VMULPS(ymm_imag[1], ymm_two_data1_imag, ymm_half)
72 |     VXORPS(ymm_imag[2], ymm_imag[2], Constant.float32x8(-0.0))
73 |     VMULPS(ymm_real[3], ymm_two_data3_real, ymm_half)
74 |     VMULPS(ymm_imag[3], ymm_two_data3_imag, ymm_half)
75 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/ifft-dualreal.py:
--------------------------------------------------------------------------------
 1 | import fft.complex_soa
 2 | import fft.two_complex_soa_perm_to_two_real_planar
 3 | 
 4 | 
 5 | arg_f = Argument(ptr(const_float_), name="f")
 6 | arg_t = Argument(ptr(float_), name="t")
 7 | 
 8 | 
 9 | with Function("nnp_ifft8_dualreal__avx2",
10 |     (arg_f, arg_t),
11 |     target=uarch.default + isa.fma3 + isa.avx2):
12 | 
13 |     reg_f = GeneralPurposeRegister64()
14 |     LOAD.ARGUMENT(reg_f, arg_f)
15 | 
16 |     reg_t = GeneralPurposeRegister64()
17 |     LOAD.ARGUMENT(reg_t, arg_t)
18 | 
19 |     ymm_xhr, ymm_xhi = YMMRegister(), YMMRegister()
20 |     VMOVUPS(ymm_xhr, [reg_f])
21 |     VMOVUPS(ymm_xhi, [reg_f + YMMRegister.size])
22 | 
23 |     fft.two_complex_soa_perm_to_two_real_planar.ifft8_within_rows_preprocess(ymm_xhr, ymm_xhi)
24 |     ymm_wr, ymm_wi = ymm_xhr, ymm_xhi
25 | 
26 |     fft.complex_soa.fft8_within_rows(ymm_wr, ymm_wi, transformation="inverse")
27 |     ymm_seq_a, ymm_seq_b = ymm_wr, ymm_wi
28 | 
29 |     VMOVUPS([reg_t], ymm_seq_a)
30 |     VMOVUPS([reg_t + YMMRegister.size], ymm_seq_b)
31 | 
32 |     RETURN()
33 | 
34 | 
35 | with Function("nnp_ifft16_dualreal__avx2",
36 |     (arg_f, arg_t),
37 |     target=uarch.default + isa.fma3 + isa.avx2):
38 | 
39 |     reg_f = GeneralPurposeRegister64()
40 |     LOAD.ARGUMENT(reg_f, arg_f)
41 | 
42 |     reg_t = GeneralPurposeRegister64()
43 |     LOAD.ARGUMENT(reg_t, arg_t)
44 | 
45 |     ymm_wr = YMMRegister(), YMMRegister()
46 |     ymm_wi = YMMRegister(), YMMRegister()
47 | 
48 |     for i, ymm_w in enumerate(ymm_wr + ymm_wi):
49 |         VMOVUPS(ymm_w, [reg_f + i * YMMRegister.size])
50 | 
51 |     fft.two_complex_soa_perm_to_two_real_planar.ifft16_within_rows_preprocess(ymm_wr, ymm_wi)
52 | 
53 |     fft.complex_soa.ifft16_within_rows(ymm_wr, ymm_wi)
54 | 
55 |     for i, ymm_w in enumerate(ymm_wr + ymm_wi):
56 |         VMOVUPS([reg_t + i * YMMRegister.size], ymm_w)
57 | 
58 |     RETURN()
59 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/ifft-real.py:
--------------------------------------------------------------------------------
 1 | import fft.complex_soa_perm_to_real
 2 | from common import butterfly, cos_npi_over_8, sqrt2_over_2
 3 | 
 4 | 
 5 | def fft8_bitreverse(n):
 6 |     return int(format(n, "03b")[::-1], 2)
 7 | 
 8 | 
 9 | arg_f = Argument(ptr(const_float_), name="f")
10 | arg_t = Argument(ptr(float_), name="t")
11 | 
12 | 
13 | with Function("nnp_ifft8_8real__fma3",
14 |     (arg_f, arg_t),
15 |     target=uarch.default + isa.fma3):
16 | 
17 |     reg_f = GeneralPurposeRegister64()
18 |     LOAD.ARGUMENT(reg_f, arg_f)
19 | 
20 |     reg_t = GeneralPurposeRegister64()
21 |     LOAD.ARGUMENT(reg_t, arg_t)
22 | 
23 |     ymm_data = [YMMRegister() for _ in range(8)]
24 |     ymm_real, ymm_imag = ymm_data[0::2], ymm_data[1::2]
25 | 
26 |     for i, ymm_i in enumerate(ymm_data):
27 |         VMOVUPS(ymm_i, [reg_f + i * YMMRegister.size])
28 | 
29 |     fft.complex_soa_perm_to_real.ifft8_across_rows(ymm_data)
30 | 
31 |     for i, ymm_i in enumerate(ymm_data):
32 |         VMOVUPS([reg_t + i * YMMRegister.size], ymm_i)
33 | 
34 |     RETURN()
35 | 
36 | 
37 | import fft16x16
38 | 
39 | 
40 | with Function("nnp_ifft16_8real__fma3",
41 |     (arg_f, arg_t),
42 |     target=uarch.default + isa.fma3):
43 | 
44 |     reg_f = GeneralPurposeRegister64()
45 |     LOAD.ARGUMENT(reg_f, arg_f)
46 | 
47 |     reg_t0 = GeneralPurposeRegister64()
48 |     LOAD.ARGUMENT(reg_t0, arg_t)
49 | 
50 |     reg_stride = GeneralPurposeRegister64()
51 |     MOV(reg_stride, YMMRegister.size)
52 | 
53 |     reg_t8 = GeneralPurposeRegister64()
54 |     LEA(reg_t8, [reg_t0 + 8 * YMMRegister.size])
55 | 
56 |     fft16x16.inverse_vfft(reg_t0, reg_t8, reg_stride,
57 |         data_in=[yword[reg_f + YMMRegister.size * i] for i in range(16)])
58 | 
59 |     RETURN()
60 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/softmax.c:
--------------------------------------------------------------------------------
 1 | #include <stdint.h>
 2 | #include <stddef.h>
 3 | #include <math.h>
 4 | 
 5 | #include <nnpack/utils.h>
 6 | #include <nnpack/softmax.h>
 7 | 
 8 | float max__avx(size_t n, const float v[restrict static n]);
 9 | float sum_exp_minus_c__avx2(size_t n, const float v[restrict static n], float c);
10 | void scaled_exp_minus_c__avx2(size_t n, const float x[restrict static n], float y[restrict static n], float scale, float c);
11 | void inplace_scaled_exp_minus_c__avx2(size_t n, const float v[restrict static n], float scale, float c);
12 | 
13 | void nnp_softmax__avx2(
14 | 	size_t n,
15 | 	const float x[restrict static n],
16 | 	float y[restrict static n])
17 | {
18 | 	const float c = max__avx(n, x);
19 | 	const float sum = sum_exp_minus_c__avx2(n, x, c);
20 | 	const float scale = 1.0f / sum;
21 | 	scaled_exp_minus_c__avx2(n, x, y, scale, c);
22 | }
23 | 
24 | void nnp_inplace_softmax__avx2(
25 | 	size_t n,
26 | 	float v[restrict static n])
27 | {
28 | 	const float c = max__avx(n, v);
29 | 	const float sum = sum_exp_minus_c__avx2(n, v, c);
30 | 	const float scale = 1.0f / sum;
31 | 	inplace_scaled_exp_minus_c__avx2(n, v, scale, c);
32 | }
33 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/vecmath/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Maratyszcza/NNPACK/70a77f485e8b934224f3a79efd8edcd84cd377b8/src/x86_64-fma/vecmath/__init__.py


--------------------------------------------------------------------------------
/src/x86_64-fma/winograd-f6k3.py:
--------------------------------------------------------------------------------
 1 | import winograd.o6x6k3x3
 2 | 
 3 | 
 4 | arg_d_pointer = Argument(ptr(const_float_), name="d")
 5 | arg_w_pointer = Argument(ptr(float_), name="w")
 6 | with Function("nnp_iwt_f6k3__fma3", (arg_d_pointer, arg_w_pointer),
 7 |     target=uarch.default + isa.fma3):
 8 | 
 9 |     reg_d = GeneralPurposeRegister64()
10 |     LOAD.ARGUMENT(reg_d, arg_d_pointer)
11 | 
12 |     reg_w = GeneralPurposeRegister64()
13 |     LOAD.ARGUMENT(reg_w, arg_w_pointer)
14 | 
15 |     ymm_data = [YMMRegister() for _ in range(8)]
16 |     for i, ymm_row in enumerate(ymm_data):
17 |     	VMOVUPS(ymm_row, [reg_d + i * YMMRegister.size])
18 | 
19 |     ymm_data = winograd.o6x6k3x3.input_transform(ymm_data)
20 | 
21 |     for i, ymm_row in enumerate(ymm_data):
22 |     	VMOVUPS([reg_w + i * YMMRegister.size], ymm_row)
23 | 
24 |     RETURN()
25 | 
26 | 
27 | arg_g_pointer = Argument(ptr(const_float_), name="g")
28 | arg_w_pointer = Argument(ptr(float_), name="w")
29 | with Function("nnp_kwt_f6k3__fma3", (arg_g_pointer, arg_w_pointer),
30 |     target=uarch.default + isa.fma3):
31 | 
32 |     reg_g = GeneralPurposeRegister64()
33 |     LOAD.ARGUMENT(reg_g, arg_g_pointer)
34 | 
35 |     reg_w = GeneralPurposeRegister64()
36 |     LOAD.ARGUMENT(reg_w, arg_w_pointer)
37 | 
38 |     ymm_data = [YMMRegister() for _ in range(3)]
39 |     for i, ymm_row in enumerate(ymm_data):
40 |     	VMOVUPS(ymm_row, [reg_g + i * YMMRegister.size])
41 | 
42 |     ymm_data = winograd.o6x6k3x3.kernel_transform(ymm_data)
43 | 
44 |     for i, ymm_row in enumerate(ymm_data):
45 |     	VMOVUPS([reg_w + i * YMMRegister.size], ymm_row)
46 | 
47 |     RETURN()
48 | 
49 | 
50 | arg_m_pointer = Argument(ptr(const_float_), name="m")
51 | arg_s_pointer = Argument(ptr(float_), name="s")
52 | with Function("nnp_owt_f6k3__fma3", (arg_m_pointer, arg_s_pointer),
53 |     target=uarch.default + isa.fma3):
54 | 
55 |     reg_m = GeneralPurposeRegister64()
56 |     LOAD.ARGUMENT(reg_m, arg_m_pointer)
57 | 
58 |     reg_s = GeneralPurposeRegister64()
59 |     LOAD.ARGUMENT(reg_s, arg_s_pointer)
60 | 
61 |     ymm_m = [YMMRegister() for _ in range(8)]
62 |     for i, ymm_row in enumerate(ymm_m):
63 |     	VMOVUPS(ymm_row, [reg_m + i * YMMRegister.size])
64 | 
65 |     ymm_s = winograd.o6x6k3x3.output_transform(ymm_m)
66 | 
67 |     for i, ymm_row in enumerate(ymm_s):
68 |     	VMOVUPS([reg_s + i * YMMRegister.size], ymm_row)
69 | 
70 |     RETURN()
71 | 


--------------------------------------------------------------------------------
/src/x86_64-fma/winograd/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Maratyszcza/NNPACK/70a77f485e8b934224f3a79efd8edcd84cd377b8/src/x86_64-fma/winograd/__init__.py


--------------------------------------------------------------------------------
/test/convolution-input-gradient/alexnet.cc:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <nnpack.h>
  4 | 
  5 | #include <testers/convolution.h>
  6 | #include <models/alexnet.h>
  7 | 
  8 | /*
  9 |  * AlexNet conv2 layer
 10 |  */
 11 | 
 12 | TEST(FT8x8, conv2) {
 13 | 	AlexNet::conv2()
 14 | 		.batchSize(128)
 15 | 		.errorLimit(1.0e-5)
 16 | 		.testInputGradient(nnp_convolution_algorithm_ft8x8);
 17 | }
 18 | 
 19 | TEST(FT16x16, conv2) {
 20 | 	AlexNet::conv2()
 21 | 		.batchSize(128)
 22 | 		.errorLimit(1.0e-5)
 23 | 		.testInputGradient(nnp_convolution_algorithm_ft16x16);
 24 | }
 25 | 
 26 | /*
 27 |  * AlexNet conv3 layer
 28 |  */
 29 | 
 30 | TEST(FT8x8, conv3) {
 31 | 	AlexNet::conv3()
 32 | 		.batchSize(128)
 33 | 		.errorLimit(1.0e-5)
 34 | 		.testInputGradient(nnp_convolution_algorithm_ft8x8);
 35 | }
 36 | 
 37 | TEST(FT16x16, conv3) {
 38 | 	AlexNet::conv3()
 39 | 		.batchSize(128)
 40 | 		.errorLimit(1.0e-5)
 41 | 		.testInputGradient(nnp_convolution_algorithm_ft16x16);
 42 | }
 43 | 
 44 | TEST(WT8x8, conv3) {
 45 | 	AlexNet::conv3()
 46 | 		.batchSize(128)
 47 | 		.errorLimit(1.0e-5)
 48 | 		.testInputGradient(nnp_convolution_algorithm_wt8x8);
 49 | }
 50 | 
 51 | /*
 52 |  * AlexNet conv4 layer
 53 |  */
 54 | 
 55 | TEST(FT8x8, conv4) {
 56 | 	AlexNet::conv4()
 57 | 		.batchSize(128)
 58 | 		.errorLimit(1.0e-5)
 59 | 		.testInputGradient(nnp_convolution_algorithm_ft8x8);
 60 | }
 61 | 
 62 | TEST(FT16x16, conv4) {
 63 | 	AlexNet::conv4()
 64 | 		.batchSize(128)
 65 | 		.errorLimit(1.0e-5)
 66 | 		.testInputGradient(nnp_convolution_algorithm_ft16x16);
 67 | }
 68 | 
 69 | TEST(WT8x8, conv4) {
 70 | 	AlexNet::conv4()
 71 | 		.batchSize(128)
 72 | 		.errorLimit(1.0e-5)
 73 | 		.testInputGradient(nnp_convolution_algorithm_wt8x8);
 74 | }
 75 | 
 76 | /*
 77 |  * AlexNet conv5 layer
 78 |  */
 79 | 
 80 | TEST(FT8x8, conv5) {
 81 | 	AlexNet::conv5()
 82 | 		.batchSize(128)
 83 | 		.errorLimit(1.0e-5)
 84 | 		.testInputGradient(nnp_convolution_algorithm_ft8x8);
 85 | }
 86 | 
 87 | TEST(FT16x16, conv5) {
 88 | 	AlexNet::conv5()
 89 | 		.batchSize(128)
 90 | 		.errorLimit(1.0e-5)
 91 | 		.testInputGradient(nnp_convolution_algorithm_ft16x16);
 92 | }
 93 | 
 94 | TEST(WT8x8, conv5) {
 95 | 	AlexNet::conv5()
 96 | 		.batchSize(128)
 97 | 		.errorLimit(1.0e-5)
 98 | 		.testInputGradient(nnp_convolution_algorithm_wt8x8);
 99 | }
100 | 
101 | int main(int argc, char* argv[]) {
102 | 	const enum nnp_status init_status = nnp_initialize();
103 | 	assert(init_status == nnp_status_success);
104 | 	setenv("TERM", "xterm-256color", 0);
105 | 	::testing::InitGoogleTest(&argc, argv);
106 | 	return RUN_ALL_TESTS();
107 | }
108 | 


--------------------------------------------------------------------------------
/test/convolution-input-gradient/overfeat-fast.cc:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <nnpack.h>
  4 | 
  5 | #include <testers/convolution.h>
  6 | #include <models/overfeat-fast.h>
  7 | 
  8 | /*
  9 |  * OverFeat (Fast model) conv2 layer
 10 |  */
 11 | 
 12 | TEST(FT8x8, conv2) {
 13 | 	OverFeat_Fast::conv2()
 14 | 		.batchSize(128)
 15 | 		.errorLimit(1.0e-5)
 16 | 		.testInputGradient(nnp_convolution_algorithm_ft8x8);
 17 | }
 18 | 
 19 | TEST(FT16x16, conv2) {
 20 | 	OverFeat_Fast::conv2()
 21 | 		.batchSize(128)
 22 | 		.errorLimit(1.0e-5)
 23 | 		.testInputGradient(nnp_convolution_algorithm_ft16x16);
 24 | }
 25 | 
 26 | /*
 27 |  * OverFeat (Fast model) conv3 layer
 28 |  */
 29 | 
 30 | TEST(FT8x8, conv3) {
 31 | 	OverFeat_Fast::conv3()
 32 | 		.batchSize(128)
 33 | 		.errorLimit(1.0e-5)
 34 | 		.testInputGradient(nnp_convolution_algorithm_ft8x8);
 35 | }
 36 | 
 37 | TEST(FT16x16, conv3) {
 38 | 	OverFeat_Fast::conv3()
 39 | 		.batchSize(128)
 40 | 		.errorLimit(1.0e-5)
 41 | 		.testInputGradient(nnp_convolution_algorithm_ft16x16);
 42 | }
 43 | 
 44 | TEST(WT8x8, conv3) {
 45 | 	OverFeat_Fast::conv3()
 46 | 		.batchSize(128)
 47 | 		.errorLimit(1.0e-5)
 48 | 		.testInputGradient(nnp_convolution_algorithm_wt8x8);
 49 | }
 50 | 
 51 | /*
 52 |  * OverFeat (Fast model) conv4 layer
 53 |  */
 54 | 
 55 | TEST(FT8x8, conv4) {
 56 | 	OverFeat_Fast::conv4()
 57 | 		.batchSize(128)
 58 | 		.errorLimit(1.0e-5)
 59 | 		.testInputGradient(nnp_convolution_algorithm_ft8x8);
 60 | }
 61 | 
 62 | TEST(FT16x16, conv4) {
 63 | 	OverFeat_Fast::conv4()
 64 | 		.batchSize(128)
 65 | 		.errorLimit(1.0e-5)
 66 | 		.testInputGradient(nnp_convolution_algorithm_ft16x16);
 67 | }
 68 | 
 69 | TEST(WT8x8, conv4) {
 70 | 	OverFeat_Fast::conv4()
 71 | 		.batchSize(128)
 72 | 		.errorLimit(1.0e-5)
 73 | 		.testInputGradient(nnp_convolution_algorithm_wt8x8);
 74 | }
 75 | 
 76 | /*
 77 |  * OverFeat (Fast model) conv5 layer
 78 |  */
 79 | 
 80 | TEST(FT8x8, conv5) {
 81 | 	OverFeat_Fast::conv5()
 82 | 		.batchSize(128)
 83 | 		.errorLimit(1.0e-5)
 84 | 		.testInputGradient(nnp_convolution_algorithm_ft8x8);
 85 | }
 86 | 
 87 | TEST(FT16x16, conv5) {
 88 | 	OverFeat_Fast::conv5()
 89 | 		.batchSize(128)
 90 | 		.errorLimit(1.0e-5)
 91 | 		.testInputGradient(nnp_convolution_algorithm_ft16x16);
 92 | }
 93 | 
 94 | TEST(WT8x8, conv5) {
 95 | 	OverFeat_Fast::conv5()
 96 | 		.batchSize(128)
 97 | 		.errorLimit(1.0e-5)
 98 | 		.testInputGradient(nnp_convolution_algorithm_wt8x8);
 99 | }
100 | 
101 | int main(int argc, char* argv[]) {
102 | 	const enum nnp_status init_status = nnp_initialize();
103 | 	assert(init_status == nnp_status_success);
104 | 	setenv("TERM", "xterm-256color", 0);
105 | 	::testing::InitGoogleTest(&argc, argv);
106 | 	return RUN_ALL_TESTS();
107 | }
108 | 


--------------------------------------------------------------------------------
/test/convolution-kernel-gradient/alexnet.cc:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <nnpack.h>
  4 | 
  5 | #include <testers/convolution.h>
  6 | #include <models/alexnet.h>
  7 | 
  8 | /*
  9 |  * AlexNet conv2 layer
 10 |  */
 11 | 
 12 | TEST(FT8x8, conv2) {
 13 | 	AlexNet::conv2()
 14 | 		.batchSize(128)
 15 | 		.errorLimit(1.0e-6)
 16 | 		.testKernelGradient(nnp_convolution_algorithm_ft8x8);
 17 | }
 18 | 
 19 | TEST(FT16x16, conv2) {
 20 | 	AlexNet::conv2()
 21 | 		.batchSize(128)
 22 | 		.errorLimit(1.0e-5)
 23 | 		.testKernelGradient(nnp_convolution_algorithm_ft16x16);
 24 | }
 25 | 
 26 | /*
 27 |  * AlexNet conv3 layer
 28 |  */
 29 | 
 30 | TEST(FT8x8, conv3) {
 31 | 	AlexNet::conv3()
 32 | 		.batchSize(128)
 33 | 		.errorLimit(1.0e-6)
 34 | 		.testKernelGradient(nnp_convolution_algorithm_ft8x8);
 35 | }
 36 | 
 37 | TEST(FT16x16, conv3) {
 38 | 	AlexNet::conv3()
 39 | 		.batchSize(128)
 40 | 		.errorLimit(1.0e-5)
 41 | 		.testKernelGradient(nnp_convolution_algorithm_ft16x16);
 42 | }
 43 | 
 44 | TEST(WT8x8, DISABLED_conv3) {
 45 | 	AlexNet::conv3()
 46 | 		.batchSize(128)
 47 | 		.errorLimit(1.0e-3)
 48 | 		.testKernelGradient(nnp_convolution_algorithm_wt8x8);
 49 | }
 50 | 
 51 | /*
 52 |  * AlexNet conv4 layer
 53 |  */
 54 | 
 55 | TEST(FT8x8, conv4) {
 56 | 	AlexNet::conv4()
 57 | 		.batchSize(128)
 58 | 		.errorLimit(1.0e-6)
 59 | 		.testKernelGradient(nnp_convolution_algorithm_ft8x8);
 60 | }
 61 | 
 62 | TEST(FT16x16, conv4) {
 63 | 	AlexNet::conv4()
 64 | 		.batchSize(128)
 65 | 		.errorLimit(1.0e-5)
 66 | 		.testKernelGradient(nnp_convolution_algorithm_ft16x16);
 67 | }
 68 | 
 69 | TEST(WT8x8, DISABLED_conv4) {
 70 | 	AlexNet::conv4()
 71 | 		.batchSize(128)
 72 | 		.errorLimit(1.0e-3)
 73 | 		.testKernelGradient(nnp_convolution_algorithm_wt8x8);
 74 | }
 75 | 
 76 | /*
 77 |  * AlexNet conv5 layer
 78 |  */
 79 | 
 80 | TEST(FT8x8, conv5) {
 81 | 	AlexNet::conv5()
 82 | 		.batchSize(128)
 83 | 		.errorLimit(1.0e-6)
 84 | 		.testKernelGradient(nnp_convolution_algorithm_ft8x8);
 85 | }
 86 | 
 87 | TEST(FT16x16, conv5) {
 88 | 	AlexNet::conv5()
 89 | 		.batchSize(128)
 90 | 		.errorLimit(1.0e-5)
 91 | 		.testKernelGradient(nnp_convolution_algorithm_ft16x16);
 92 | }
 93 | 
 94 | TEST(WT8x8, DISABLED_conv5) {
 95 | 	AlexNet::conv5()
 96 | 		.batchSize(128)
 97 | 		.errorLimit(1.0e-3)
 98 | 		.testKernelGradient(nnp_convolution_algorithm_wt8x8);
 99 | }
100 | 
101 | int main(int argc, char* argv[]) {
102 | 	const enum nnp_status init_status = nnp_initialize();
103 | 	assert(init_status == nnp_status_success);
104 | 	setenv("TERM", "xterm-256color", 0);
105 | 	::testing::InitGoogleTest(&argc, argv);
106 | 	return RUN_ALL_TESTS();
107 | }
108 | 


--------------------------------------------------------------------------------
/test/convolution-kernel-gradient/overfeat-fast.cc:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <nnpack.h>
  4 | 
  5 | #include <testers/convolution.h>
  6 | #include <models/overfeat-fast.h>
  7 | 
  8 | /*
  9 |  * OverFeat (Fast model) conv2 layer
 10 |  */
 11 | 
 12 | TEST(FT8x8, conv2) {
 13 | 	OverFeat_Fast::conv2()
 14 | 		.batchSize(128)
 15 | 		.errorLimit(1.0e-6)
 16 | 		.testKernelGradient(nnp_convolution_algorithm_ft8x8);
 17 | }
 18 | 
 19 | TEST(FT16x16, conv2) {
 20 | 	OverFeat_Fast::conv2()
 21 | 		.batchSize(128)
 22 | 		.errorLimit(1.0e-5)
 23 | 		.testKernelGradient(nnp_convolution_algorithm_ft16x16);
 24 | }
 25 | 
 26 | /*
 27 |  * OverFeat (Fast model) conv3 layer
 28 |  */
 29 | 
 30 | TEST(FT8x8, conv3) {
 31 | 	OverFeat_Fast::conv3()
 32 | 		.batchSize(128)
 33 | 		.errorLimit(1.0e-6)
 34 | 		.testKernelGradient(nnp_convolution_algorithm_ft8x8);
 35 | }
 36 | 
 37 | TEST(FT16x16, conv3) {
 38 | 	OverFeat_Fast::conv3()
 39 | 		.batchSize(128)
 40 | 		.errorLimit(1.0e-5)
 41 | 		.testKernelGradient(nnp_convolution_algorithm_ft16x16);
 42 | }
 43 | 
 44 | TEST(WT8x8, DISABLED_conv3) {
 45 | 	OverFeat_Fast::conv3()
 46 | 		.batchSize(128)
 47 | 		.errorLimit(1.0e-3)
 48 | 		.testKernelGradient(nnp_convolution_algorithm_wt8x8);
 49 | }
 50 | 
 51 | /*
 52 |  * OverFeat (Fast model) conv4 layer
 53 |  */
 54 | 
 55 | TEST(FT8x8, conv4) {
 56 | 	OverFeat_Fast::conv4()
 57 | 		.batchSize(128)
 58 | 		.errorLimit(1.0e-6)
 59 | 		.testKernelGradient(nnp_convolution_algorithm_ft8x8);
 60 | }
 61 | 
 62 | TEST(FT16x16, conv4) {
 63 | 	OverFeat_Fast::conv4()
 64 | 		.batchSize(128)
 65 | 		.errorLimit(1.0e-5)
 66 | 		.testKernelGradient(nnp_convolution_algorithm_ft16x16);
 67 | }
 68 | 
 69 | TEST(WT8x8, DISABLED_conv4) {
 70 | 	OverFeat_Fast::conv4()
 71 | 		.batchSize(128)
 72 | 		.errorLimit(1.0e-3)
 73 | 		.testKernelGradient(nnp_convolution_algorithm_wt8x8);
 74 | }
 75 | 
 76 | /*
 77 |  * OverFeat (Fast model) conv5 layer
 78 |  */
 79 | 
 80 | TEST(FT8x8, conv5) {
 81 | 	OverFeat_Fast::conv5()
 82 | 		.batchSize(128)
 83 | 		.errorLimit(1.0e-6)
 84 | 		.testKernelGradient(nnp_convolution_algorithm_ft8x8);
 85 | }
 86 | 
 87 | TEST(FT16x16, conv5) {
 88 | 	OverFeat_Fast::conv5()
 89 | 		.batchSize(128)
 90 | 		.errorLimit(1.0e-5)
 91 | 		.testKernelGradient(nnp_convolution_algorithm_ft16x16);
 92 | }
 93 | 
 94 | TEST(WT8x8, DISABLED_conv5) {
 95 | 	OverFeat_Fast::conv5()
 96 | 		.batchSize(128)
 97 | 		.errorLimit(1.0e-3)
 98 | 		.testKernelGradient(nnp_convolution_algorithm_wt8x8);
 99 | }
100 | 
101 | int main(int argc, char* argv[]) {
102 | 	const enum nnp_status init_status = nnp_initialize();
103 | 	assert(init_status == nnp_status_success);
104 | 	setenv("TERM", "xterm-256color", 0);
105 | 	::testing::InitGoogleTest(&argc, argv);
106 | 	return RUN_ALL_TESTS();
107 | }
108 | 


--------------------------------------------------------------------------------
/test/fully-connected-inference/alexnet.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/fully-connected.h>
 6 | #include <models/alexnet.h>
 7 | 
 8 | /*
 9 |  * AlexNet fc6 layer
10 |  */
11 | 
12 | TEST(F32, fc6) {
13 | 	AlexNet::fc6()
14 | 		.errorLimit(2.0e-5)
15 | 		.testInferenceF32();
16 | }
17 | 
18 | TEST(F16F32, fc6) {
19 | 	AlexNet::fc6()
20 | 		.errorLimit(2.0e-5)
21 | 		.testInferenceF16F32();
22 | }
23 | 
24 | /*
25 |  * AlexNet fc7 layer
26 |  */
27 | 
28 | TEST(F32, fc7) {
29 | 	AlexNet::fc7()
30 | 		.errorLimit(1.0e-5)
31 | 		.testInferenceF32();
32 | }
33 | 
34 | TEST(F16F32, fc7) {
35 | 	AlexNet::fc7()
36 | 		.errorLimit(1.0e-5)
37 | 		.testInferenceF16F32();
38 | }
39 | 
40 | /*
41 |  * AlexNet fc8 layer
42 |  */
43 | 
44 | TEST(F32, fc8) {
45 | 	AlexNet::fc8()
46 | 		.errorLimit(1.0e-5)
47 | 		.testInferenceF32();
48 | }
49 | 
50 | TEST(F16F32, fc8) {
51 | 	AlexNet::fc8()
52 | 		.errorLimit(1.0e-5)
53 | 		.testInferenceF16F32();
54 | }
55 | 
56 | int main(int argc, char* argv[]) {
57 | 	const enum nnp_status init_status = nnp_initialize();
58 | 	assert(init_status == nnp_status_success);
59 | 	setenv("TERM", "xterm-256color", 0);
60 | 	::testing::InitGoogleTest(&argc, argv);
61 | 	return RUN_ALL_TESTS();
62 | }
63 | 


--------------------------------------------------------------------------------
/test/fully-connected-inference/overfeat-fast.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/fully-connected.h>
 6 | #include <models/overfeat-fast.h>
 7 | 
 8 | /*
 9 |  * OverFeat (Fast model) fc6 layer
10 |  */
11 | 
12 | TEST(F32, fc6) {
13 | 	OverFeat_Fast::fc6()
14 | 		.errorLimit(2.0e-5)
15 | 		.testInferenceF32();
16 | }
17 | 
18 | TEST(F16F32, fc6) {
19 | 	OverFeat_Fast::fc6()
20 | 		.errorLimit(2.0e-5)
21 | 		.testInferenceF16F32();
22 | }
23 | 
24 | /*
25 |  * OverFeat (Fast model) fc7 layer
26 |  */
27 | 
28 | TEST(F32, fc7) {
29 | 	OverFeat_Fast::fc7()
30 | 		.errorLimit(1.0e-5)
31 | 		.testInferenceF32();
32 | }
33 | 
34 | TEST(F16F32, fc7) {
35 | 	OverFeat_Fast::fc7()
36 | 		.errorLimit(1.0e-5)
37 | 		.testInferenceF16F32();
38 | }
39 | 
40 | /*
41 |  * OverFeat (Fast model) fc8 layer
42 |  */
43 | 
44 | TEST(F32, fc8) {
45 | 	OverFeat_Fast::fc8()
46 | 		.errorLimit(1.0e-5)
47 | 		.testInferenceF32();
48 | }
49 | 
50 | TEST(F16F32, fc8) {
51 | 	OverFeat_Fast::fc8()
52 | 		.errorLimit(1.0e-5)
53 | 		.testInferenceF16F32();
54 | }
55 | 
56 | int main(int argc, char* argv[]) {
57 | 	const enum nnp_status init_status = nnp_initialize();
58 | 	assert(init_status == nnp_status_success);
59 | 	setenv("TERM", "xterm-256color", 0);
60 | 	::testing::InitGoogleTest(&argc, argv);
61 | 	return RUN_ALL_TESTS();
62 | }
63 | 


--------------------------------------------------------------------------------
/test/fully-connected-inference/vgg-a.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/fully-connected.h>
 6 | #include <models/vgg-a.h>
 7 | 
 8 | /*
 9 |  * VGG model A fc6 layer
10 |  */
11 | 
12 | TEST(F32, fc6) {
13 | 	VGG_A::fc6()
14 | 		.errorLimit(2.0e-5)
15 | 		.testInferenceF32();
16 | }
17 | 
18 | TEST(F16F32, fc6) {
19 | 	VGG_A::fc6()
20 | 		.errorLimit(2.0e-5)
21 | 		.testInferenceF16F32();
22 | }
23 | 
24 | /*
25 |  * VGG model A fc7 layer
26 |  */
27 | 
28 | TEST(F32, fc7) {
29 | 	VGG_A::fc7()
30 | 		.errorLimit(1.0e-5)
31 | 		.testInferenceF32();
32 | }
33 | 
34 | TEST(F16F32, fc7) {
35 | 	VGG_A::fc7()
36 | 		.errorLimit(1.0e-5)
37 | 		.testInferenceF16F32();
38 | }
39 | 
40 | /*
41 |  * VGG model A fc8 layer
42 |  */
43 | 
44 | TEST(F32, fc8) {
45 | 	VGG_A::fc8()
46 | 		.errorLimit(1.0e-5)
47 | 		.testInferenceF32();
48 | }
49 | 
50 | TEST(F16F32, fc8) {
51 | 	VGG_A::fc8()
52 | 		.errorLimit(1.0e-5)
53 | 		.testInferenceF16F32();
54 | }
55 | 
56 | int main(int argc, char* argv[]) {
57 | 	const enum nnp_status init_status = nnp_initialize();
58 | 	assert(init_status == nnp_status_success);
59 | 	setenv("TERM", "xterm-256color", 0);
60 | 	::testing::InitGoogleTest(&argc, argv);
61 | 	return RUN_ALL_TESTS();
62 | }
63 | 


--------------------------------------------------------------------------------
/test/fully-connected-output/alexnet.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/fully-connected.h>
 6 | #include <models/alexnet.h>
 7 | 
 8 | /*
 9 |  * AlexNet fc6 layer
10 |  */
11 | 
12 | TEST(FC, fc6) {
13 | 	AlexNet::fc6()
14 | 		.batchSize(128)
15 | 		.errorLimit(1.0e-5)
16 | 		.testOutput();
17 | }
18 | 
19 | /*
20 |  * AlexNet fc7 layer
21 |  */
22 | 
23 | TEST(FC, fc7) {
24 | 	AlexNet::fc7()
25 | 		.batchSize(128)
26 | 		.errorLimit(1.0e-5)
27 | 		.testOutput();
28 | }
29 | 
30 | /*
31 |  * AlexNet fc8 layer
32 |  */
33 | 
34 | TEST(FC, fc8) {
35 | 	AlexNet::fc8()
36 | 		.batchSize(128)
37 | 		.errorLimit(1.0e-5)
38 | 		.testOutput();
39 | }
40 | 
41 | int main(int argc, char* argv[]) {
42 | 	const enum nnp_status init_status = nnp_initialize();
43 | 	assert(init_status == nnp_status_success);
44 | 	setenv("TERM", "xterm-256color", 0);
45 | 	::testing::InitGoogleTest(&argc, argv);
46 | 	return RUN_ALL_TESTS();
47 | }
48 | 


--------------------------------------------------------------------------------
/test/fully-connected-output/overfeat-fast.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/fully-connected.h>
 6 | #include <models/overfeat-fast.h>
 7 | 
 8 | /*
 9 |  * OverFeat (Fast model) fc6 layer
10 |  */
11 | 
12 | TEST(FC, fc6) {
13 | 	OverFeat_Fast::fc6()
14 | 		.batchSize(128)
15 | 		.errorLimit(1.0e-5)
16 | 		.testOutput();
17 | }
18 | 
19 | /*
20 |  * OverFeat (Fast model) fc7 layer
21 |  */
22 | 
23 | TEST(FC, fc7) {
24 | 	OverFeat_Fast::fc7()
25 | 		.batchSize(128)
26 | 		.errorLimit(1.0e-5)
27 | 		.testOutput();
28 | }
29 | 
30 | /*
31 |  * OverFeat (Fast model) fc8 layer
32 |  */
33 | 
34 | TEST(FC, fc8) {
35 | 	OverFeat_Fast::fc8()
36 | 		.batchSize(128)
37 | 		.errorLimit(1.0e-5)
38 | 		.testOutput();
39 | }
40 | 
41 | int main(int argc, char* argv[]) {
42 | 	const enum nnp_status init_status = nnp_initialize();
43 | 	assert(init_status == nnp_status_success);
44 | 	setenv("TERM", "xterm-256color", 0);
45 | 	::testing::InitGoogleTest(&argc, argv);
46 | 	return RUN_ALL_TESTS();
47 | }
48 | 


--------------------------------------------------------------------------------
/test/fully-connected-output/vgg-a.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/fully-connected.h>
 6 | #include <models/vgg-a.h>
 7 | 
 8 | /*
 9 |  * VGG model A fc6 layer
10 |  */
11 | 
12 | TEST(FC, fc6) {
13 | 	VGG_A::fc6()
14 | 		.batchSize(64)
15 | 		.errorLimit(1.0e-5)
16 | 		.testOutput();
17 | }
18 | 
19 | /*
20 |  * VGG model A fc7 layer
21 |  */
22 | 
23 | TEST(FC, fc7) {
24 | 	VGG_A::fc7()
25 | 		.batchSize(64)
26 | 		.errorLimit(1.0e-5)
27 | 		.testOutput();
28 | }
29 | 
30 | /*
31 |  * VGG model A fc8 layer
32 |  */
33 | 
34 | TEST(FC, fc8) {
35 | 	VGG_A::fc8()
36 | 		.batchSize(64)
37 | 		.errorLimit(1.0e-5)
38 | 		.testOutput();
39 | }
40 | 
41 | int main(int argc, char* argv[]) {
42 | 	const enum nnp_status init_status = nnp_initialize();
43 | 	assert(init_status == nnp_status_success);
44 | 	setenv("TERM", "xterm-256color", 0);
45 | 	::testing::InitGoogleTest(&argc, argv);
46 | 	return RUN_ALL_TESTS();
47 | }
48 | 


--------------------------------------------------------------------------------
/test/hxgemm/neon.cc:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <cpuinfo.h>
  4 | 
  5 | #include <testers/gemm-ukernel.h>
  6 | #include <nnpack/blas.h>
  7 | 
  8 | #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
  9 | TEST(FAST_H4GEMM_3x3, neonhp) {
 10 | 	ASSERT_TRUE(cpuinfo_initialize());
 11 | 	if (cpuinfo_has_arm_neon_fma()) {
 12 | 		GemmMicroKernelTester tester = GemmMicroKernelTester()
 13 | 			.simdWidth(4)
 14 | 			.mr(3)
 15 | 			.nr(3)
 16 | 			.errorLimit(1.0e-3f);
 17 | 
 18 | 		for (uint32_t kc = 1; kc < 10; kc++) {
 19 | 			tester
 20 | 				.kc(kc)
 21 | 				.accumulateC(true)
 22 | 				.testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__neonhp));
 23 | 			tester
 24 | 				.accumulateC(false)
 25 | 				.testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__neonhp));
 26 | 		}
 27 | 	}
 28 | }
 29 | 
 30 | TEST(FULL_H4GEMM_3x3, neon) {
 31 | 	ASSERT_TRUE(cpuinfo_initialize());
 32 | 	if (cpuinfo_has_arm_neon_fma()) {
 33 | 		GemmMicroKernelTester tester = GemmMicroKernelTester()
 34 | 			.simdWidth(4)
 35 | 			.mr(3)
 36 | 			.nr(3)
 37 | 			.errorLimit(1.0e-3f);
 38 | 
 39 | 		for (uint32_t kc = 1; kc < 10; kc++) {
 40 | 			tester
 41 | 				.kc(kc)
 42 | 				.accumulateC(true)
 43 | 				.testHXGEMM(nnp_full_tuple_gemm_function(nnp_h4gemm_upto_3x3__neonhp));
 44 | 			tester
 45 | 				.accumulateC(false)
 46 | 				.testHXGEMM(nnp_full_tuple_gemm_function(nnp_h4gemm_upto_3x3__neonhp));
 47 | 		}
 48 | 	}
 49 | }
 50 | #endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
 51 | 
 52 | #if CPUINFO_ARCH_ARM
 53 | TEST(FAST_H4GEMM_3x3, aarch32_neonhp) {
 54 | 	ASSERT_TRUE(cpuinfo_initialize());
 55 | 	if (cpuinfo_has_arm_neon_fma()) {
 56 | 		GemmMicroKernelTester tester = GemmMicroKernelTester()
 57 | 			.simdWidth(4)
 58 | 			.mr(3)
 59 | 			.nr(3)
 60 | 			.errorLimit(1.0e-3f);
 61 | 
 62 | 		for (uint32_t kc = 1; kc < 10; kc++) {
 63 | 			tester
 64 | 				.kc(kc)
 65 | 				.accumulateC(true)
 66 | 				.testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neonhp));
 67 | 			tester
 68 | 				.accumulateC(false)
 69 | 				.testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neonhp));
 70 | 		}
 71 | 	}
 72 | }
 73 | 
 74 | TEST(FAST_H4GEMM_3x3, aarch32_neon2) {
 75 | 	ASSERT_TRUE(cpuinfo_initialize());
 76 | 	if (cpuinfo_has_arm_neon_fma()) {
 77 | 		GemmMicroKernelTester tester = GemmMicroKernelTester()
 78 | 			.simdWidth(4)
 79 | 			.mr(3)
 80 | 			.nr(3)
 81 | 			.errorLimit(1.0e-3f);
 82 | 
 83 | 		for (uint32_t kc = 1; kc < 10; kc++) {
 84 | 			tester
 85 | 				.kc(kc)
 86 | 				.accumulateC(true)
 87 | 				.testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neon2));
 88 | 			tester
 89 | 				.accumulateC(false)
 90 | 				.testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neon2));
 91 | 		}
 92 | 	}
 93 | }
 94 | 
 95 | TEST(FULL_H4GEMM_3x3, aarch32_neon2) {
 96 | 	ASSERT_TRUE(cpuinfo_initialize());
 97 | 	if (cpuinfo_has_arm_neon_fma()) {
 98 | 		GemmMicroKernelTester tester = GemmMicroKernelTester()
 99 | 			.simdWidth(4)
100 | 			.mr(3)
101 | 			.nr(3)
102 | 			.errorLimit(1.0e-3f);
103 | 
104 | 		for (uint32_t kc = 1; kc < 10; kc++) {
105 | 			tester
106 | 				.kc(kc)
107 | 				.accumulateC(true)
108 | 				.testHXGEMM(nnp_full_tuple_gemm_function(nnp_h4gemm_upto_3x3__aarch32_neon2));
109 | 			tester
110 | 				.accumulateC(false)
111 | 				.testHXGEMM(nnp_full_tuple_gemm_function(nnp_h4gemm_upto_3x3__aarch32_neon2));
112 | 		}
113 | 	}
114 | }
115 | 
116 | TEST(FAST_H4GEMM_3x3, aarch32_neonhparith) {
117 | 	ASSERT_TRUE(cpuinfo_initialize());
118 | 	if (cpuinfo_has_arm_neon_fp16_arith()) {
119 | 		GemmMicroKernelTester tester = GemmMicroKernelTester()
120 | 			.simdWidth(4)
121 | 			.mr(3)
122 | 			.nr(3)
123 | 			.errorLimit(1.0e-3f);
124 | 
125 | 		for (uint32_t kc = 1; kc < 10; kc++) {
126 | 			tester
127 | 				.kc(kc)
128 | 				.accumulateC(true)
129 | 				.testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neonhparith));
130 | 			tester
131 | 				.accumulateC(false)
132 | 				.testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neonhparith));
133 | 		}
134 | 	}
135 | }
136 | #endif /* CPUINFO_ARCH_ARM */
137 | 


--------------------------------------------------------------------------------
/test/max-pooling-output/overfeat-fast.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/pooling.h>
 6 | #include <models/overfeat-fast.h>
 7 | 
 8 | /*
 9 |  * OverFeat (Fast model) pool1 layer
10 |  */
11 | 
12 | TEST(MaxPooling2x2, pool1) {
13 | 	OverFeat_Fast::pool1()
14 | 		.batchSize(128)
15 | 		.testOutput();
16 | }
17 | 
18 | /*
19 |  * OverFeat (Fast model) pool2 layer
20 |  */
21 | 
22 | TEST(MaxPooling2x2, pool2) {
23 | 	OverFeat_Fast::pool2()
24 | 		.batchSize(128)
25 | 		.testOutput();
26 | }
27 | 
28 | /*
29 |  * OverFeat (Fast model) pool3 layer
30 |  */
31 | 
32 | TEST(MaxPooling2x2, pool3) {
33 | 	OverFeat_Fast::pool3()
34 | 		.batchSize(128)
35 | 		.testOutput();
36 | }
37 | 
38 | int main(int argc, char* argv[]) {
39 | 	const enum nnp_status init_status = nnp_initialize();
40 | 	assert(init_status == nnp_status_success);
41 | 	setenv("TERM", "xterm-256color", 0);
42 | 	::testing::InitGoogleTest(&argc, argv);
43 | 	return RUN_ALL_TESTS();
44 | }
45 | 


--------------------------------------------------------------------------------
/test/max-pooling-output/vgg-a.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/pooling.h>
 6 | #include <models/vgg-a.h>
 7 | 
 8 | /*
 9 |  * VGG model A pool1 layer
10 |  */
11 | 
12 | TEST(MaxPooling2x2, pool1) {
13 | 	VGG_A::pool1()
14 | 		.batchSize(64)
15 | 		.testOutput();
16 | }
17 | 
18 | /*
19 |  * VGG model A pool2 layer
20 |  */
21 | 
22 | TEST(MaxPooling2x2, pool2) {
23 | 	VGG_A::pool2()
24 | 		.batchSize(64)
25 | 		.testOutput();
26 | }
27 | 
28 | /*
29 |  * VGG model A pool3 layer
30 |  */
31 | 
32 | TEST(MaxPooling2x2, pool3) {
33 | 	VGG_A::pool3()
34 | 		.batchSize(64)
35 | 		.testOutput();
36 | }
37 | 
38 | /*
39 |  * VGG model A pool4 layer
40 |  */
41 | 
42 | TEST(MaxPooling2x2, pool4) {
43 | 	VGG_A::pool4()
44 | 		.batchSize(64)
45 | 		.testOutput();
46 | }
47 | 
48 | /*
49 |  * VGG model A pool5 layer
50 |  */
51 | 
52 | TEST(MaxPooling2x2, pool5) {
53 | 	VGG_A::pool5()
54 | 		.batchSize(64)
55 | 		.testOutput();
56 | }
57 | 
58 | int main(int argc, char* argv[]) {
59 | 	const enum nnp_status init_status = nnp_initialize();
60 | 	assert(init_status == nnp_status_success);
61 | 	setenv("TERM", "xterm-256color", 0);
62 | 	::testing::InitGoogleTest(&argc, argv);
63 | 	return RUN_ALL_TESTS();
64 | }
65 | 


--------------------------------------------------------------------------------
/test/relu-input-gradient/alexnet.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/pooling.h>
 6 | #include <models/alexnet.h>
 7 | 
 8 | /*
 9 |  * AlexNet conv1 ReLU layer
10 |  */
11 | 
12 | TEST(OUT_OF_PLACE, conv1_relu) {
13 | 	AlexNet::conv1_relu()
14 | 		.batchSize(128)
15 | 		.testInputGradient();
16 | }
17 | 
18 | /*
19 |  * AlexNet conv1 ReLU layer
20 |  */
21 | 
22 | TEST(OUT_OF_PLACE, conv2_relu) {
23 | 	AlexNet::conv2_relu()
24 | 		.batchSize(128)
25 | 		.testInputGradient();
26 | }
27 | 
28 | /*
29 |  * AlexNet conv3 ReLU layer
30 |  */
31 | 
32 | TEST(OUT_OF_PLACE, conv3_relu) {
33 | 	AlexNet::conv3_relu()
34 | 		.batchSize(128)
35 | 		.testInputGradient();
36 | }
37 | 
38 | /*
39 |  * AlexNet conv4 ReLU layer
40 |  */
41 | 
42 | TEST(OUT_OF_PLACE, conv4_relu) {
43 | 	AlexNet::conv4_relu()
44 | 		.batchSize(128)
45 | 		.testInputGradient();
46 | }
47 | 
48 | /*
49 |  * AlexNet fc6 ReLU layer
50 |  */
51 | 
52 | TEST(OUT_OF_PLACE, fc6_relu) {
53 | 	AlexNet::fc6_relu()
54 | 		.batchSize(128)
55 | 		.testInputGradient();
56 | }
57 | 
58 | /*
59 |  * AlexNet fc8 ReLU layer
60 |  */
61 | 
62 | TEST(OUT_OF_PLACE, fc8_relu) {
63 | 	AlexNet::fc8_relu()
64 | 		.batchSize(128)
65 | 		.testInputGradient();
66 | }
67 | 
68 | int main(int argc, char* argv[]) {
69 | 	const enum nnp_status init_status = nnp_initialize();
70 | 	assert(init_status == nnp_status_success);
71 | 	setenv("TERM", "xterm-256color", 0);
72 | 	::testing::InitGoogleTest(&argc, argv);
73 | 	return RUN_ALL_TESTS();
74 | }
75 | 


--------------------------------------------------------------------------------
/test/relu-input-gradient/overfeat-fast.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/pooling.h>
 6 | #include <models/overfeat-fast.h>
 7 | 
 8 | /*
 9 |  * OverFeat (Fast model) conv1 ReLU layer
10 |  */
11 | 
12 | TEST(OUT_OF_PLACE, conv1_relu) {
13 | 	OverFeat_Fast::conv1_relu()
14 | 		.batchSize(128)
15 | 		.testInputGradient();
16 | }
17 | 
18 | /*
19 |  * OverFeat (Fast model) conv1 ReLU layer
20 |  */
21 | 
22 | TEST(OUT_OF_PLACE, conv2_relu) {
23 | 	OverFeat_Fast::conv2_relu()
24 | 		.batchSize(128)
25 | 		.testInputGradient();
26 | }
27 | 
28 | /*
29 |  * OverFeat (Fast model) conv3 ReLU layer
30 |  */
31 | 
32 | TEST(OUT_OF_PLACE, conv3_relu) {
33 | 	OverFeat_Fast::conv3_relu()
34 | 		.batchSize(128)
35 | 		.testInputGradient();
36 | }
37 | 
38 | /*
39 |  * OverFeat (Fast model) conv4 ReLU layer
40 |  */
41 | 
42 | TEST(OUT_OF_PLACE, conv4_relu) {
43 | 	OverFeat_Fast::conv4_relu()
44 | 		.batchSize(128)
45 | 		.testInputGradient();
46 | }
47 | 
48 | /*
49 |  * OverFeat (Fast model) fc6 ReLU layer
50 |  */
51 | 
52 | TEST(OUT_OF_PLACE, fc6_relu) {
53 | 	OverFeat_Fast::fc6_relu()
54 | 		.batchSize(128)
55 | 		.testInputGradient();
56 | }
57 | 
58 | /*
59 |  * OverFeat (Fast model) fc7 ReLU layer
60 |  */
61 | 
62 | TEST(OUT_OF_PLACE, fc7_relu) {
63 | 	OverFeat_Fast::fc7_relu()
64 | 		.batchSize(128)
65 | 		.testInputGradient();
66 | }
67 | 
68 | /*
69 |  * OverFeat (Fast model) fc8 ReLU layer
70 |  */
71 | 
72 | TEST(OUT_OF_PLACE, fc8_relu) {
73 | 	OverFeat_Fast::fc8_relu()
74 | 		.batchSize(128)
75 | 		.testInputGradient();
76 | }
77 | 
78 | int main(int argc, char* argv[]) {
79 | 	const enum nnp_status init_status = nnp_initialize();
80 | 	assert(init_status == nnp_status_success);
81 | 	setenv("TERM", "xterm-256color", 0);
82 | 	::testing::InitGoogleTest(&argc, argv);
83 | 	return RUN_ALL_TESTS();
84 | }
85 | 


--------------------------------------------------------------------------------
/test/relu-input-gradient/vgg-a.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/pooling.h>
 6 | #include <models/vgg-a.h>
 7 | 
 8 | /*
 9 |  * VGG model A conv1 ReLU layer
10 |  */
11 | 
12 | TEST(OUT_OF_PLACE, conv1_relu) {
13 | 	VGG_A::conv1_relu()
14 | 		.batchSize(64)
15 | 		.testInputGradient();
16 | }
17 | 
18 | /*
19 |  * VGG model A conv1 ReLU layer
20 |  */
21 | 
22 | TEST(OUT_OF_PLACE, conv2_relu) {
23 | 	VGG_A::conv2_relu()
24 | 		.batchSize(64)
25 | 		.testInputGradient();
26 | }
27 | 
28 | /*
29 |  * VGG model A conv3 ReLU layer
30 |  */
31 | 
32 | TEST(OUT_OF_PLACE, conv3_relu) {
33 | 	VGG_A::conv3_relu()
34 | 		.batchSize(64)
35 | 		.testInputGradient();
36 | }
37 | 
38 | /*
39 |  * VGG model A conv5 ReLU layer
40 |  */
41 | 
42 | TEST(OUT_OF_PLACE, conv5_relu) {
43 | 	VGG_A::conv5_relu()
44 | 		.batchSize(64)
45 | 		.testInputGradient();
46 | }
47 | 
48 | /*
49 |  * VGG model A conv8 ReLU layer
50 |  */
51 | 
52 | TEST(OUT_OF_PLACE, conv8_relu) {
53 | 	VGG_A::conv8_relu()
54 | 		.batchSize(64)
55 | 		.testInputGradient();
56 | }
57 | 
58 | /*
59 |  * VGG model A fc6 ReLU layer
60 |  */
61 | 
62 | TEST(OUT_OF_PLACE, fc6_relu) {
63 | 	VGG_A::fc6_relu()
64 | 		.batchSize(64)
65 | 		.testInputGradient();
66 | }
67 | 
68 | /*
69 |  * VGG model A fc8 ReLU layer
70 |  */
71 | 
72 | TEST(OUT_OF_PLACE, fc8_relu) {
73 | 	VGG_A::fc8_relu()
74 | 		.batchSize(64)
75 | 		.testInputGradient();
76 | }
77 | 
78 | int main(int argc, char* argv[]) {
79 | 	const enum nnp_status init_status = nnp_initialize();
80 | 	assert(init_status == nnp_status_success);
81 | 	setenv("TERM", "xterm-256color", 0);
82 | 	::testing::InitGoogleTest(&argc, argv);
83 | 	return RUN_ALL_TESTS();
84 | }
85 | 


--------------------------------------------------------------------------------
/test/relu-output/alexnet.cc:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <nnpack.h>
  4 | 
  5 | #include <testers/pooling.h>
  6 | #include <models/alexnet.h>
  7 | 
  8 | /*
  9 |  * AlexNet conv1 ReLU layer
 10 |  */
 11 | 
 12 | TEST(OUT_OF_PLACE, conv1_relu) {
 13 | 	AlexNet::conv1_relu()
 14 | 		.batchSize(128)
 15 | 		.testOutput();
 16 | }
 17 | 
 18 | TEST(IN_PLACE, conv1_relu) {
 19 | 	AlexNet::conv1_relu()
 20 | 		.batchSize(128)
 21 | 		.testOutputInplace();
 22 | }
 23 | 
 24 | /*
 25 |  * AlexNet conv1 ReLU layer
 26 |  */
 27 | 
 28 | TEST(OUT_OF_PLACE, conv2_relu) {
 29 | 	AlexNet::conv2_relu()
 30 | 		.batchSize(128)
 31 | 		.testOutput();
 32 | }
 33 | 
 34 | TEST(IN_PLACE, conv2_relu) {
 35 | 	AlexNet::conv2_relu()
 36 | 		.batchSize(128)
 37 | 		.testOutputInplace();
 38 | }
 39 | 
 40 | /*
 41 |  * AlexNet conv3 ReLU layer
 42 |  */
 43 | 
 44 | TEST(OUT_OF_PLACE, conv3_relu) {
 45 | 	AlexNet::conv3_relu()
 46 | 		.batchSize(128)
 47 | 		.testOutput();
 48 | }
 49 | 
 50 | TEST(IN_PLACE, conv3_relu) {
 51 | 	AlexNet::conv3_relu()
 52 | 		.batchSize(128)
 53 | 		.testOutputInplace();
 54 | }
 55 | 
 56 | /*
 57 |  * AlexNet conv4 ReLU layer
 58 |  */
 59 | 
 60 | TEST(OUT_OF_PLACE, conv4_relu) {
 61 | 	AlexNet::conv4_relu()
 62 | 		.batchSize(128)
 63 | 		.testOutput();
 64 | }
 65 | 
 66 | TEST(IN_PLACE, conv4_relu) {
 67 | 	AlexNet::conv4_relu()
 68 | 		.batchSize(128)
 69 | 		.testOutputInplace();
 70 | }
 71 | 
 72 | /*
 73 |  * AlexNet fc6 ReLU layer
 74 |  */
 75 | 
 76 | TEST(OUT_OF_PLACE, fc6_relu) {
 77 | 	AlexNet::fc6_relu()
 78 | 		.batchSize(128)
 79 | 		.testOutput();
 80 | }
 81 | 
 82 | TEST(IN_PLACE, fc6_relu) {
 83 | 	AlexNet::fc6_relu()
 84 | 		.batchSize(128)
 85 | 		.testOutputInplace();
 86 | }
 87 | 
 88 | /*
 89 |  * AlexNet fc8 ReLU layer
 90 |  */
 91 | 
 92 | TEST(OUT_OF_PLACE, fc8_relu) {
 93 | 	AlexNet::fc8_relu()
 94 | 		.batchSize(128)
 95 | 		.testOutput();
 96 | }
 97 | 
 98 | TEST(IN_PLACE, fc8_relu) {
 99 | 	AlexNet::fc8_relu()
100 | 		.batchSize(128)
101 | 		.testOutputInplace();
102 | }
103 | 
104 | int main(int argc, char* argv[]) {
105 | 	const enum nnp_status init_status = nnp_initialize();
106 | 	assert(init_status == nnp_status_success);
107 | 	setenv("TERM", "xterm-256color", 0);
108 | 	::testing::InitGoogleTest(&argc, argv);
109 | 	return RUN_ALL_TESTS();
110 | }
111 | 


--------------------------------------------------------------------------------
/test/relu-output/overfeat-fast.cc:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <nnpack.h>
  4 | 
  5 | #include <testers/pooling.h>
  6 | #include <models/overfeat-fast.h>
  7 | 
  8 | /*
  9 |  * OverFeat (Fast model) conv1 ReLU layer
 10 |  */
 11 | 
 12 | TEST(OUT_OF_PLACE, conv1_relu) {
 13 | 	OverFeat_Fast::conv1_relu()
 14 | 		.batchSize(128)
 15 | 		.testOutput();
 16 | }
 17 | 
 18 | TEST(IN_PLACE, conv1_relu) {
 19 | 	OverFeat_Fast::conv1_relu()
 20 | 		.batchSize(128)
 21 | 		.testOutputInplace();
 22 | }
 23 | 
 24 | /*
 25 |  * OverFeat (Fast model) conv1 ReLU layer
 26 |  */
 27 | 
 28 | TEST(OUT_OF_PLACE, conv2_relu) {
 29 | 	OverFeat_Fast::conv2_relu()
 30 | 		.batchSize(128)
 31 | 		.testOutput();
 32 | }
 33 | 
 34 | TEST(IN_PLACE, conv2_relu) {
 35 | 	OverFeat_Fast::conv2_relu()
 36 | 		.batchSize(128)
 37 | 		.testOutputInplace();
 38 | }
 39 | 
 40 | /*
 41 |  * OverFeat (Fast model) conv3 ReLU layer
 42 |  */
 43 | 
 44 | TEST(OUT_OF_PLACE, conv3_relu) {
 45 | 	OverFeat_Fast::conv3_relu()
 46 | 		.batchSize(128)
 47 | 		.testOutput();
 48 | }
 49 | 
 50 | TEST(IN_PLACE, conv3_relu) {
 51 | 	OverFeat_Fast::conv3_relu()
 52 | 		.batchSize(128)
 53 | 		.testOutputInplace();
 54 | }
 55 | 
 56 | /*
 57 |  * OverFeat (Fast model) conv4 ReLU layer
 58 |  */
 59 | 
 60 | TEST(OUT_OF_PLACE, conv4_relu) {
 61 | 	OverFeat_Fast::conv4_relu()
 62 | 		.batchSize(128)
 63 | 		.testOutput();
 64 | }
 65 | 
 66 | TEST(IN_PLACE, conv4_relu) {
 67 | 	OverFeat_Fast::conv4_relu()
 68 | 		.batchSize(128)
 69 | 		.testOutputInplace();
 70 | }
 71 | 
 72 | /*
 73 |  * OverFeat (Fast model) fc6 ReLU layer
 74 |  */
 75 | 
 76 | TEST(OUT_OF_PLACE, fc6_relu) {
 77 | 	OverFeat_Fast::fc6_relu()
 78 | 		.batchSize(128)
 79 | 		.testOutput();
 80 | }
 81 | 
 82 | TEST(IN_PLACE, fc6_relu) {
 83 | 	OverFeat_Fast::fc6_relu()
 84 | 		.batchSize(128)
 85 | 		.testOutputInplace();
 86 | }
 87 | 
 88 | /*
 89 |  * OverFeat (Fast model) fc7 ReLU layer
 90 |  */
 91 | 
 92 | TEST(OUT_OF_PLACE, fc7_relu) {
 93 | 	OverFeat_Fast::fc7_relu()
 94 | 		.batchSize(128)
 95 | 		.testOutput();
 96 | }
 97 | 
 98 | TEST(IN_PLACE, fc7_relu) {
 99 | 	OverFeat_Fast::fc7_relu()
100 | 		.batchSize(128)
101 | 		.testOutputInplace();
102 | }
103 | 
104 | /*
105 |  * OverFeat (Fast model) fc8 ReLU layer
106 |  */
107 | 
108 | TEST(OUT_OF_PLACE, fc8_relu) {
109 | 	OverFeat_Fast::fc8_relu()
110 | 		.batchSize(128)
111 | 		.testOutput();
112 | }
113 | 
114 | TEST(IN_PLACE, fc8_relu) {
115 | 	OverFeat_Fast::fc8_relu()
116 | 		.batchSize(128)
117 | 		.testOutputInplace();
118 | }
119 | 
120 | int main(int argc, char* argv[]) {
121 | 	const enum nnp_status init_status = nnp_initialize();
122 | 	assert(init_status == nnp_status_success);
123 | 	setenv("TERM", "xterm-256color", 0);
124 | 	::testing::InitGoogleTest(&argc, argv);
125 | 	return RUN_ALL_TESTS();
126 | }
127 | 


--------------------------------------------------------------------------------
/test/relu-output/vgg-a.cc:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <nnpack.h>
  4 | 
  5 | #include <testers/pooling.h>
  6 | #include <models/vgg-a.h>
  7 | 
  8 | /*
  9 |  * VGG model A conv1 ReLU layer
 10 |  */
 11 | 
 12 | TEST(OUT_OF_PLACE, conv1_relu) {
 13 | 	VGG_A::conv1_relu()
 14 | 		.batchSize(64)
 15 | 		.testOutput();
 16 | }
 17 | 
 18 | TEST(IN_PLACE, conv1_relu) {
 19 | 	VGG_A::conv1_relu()
 20 | 		.batchSize(64)
 21 | 		.testOutputInplace();
 22 | }
 23 | 
 24 | /*
 25 |  * VGG model A conv1 ReLU layer
 26 |  */
 27 | 
 28 | TEST(OUT_OF_PLACE, conv2_relu) {
 29 | 	VGG_A::conv2_relu()
 30 | 		.batchSize(64)
 31 | 		.testOutput();
 32 | }
 33 | 
 34 | TEST(IN_PLACE, conv2_relu) {
 35 | 	VGG_A::conv2_relu()
 36 | 		.batchSize(64)
 37 | 		.testOutputInplace();
 38 | }
 39 | 
 40 | /*
 41 |  * VGG model A conv3 ReLU layer
 42 |  */
 43 | 
 44 | TEST(OUT_OF_PLACE, conv3_relu) {
 45 | 	VGG_A::conv3_relu()
 46 | 		.batchSize(64)
 47 | 		.testOutput();
 48 | }
 49 | 
 50 | TEST(IN_PLACE, conv3_relu) {
 51 | 	VGG_A::conv3_relu()
 52 | 		.batchSize(64)
 53 | 		.testOutputInplace();
 54 | }
 55 | 
 56 | /*
 57 |  * VGG model A conv5 ReLU layer
 58 |  */
 59 | 
 60 | TEST(OUT_OF_PLACE, conv5_relu) {
 61 | 	VGG_A::conv5_relu()
 62 | 		.batchSize(64)
 63 | 		.testOutput();
 64 | }
 65 | 
 66 | TEST(IN_PLACE, conv5_relu) {
 67 | 	VGG_A::conv5_relu()
 68 | 		.batchSize(64)
 69 | 		.testOutputInplace();
 70 | }
 71 | 
 72 | /*
 73 |  * VGG model A conv8 ReLU layer
 74 |  */
 75 | 
 76 | TEST(OUT_OF_PLACE, conv8_relu) {
 77 | 	VGG_A::conv8_relu()
 78 | 		.batchSize(64)
 79 | 		.testOutput();
 80 | }
 81 | 
 82 | TEST(IN_PLACE, conv8_relu) {
 83 | 	VGG_A::conv8_relu()
 84 | 		.batchSize(64)
 85 | 		.testOutputInplace();
 86 | }
 87 | 
 88 | /*
 89 |  * VGG model A fc6 ReLU layer
 90 |  */
 91 | 
 92 | TEST(OUT_OF_PLACE, fc6_relu) {
 93 | 	VGG_A::fc6_relu()
 94 | 		.batchSize(64)
 95 | 		.testOutput();
 96 | }
 97 | 
 98 | TEST(IN_PLACE, fc6_relu) {
 99 | 	VGG_A::fc6_relu()
100 | 		.batchSize(64)
101 | 		.testOutputInplace();
102 | }
103 | 
104 | /*
105 |  * VGG model A fc8 ReLU layer
106 |  */
107 | 
108 | TEST(OUT_OF_PLACE, fc8_relu) {
109 | 	VGG_A::fc8_relu()
110 | 		.batchSize(64)
111 | 		.testOutput();
112 | }
113 | 
114 | TEST(IN_PLACE, fc8_relu) {
115 | 	VGG_A::fc8_relu()
116 | 		.batchSize(64)
117 | 		.testOutputInplace();
118 | }
119 | 
120 | int main(int argc, char* argv[]) {
121 | 	const enum nnp_status init_status = nnp_initialize();
122 | 	assert(init_status == nnp_status_success);
123 | 	setenv("TERM", "xterm-256color", 0);
124 | 	::testing::InitGoogleTest(&argc, argv);
125 | 	return RUN_ALL_TESTS();
126 | }
127 | 


--------------------------------------------------------------------------------
/test/sgemm/neon.cc:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <cpuinfo.h>
  4 | 
  5 | #include <testers/gemm-ukernel.h>
  6 | #include <nnpack/blas.h>
  7 | 
  8 | TEST(FAST6x8_NEON, kc1) {
  9 | 	auto tester = GemmMicroKernelTester()
 10 | 		.mr(6)
 11 | 		.nr(8)
 12 | 		.kc(1)
 13 | 		.simdWidth(4)
 14 | 		.errorLimit(1.0e-6f);
 15 | 	tester.accumulateC(false)
 16 | 		.testSGEMM(nnp_sgemm_only_6x8__neon);
 17 | 	tester.accumulateC(true)
 18 | 		.testSGEMM(nnp_sgemm_only_6x8__neon);
 19 | }
 20 | 
 21 | TEST(FAST6x8_NEON, kc2) {
 22 | 	auto tester = GemmMicroKernelTester()
 23 | 		.mr(6)
 24 | 		.nr(8)
 25 | 		.kc(2)
 26 | 		.simdWidth(4)
 27 | 		.errorLimit(1.0e-6f);
 28 | 	tester.accumulateC(false)
 29 | 		.testSGEMM(nnp_sgemm_only_6x8__neon);
 30 | 	tester.accumulateC(true)
 31 | 		.testSGEMM(nnp_sgemm_only_6x8__neon);
 32 | }
 33 | 
 34 | TEST(FAST6x8_NEON, kc10) {
 35 | 	auto tester = GemmMicroKernelTester()
 36 | 		.mr(6)
 37 | 		.nr(8)
 38 | 		.kc(10)
 39 | 		.simdWidth(4)
 40 | 		.errorLimit(1.0e-6f);
 41 | 	tester.accumulateC(false)
 42 | 		.testSGEMM(nnp_sgemm_only_6x8__neon);
 43 | 	tester.accumulateC(true)
 44 | 		.testSGEMM(nnp_sgemm_only_6x8__neon);
 45 | }
 46 | 
 47 | #if CPUINFO_ARCH_ARM
 48 | 	TEST(FAST6x8_AARCH32_NEON, kc1) {
 49 | 		auto tester = GemmMicroKernelTester()
 50 | 			.mr(6)
 51 | 			.nr(8)
 52 | 			.kc(1)
 53 | 			.simdWidth(4)
 54 | 			.errorLimit(1.0e-6f);
 55 | 		tester
 56 | 			.accumulateC(true)
 57 | 			.testSGEMM(nnp_sgemm_only_6x8__aarch32_neon);
 58 | 		tester
 59 | 			.accumulateC(false)
 60 | 			.testSGEMM(nnp_sgemm_only_6x8__aarch32_neon);
 61 | 	}
 62 | 
 63 | 	TEST(FAST6x8_AARCH32_NEON, kc2) {
 64 | 		auto tester = GemmMicroKernelTester()
 65 | 			.mr(6)
 66 | 			.nr(8)
 67 | 			.kc(2)
 68 | 			.simdWidth(4)
 69 | 			.errorLimit(1.0e-6f);
 70 | 		tester
 71 | 			.accumulateC(true)
 72 | 			.testSGEMM(nnp_sgemm_only_6x8__aarch32_neon);
 73 | 		tester
 74 | 			.accumulateC(false)
 75 | 			.testSGEMM(nnp_sgemm_only_6x8__aarch32_neon);
 76 | 	}
 77 | 
 78 | 	TEST(FAST6x8_AARCH32_NEON, kc10) {
 79 | 		auto tester = GemmMicroKernelTester()
 80 | 			.mr(6)
 81 | 			.nr(8)
 82 | 			.kc(10)
 83 | 			.simdWidth(4)
 84 | 			.errorLimit(1.0e-6f);
 85 | 		tester
 86 | 			.accumulateC(true)
 87 | 			.testSGEMM(nnp_sgemm_only_6x8__aarch32_neon);
 88 | 		tester
 89 | 			.accumulateC(false)
 90 | 			.testSGEMM(nnp_sgemm_only_6x8__aarch32_neon);
 91 | 	}
 92 | #endif
 93 | 
 94 | TEST(FULL6x8_NEON, kc1) {
 95 | 	auto tester = GemmMicroKernelTester()
 96 | 		.mr(6)
 97 | 		.nr(8)
 98 | 		.kc(1)
 99 | 		.simdWidth(4)
100 | 		.errorLimit(1.0e-6f);
101 | 	tester
102 | 		.accumulateC(true)
103 | 		.testSGEMM(nnp_sgemm_upto_6x8__neon);
104 | 	tester
105 | 		.accumulateC(false)
106 | 		.testSGEMM(nnp_sgemm_upto_6x8__neon);
107 | }
108 | 
109 | TEST(FULL6x8_NEON, kc2) {
110 | 	auto tester = GemmMicroKernelTester()
111 | 		.mr(6)
112 | 		.nr(8)
113 | 		.kc(2)
114 | 		.simdWidth(4)
115 | 		.errorLimit(1.0e-6f);
116 | 	tester
117 | 		.accumulateC(true)
118 | 		.testSGEMM(nnp_sgemm_upto_6x8__neon);
119 | 	tester
120 | 		.accumulateC(false)
121 | 		.testSGEMM(nnp_sgemm_upto_6x8__neon);
122 | }
123 | 
124 | TEST(FULL6x8_NEON, kc10) {
125 | 	auto tester = GemmMicroKernelTester()
126 | 		.mr(6)
127 | 		.nr(8)
128 | 		.kc(10)
129 | 		.simdWidth(4)
130 | 		.errorLimit(1.0e-6f);
131 | 	tester
132 | 		.accumulateC(true)
133 | 		.testSGEMM(nnp_sgemm_upto_6x8__neon);
134 | 	tester
135 | 		.accumulateC(false)
136 | 		.testSGEMM(nnp_sgemm_upto_6x8__neon);
137 | }
138 | 


--------------------------------------------------------------------------------
/test/sgemm/psimd.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <testers/gemm-ukernel.h>
 4 | #include <nnpack/blas.h>
 5 | 
 6 | TEST(FAST4x8, kc1) {
 7 | 	auto tester = GemmMicroKernelTester()
 8 | 		.mr(4)
 9 | 		.nr(8)
10 | 		.kc(1)
11 | 		.simdWidth(4)
12 | 		.errorLimit(1.0e-6f);
13 | 	tester
14 | 		.accumulateC(true)
15 | 		.testSGEMM(nnp_sgemm_only_4x8__psimd);
16 | 	tester
17 | 		.accumulateC(false)
18 | 		.testSGEMM(nnp_sgemm_only_4x8__psimd);
19 | }
20 | 
21 | TEST(FAST4x8, kc2) {
22 | 	auto tester = GemmMicroKernelTester()
23 | 		.mr(4)
24 | 		.nr(8)
25 | 		.kc(2)
26 | 		.simdWidth(4)
27 | 		.errorLimit(1.0e-6f);
28 | 	tester
29 | 		.accumulateC(true)
30 | 		.testSGEMM(nnp_sgemm_only_4x8__psimd);
31 | 	tester
32 | 		.accumulateC(false)
33 | 		.testSGEMM(nnp_sgemm_only_4x8__psimd);
34 | }
35 | 
36 | TEST(FAST4x8, kc10) {
37 | 	auto tester = GemmMicroKernelTester()
38 | 		.mr(4)
39 | 		.nr(8)
40 | 		.kc(10)
41 | 		.simdWidth(4)
42 | 		.errorLimit(1.0e-6f);
43 | 	tester
44 | 		.accumulateC(true)
45 | 		.testSGEMM(nnp_sgemm_only_4x8__psimd);
46 | 	tester
47 | 		.accumulateC(false)
48 | 		.testSGEMM(nnp_sgemm_only_4x8__psimd);
49 | }
50 | 
51 | TEST(FULL4x8, kc1) {
52 | 	auto tester = GemmMicroKernelTester()
53 | 		.mr(4)
54 | 		.nr(8)
55 | 		.kc(1)
56 | 		.simdWidth(4)
57 | 		.errorLimit(1.0e-6f);
58 | 	tester
59 | 		.accumulateC(true)
60 | 		.testSGEMM(nnp_sgemm_upto_4x8__psimd);
61 | 	tester
62 | 		.accumulateC(false)
63 | 		.testSGEMM(nnp_sgemm_upto_4x8__psimd);
64 | }
65 | 
66 | TEST(FULL4x8, kc2) {
67 | 	auto tester = GemmMicroKernelTester()
68 | 		.mr(4)
69 | 		.nr(8)
70 | 		.kc(2)
71 | 		.simdWidth(4)
72 | 		.errorLimit(1.0e-6f);
73 | 	tester
74 | 		.accumulateC(true)
75 | 		.testSGEMM(nnp_sgemm_upto_4x8__psimd);
76 | 	tester
77 | 		.accumulateC(false)
78 | 		.testSGEMM(nnp_sgemm_upto_4x8__psimd);
79 | }
80 | 
81 | TEST(FULL4x8, kc10) {
82 | 	auto tester = GemmMicroKernelTester()
83 | 		.mr(4)
84 | 		.nr(8)
85 | 		.kc(10)
86 | 		.simdWidth(4)
87 | 		.errorLimit(1.0e-6f);
88 | 	tester
89 | 		.accumulateC(true)
90 | 		.testSGEMM(nnp_sgemm_upto_4x8__psimd);
91 | 	tester
92 | 		.accumulateC(false)
93 | 		.testSGEMM(nnp_sgemm_upto_4x8__psimd);
94 | }
95 | 


--------------------------------------------------------------------------------
/test/sgemm/scalar.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <testers/gemm-ukernel.h>
 4 | #include <nnpack/blas.h>
 5 | 
 6 | TEST(FAST4x3, kc1) {
 7 | 	auto tester = GemmMicroKernelTester()
 8 | 		.mr(4)
 9 | 		.nr(3)
10 | 		.kc(1)
11 | 		.simdWidth(1)
12 | 		.errorLimit(1.0e-6f);
13 | 	tester
14 | 		.accumulateC(true)
15 | 		.testSGEMM(nnp_sgemm_only_4x3__scalar);
16 | 	tester
17 | 		.accumulateC(false)
18 | 		.testSGEMM(nnp_sgemm_only_4x3__scalar);
19 | }
20 | 
21 | TEST(FAST4x3, kc2) {
22 | 	auto tester = GemmMicroKernelTester()
23 | 		.mr(4)
24 | 		.nr(3)
25 | 		.kc(2)
26 | 		.simdWidth(1)
27 | 		.errorLimit(1.0e-6f);
28 | 	tester
29 | 		.accumulateC(true)
30 | 		.testSGEMM(nnp_sgemm_only_4x3__scalar);
31 | 	tester
32 | 		.accumulateC(false)
33 | 		.testSGEMM(nnp_sgemm_only_4x3__scalar);
34 | }
35 | 
36 | TEST(FAST4x3, kc10) {
37 | 	auto tester = GemmMicroKernelTester()
38 | 		.mr(4)
39 | 		.nr(3)
40 | 		.kc(10)
41 | 		.simdWidth(1)
42 | 		.errorLimit(1.0e-6f);
43 | 	tester
44 | 		.accumulateC(true)
45 | 		.testSGEMM(nnp_sgemm_only_4x3__scalar);
46 | 	tester
47 | 		.accumulateC(false)
48 | 		.testSGEMM(nnp_sgemm_only_4x3__scalar);
49 | }
50 | 
51 | TEST(FULL4x3, kc1) {
52 | 	auto tester = GemmMicroKernelTester()
53 | 		.mr(4)
54 | 		.nr(3)
55 | 		.kc(1)
56 | 		.simdWidth(1)
57 | 		.errorLimit(1.0e-6f);
58 | 	tester
59 | 		.accumulateC(true)
60 | 		.testSGEMM(nnp_sgemm_upto_4x3__scalar);
61 | 	tester
62 | 		.accumulateC(false)
63 | 		.testSGEMM(nnp_sgemm_upto_4x3__scalar);
64 | }
65 | 
66 | TEST(FULL4x3, kc2) {
67 | 	auto tester = GemmMicroKernelTester()
68 | 		.mr(4)
69 | 		.nr(3)
70 | 		.kc(2)
71 | 		.simdWidth(1)
72 | 		.errorLimit(1.0e-6f);
73 | 	tester
74 | 		.accumulateC(true)
75 | 		.testSGEMM(nnp_sgemm_upto_4x3__scalar);
76 | 	tester
77 | 		.accumulateC(false)
78 | 		.testSGEMM(nnp_sgemm_upto_4x3__scalar);
79 | }
80 | 
81 | TEST(FULL4x3, kc10) {
82 | 	auto tester = GemmMicroKernelTester()
83 | 		.mr(4)
84 | 		.nr(3)
85 | 		.kc(10)
86 | 		.simdWidth(1)
87 | 		.errorLimit(1.0e-6f);
88 | 	tester
89 | 		.accumulateC(true)
90 | 		.testSGEMM(nnp_sgemm_upto_4x3__scalar);
91 | 	tester
92 | 		.accumulateC(false)
93 | 		.testSGEMM(nnp_sgemm_upto_4x3__scalar);
94 | }
95 | 


--------------------------------------------------------------------------------
/test/sgemm/x86_64-fma3.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <testers/gemm-ukernel.h>
 4 | #include <nnpack/blas.h>
 5 | 
 6 | TEST(FAST4x24, kc1) {
 7 | 	auto tester = GemmMicroKernelTester()
 8 | 		.mr(4)
 9 | 		.nr(24)
10 | 		.kc(1)
11 | 		.simdWidth(8)
12 | 		.errorLimit(1.0e-6f);
13 | 	tester
14 | 		.accumulateC(true)
15 | 		.testSGEMM(nnp_sgemm_only_4x24__fma3);
16 | 	tester
17 | 		.accumulateC(false)
18 | 		.testSGEMM(nnp_sgemm_only_4x24__fma3);
19 | }
20 | 
21 | TEST(FAST4x24, kc2) {
22 | 	auto tester = GemmMicroKernelTester()
23 | 		.mr(4)
24 | 		.nr(24)
25 | 		.kc(2)
26 | 		.simdWidth(8)
27 | 		.errorLimit(1.0e-6f);
28 | 	tester
29 | 		.accumulateC(true)
30 | 		.testSGEMM(nnp_sgemm_only_4x24__fma3);
31 | 	tester
32 | 		.accumulateC(false)
33 | 		.testSGEMM(nnp_sgemm_only_4x24__fma3);
34 | }
35 | 
36 | TEST(FAST4x24, kc10) {
37 | 	auto tester = GemmMicroKernelTester()
38 | 		.mr(4)
39 | 		.nr(24)
40 | 		.kc(10)
41 | 		.simdWidth(8)
42 | 		.errorLimit(1.0e-6f);
43 | 	tester
44 | 		.accumulateC(true)
45 | 		.testSGEMM(nnp_sgemm_only_4x24__fma3);
46 | 	tester
47 | 		.accumulateC(false)
48 | 		.testSGEMM(nnp_sgemm_only_4x24__fma3);
49 | }
50 | 
51 | TEST(FULL4x24, kc1) {
52 | 	auto tester = GemmMicroKernelTester()
53 | 		.mr(4)
54 | 		.nr(24)
55 | 		.kc(1)
56 | 		.simdWidth(8)
57 | 		.errorLimit(1.0e-6f);
58 | 	tester
59 | 		.accumulateC(true)
60 | 		.testSGEMM(nnp_sgemm_upto_4x24__fma3);
61 | 	tester
62 | 		.accumulateC(false)
63 | 		.testSGEMM(nnp_sgemm_upto_4x24__fma3);
64 | }
65 | 
66 | TEST(FULL4x24, kc2) {
67 | 	auto tester = GemmMicroKernelTester()
68 | 		.mr(4)
69 | 		.nr(24)
70 | 		.kc(2)
71 | 		.simdWidth(8)
72 | 		.errorLimit(1.0e-6f);
73 | 	tester
74 | 		.accumulateC(true)
75 | 		.testSGEMM(nnp_sgemm_upto_4x24__fma3);
76 | 	tester
77 | 		.accumulateC(false)
78 | 		.testSGEMM(nnp_sgemm_upto_4x24__fma3);
79 | }
80 | 
81 | TEST(FULL4x24, kc10) {
82 | 	auto tester = GemmMicroKernelTester()
83 | 		.mr(4)
84 | 		.nr(24)
85 | 		.kc(10)
86 | 		.simdWidth(8)
87 | 		.errorLimit(1.0e-6f);
88 | 	tester
89 | 		.accumulateC(true)
90 | 		.testSGEMM(nnp_sgemm_upto_4x24__fma3);
91 | 	tester
92 | 		.accumulateC(false)
93 | 		.testSGEMM(nnp_sgemm_upto_4x24__fma3);
94 | }
95 | 


--------------------------------------------------------------------------------
/test/softmax-output/imagenet.cc:
--------------------------------------------------------------------------------
  1 | #include <gtest/gtest.h>
  2 | 
  3 | #include <nnpack.h>
  4 | 
  5 | #include <testers/softmax.h>
  6 | 
  7 | /*
  8 |  * ImageNet (1000 categories) with batch size = 1
  9 |  */
 10 | 
 11 | TEST(OUT_OF_PLACE, batch1) {
 12 | 	SoftmaxTester()
 13 | 		.channels(1000)
 14 | 		.testOutput();
 15 | }
 16 | 
 17 | TEST(IN_PLACE, batch1) {
 18 | 	SoftmaxTester()
 19 | 		.channels(1000)
 20 | 		.testOutputInplace();
 21 | }
 22 | 
 23 | /*
 24 |  * ImageNet (1000 categories) with batch size = 2
 25 |  */
 26 | 
 27 | TEST(OUT_OF_PLACE, batch2) {
 28 | 	SoftmaxTester()
 29 | 		.batchSize(2)
 30 | 		.channels(1000)
 31 | 		.testOutput();
 32 | }
 33 | 
 34 | TEST(IN_PLACE, batch2) {
 35 | 	SoftmaxTester()
 36 | 		.batchSize(2)
 37 | 		.channels(1000)
 38 | 		.testOutputInplace();
 39 | }
 40 | 
 41 | /*
 42 |  * ImageNet (1000 categories) with batch size = 16
 43 |  */
 44 | 
 45 | TEST(OUT_OF_PLACE, batch16) {
 46 | 	SoftmaxTester()
 47 | 		.batchSize(16)
 48 | 		.channels(1000)
 49 | 		.testOutput();
 50 | }
 51 | 
 52 | TEST(IN_PLACE, batch16) {
 53 | 	SoftmaxTester()
 54 | 		.batchSize(16)
 55 | 		.channels(1000)
 56 | 		.testOutputInplace();
 57 | }
 58 | 
 59 | /*
 60 |  * ImageNet (1000 categories) with batch size = 64
 61 |  */
 62 | 
 63 | TEST(OUT_OF_PLACE, batch64) {
 64 | 	SoftmaxTester()
 65 | 		.batchSize(64)
 66 | 		.channels(1000)
 67 | 		.testOutput();
 68 | }
 69 | 
 70 | TEST(IN_PLACE, batch64) {
 71 | 	SoftmaxTester()
 72 | 		.batchSize(64)
 73 | 		.channels(1000)
 74 | 		.testOutputInplace();
 75 | }
 76 | 
 77 | /*
 78 |  * ImageNet (1000 categories) with batch size = 128
 79 |  */
 80 | 
 81 | TEST(OUT_OF_PLACE, batch128) {
 82 | 	SoftmaxTester()
 83 | 		.multithreading(true)
 84 | 		.batchSize(128)
 85 | 		.channels(1000)
 86 | 		.testOutput();
 87 | }
 88 | 
 89 | TEST(IN_PLACE, batch128) {
 90 | 	SoftmaxTester()
 91 | 		.multithreading(true)
 92 | 		.batchSize(128)
 93 | 		.channels(1000)
 94 | 		.testOutputInplace();
 95 | }
 96 | 
 97 | /*
 98 |  * ImageNet (1000 categories) with batch size = 256
 99 |  */
100 | 
101 | TEST(OUT_OF_PLACE, batch256) {
102 | 	SoftmaxTester()
103 | 		.multithreading(true)
104 | 		.batchSize(256)
105 | 		.channels(1000)
106 | 		.testOutput();
107 | }
108 | 
109 | TEST(IN_PLACE, batch256) {
110 | 	SoftmaxTester()
111 | 		.multithreading(true)
112 | 		.batchSize(256)
113 | 		.channels(1000)
114 | 		.testOutputInplace();
115 | }
116 | 
117 | int main(int argc, char* argv[]) {
118 | 	const enum nnp_status init_status = nnp_initialize();
119 | 	assert(init_status == nnp_status_success);
120 | 	setenv("TERM", "xterm-256color", 0);
121 | 	::testing::InitGoogleTest(&argc, argv);
122 | 	return RUN_ALL_TESTS();
123 | }
124 | 


--------------------------------------------------------------------------------
/test/softmax-output/smoke.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <nnpack.h>
 4 | 
 5 | #include <testers/softmax.h>
 6 | 
 7 | /*
 8 |  * Test that implementation works for a small number of channels
 9 |  */
10 | 
11 | TEST(OUT_OF_PLACE, few_channels) {
12 | 	auto tester = SoftmaxTester();
13 | 	for (size_t channels = 1; channels <= 96; channels += 1) {
14 | 		tester.channels(1000)
15 | 			.testOutput();
16 | 	}
17 | }
18 | 
19 | TEST(IN_PLACE, few_channels) {
20 | 	auto tester = SoftmaxTester();
21 | 	for (size_t channels = 1; channels <= 96; channels += 1) {
22 | 		tester.channels(1000)
23 | 			.testOutputInplace();
24 | 	}
25 | }
26 | 
27 | /*
28 |  * Test that implementation works for a moderate number of channels with small batch
29 |  */
30 | 
31 | TEST(OUT_OF_PLACE, small_batch) {
32 | 	auto tester = SoftmaxTester();
33 | 	for (size_t channels = 100; channels <= 115; channels += 1) {
34 | 		for (size_t batch = 2; batch <= 5; batch += 1) {
35 | 			tester.channels(1000)
36 | 				.batchSize(batch)
37 | 				.testOutput();
38 | 		}
39 | 	}
40 | }
41 | 
42 | TEST(IN_PLACE, small_batch) {
43 | 	auto tester = SoftmaxTester();
44 | 	for (size_t channels = 100; channels <= 115; channels += 1) {
45 | 		for (size_t batch = 2; batch <= 5; batch += 1) {
46 | 			tester.channels(1000)
47 | 				.batchSize(batch)
48 | 				.testOutputInplace();
49 | 		}
50 | 	}
51 | }
52 | 
53 | int main(int argc, char* argv[]) {
54 | 	const enum nnp_status init_status = nnp_initialize();
55 | 	assert(init_status == nnp_status_success);
56 | 	setenv("TERM", "xterm-256color", 0);
57 | 	::testing::InitGoogleTest(&argc, argv);
58 | 	return RUN_ALL_TESTS();
59 | }
60 | 


--------------------------------------------------------------------------------
/test/sxgemm/neon.cc:
--------------------------------------------------------------------------------
 1 | #include <gtest/gtest.h>
 2 | 
 3 | #include <cpuinfo.h>
 4 | 
 5 | #include <testers/gemm-ukernel.h>
 6 | #include <nnpack/blas.h>
 7 | 
 8 | #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
 9 | TEST(FAST_S4GEMM_3x3, neon) {
10 | 	ASSERT_TRUE(cpuinfo_initialize());
11 | 	if (cpuinfo_has_arm_neon_fma()) {
12 | 		GemmMicroKernelTester tester = GemmMicroKernelTester()
13 | 			.simdWidth(4)
14 | 			.mr(3)
15 | 			.nr(3)
16 | 			.errorLimit(1.0e-6f);
17 | 
18 | 		for (uint32_t kc = 1; kc < 10; kc++) {
19 | 			tester
20 | 				.kc(kc)
21 | 				.accumulateC(true)
22 | 				.testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__neon));
23 | 			tester
24 | 				.accumulateC(false)
25 | 				.testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__neon));
26 | 		}
27 | 	}
28 | }
29 | 
30 | TEST(FULL_S4GEMM_3x3, neon) {
31 | 	ASSERT_TRUE(cpuinfo_initialize());
32 | 	if (cpuinfo_has_arm_neon_fma()) {
33 | 		GemmMicroKernelTester tester = GemmMicroKernelTester()
34 | 			.simdWidth(4)
35 | 			.mr(3)
36 | 			.nr(3)
37 | 			.errorLimit(1.0e-6f);
38 | 
39 | 		for (uint32_t kc = 1; kc < 10; kc++) {
40 | 			tester
41 | 				.kc(kc)
42 | 				.accumulateC(true)
43 | 				.testSXGEMM(nnp_full_tuple_gemm_function(nnp_s4gemm_upto_3x3__neon));
44 | 			tester
45 | 				.accumulateC(false)
46 | 				.testSXGEMM(nnp_full_tuple_gemm_function(nnp_s4gemm_upto_3x3__neon));
47 | 		}
48 | 	}
49 | }
50 | #endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */
51 | 
52 | #if CPUINFO_ARCH_ARM
53 | TEST(FAST_S4GEMM_3x3, aarch32_neon) {
54 | 	ASSERT_TRUE(cpuinfo_initialize());
55 | 	if (cpuinfo_has_arm_neon_fma()) {
56 | 		GemmMicroKernelTester tester = GemmMicroKernelTester()
57 | 			.simdWidth(4)
58 | 			.mr(3)
59 | 			.nr(3)
60 | 			.errorLimit(1.0e-6f);
61 | 
62 | 		for (uint32_t kc = 1; kc < 10; kc++) {
63 | 			tester
64 | 				.kc(kc)
65 | 				.accumulateC(true)
66 | 				.testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__aarch32_neon));
67 | 			tester
68 | 				.accumulateC(false)
69 | 				.testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__aarch32_neon));
70 | 		}
71 | 	}
72 | }
73 | 
74 | TEST(FAST_S4GEMM_3x3, aarch32_neon2) {
75 | 	ASSERT_TRUE(cpuinfo_initialize());
76 | 	if (cpuinfo_has_arm_neon_fma()) {
77 | 		GemmMicroKernelTester tester = GemmMicroKernelTester()
78 | 			.simdWidth(4)
79 | 			.mr(3)
80 | 			.nr(3)
81 | 			.errorLimit(1.0e-6f);
82 | 
83 | 		for (uint32_t kc = 1; kc < 10; kc++) {
84 | 			tester
85 | 				.kc(kc)
86 | 				.accumulateC(true)
87 | 				.testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__aarch32_neon2));
88 | 			tester
89 | 				.accumulateC(false)
90 | 				.testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__aarch32_neon2));
91 | 		}
92 | 	}
93 | }
94 | #endif /* CPUINFO_ARCH_ARM */
95 | 


--------------------------------------------------------------------------------
/web/nnpack.nmf:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"program": {
 3 | 		"portable": {
 4 | 			"pnacl-translate": {
 5 | 				"url": "webnnpack.pexe"
 6 | 			}
 7 | 		}
 8 | 	}
 9 | }
10 | 


--------------------------------------------------------------------------------