├── .gitignore ├── .travis.yml ├── CMakeLists.txt ├── LICENSE ├── README.md ├── bench ├── conv1x1.cc ├── convolution-inference.cc ├── convolution.c ├── fully-connected.c ├── gemm.c ├── hxgemm.cc ├── median.c ├── memread.c ├── memread.py ├── perf_counter.c ├── perf_counter.h ├── pooling.c ├── relu.c ├── sgemm.cc ├── sxgemm.cc ├── transform.c ├── ugemm.c └── winograd.cc ├── benchmark.py ├── cmake ├── DownloadCpuinfo.cmake ├── DownloadEnum.cmake ├── DownloadFP16.cmake ├── DownloadFXdiv.cmake ├── DownloadGoogleTest.cmake ├── DownloadOpcodes.cmake ├── DownloadPSimd.cmake ├── DownloadPThreadPool.cmake ├── DownloadPeachPy.cmake └── DownloadSix.cmake ├── configure.py ├── confu.yaml ├── include ├── nnpack.h └── nnpack │ ├── AlignedAllocator.h │ ├── activations.h │ ├── arm_neon.h │ ├── assembly.h │ ├── blas.h │ ├── complex.h │ ├── fft-constants.h │ ├── fft.h │ ├── hwinfo.h │ ├── macros.h │ ├── pooling.h │ ├── reference.h │ ├── relu.h │ ├── softmax.h │ ├── system.h │ ├── transform.h │ ├── utils.h │ ├── validation.h │ └── winograd.h ├── logo └── NNPACK.png ├── src ├── convolution-inference.c ├── convolution-input-gradient.c ├── convolution-kernel-gradient.c ├── convolution-output.c ├── fully-connected-inference.c ├── fully-connected-output.c ├── init.c ├── neon │ ├── 2d-winograd-8x8-3x3-fp16.c │ ├── 2d-winograd-8x8-3x3.c │ ├── blas │ │ ├── c4gemm-conjb-transc.c │ │ ├── c4gemm-conjb.c │ │ ├── c4gemm.c │ │ ├── conv1x1.c │ │ ├── h4gemm-aarch32.S │ │ ├── h4gemm.c │ │ ├── s4c2gemm-conjb-transc.c │ │ ├── s4c2gemm-conjb.c │ │ ├── s4c2gemm.c │ │ ├── s4gemm-aarch32.S │ │ ├── s4gemm.c │ │ ├── sdotxf.c │ │ ├── sgemm-aarch32.S │ │ └── sgemm.c │ ├── relu.c │ ├── transpose.h │ ├── winograd-f6k3.c │ └── winograd │ │ └── f6x6k3x3.h ├── pooling-output.c ├── psimd │ ├── 2d-fourier-16x16.c │ ├── 2d-fourier-8x8.c │ ├── 2d-winograd-8x8-3x3.c │ ├── blas │ │ ├── c4gemm-conjb-transc.c │ │ ├── c4gemm-conjb.c │ │ ├── c4gemm.c │ │ ├── conv1x1.c │ │ ├── s4c2gemm-conjb-transc.c │ │ ├── s4c2gemm-conjb.c │ │ ├── s4c2gemm.c │ │ ├── s4gemm.c │ │ ├── sdotxf.c │ │ ├── sgemm.c │ │ └── shdotxf.c │ ├── butterfly.h │ ├── exp.c │ ├── exp.h │ ├── fft-aos.c │ ├── fft-dualreal.c │ ├── fft-real.c │ ├── fft-soa.c │ ├── fft │ │ ├── aos.h │ │ ├── dualreal.h │ │ ├── real.h │ │ └── soa.h │ ├── relu.c │ ├── softmax.c │ ├── transpose.h │ ├── winograd-f6k3.c │ └── winograd │ │ └── f6x6k3x3.h ├── ref │ ├── convolution-input-gradient.c │ ├── convolution-kernel.c │ ├── convolution-output.c │ ├── fft │ │ ├── aos.c │ │ ├── complex.h │ │ ├── forward-dualreal.c │ │ ├── forward-real.c │ │ ├── inverse-dualreal.c │ │ ├── inverse-real.c │ │ └── soa.c │ ├── fully-connected-output.c │ ├── max-pooling-output.c │ ├── relu-input-gradient.c │ ├── relu-output.c │ └── softmax-output.c ├── relu-input-gradient.c ├── relu-output.c ├── scalar │ ├── 2d-fourier-16x16.c │ ├── 2d-fourier-8x8.c │ ├── 2d-winograd-8x8-3x3.c │ ├── blas │ │ ├── cgemm-conjb-transc.c │ │ ├── cgemm-conjb.c │ │ ├── cgemm.c │ │ ├── conv1x1.c │ │ ├── s2gemm-transc.c │ │ ├── s2gemm.c │ │ ├── sdotxf.c │ │ ├── sgemm.c │ │ └── shdotxf.c │ ├── butterfly.h │ ├── fft-aos.c │ ├── fft-dualreal.c │ ├── fft-real.c │ ├── fft-soa.c │ ├── fft │ │ ├── aos.h │ │ ├── dualreal.h │ │ ├── real.h │ │ └── soa.h │ ├── relu.c │ ├── softmax.c │ ├── winograd-f6k3.c │ └── winograd │ │ └── f6x6k3x3.h ├── softmax-output.c └── x86_64-fma │ ├── 2d-fourier-16x16.py │ ├── 2d-fourier-8x8.py │ ├── 2d-winograd-8x8-3x3.py │ ├── __init__.py │ ├── blas │ ├── c8gemm.py │ ├── conv1x1.py │ ├── s4c6gemm.py │ ├── s8gemm.py │ ├── sdotxf.py │ ├── sgemm.py │ └── shdotxf.py │ ├── block8x8.py │ ├── common.py │ ├── exp.c │ ├── exp.py │ ├── fft-aos.py │ ├── fft-dualreal.py │ ├── fft-real.py │ ├── fft-soa.py │ ├── fft │ ├── __init__.py │ ├── complex_soa.py │ ├── complex_soa_perm_to_real.py │ ├── real_to_complex_soa_perm.py │ ├── two_complex_soa_perm_to_two_real_planar.py │ └── two_real_to_two_complex_soa_perm_planar.py │ ├── fft16x16.py │ ├── ifft-dualreal.py │ ├── ifft-real.py │ ├── max-pooling.py │ ├── relu.py │ ├── softmax.c │ ├── softmax.py │ ├── vecmath │ ├── __init__.py │ └── exp.py │ ├── winograd-f6k3.py │ └── winograd │ ├── __init__.py │ └── o6x6k3x3.py ├── test ├── convolution-inference │ ├── alexnet.cc │ ├── overfeat-fast.cc │ ├── smoke.cc │ └── vgg-a.cc ├── convolution-input-gradient │ ├── alexnet.cc │ ├── overfeat-fast.cc │ ├── smoke.cc │ └── vgg-a.cc ├── convolution-kernel-gradient │ ├── alexnet.cc │ ├── overfeat-fast.cc │ ├── smoke.cc │ └── vgg-a.cc ├── convolution-output │ ├── alexnet.cc │ ├── overfeat-fast.cc │ ├── smoke.cc │ └── vgg-a.cc ├── fft-samples.h ├── fourier │ ├── psimd.cc │ ├── reference.cc │ ├── scalar.cc │ └── x86_64-avx2.cc ├── fully-connected-inference │ ├── alexnet.cc │ ├── overfeat-fast.cc │ └── vgg-a.cc ├── fully-connected-output │ ├── alexnet.cc │ ├── overfeat-fast.cc │ ├── smoke.cc │ └── vgg-a.cc ├── hxgemm │ └── neon.cc ├── max-pooling-output │ ├── overfeat-fast.cc │ ├── smoke.cc │ └── vgg-a.cc ├── models │ ├── alexnet.h │ ├── overfeat-fast.h │ └── vgg-a.h ├── relu-input-gradient │ ├── alexnet.cc │ ├── overfeat-fast.cc │ └── vgg-a.cc ├── relu-output │ ├── alexnet.cc │ ├── overfeat-fast.cc │ └── vgg-a.cc ├── sgemm │ ├── neon.cc │ ├── psimd.cc │ ├── scalar.cc │ └── x86_64-fma3.cc ├── softmax-output │ ├── imagenet.cc │ └── smoke.cc ├── sxgemm │ └── neon.cc ├── testers │ ├── convolution.h │ ├── fourier.h │ ├── fully-connected.h │ ├── gemm-ukernel.h │ ├── padding.h │ ├── pooling.h │ ├── relu.h │ ├── softmax.h │ └── winograd.h └── winograd │ ├── neon.cc │ ├── psimd.cc │ ├── scalar.cc │ └── x86_64-fma3.cc └── web ├── nnpack.html └── nnpack.nmf /.gitignore: -------------------------------------------------------------------------------- 1 | # Ninja files 2 | build.ninja 3 | 4 | # Build objects and artifacts 5 | deps/ 6 | build/ 7 | build-*/ 8 | bin/ 9 | lib/ 10 | out/ 11 | obj/ 12 | libs/ 13 | *.pyc 14 | *.pyo 15 | 16 | # System files 17 | .DS_Store 18 | .DS_Store? 19 | ._* 20 | .Spotlight-V100 21 | .Trashes 22 | ehthumbs.db 23 | Thumbs.db 24 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: c 2 | compiler: clang 3 | install: 4 | - git clone https://github.com/ninja-build/ninja.git /tmp/ninja 5 | - pushd /tmp/ninja 6 | - git checkout release 7 | - python configure.py --bootstrap 8 | - mkdir -p $HOME/.local/bin 9 | - install -m 755 /tmp/ninja/ninja $HOME/.local/bin/ninja 10 | - popd 11 | - export PATH=$HOME/.local/bin:$PATH 12 | - pip install --user git+https://github.com/Maratyszcza/PeachPy 13 | - pip install --user git+https://github.com/Maratyszcza/confu 14 | before_script: 15 | - confu setup 16 | - python ./configure.py --toolchain=clang --backend=$BACKEND 17 | - ninja 18 | script: 19 | - ninja smoketest 20 | addons: 21 | apt: 22 | packages: 23 | - python-pip 24 | env: 25 | - BACKEND=psimd 26 | - BACKEND=scalar 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Facebook Inc. 2 | Copyright (c) 2015-2017, Georgia Institute of Technology 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright notice, this 9 | list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above copyright notice, 12 | this list of conditions and the following disclaimer in the documentation 13 | and/or other materials provided with the distribution. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 16 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 19 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 20 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 21 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 22 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 23 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 24 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 | -------------------------------------------------------------------------------- /bench/conv1x1.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | 17 | template 18 | class CONV1x1 : public benchmark::Fixture { 19 | public: 20 | inline CONV1x1() { 21 | cpuinfo_initialize(); 22 | const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size; 23 | const size_t l1d_reserve = 512; 24 | kc_ = ((l1d_size - l1d_reserve) / sizeof(float) - mr() * nr()) / (mr() + nr()); 25 | } 26 | 27 | virtual void SetUp(const benchmark::State&) override { 28 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 29 | auto rng = std::bind(std::uniform_real_distribution(), std::mt19937(seed)); 30 | 31 | i_.resize(mr() * kc()); 32 | std::generate(i_.begin(), i_.end(), std::ref(rng)); 33 | k_.resize(mr() * kc() + nr()); 34 | std::fill(k_.begin(), k_.end(), std::nanf("")); 35 | o_.resize(nr() * kc()); 36 | std::generate(o_.begin(), o_.end(), std::ref(rng)); 37 | } 38 | 39 | virtual void TearDown(benchmark::State& state) override { 40 | state.SetItemsProcessed(uint64_t(state.iterations()) * 2 * mr() * nr() * kc()); 41 | i_.clear(); 42 | k_.clear(); 43 | o_.clear(); 44 | } 45 | 46 | inline const float* i() const { 47 | return i_.data(); 48 | } 49 | 50 | inline const float* k() const { 51 | return k_.data(); 52 | } 53 | 54 | inline float* o() { 55 | return o_.data(); 56 | } 57 | 58 | inline uint32_t mr() const { 59 | return mr_; 60 | } 61 | 62 | inline uint32_t nr() const { 63 | return nr_; 64 | } 65 | 66 | inline uint32_t kc() const { 67 | return kc_; 68 | } 69 | 70 | private: 71 | std::vector i_; 72 | std::vector k_; 73 | std::vector o_; 74 | uint32_t kc_; 75 | }; 76 | 77 | #if NNP_BACKEND_X86_64 78 | BENCHMARK_TEMPLATE_F(CONV1x1, fast__neon, 2, 4)(benchmark::State& state) { 79 | for (auto _ : state) { 80 | nnp_conv1x1_only_2x4__fma3(mr(), kc(), i(), k(), o()); 81 | } 82 | } 83 | #endif 84 | 85 | #if NNP_BACKEND_ARM 86 | BENCHMARK_TEMPLATE_F(CONV1x1, fast__neon, 4, 4)(benchmark::State& state) { 87 | for (auto _ : state) { 88 | nnp_conv1x1_only_4x4__neon(mr(), kc(), i(), k(), o()); 89 | } 90 | } 91 | #endif 92 | 93 | #if NNP_BACKEND_PSIMD 94 | BENCHMARK_TEMPLATE_F(CONV1x1, psimd, 2, 8)(benchmark::State& state) { 95 | for (auto _ : state) { 96 | nnp_conv1x1_only_2x4__psimd(mr(), kc(), i(), k(), o()); 97 | } 98 | } 99 | #endif 100 | 101 | #if NNP_BACKEND_SCALAR 102 | BENCHMARK_TEMPLATE_F(CONV1x1, scalar, 2, 4)(benchmark::State& state) { 103 | for (auto _ : state) { 104 | nnp_conv1x1_only_2x4__scalar(mr(), kc(), i(), k(), o()); 105 | } 106 | } 107 | #endif 108 | 109 | BENCHMARK_MAIN(); 110 | -------------------------------------------------------------------------------- /bench/median.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | static int compare_ulonglong(const void *a_ptr, const void *b_ptr) { 6 | const unsigned long long a = *((unsigned long long*) a_ptr); 7 | const unsigned long long b = *((unsigned long long*) b_ptr); 8 | if (a < b) { 9 | return -1; 10 | } else if (a > b) { 11 | return 1; 12 | } else { 13 | return 0; 14 | } 15 | } 16 | 17 | static int compare_profile(const void *a_ptr, const void *b_ptr) { 18 | const double a_total = ((const struct nnp_profile*) a_ptr)->total; 19 | const double b_total = ((const struct nnp_profile*) b_ptr)->total; 20 | if (a_total < b_total) { 21 | return -1; 22 | } else if (a_total > b_total) { 23 | return 1; 24 | } else { 25 | return 0; 26 | } 27 | } 28 | 29 | static inline unsigned long long average(unsigned long long a, unsigned long long b) { 30 | return (a / 2) + (b / 2) + (a & b & 1ull); 31 | } 32 | 33 | static inline struct nnp_profile average_profile(struct nnp_profile a, struct nnp_profile b) { 34 | return (struct nnp_profile) { 35 | .total = 0.5 * (a.total + b.total), 36 | .input_transform = 0.5 * (a.input_transform + b.input_transform), 37 | .kernel_transform = 0.5 * (a.kernel_transform + b.kernel_transform), 38 | .output_transform = 0.5 * (a.output_transform + b.output_transform), 39 | .block_multiplication = 0.5 * (a.block_multiplication + b.block_multiplication) 40 | }; 41 | } 42 | 43 | unsigned long long median(unsigned long long array[], size_t length) { 44 | qsort(array, length, sizeof(unsigned long long), &compare_ulonglong); 45 | if (length % 2 == 0) { 46 | const unsigned long long median_lo = array[length / 2 - 1]; 47 | const unsigned long long median_hi = array[length / 2]; 48 | return average(median_lo, median_hi); 49 | } else { 50 | return array[length / 2]; 51 | } 52 | } 53 | 54 | struct nnp_profile median_profile(struct nnp_profile array[], size_t length) { 55 | qsort(array, length, sizeof(struct nnp_profile), &compare_profile); 56 | if (length % 2 == 0) { 57 | return average_profile(array[length / 2 - 1], array[length / 2]); 58 | } else { 59 | return array[length / 2]; 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /bench/memread.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int read_memory(const void* pointer, size_t bytes) { 4 | int hash = 0; 5 | while (bytes >= 64) { 6 | hash ^= *((const int*) pointer); 7 | pointer += 64; 8 | bytes -= 64; 9 | } 10 | return hash; 11 | } 12 | -------------------------------------------------------------------------------- /bench/memread.py: -------------------------------------------------------------------------------- 1 | arg_mem = Argument(ptr(), "mem") 2 | arg_len = Argument(size_t, "n") 3 | with Function("read_memory", (arg_mem, arg_len)): 4 | reg_mem = GeneralPurposeRegister64() 5 | LOAD.ARGUMENT(reg_mem, arg_mem) 6 | 7 | reg_len = GeneralPurposeRegister64() 8 | LOAD.ARGUMENT(reg_len, arg_len) 9 | 10 | main_loop = Loop() 11 | SUB(reg_len, 64) 12 | JB(main_loop.end) 13 | with main_loop: 14 | MOVAPS(xmm0, [reg_mem]) 15 | ADD(reg_mem, 64) 16 | SUB(reg_len, 64) 17 | JAE(main_loop.begin) 18 | 19 | RETURN() 20 | -------------------------------------------------------------------------------- /bench/perf_counter.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #if defined(__linux__) 7 | #include 8 | #include 9 | #include 10 | #if !defined(__ANDROID__) 11 | #include 12 | #endif 13 | #elif defined(__native_client__) 14 | #include 15 | #elif defined(EMSCRIPTEN) 16 | #include 17 | #else 18 | #if defined(__MACH__) 19 | #include 20 | #include 21 | #endif 22 | #if defined(__x86_64__) 23 | #include 24 | #endif 25 | #endif 26 | 27 | struct performance_counter { 28 | const char* name; 29 | int file_descriptor; 30 | }; 31 | 32 | static inline bool enable_perf_counter(int file_descriptor) { 33 | #if defined(__linux__) && defined(__x86_64__) && !defined(__ANDROID__) 34 | return ioctl(file_descriptor, PERF_EVENT_IOC_ENABLE, 0) == 0; 35 | #else 36 | return true; 37 | #endif 38 | } 39 | 40 | static inline bool disable_perf_counter(int file_descriptor) { 41 | #if defined(__linux__) && defined(__x86_64__) && !defined(__ANDROID__) 42 | return ioctl(file_descriptor, PERF_EVENT_IOC_DISABLE, 0) == 0; 43 | #else 44 | return true; 45 | #endif 46 | } 47 | 48 | static inline bool read_perf_counter(int file_descriptor, unsigned long long output[restrict static 1]) { 49 | #if defined(__linux__) && defined(__x86_64__) && !defined(__ANDROID__) 50 | return read(file_descriptor, output, sizeof(*output)) == sizeof(*output); 51 | #elif defined(EMSCRIPTEN) || (defined(__native_client__) && !defined(__x86_64__)) 52 | return false; 53 | #elif (defined(__native_client__) || defined(__ANDROID__)) && (defined(__x86_64__) || defined(__i386__)) 54 | unsigned int lo, hi; 55 | asm volatile( 56 | "XORL %%eax, %%eax;" 57 | "CPUID;" 58 | "RDTSC;" 59 | : "=a" (lo), "=d" (hi) 60 | : 61 | : "%rbx", "%rcx" 62 | ); 63 | *output = (((unsigned long long) hi) << 32) | ((unsigned long long) lo); 64 | return true; 65 | #elif defined(__x86_64__) 66 | unsigned int aux; 67 | *output = __rdtscp(&aux); 68 | return true; 69 | #else 70 | return false; 71 | #endif 72 | } 73 | 74 | static inline bool read_timer(unsigned long long output[restrict static 1]) { 75 | #if defined(__linux__) 76 | struct timespec ts; 77 | if (clock_gettime(CLOCK_MONOTONIC, &ts) != 0) { 78 | return false; 79 | } else { 80 | *output = ts.tv_sec * 1000000000ull + ts.tv_nsec; 81 | return true; 82 | } 83 | #elif defined(__MACH__) 84 | static mach_timebase_info_data_t timebase_info; 85 | if (timebase_info.denom == 0) { 86 | mach_timebase_info(&timebase_info); 87 | } 88 | 89 | *output = mach_absolute_time() * timebase_info.numer / timebase_info.denom; 90 | return true; 91 | #elif defined(__native_client__) 92 | struct timeval walltime; 93 | if (gettimeofday(&walltime, NULL) == 0) { 94 | *output = walltime.tv_sec * 1000000000ull + walltime.tv_usec * 1000ull; 95 | return true; 96 | } else { 97 | return false; 98 | } 99 | #elif defined(EMSCRIPTEN) 100 | *output = (unsigned long long) (emscripten_get_now() * 1.0e+6); 101 | return true; 102 | #else 103 | #error No implementation available 104 | #endif 105 | } 106 | 107 | #if defined(__linux__) && defined(__x86_64__) 108 | const struct performance_counter* init_performance_counters(size_t* count_ptr); 109 | #else 110 | static inline const struct performance_counter* init_performance_counters(size_t* count_ptr) { 111 | static const struct performance_counter performance_counter = { 112 | .name = "Cycles" 113 | }; 114 | *count_ptr = 1; 115 | return &performance_counter; 116 | } 117 | #endif 118 | -------------------------------------------------------------------------------- /bench/sgemm.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | #include 15 | 16 | 17 | template 18 | class SGEMM : public benchmark::Fixture { 19 | public: 20 | inline SGEMM() { 21 | cpuinfo_initialize(); 22 | const size_t l1d_size = cpuinfo_get_l1d_cache(0)->size; 23 | const size_t l1d_reserve = 512; 24 | kc_ = ((l1d_size - l1d_reserve) / sizeof(float) - mr() * nr()) / (mr() + nr()); 25 | } 26 | 27 | virtual void SetUp(const benchmark::State&) override { 28 | const uint_fast32_t seed = std::chrono::system_clock::now().time_since_epoch().count(); 29 | auto rng = std::bind(std::uniform_real_distribution(), std::mt19937(seed)); 30 | 31 | a_.resize(mr() * kc()); 32 | std::generate(a_.begin(), a_.end(), std::ref(rng)); 33 | b_.resize(nr() * kc()); 34 | std::generate(b_.begin(), b_.end(), std::ref(rng)); 35 | c_.resize(mr() * nr()); 36 | std::fill(c_.begin(), c_.end(), std::nanf("")); 37 | } 38 | 39 | virtual void TearDown(benchmark::State& state) override { 40 | state.SetItemsProcessed(uint64_t(state.iterations()) * 2 * mr() * nr() * kc()); 41 | a_.clear(); 42 | b_.clear(); 43 | c_.clear(); 44 | } 45 | 46 | inline const float* a() const { 47 | return a_.data(); 48 | } 49 | 50 | inline const float* b() const { 51 | return b_.data(); 52 | } 53 | 54 | inline float* c() { 55 | return c_.data(); 56 | } 57 | 58 | inline uint32_t mr() const { 59 | return mr_; 60 | } 61 | 62 | inline uint32_t nr() const { 63 | return nr_; 64 | } 65 | 66 | inline uint32_t kc() const { 67 | return kc_; 68 | } 69 | 70 | private: 71 | std::vector> a_; 72 | std::vector> b_; 73 | std::vector c_; 74 | uint32_t kc_; 75 | }; 76 | 77 | #if NNP_BACKEND_X86_64 78 | BENCHMARK_TEMPLATE_F(SGEMM, fma3, 4, 24)(benchmark::State& state) { 79 | for (auto _ : state) { 80 | nnp_sgemm_only_4x24__fma3(kc(), 0, a(), b(), c(), nr()); 81 | } 82 | } 83 | #endif 84 | 85 | #if NNP_BACKEND_ARM && CPUINFO_ARCH_ARM 86 | BENCHMARK_TEMPLATE_F(SGEMM, aarch32_neon, 6, 8)(benchmark::State& state) { 87 | for (auto _ : state) { 88 | nnp_sgemm_only_6x8__aarch32_neon(kc(), 0, a(), b(), c(), nr()); 89 | } 90 | } 91 | #endif 92 | 93 | #if NNP_BACKEND_ARM 94 | BENCHMARK_TEMPLATE_F(SGEMM, neon, 6, 8)(benchmark::State& state) { 95 | for (auto _ : state) { 96 | nnp_sgemm_only_6x8__neon(kc(), 0, a(), b(), c(), nr()); 97 | } 98 | } 99 | #endif 100 | 101 | #if NNP_BACKEND_PSIMD 102 | BENCHMARK_TEMPLATE_F(SGEMM, psimd, 4, 8)(benchmark::State& state) { 103 | for (auto _ : state) { 104 | nnp_sgemm_only_4x8__psimd(kc(), 0, a(), b(), c(), nr()); 105 | } 106 | } 107 | #endif 108 | 109 | #if NNP_BACKEND_SCALAR 110 | BENCHMARK_TEMPLATE_F(SGEMM, scalar, 4, 3)(benchmark::State& state) { 111 | for (auto _ : state) { 112 | nnp_sgemm_only_4x3__scalar(kc(), 0, a(), b(), c(), nr()); 113 | } 114 | } 115 | #endif 116 | 117 | BENCHMARK_MAIN(); 118 | -------------------------------------------------------------------------------- /cmake/DownloadCpuinfo.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 2 | 3 | PROJECT(cpuinfo-download NONE) 4 | 5 | INCLUDE(ExternalProject) 6 | ExternalProject_Add(cpuinfo 7 | GIT_REPOSITORY https://github.com/Maratyszcza/cpuinfo.git 8 | GIT_TAG main 9 | SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/cpuinfo" 10 | BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/cpuinfo" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /cmake/DownloadEnum.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 2 | 3 | PROJECT(enum-download NONE) 4 | 5 | INCLUDE(ExternalProject) 6 | ExternalProject_Add(enum 7 | URL https://pypi.python.org/packages/bf/3e/31d502c25302814a7c2f1d3959d2a3b3f78e509002ba91aea64993936876/enum34-1.1.6.tar.gz 8 | URL_HASH SHA256=8ad8c4783bf61ded74527bffb48ed9b54166685e4230386a9ed9b1279e2df5b1 9 | SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/enum" 10 | BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/enum" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /cmake/DownloadFP16.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 2 | 3 | PROJECT(fp16-download NONE) 4 | 5 | INCLUDE(ExternalProject) 6 | ExternalProject_Add(fp16 7 | GIT_REPOSITORY https://github.com/Maratyszcza/FP16.git 8 | GIT_TAG master 9 | SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/fp16" 10 | BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/fp16" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /cmake/DownloadFXdiv.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 2 | 3 | PROJECT(fxdiv-download NONE) 4 | 5 | INCLUDE(ExternalProject) 6 | ExternalProject_Add(fxdiv 7 | GIT_REPOSITORY https://github.com/Maratyszcza/FXdiv.git 8 | GIT_TAG master 9 | SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/fxdiv" 10 | BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/fxdiv" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /cmake/DownloadGoogleTest.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 2 | 3 | PROJECT(googletest-download NONE) 4 | 5 | INCLUDE(ExternalProject) 6 | ExternalProject_Add(googletest 7 | URL https://github.com/google/googletest/archive/release-1.8.0.zip 8 | URL_HASH SHA256=f3ed3b58511efd272eb074a3a6d6fb79d7c2e6a0e374323d1e6bcbcc1ef141bf 9 | SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/googletest" 10 | BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/googletest" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /cmake/DownloadOpcodes.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 2 | 3 | PROJECT(opcodes-download NONE) 4 | 5 | INCLUDE(ExternalProject) 6 | ExternalProject_Add(opcodes 7 | URL https://pypi.python.org/packages/e8/59/8c2e293c9c8d60f206fd5d0f6c8236a2e0a97832379ac319077441552c6a/opcodes-0.3.13.tar.gz 8 | URL_HASH SHA256=1859c23143fe20daa4110be87a947cbf3eefa048da71dde642290213f251590c 9 | SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/opcodes" 10 | BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/opcodes" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /cmake/DownloadPSimd.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 2 | 3 | PROJECT(psimd-download NONE) 4 | 5 | INCLUDE(ExternalProject) 6 | ExternalProject_Add(psimd 7 | GIT_REPOSITORY https://github.com/Maratyszcza/psimd.git 8 | GIT_TAG master 9 | SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/psimd" 10 | BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/psimd" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /cmake/DownloadPThreadPool.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 2 | 3 | PROJECT(pthreadpool-download NONE) 4 | 5 | INCLUDE(ExternalProject) 6 | ExternalProject_Add(pthreadpool 7 | GIT_REPOSITORY https://github.com/Maratyszcza/pthreadpool.git 8 | GIT_TAG master 9 | SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/pthreadpool" 10 | BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/pthreadpool" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /cmake/DownloadPeachPy.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 2 | 3 | PROJECT(peachpy-download NONE) 4 | 5 | FIND_PACKAGE(PythonInterp REQUIRED) 6 | 7 | INCLUDE(ExternalProject) 8 | ExternalProject_Add(peachpy 9 | GIT_REPOSITORY https://github.com/Maratyszcza/PeachPy.git 10 | GIT_TAG master 11 | SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/peachpy" 12 | BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/peachpy" 13 | PATCH_COMMAND "PYTHONPATH=${PYTHON_SIX_SOURCE_DIR}:${PYTHON_ENUM_SOURCE_DIR}:${PYTHON_OPCODES_SOURCE_DIR}" ${PYTHON_EXECUTABLE} setup.py generate 14 | CONFIGURE_COMMAND "" 15 | BUILD_COMMAND "" 16 | INSTALL_COMMAND "" 17 | TEST_COMMAND "" 18 | ) 19 | -------------------------------------------------------------------------------- /cmake/DownloadSix.cmake: -------------------------------------------------------------------------------- 1 | CMAKE_MINIMUM_REQUIRED(VERSION 2.8.12 FATAL_ERROR) 2 | 3 | PROJECT(six-download NONE) 4 | 5 | INCLUDE(ExternalProject) 6 | ExternalProject_Add(six 7 | URL https://pypi.python.org/packages/16/d8/bc6316cf98419719bd59c91742194c111b6f2e85abac88e496adefaf7afe/six-1.11.0.tar.gz 8 | URL_HASH SHA256=70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9 9 | SOURCE_DIR "${CONFU_DEPENDENCIES_SOURCE_DIR}/six" 10 | BINARY_DIR "${CONFU_DEPENDENCIES_BINARY_DIR}/six" 11 | CONFIGURE_COMMAND "" 12 | BUILD_COMMAND "" 13 | INSTALL_COMMAND "" 14 | TEST_COMMAND "" 15 | ) 16 | -------------------------------------------------------------------------------- /confu.yaml: -------------------------------------------------------------------------------- 1 | name: nnpack 2 | title: Neural Networks acceleration PACKage 3 | license: Simplified BSD 4 | deps: 5 | - name: pthreadpool 6 | url: https://github.com/Maratyszcza/pthreadpool.git 7 | - name: cpuinfo 8 | url: https://github.com/pytorch/cpuinfo.git 9 | - name: fxdiv 10 | url: https://github.com/Maratyszcza/FXdiv.git 11 | - name: fp16 12 | url: https://github.com/Maratyszcza/FP16.git 13 | - name: psimd 14 | url: https://github.com/Maratyszcza/psimd.git 15 | - name: clog 16 | - name: googletest 17 | - name: googlebenchmark 18 | -------------------------------------------------------------------------------- /include/nnpack/AlignedAllocator.h: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | template 7 | class AlignedAllocator; 8 | 9 | template 10 | class AlignedAllocator 11 | { 12 | public: 13 | typedef void* pointer; 14 | typedef const void* const_pointer; 15 | typedef void value_type; 16 | 17 | template 18 | struct rebind { 19 | typedef AlignedAllocator other; 20 | }; 21 | }; 22 | 23 | template 24 | class AlignedAllocator 25 | { 26 | public: 27 | typedef T value_type; 28 | typedef T* pointer; 29 | typedef const T* const_pointer; 30 | typedef T& reference; 31 | typedef const T& const_reference; 32 | typedef size_t size_type; 33 | typedef ptrdiff_t difference_type; 34 | 35 | #if __cplusplus >= 201402L 36 | typedef std::true_type propagate_on_container_move_assignment; 37 | #endif 38 | 39 | template 40 | struct rebind { 41 | typedef AlignedAllocator other; 42 | }; 43 | 44 | public: 45 | inline AlignedAllocator() noexcept { 46 | } 47 | 48 | template 49 | inline AlignedAllocator(const AlignedAllocator& other) noexcept { 50 | } 51 | 52 | inline size_type max_size() const noexcept { 53 | return (std::numeric_limits::max() - size_type(Alignment)) / sizeof(T); 54 | } 55 | 56 | inline pointer address(reference x) const noexcept { 57 | return std::addressof(x); 58 | } 59 | 60 | inline const_pointer address(const_reference x) const noexcept { 61 | return std::addressof(x); 62 | } 63 | 64 | inline pointer allocate(size_type n, typename AlignedAllocator::const_pointer hint = 0) { 65 | #if defined(__ANDROID__) 66 | void* memory = memalign(Alignment, n * sizeof(T)); 67 | if (memory == 0) { 68 | #if !defined(__GNUC__) || defined(__EXCEPTIONS) 69 | throw std::bad_alloc(); 70 | #endif 71 | } 72 | #else 73 | void* memory = nullptr; 74 | if (posix_memalign(&memory, Alignment, n * sizeof(T)) != 0) { 75 | #if !defined(__GNUC__) || defined(__EXCEPTIONS) 76 | throw std::bad_alloc(); 77 | #endif 78 | } 79 | #endif 80 | return static_cast(memory); 81 | } 82 | 83 | inline void deallocate(pointer p, size_type n) noexcept { 84 | free(static_cast(p)); 85 | } 86 | 87 | template 88 | inline void construct(U* p, Args&&... args) { 89 | ::new(static_cast(p)) U(std::forward(args)...); 90 | } 91 | 92 | template 93 | inline void destroy(U* p) { 94 | p->~U(); 95 | } 96 | }; 97 | -------------------------------------------------------------------------------- /include/nnpack/activations.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | 6 | static inline float relu(float data, float negative_slope) { 7 | return signbit(data) ? data * negative_slope : data; 8 | } 9 | 10 | static inline float grad_relu(float grad_output_data, float input_data, float negative_slope) { 11 | return signbit(input_data) ? grad_output_data * negative_slope : grad_output_data; 12 | } 13 | 14 | #ifdef PSIMD_H 15 | static inline psimd_f32 psimd_relu_f32(psimd_f32 data, psimd_f32 negative_slope) { 16 | return psimd_signblend_f32(data, data * negative_slope, data); 17 | } 18 | 19 | static inline psimd_f32 psimd_grad_relu_f32(psimd_f32 grad_output_data, psimd_f32 input_data, psimd_f32 negative_slope) { 20 | return psimd_signblend_f32(input_data, grad_output_data * negative_slope, grad_output_data); 21 | } 22 | #endif 23 | 24 | #if defined(__ARM_NEON) || defined(__ARM_NEON__) 25 | #include 26 | 27 | static inline float32x4_t neon_reluq_f32(float32x4_t data, float32x4_t negative_slope) { 28 | const uint32x4_t negative_mask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(data), 31)); 29 | return vbslq_f32(negative_mask, vmulq_f32(data, negative_slope), data); 30 | } 31 | 32 | static inline float32x4_t neon_grad_reluq_f32(float32x4_t grad_output_data, float32x4_t input_data, float32x4_t negative_slope) { 33 | const uint32x4_t negative_mask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_f32(input_data), 31)); 34 | return vbslq_f32(negative_mask, vmulq_f32(grad_output_data, negative_slope), grad_output_data); 35 | } 36 | 37 | static inline float32x2_t neon_relu_f32(float32x2_t data, float32x2_t negative_slope) { 38 | const uint32x2_t negative_mask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(data), 31)); 39 | return vbsl_f32(negative_mask, vmul_f32(data, negative_slope), data); 40 | } 41 | 42 | static inline float32x2_t neon_grad_relu_f32(float32x2_t grad_output_data, float32x2_t input_data, float32x2_t negative_slope) { 43 | const uint32x2_t negative_mask = vreinterpret_u32_s32(vshr_n_s32(vreinterpret_s32_f32(input_data), 31)); 44 | return vbsl_f32(negative_mask, vmul_f32(grad_output_data, negative_slope), grad_output_data); 45 | } 46 | #endif 47 | -------------------------------------------------------------------------------- /include/nnpack/assembly.h: -------------------------------------------------------------------------------- 1 | #ifdef __ELF__ 2 | .macro BEGIN_FUNCTION name 3 | .text 4 | .align 2 5 | .global \name 6 | .type \name, %function 7 | \name: 8 | .endm 9 | 10 | .macro END_FUNCTION name 11 | .size \name, .-\name 12 | .endm 13 | #elif defined(__MACH__) 14 | .macro BEGIN_FUNCTION name 15 | .text 16 | .align 2 17 | .global _\name 18 | _\name: 19 | .endm 20 | 21 | .macro END_FUNCTION name 22 | .endm 23 | #endif 24 | -------------------------------------------------------------------------------- /include/nnpack/complex.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #ifndef CMPLXF 6 | #define CMPLXF(real, imag) ((real) + _Complex_I * (imag)) 7 | #endif 8 | 9 | #ifdef __ANDROID__ 10 | /* Work-around for pre-API 23 Android, where libc does not provide crealf */ 11 | #if __ANDROID_API__ < 23 12 | static inline float crealf(_Complex float c) { 13 | return __real__ c; 14 | } 15 | 16 | static inline float cimagf(_Complex float c) { 17 | return __imag__ c; 18 | } 19 | #endif 20 | #endif 21 | -------------------------------------------------------------------------------- /include/nnpack/macros.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | #if defined(__GNUC__) 5 | #if defined(__clang__) || ((__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 5))) 6 | #define NNP_UNREACHABLE do { __builtin_unreachable(); } while (0) 7 | #else 8 | #define NNP_UNREACHABLE do { __builtin_trap(); } while (0) 9 | #endif 10 | #else 11 | #define NNP_UNREACHABLE do { } while (0) 12 | #endif 13 | 14 | 15 | #if defined(NNP_BACKEND_PSIMD) 16 | #if !(NNP_BACKEND_PSIMD) 17 | #error NNP_BACKEND_PSIMD predefined as 0 18 | #endif 19 | #elif defined(NNP_BACKEND_SCALAR) 20 | #if !(NNP_BACKEND_SCALAR) 21 | #error NNP_BACKEND_SCALAR predefined as 0 22 | #endif 23 | #elif defined(__arm__) || defined(__aarch64__) 24 | #define NNP_BACKEND_ARM 1 25 | #elif defined(__ANDROID__) && (defined(__i686__) || defined(__x86_64__)) 26 | #define NNP_BACKEND_PSIMD 1 27 | #elif defined(__x86_64__) 28 | #define NNP_BACKEND_X86_64 1 29 | #elif defined(__ANDROID__) && defined(__mips__) 30 | #define NNP_BACKEND_SCALAR 1 31 | #else 32 | #define NNP_BACKEND_PSIMD 1 33 | #endif 34 | 35 | #ifndef NNP_BACKEND_PSIMD 36 | #define NNP_BACKEND_PSIMD 0 37 | #endif 38 | #ifndef NNP_BACKEND_SCALAR 39 | #define NNP_BACKEND_SCALAR 0 40 | #endif 41 | #ifndef NNP_BACKEND_ARM 42 | #define NNP_BACKEND_ARM 0 43 | #endif 44 | #ifndef NNP_BACKEND_X86_64 45 | #define NNP_BACKEND_X86_64 0 46 | #endif 47 | 48 | #define NNP_ALIGN(alignment) __attribute__((__aligned__(alignment))) 49 | #define NNP_SIMD_ALIGN NNP_ALIGN(64) 50 | #define NNP_CACHE_ALIGN NNP_ALIGN(64) 51 | 52 | #define NNP_COUNT_OF(array) (sizeof(array) / sizeof(0[array])) 53 | 54 | #if defined(__GNUC__) 55 | #define NNP_LIKELY(condition) (__builtin_expect(!!(condition), 1)) 56 | #define NNP_UNLIKELY(condition) (__builtin_expect(!!(condition), 0)) 57 | #else 58 | #define NNP_LIKELY(condition) (!!(condition)) 59 | #define NNP_UNLIKELY(condition) (!!(condition)) 60 | #endif 61 | 62 | #if defined(__GNUC__) 63 | #define NNP_INLINE inline __attribute__((__always_inline__)) 64 | #else 65 | #define NNP_INLINE inline 66 | #endif 67 | -------------------------------------------------------------------------------- /include/nnpack/pooling.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | typedef void (*nnp_pooling_function)(const float*, float*, size_t, size_t, size_t, size_t, size_t, size_t, uint32_t, uint32_t, uint32_t, uint32_t); 12 | 13 | void nnp_maxpool_2x2_2x2__avx2(const float* src_pointer, float* dst_pointer, size_t src_stride, 14 | uint32_t src_row_offset, uint32_t src_row_count, uint32_t src_column_offset, uint32_t src_column_count, uint32_t dst_column_count); 15 | 16 | #ifdef __cplusplus 17 | } /* extern "C" */ 18 | #endif 19 | -------------------------------------------------------------------------------- /include/nnpack/reference.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #include 6 | 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | 11 | void nnp_convolution_output__reference( 12 | size_t batch_size, 13 | size_t input_channels, 14 | size_t output_channels, 15 | struct nnp_size input_size, 16 | struct nnp_padding input_padding, 17 | struct nnp_size kernel_size, 18 | struct nnp_size output_subsampling, 19 | const float input_pointer[], 20 | const float kernel_pointer[], 21 | const float bias[], 22 | float output_pointer[], 23 | pthreadpool_t threadpool); 24 | 25 | void nnp_convolution_input_gradient__reference( 26 | size_t batch_size, 27 | size_t input_channels, 28 | size_t output_channels, 29 | struct nnp_size input_size, 30 | struct nnp_padding input_padding, 31 | struct nnp_size kernel_size, 32 | const float grad_output[], 33 | const float kernel[], 34 | float grad_input[], 35 | pthreadpool_t threadpool); 36 | 37 | void nnp_convolution_kernel_gradient__reference( 38 | size_t batch_size, 39 | size_t input_channels, 40 | size_t output_channels, 41 | struct nnp_size input_size, 42 | struct nnp_padding input_padding, 43 | struct nnp_size kernel_size, 44 | const float input[], 45 | const float grad_output[], 46 | float grad_kernel[], 47 | pthreadpool_t threadpool); 48 | 49 | void nnp_fully_connected_output_f32__reference( 50 | size_t batch_size, 51 | size_t input_channels, 52 | size_t output_channels, 53 | const float* input, 54 | const float* kernel, 55 | float* output, 56 | pthreadpool_t threadpool); 57 | 58 | void nnp_fully_connected_output_f16f32__reference( 59 | size_t batch_size, 60 | size_t input_channels, 61 | size_t output_channels, 62 | const float* input, 63 | const void* kernel, 64 | float* output, 65 | pthreadpool_t threadpool); 66 | 67 | void nnp_max_pooling_output__reference( 68 | size_t batch_size, 69 | size_t channels, 70 | struct nnp_size input_size, 71 | struct nnp_padding input_padding, 72 | struct nnp_size pooling_size, 73 | struct nnp_size pooling_stride, 74 | const float input[], 75 | float output[], 76 | pthreadpool_t threadpool); 77 | 78 | void nnp_relu_output__reference( 79 | size_t batch_size, 80 | size_t channels, 81 | const float input[], 82 | float output[], 83 | float negative_slope, 84 | pthreadpool_t threadpool); 85 | 86 | void nnp_relu_input_gradient__reference( 87 | size_t batch_size, 88 | size_t channels, 89 | const float grad_output[], 90 | const float input[], 91 | float grad_input[], 92 | float negative_slope, 93 | pthreadpool_t threadpool); 94 | 95 | void nnp_softmax_output__reference( 96 | size_t batch_size, 97 | size_t channels, 98 | const float input[], 99 | float output[], 100 | pthreadpool_t threadpool); 101 | 102 | #ifdef __cplusplus 103 | } /* extern "C" */ 104 | #endif 105 | -------------------------------------------------------------------------------- /include/nnpack/relu.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | void nnp_relu__avx2(const float* input, float* output, size_t length, float negative_slope); 11 | void nnp_inplace_relu__avx2(float* data, size_t length, float negative_slope); 12 | void nnp_grad_relu__avx2(const float* output_gradient, const float* input, float* input_gradient, size_t length, float negative_slope); 13 | 14 | void nnp_relu__neon(const float* input, float* output, size_t length, float negative_slope); 15 | void nnp_inplace_relu__neon(float* data, size_t length, float negative_slope); 16 | void nnp_grad_relu__neon(const float* output_gradient, const float* input, float* input_gradient, size_t length, float negative_slope); 17 | 18 | void nnp_relu__psimd(const float* input, float* output, size_t length, float negative_slope); 19 | void nnp_inplace_relu__psimd(float* data, size_t length, float negative_slope); 20 | void nnp_grad_relu__psimd(const float* output_gradient, const float* input, float* input_gradient, size_t length, float negative_slope); 21 | 22 | void nnp_relu__scalar(const float* input, float* output, size_t length, float negative_slope); 23 | void nnp_inplace_relu__scalar(float* data, size_t length, float negative_slope); 24 | void nnp_grad_relu__scalar(const float* output_gradient, const float* input, float* input_gradient, size_t length, float negative_slope); 25 | 26 | #ifdef __cplusplus 27 | } /* extern "C" */ 28 | #endif 29 | -------------------------------------------------------------------------------- /include/nnpack/softmax.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | #ifdef __cplusplus 6 | extern "C" { 7 | #endif 8 | 9 | typedef void (*nnp_exp_function)(size_t, const float*, float*); 10 | 11 | void nnp_vector_exp__psimd(size_t n, const float* x, float* y); 12 | 13 | void nnp_softmax__avx2(size_t n, const float* x, float* y); 14 | void nnp_inplace_softmax__avx2(size_t n, float* v); 15 | 16 | void nnp_softmax__psimd(size_t n, const float* x, float* y); 17 | void nnp_inplace_softmax__psimd(size_t n, float* v); 18 | 19 | void nnp_softmax__scalar(size_t n, const float* x, float* y); 20 | void nnp_inplace_softmax__scalar(size_t n, float* v); 21 | 22 | #ifdef __cplusplus 23 | } /* extern "C" */ 24 | #endif 25 | -------------------------------------------------------------------------------- /include/nnpack/system.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #if defined(__linux__) || defined(__native_client__) 9 | #include 10 | #include 11 | #include 12 | #elif defined(__MACH__) 13 | #include 14 | #include 15 | #elif defined(EMSCRIPTEN) 16 | #include 17 | #endif 18 | 19 | inline static double read_timer() { 20 | #if defined(__linux__) || defined(__native_client__) 21 | struct timespec ts; 22 | int result = clock_gettime(CLOCK_MONOTONIC, &ts); 23 | assert(result == 0); 24 | return ((double) ts.tv_sec) + ((double) ts.tv_nsec) * 1.0e-9; 25 | #elif defined(__MACH__) 26 | static mach_timebase_info_data_t timebase_info; 27 | if (timebase_info.denom == 0) { 28 | mach_timebase_info(&timebase_info); 29 | } 30 | 31 | return ((double) (mach_absolute_time() * timebase_info.numer / timebase_info.denom)) * 1.0e-9; 32 | #elif defined(EMSCRIPTEN) 33 | return emscripten_get_now() * 1.0e-3; 34 | #else 35 | #error No implementation available 36 | #endif 37 | } 38 | 39 | #define NNP_TOTAL_START(profile_ptr) \ 40 | double total_start; \ 41 | if (profile_ptr != NULL) { \ 42 | *profile_ptr = (struct nnp_profile) { 0 }; \ 43 | total_start = read_timer(); \ 44 | } 45 | 46 | #define NNP_KERNEL_TRANSFORM_START(profile_ptr) \ 47 | double kernel_transform_start; \ 48 | if (profile_ptr != NULL) { \ 49 | kernel_transform_start = read_timer(); \ 50 | } 51 | 52 | #define NNP_INPUT_TRANSFORM_START(profile_ptr) \ 53 | double input_transform_start; \ 54 | if (profile_ptr != NULL) { \ 55 | input_transform_start = read_timer(); \ 56 | } 57 | 58 | #define NNP_OUTPUT_TRANSFORM_START(profile_ptr) \ 59 | double output_transform_start; \ 60 | if (profile_ptr != NULL) { \ 61 | output_transform_start = read_timer(); \ 62 | } 63 | 64 | #define NNP_BLOCK_MULTIPLICATION_START(profile_ptr) \ 65 | double block_multiplication_start; \ 66 | if (profile_ptr != NULL) { \ 67 | block_multiplication_start = read_timer(); \ 68 | } 69 | 70 | #define NNP_TOTAL_END(profile_ptr) \ 71 | if (profile_ptr != NULL) { \ 72 | profile_ptr->total = read_timer() - total_start; \ 73 | } 74 | 75 | #define NNP_KERNEL_TRANSFORM_END(profile_ptr) \ 76 | if (profile_ptr != NULL) { \ 77 | profile_ptr->kernel_transform += read_timer() - kernel_transform_start; \ 78 | } 79 | 80 | #define NNP_INPUT_TRANSFORM_END(profile_ptr) \ 81 | if (profile_ptr != NULL) { \ 82 | profile_ptr->input_transform += read_timer() - input_transform_start; \ 83 | } 84 | 85 | #define NNP_OUTPUT_TRANSFORM_END(profile_ptr) \ 86 | if (profile_ptr != NULL) { \ 87 | profile_ptr->output_transform += read_timer() - output_transform_start; \ 88 | } 89 | 90 | #define NNP_BLOCK_MULTIPLICATION_END(profile_ptr) \ 91 | if (profile_ptr != NULL) { \ 92 | profile_ptr->block_multiplication += read_timer() - block_multiplication_start; \ 93 | } 94 | 95 | inline static void* allocate_memory(size_t memory_size) { 96 | #if defined(__linux__) 97 | #if !defined(__ANDROID__) 98 | /* Try to use large page TLB */ 99 | void* memory_block = mmap(NULL, memory_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE | MAP_HUGETLB, -1, 0); 100 | #else 101 | void* memory_block = MAP_FAILED; 102 | #endif 103 | if (memory_block == MAP_FAILED) { 104 | /* Fallback to standard pages */ 105 | memory_block = mmap(NULL, memory_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_POPULATE, -1, 0); 106 | if (memory_block == MAP_FAILED) { 107 | return NULL; 108 | } 109 | } 110 | return memory_block; 111 | #else 112 | void* memory_block = NULL; 113 | int allocation_result = posix_memalign(&memory_block, 64, memory_size); 114 | return (allocation_result == 0) ? memory_block : NULL; 115 | #endif 116 | } 117 | 118 | inline static void release_memory(void* memory_block, size_t memory_size) { 119 | #if defined(__linux__) 120 | if (memory_block != NULL) { 121 | munmap(memory_block, memory_size); 122 | } 123 | #else 124 | free(memory_block); 125 | #endif 126 | } 127 | -------------------------------------------------------------------------------- /include/nnpack/utils.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #ifdef __cplusplus 4 | #include 5 | #else 6 | #include 7 | #endif 8 | 9 | static inline float maxf(float a, float b) { 10 | return a > b ? a : b; 11 | } 12 | 13 | static inline size_t doz(size_t a, size_t b) { 14 | return a > b ? a - b : 0; 15 | } 16 | 17 | static inline size_t max(size_t a, size_t b) { 18 | return a > b ? a : b; 19 | } 20 | 21 | static inline size_t min(size_t a, size_t b) { 22 | return a > b ? b : a; 23 | } 24 | 25 | static inline size_t round_up(size_t number, size_t factor) { 26 | return (number + factor - 1) / factor * factor; 27 | } 28 | 29 | static inline size_t round_up_by_power_of_2(size_t number, size_t power_of_2_factor) { 30 | return (number + power_of_2_factor - 1) & ~(power_of_2_factor - 1); 31 | } 32 | 33 | static inline size_t round_down(size_t number, size_t factor) { 34 | return number / factor * factor; 35 | } 36 | 37 | static inline size_t divide_round_up(size_t dividend, size_t divisor) { 38 | if (dividend % divisor == 0) { 39 | return dividend / divisor; 40 | } else { 41 | return dividend / divisor + 1; 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /include/nnpack/winograd.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | #ifdef __cplusplus 7 | extern "C" { 8 | #endif 9 | 10 | typedef void (*nnp_wt_function)(const float*, float*); 11 | 12 | void nnp_iwt_f6k3__fma3(const float d[], float w[]); 13 | void nnp_kwt_f6k3__fma3(const float g[], float w[]); 14 | void nnp_owt_f6k3__fma3(const float m[], float s[]); 15 | 16 | void nnp_iwt_f6k3__psimd(const float d[], float w[]); 17 | void nnp_kwt_f6k3__psimd(const float g[], float w[]); 18 | void nnp_owt_f6k3__psimd(const float m[], float s[]); 19 | 20 | void nnp_iwt_f6k3__neon(const float d[], float w[]); 21 | void nnp_kwt_f6k3__neon(const float g[], float w[]); 22 | void nnp_owt_f6k3__neon(const float m[], float s[]); 23 | 24 | void nnp_iwt_f6k3__scalar(const float d[], float w[]); 25 | void nnp_kwt_f6k3__scalar(const float g[], float w[]); 26 | void nnp_owt_f6k3__scalar(const float m[], float s[]); 27 | 28 | #ifdef __cplusplus 29 | } /* extern "C" */ 30 | #endif 31 | -------------------------------------------------------------------------------- /logo/NNPACK.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Maratyszcza/NNPACK/70a77f485e8b934224f3a79efd8edcd84cd377b8/logo/NNPACK.png -------------------------------------------------------------------------------- /src/neon/relu.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | 7 | void nnp_relu__neon( 8 | const float input[restrict static 4], 9 | float output[restrict static 4], 10 | size_t length, 11 | float negative_slope) 12 | { 13 | const float32x4_t vec_negative_slope = vdupq_n_f32(negative_slope); 14 | 15 | /* Length is always non-zero and proportional to SIMD width */ 16 | do { 17 | vst1q_f32(output, 18 | neon_reluq_f32(vld1q_f32(input), vec_negative_slope)); 19 | 20 | input += 4; 21 | output += 4; 22 | length -= 4; 23 | } while (length != 0); 24 | } 25 | 26 | void nnp_inplace_relu__neon( 27 | float data[restrict static 4], 28 | size_t length, 29 | float negative_slope) 30 | { 31 | const float32x4_t vec_negative_slope = vdupq_n_f32(negative_slope); 32 | 33 | /* Length is always non-zero and proportional to SIMD width */ 34 | do { 35 | vst1q_f32(data, 36 | neon_reluq_f32(vld1q_f32(data), vec_negative_slope)); 37 | 38 | data += 4; 39 | length -= 4; 40 | } while (length != 0); 41 | } 42 | 43 | void nnp_grad_relu__neon( 44 | const float output_gradient[restrict static 4], 45 | const float input[restrict static 4], 46 | float input_gradient[restrict static 4], 47 | size_t length, 48 | float negative_slope) 49 | { 50 | const float32x4_t vec_negative_slope = vdupq_n_f32(negative_slope); 51 | 52 | /* Length is always non-zero and proportional to SIMD width */ 53 | do { 54 | vst1q_f32(input_gradient, 55 | neon_grad_reluq_f32(vld1q_f32(output_gradient), vld1q_f32(input), vec_negative_slope)); 56 | 57 | output_gradient += 4; 58 | input += 4; 59 | input_gradient += 4; 60 | length -= 4; 61 | } while (length != 0); 62 | } 63 | -------------------------------------------------------------------------------- /src/neon/transpose.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | 6 | static inline void neon_transpose4x4_inplace_f32( 7 | float32x4_t row0[static restrict 1], 8 | float32x4_t row1[static restrict 1], 9 | float32x4_t row2[static restrict 1], 10 | float32x4_t row3[static restrict 1]) 11 | { 12 | /* 13 | * row0 = ( x00 x01 x02 x03 ) 14 | * row1 = ( x10 x11 x12 x13 ) 15 | * row2 = ( x20 x21 x22 x23 ) 16 | * row3 = ( x30 x31 x32 x33 ) 17 | */ 18 | 19 | /* 20 | * row01 = ( x00 x10 x02 x12 ), ( x01 x11 x03, x13 ) 21 | * row23 = ( x20 x30 x22 x32 ), ( x21 x31 x23, x33 ) 22 | */ 23 | float32x4x2_t row01 = vtrnq_f32(*row0, *row1); 24 | float32x4x2_t row23 = vtrnq_f32(*row2, *row3); 25 | 26 | /* 27 | * row0 = ( x00 x10 x20 x30 ) 28 | * row1 = ( x01 x11 x21 x31 ) 29 | * row2 = ( x02 x12 x22 x32 ) 30 | * row3 = ( x03 x13 x23 x33 ) 31 | */ 32 | *row0 = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0])); 33 | *row1 = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1])); 34 | *row2 = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0])); 35 | *row3 = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1])); 36 | } 37 | -------------------------------------------------------------------------------- /src/neon/winograd-f6k3.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void nnp_iwt_f6k3__neon( 5 | const float d[restrict static 32], 6 | float w[restrict static 32]) 7 | { 8 | float32x4_t w0 = vld1q_f32(d + 0); 9 | float32x4_t w1 = vld1q_f32(d + 4); 10 | float32x4_t w2 = vld1q_f32(d + 8); 11 | float32x4_t w3 = vld1q_f32(d + 12); 12 | float32x4_t w4 = vld1q_f32(d + 16); 13 | float32x4_t w5 = vld1q_f32(d + 20); 14 | float32x4_t w6 = vld1q_f32(d + 24); 15 | float32x4_t w7 = vld1q_f32(d + 28); 16 | 17 | winograd_f6k3_input_transform( 18 | w0, w1, w2, w3, w4, w5, w6, w7, 19 | &w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7); 20 | 21 | vst1q_f32(w + 0, w0); 22 | vst1q_f32(w + 4, w1); 23 | vst1q_f32(w + 8, w2); 24 | vst1q_f32(w + 12, w3); 25 | vst1q_f32(w + 16, w4); 26 | vst1q_f32(w + 20, w5); 27 | vst1q_f32(w + 24, w6); 28 | vst1q_f32(w + 28, w7); 29 | } 30 | 31 | void nnp_kwt_f6k3__neon( 32 | const float g[restrict static 12], 33 | float w[restrict static 32]) 34 | { 35 | const float32x4_t g0 = vld1q_f32(g + 0); 36 | const float32x4_t g1 = vld1q_f32(g + 4); 37 | const float32x4_t g2 = vld1q_f32(g + 8); 38 | 39 | float32x4_t w0, w1, w2, w3, w4, w5, w6, w7; 40 | winograd_f6k3_kernel_transform( 41 | g0, g1, g2, 42 | &w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7, 43 | true /* rescale coefficients */); 44 | 45 | vst1q_f32(w + 0, w0); 46 | vst1q_f32(w + 4, w1); 47 | vst1q_f32(w + 8, w2); 48 | vst1q_f32(w + 12, w3); 49 | vst1q_f32(w + 16, w4); 50 | vst1q_f32(w + 20, w5); 51 | vst1q_f32(w + 24, w6); 52 | vst1q_f32(w + 28, w7); 53 | } 54 | 55 | void nnp_owt_f6k3__neon( 56 | const float m[restrict static 32], 57 | float s[restrict static 24]) 58 | { 59 | float32x4_t w0 = vld1q_f32(m + 0); 60 | float32x4_t w1 = vld1q_f32(m + 4); 61 | float32x4_t w2 = vld1q_f32(m + 8); 62 | float32x4_t w3 = vld1q_f32(m + 12); 63 | float32x4_t w4 = vld1q_f32(m + 16); 64 | float32x4_t w5 = vld1q_f32(m + 20); 65 | float32x4_t w6 = vld1q_f32(m + 24); 66 | float32x4_t w7 = vld1q_f32(m + 28); 67 | 68 | float32x4_t s0, s1, s2, s3, s4, s5; 69 | winograd_f6k3_output_transformq( 70 | w0, w1, w2, w3, w4, w5, w6, w7, 71 | &s0, &s1, &s2, &s3, &s4, &s5); 72 | 73 | vst1q_f32(s + 0, s0); 74 | vst1q_f32(s + 4, s1); 75 | vst1q_f32(s + 8, s2); 76 | vst1q_f32(s + 12, s3); 77 | vst1q_f32(s + 16, s4); 78 | vst1q_f32(s + 20, s5); 79 | } 80 | -------------------------------------------------------------------------------- /src/psimd/butterfly.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | 6 | static inline void psimd_butterfly_f32( 7 | psimd_f32 a[restrict static 1], 8 | psimd_f32 b[restrict static 1]) 9 | { 10 | const psimd_f32 new_a = *a + *b; 11 | const psimd_f32 new_b = *a - *b; 12 | *a = new_a; 13 | *b = new_b; 14 | } 15 | 16 | static inline void psimd_butterfly_and_negate_b_f32( 17 | psimd_f32 a[restrict static 1], 18 | psimd_f32 b[restrict static 1]) 19 | { 20 | const psimd_f32 new_a = *a + *b; 21 | const psimd_f32 new_b = *b - *a; 22 | *a = new_a; 23 | *b = new_b; 24 | } 25 | 26 | static inline void psimd_butterfly_with_negated_b_f32( 27 | psimd_f32 a[restrict static 1], 28 | psimd_f32 b[restrict static 1]) 29 | { 30 | const psimd_f32 new_a = *a - *b; 31 | const psimd_f32 new_b = *a + *b; 32 | *a = new_a; 33 | *b = new_b; 34 | } 35 | -------------------------------------------------------------------------------- /src/psimd/exp.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | 7 | 8 | void nnp_vector_exp__psimd( 9 | size_t n, 10 | const float x[restrict static n], 11 | float y[restrict static n]) 12 | { 13 | do { 14 | psimd_store_f32(y, 15 | psimd_exp_f32(psimd_load_f32(x))); 16 | 17 | y += 4; 18 | x += 4; 19 | n -= 4; 20 | } while (n >= 4); 21 | if (n != 0) { 22 | psimd_store_f32(y + n - 4, 23 | psimd_exp_f32(psimd_load_f32(x + n - 4))); 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/psimd/exp.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | 6 | static inline psimd_f32 psimd_exp_f32(psimd_f32 x) { 7 | const psimd_f32 magic_bias = psimd_splat_f32(0x1.800000p+23f); 8 | const psimd_f32 zero_cutoff = psimd_splat_f32(-0x1.9FE368p+6f); /* The smallest x for which expf(x) is non-zero */ 9 | const psimd_f32 inf_cutoff = psimd_splat_f32(0x1.62E42Ep+6f); /* The largest x for which expf(x) is finite */ 10 | const psimd_f32 log2e = psimd_splat_f32(0x1.715476p+0f); 11 | const psimd_f32 ln2_hi = psimd_splat_f32(0x1.62E400p-1f); /* The lowest 7 bits are zeros */ 12 | const psimd_f32 ln2_lo = psimd_splat_f32(0x1.7F7D1Cp-20f); 13 | const psimd_f32 plus_inf = psimd_splat_f32(__builtin_inff()); 14 | 15 | const psimd_f32 c2 = psimd_splat_f32(0x1.FFFFFCp-2f); 16 | const psimd_f32 c3 = psimd_splat_f32(0x1.55548Cp-3f); 17 | const psimd_f32 c4 = psimd_splat_f32(0x1.555834p-5f); 18 | const psimd_f32 c5 = psimd_splat_f32(0x1.123CFEp-7f); 19 | const psimd_f32 c6 = psimd_splat_f32(0x1.6ADCAEp-10f); 20 | 21 | const psimd_s32 min_exponent = psimd_splat_s32((int32_t)((uint32_t) -126 << 23)); 22 | const psimd_s32 max_exponent = psimd_splat_s32(127 << 23); 23 | const psimd_s32 default_exponent = psimd_splat_s32(0x3F800000); 24 | 25 | psimd_f32 t = x * log2e + magic_bias; 26 | psimd_s32 e1 = ((psimd_s32) t) << psimd_splat_s32(23); 27 | psimd_s32 e2 = e1; 28 | e1 = psimd_min_s32(psimd_max_s32(e1, min_exponent), max_exponent); 29 | e2 = e2 - e1; 30 | 31 | const psimd_f32 s1 = (psimd_f32) (e1 + default_exponent); 32 | const psimd_f32 s2 = (psimd_f32) (e2 + default_exponent); 33 | 34 | t = t - magic_bias; 35 | const psimd_f32 rx = (x - t * ln2_hi) - t * ln2_lo; 36 | const psimd_f32 rf = rx + rx * rx * (c2 + rx * (c3 + rx * (c4 + rx * (c5 + rx * c6)))); 37 | psimd_f32 f = s2 * (s1 * rf + s1); 38 | 39 | /* Fixup underflow to zero */ 40 | f = psimd_andmask_f32(x > zero_cutoff, f); 41 | 42 | /* Fixup overflow */ 43 | f = psimd_blend_f32(x > inf_cutoff, plus_inf, f); 44 | return f; 45 | } 46 | -------------------------------------------------------------------------------- /src/psimd/fft-aos.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void nnp_fft4_4aos__psimd( 5 | const float t[restrict static 32], 6 | float f[restrict static 32]) 7 | { 8 | psimd_f32 w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i; 9 | psimd_fft4_aos_f32( 10 | t, t + 16, 4, 0, 8, 11 | &w0r, &w0i, &w1r, &w1i, &w2r, &w2i, &w3r, &w3i); 12 | psimd_store_f32(f + 0, w0r); 13 | psimd_store_f32(f + 4, w0i); 14 | psimd_store_f32(f + 8, w1r); 15 | psimd_store_f32(f + 12, w1i); 16 | psimd_store_f32(f + 16, w2r); 17 | psimd_store_f32(f + 20, w2i); 18 | psimd_store_f32(f + 24, w3r); 19 | psimd_store_f32(f + 28, w3i); 20 | } 21 | 22 | void nnp_fft8_4aos__psimd( 23 | const float t[restrict static 64], 24 | float f[restrict static 64]) 25 | { 26 | psimd_f32 w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i, w5r, w5i, w6r, w6i, w7r, w7i; 27 | psimd_fft8_aos_f32( 28 | t, t + 32, 4, 0, 16, 29 | &w0r, &w0i, &w1r, &w1i, &w2r, &w2i, &w3r, &w3i, &w4r, &w4i, &w5r, &w5i, &w6r, &w6i, &w7r, &w7i); 30 | psimd_store_f32(f + 0, w0r); 31 | psimd_store_f32(f + 4, w0i); 32 | psimd_store_f32(f + 8, w1r); 33 | psimd_store_f32(f + 12, w1i); 34 | psimd_store_f32(f + 16, w2r); 35 | psimd_store_f32(f + 20, w2i); 36 | psimd_store_f32(f + 24, w3r); 37 | psimd_store_f32(f + 28, w3i); 38 | psimd_store_f32(f + 32, w4r); 39 | psimd_store_f32(f + 36, w4i); 40 | psimd_store_f32(f + 40, w5r); 41 | psimd_store_f32(f + 44, w5i); 42 | psimd_store_f32(f + 48, w6r); 43 | psimd_store_f32(f + 52, w6i); 44 | psimd_store_f32(f + 56, w7r); 45 | psimd_store_f32(f + 60, w7i); 46 | } 47 | 48 | void nnp_ifft4_4aos__psimd( 49 | const float f[restrict static 32], 50 | float t[restrict static 32]) 51 | { 52 | const psimd_f32 w0r = psimd_load_f32(f + 0); 53 | const psimd_f32 w0i = psimd_load_f32(f + 4); 54 | const psimd_f32 w1r = psimd_load_f32(f + 8); 55 | const psimd_f32 w1i = psimd_load_f32(f + 12); 56 | const psimd_f32 w2r = psimd_load_f32(f + 16); 57 | const psimd_f32 w2i = psimd_load_f32(f + 20); 58 | const psimd_f32 w3r = psimd_load_f32(f + 24); 59 | const psimd_f32 w3i = psimd_load_f32(f + 28); 60 | 61 | psimd_ifft4_aos_f32( 62 | w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i, 63 | t, t + 16, 4); 64 | } 65 | 66 | void nnp_ifft8_4aos__psimd( 67 | const float f[restrict static 64], 68 | float t[restrict static 64]) 69 | { 70 | const psimd_f32 w0r = psimd_load_f32(f + 0); 71 | const psimd_f32 w0i = psimd_load_f32(f + 4); 72 | const psimd_f32 w1r = psimd_load_f32(f + 8); 73 | const psimd_f32 w1i = psimd_load_f32(f + 12); 74 | const psimd_f32 w2r = psimd_load_f32(f + 16); 75 | const psimd_f32 w2i = psimd_load_f32(f + 20); 76 | const psimd_f32 w3r = psimd_load_f32(f + 24); 77 | const psimd_f32 w3i = psimd_load_f32(f + 28); 78 | const psimd_f32 w4r = psimd_load_f32(f + 32); 79 | const psimd_f32 w4i = psimd_load_f32(f + 36); 80 | const psimd_f32 w5r = psimd_load_f32(f + 40); 81 | const psimd_f32 w5i = psimd_load_f32(f + 44); 82 | const psimd_f32 w6r = psimd_load_f32(f + 48); 83 | const psimd_f32 w6i = psimd_load_f32(f + 52); 84 | const psimd_f32 w7r = psimd_load_f32(f + 56); 85 | const psimd_f32 w7i = psimd_load_f32(f + 60); 86 | 87 | psimd_ifft8_aos_f32( 88 | w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i, w5r, w5i, w6r, w6i, w7r, w7i, 89 | t, t + 32, 4); 90 | } 91 | -------------------------------------------------------------------------------- /src/psimd/fft-dualreal.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void nnp_fft8_dualreal__psimd( 5 | const float t[restrict static 16], 6 | float f[restrict static 16]) 7 | { 8 | psimd_f32 s0123 = psimd_load_f32(t + 0); 9 | psimd_f32 s4567 = psimd_load_f32(t + 4); 10 | psimd_f32 h0123 = psimd_load_f32(t + 8); 11 | psimd_f32 h4567 = psimd_load_f32(t + 12); 12 | 13 | psimd_fft8_dualreal_f32(&s0123, &s4567, &h0123, &h4567); 14 | 15 | psimd_store_f32(f + 0, s0123); 16 | psimd_store_f32(f + 4, s4567); 17 | psimd_store_f32(f + 8, h0123); 18 | psimd_store_f32(f + 12, h4567); 19 | } 20 | 21 | void nnp_fft16_dualreal__psimd( 22 | const float t[restrict static 32], 23 | float f[restrict static 32]) 24 | { 25 | psimd_f32 s0123 = psimd_load_f32(t + 0); 26 | psimd_f32 s4567 = psimd_load_f32(t + 4); 27 | psimd_f32 s89AB = psimd_load_f32(t + 8); 28 | psimd_f32 sCDEF = psimd_load_f32(t + 12); 29 | psimd_f32 h0123 = psimd_load_f32(t + 16); 30 | psimd_f32 h4567 = psimd_load_f32(t + 20); 31 | psimd_f32 h89AB = psimd_load_f32(t + 24); 32 | psimd_f32 hCDEF = psimd_load_f32(t + 28); 33 | 34 | psimd_fft16_dualreal_f32(&s0123, &s4567, &s89AB, &sCDEF, &h0123, &h4567, &h89AB, &hCDEF); 35 | 36 | psimd_store_f32(f + 0, s0123); 37 | psimd_store_f32(f + 4, s4567); 38 | psimd_store_f32(f + 8, s89AB); 39 | psimd_store_f32(f + 12, sCDEF); 40 | psimd_store_f32(f + 16, h0123); 41 | psimd_store_f32(f + 20, h4567); 42 | psimd_store_f32(f + 24, h89AB); 43 | psimd_store_f32(f + 28, hCDEF); 44 | } 45 | 46 | void nnp_ifft8_dualreal__psimd( 47 | const float f[restrict static 16], 48 | float t[restrict static 16]) 49 | { 50 | psimd_f32 s0123 = psimd_load_f32(f + 0); 51 | psimd_f32 s4567 = psimd_load_f32(f + 4); 52 | psimd_f32 h0123 = psimd_load_f32(f + 8); 53 | psimd_f32 h4567 = psimd_load_f32(f + 12); 54 | 55 | psimd_ifft8_dualreal_f32(&s0123, &s4567, &h0123, &h4567); 56 | 57 | psimd_store_f32(t + 0, s0123); 58 | psimd_store_f32(t + 4, s4567); 59 | psimd_store_f32(t + 8, h0123); 60 | psimd_store_f32(t + 12, h4567); 61 | } 62 | 63 | void nnp_ifft16_dualreal__psimd( 64 | const float f[restrict static 32], 65 | float t[restrict static 32]) 66 | { 67 | psimd_f32 s0123 = psimd_load_f32(f + 0); 68 | psimd_f32 s4567 = psimd_load_f32(f + 4); 69 | psimd_f32 s89AB = psimd_load_f32(f + 8); 70 | psimd_f32 sCDEF = psimd_load_f32(f + 12); 71 | psimd_f32 h0123 = psimd_load_f32(f + 16); 72 | psimd_f32 h4567 = psimd_load_f32(f + 20); 73 | psimd_f32 h89AB = psimd_load_f32(f + 24); 74 | psimd_f32 hCDEF = psimd_load_f32(f + 28); 75 | 76 | psimd_ifft16_dualreal_f32(&s0123, &s4567, &s89AB, &sCDEF, &h0123, &h4567, &h89AB, &hCDEF); 77 | 78 | psimd_store_f32(t + 0, s0123); 79 | psimd_store_f32(t + 4, s4567); 80 | psimd_store_f32(t + 8, s89AB); 81 | psimd_store_f32(t + 12, sCDEF); 82 | psimd_store_f32(t + 16, h0123); 83 | psimd_store_f32(t + 20, h4567); 84 | psimd_store_f32(t + 24, h89AB); 85 | psimd_store_f32(t + 28, hCDEF); 86 | } 87 | -------------------------------------------------------------------------------- /src/psimd/fft-real.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void nnp_fft8_4real__psimd( 5 | const float t[restrict static 32], 6 | float f[restrict static 32]) 7 | { 8 | psimd_fft8_real_f32( 9 | t, t + 16, 4, 0, 8, 10 | f, 4); 11 | } 12 | 13 | void nnp_fft16_4real__psimd( 14 | const float t[restrict static 64], 15 | float f[restrict static 64]) 16 | { 17 | psimd_fft16_real_f32( 18 | t, t + 32, 4, 0, 16, 19 | f, 4); 20 | } 21 | 22 | void nnp_ifft8_4real__psimd( 23 | const float f[restrict static 32], 24 | float t[restrict static 32]) 25 | { 26 | const psimd_f32 f0r = psimd_load_f32(f + 0); 27 | const psimd_f32 f4r = psimd_load_f32(f + 4); 28 | const psimd_f32 f1r = psimd_load_f32(f + 8); 29 | const psimd_f32 f1i = psimd_load_f32(f + 12); 30 | const psimd_f32 f2r = psimd_load_f32(f + 16); 31 | const psimd_f32 f2i = psimd_load_f32(f + 20); 32 | const psimd_f32 f3r = psimd_load_f32(f + 24); 33 | const psimd_f32 f3i = psimd_load_f32(f + 28); 34 | psimd_ifft8_real_f32( 35 | f0r, f4r, f1r, f1i, f2r, f2i, f3r, f3i, 36 | t, t + 16, 4); 37 | } 38 | 39 | void nnp_ifft16_4real__psimd( 40 | const float f[restrict static 64], 41 | float t[restrict static 64]) 42 | { 43 | const psimd_f32 f0r = psimd_load_f32(f + 0); 44 | const psimd_f32 f8r = psimd_load_f32(f + 4); 45 | const psimd_f32 f1r = psimd_load_f32(f + 8); 46 | const psimd_f32 f1i = psimd_load_f32(f + 12); 47 | const psimd_f32 f2r = psimd_load_f32(f + 16); 48 | const psimd_f32 f2i = psimd_load_f32(f + 20); 49 | const psimd_f32 f3r = psimd_load_f32(f + 24); 50 | const psimd_f32 f3i = psimd_load_f32(f + 28); 51 | const psimd_f32 f4r = psimd_load_f32(f + 32); 52 | const psimd_f32 f4i = psimd_load_f32(f + 36); 53 | const psimd_f32 f5r = psimd_load_f32(f + 40); 54 | const psimd_f32 f5i = psimd_load_f32(f + 44); 55 | const psimd_f32 f6r = psimd_load_f32(f + 48); 56 | const psimd_f32 f6i = psimd_load_f32(f + 52); 57 | const psimd_f32 f7r = psimd_load_f32(f + 56); 58 | const psimd_f32 f7i = psimd_load_f32(f + 60); 59 | psimd_ifft16_real_f32( 60 | f0r, f8r, f1r, f1i, f2r, f2i, f3r, f3i, f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i, 61 | t, t + 32, 4); 62 | } 63 | -------------------------------------------------------------------------------- /src/psimd/fft-soa.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void nnp_fft8_soa__psimd( 5 | const float t[restrict static 16], 6 | float f[restrict static 16]) 7 | { 8 | psimd_f32 w0123r = psimd_load_f32(t + 0); 9 | psimd_f32 w4567r = psimd_load_f32(t + 4); 10 | psimd_f32 w0123i = psimd_load_f32(t + 8); 11 | psimd_f32 w4567i = psimd_load_f32(t + 12); 12 | 13 | psimd_fft8_soa_f32(&w0123r, &w4567r, &w0123i, &w4567i); 14 | 15 | psimd_store_f32(f + 0, w0123r); 16 | psimd_store_f32(f + 4, w4567r); 17 | psimd_store_f32(f + 8, w0123i); 18 | psimd_store_f32(f + 12, w4567i); 19 | } 20 | 21 | void nnp_fft16_soa__psimd( 22 | const float t[restrict static 32], 23 | float f[restrict static 32]) 24 | { 25 | psimd_f32 w0123r = psimd_load_f32(t + 0); 26 | psimd_f32 w4567r = psimd_load_f32(t + 4); 27 | psimd_f32 w89ABr = psimd_load_f32(t + 8); 28 | psimd_f32 wCDEFr = psimd_load_f32(t + 12); 29 | psimd_f32 w0123i = psimd_load_f32(t + 16); 30 | psimd_f32 w4567i = psimd_load_f32(t + 20); 31 | psimd_f32 w89ABi = psimd_load_f32(t + 24); 32 | psimd_f32 wCDEFi = psimd_load_f32(t + 28); 33 | 34 | psimd_fft16_soa_f32(&w0123r, &w4567r, &w89ABr, &wCDEFr, &w0123i, &w4567i, &w89ABi, &wCDEFi); 35 | 36 | psimd_store_f32(f + 0, w0123r); 37 | psimd_store_f32(f + 4, w4567r); 38 | psimd_store_f32(f + 8, w89ABr); 39 | psimd_store_f32(f + 12, wCDEFr); 40 | psimd_store_f32(f + 16, w0123i); 41 | psimd_store_f32(f + 20, w4567i); 42 | psimd_store_f32(f + 24, w89ABi); 43 | psimd_store_f32(f + 28, wCDEFi); 44 | } 45 | 46 | void nnp_ifft8_soa__psimd( 47 | const float f[restrict static 16], 48 | float t[restrict static 16]) 49 | { 50 | psimd_f32 w0123r = psimd_load_f32(f + 0); 51 | psimd_f32 w4567r = psimd_load_f32(f + 4); 52 | psimd_f32 w0123i = psimd_load_f32(f + 8); 53 | psimd_f32 w4567i = psimd_load_f32(f + 12); 54 | 55 | psimd_ifft8_soa_f32(&w0123r, &w4567r, &w0123i, &w4567i); 56 | 57 | psimd_store_f32(t + 0, w0123r); 58 | psimd_store_f32(t + 4, w4567r); 59 | psimd_store_f32(t + 8, w0123i); 60 | psimd_store_f32(t + 12, w4567i); 61 | } 62 | 63 | void nnp_ifft16_soa__psimd( 64 | const float f[restrict static 32], 65 | float t[restrict static 32]) 66 | { 67 | psimd_f32 w0123r = psimd_load_f32(f + 0); 68 | psimd_f32 w4567r = psimd_load_f32(f + 4); 69 | psimd_f32 w89ABr = psimd_load_f32(f + 8); 70 | psimd_f32 wCDEFr = psimd_load_f32(f + 12); 71 | psimd_f32 w0123i = psimd_load_f32(f + 16); 72 | psimd_f32 w4567i = psimd_load_f32(f + 20); 73 | psimd_f32 w89ABi = psimd_load_f32(f + 24); 74 | psimd_f32 wCDEFi = psimd_load_f32(f + 28); 75 | 76 | psimd_ifft16_soa_f32(&w0123r, &w4567r, &w89ABr, &wCDEFr, &w0123i, &w4567i, &w89ABi, &wCDEFi); 77 | 78 | psimd_store_f32(t + 0, w0123r); 79 | psimd_store_f32(t + 4, w4567r); 80 | psimd_store_f32(t + 8, w89ABr); 81 | psimd_store_f32(t + 12, wCDEFr); 82 | psimd_store_f32(t + 16, w0123i); 83 | psimd_store_f32(t + 20, w4567i); 84 | psimd_store_f32(t + 24, w89ABi); 85 | psimd_store_f32(t + 28, wCDEFi); 86 | } 87 | -------------------------------------------------------------------------------- /src/psimd/relu.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | 7 | 8 | void nnp_relu__psimd( 9 | const float input[restrict static 4], 10 | float output[restrict static 4], 11 | size_t length, 12 | float negative_slope) 13 | { 14 | const psimd_f32 vec_negative_slope = psimd_splat_f32(negative_slope); 15 | 16 | /* Length is always non-zero and proportional to SIMD width */ 17 | do { 18 | psimd_store_f32(output, 19 | psimd_relu_f32(psimd_load_f32(input), vec_negative_slope)); 20 | 21 | input += 4; 22 | output += 4; 23 | length -= 4; 24 | } while (length != 0); 25 | } 26 | 27 | void nnp_inplace_relu__psimd( 28 | float data[restrict static 4], 29 | size_t length, 30 | float negative_slope) 31 | { 32 | const psimd_f32 vec_negative_slope = psimd_splat_f32(negative_slope); 33 | 34 | /* Length is always non-zero and proportional to SIMD width */ 35 | do { 36 | psimd_store_f32(data, 37 | psimd_relu_f32(psimd_load_f32(data), vec_negative_slope)); 38 | 39 | data += 4; 40 | length -= 4; 41 | } while (length != 0); 42 | } 43 | 44 | void nnp_grad_relu__psimd( 45 | const float output_gradient[restrict static 4], 46 | const float input[restrict static 4], 47 | float input_gradient[restrict static 4], 48 | size_t length, 49 | float negative_slope) 50 | { 51 | const psimd_f32 vec_negative_slope = psimd_splat_f32(negative_slope); 52 | 53 | /* Length is always non-zero and proportional to SIMD width */ 54 | do { 55 | psimd_store_f32(input_gradient, 56 | psimd_grad_relu_f32(psimd_load_f32(output_gradient), psimd_load_f32(input), vec_negative_slope)); 57 | 58 | output_gradient += 4; 59 | input += 4; 60 | input_gradient += 4; 61 | length -= 4; 62 | } while (length != 0); 63 | } 64 | -------------------------------------------------------------------------------- /src/psimd/transpose.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | 6 | static inline void psimd_transpose4x4_f32( 7 | const psimd_f32 row0, const psimd_f32 row1, const psimd_f32 row2, const psimd_f32 row3, 8 | psimd_f32 col0[restrict static 1], 9 | psimd_f32 col1[restrict static 1], 10 | psimd_f32 col2[restrict static 1], 11 | psimd_f32 col3[restrict static 1]) 12 | { 13 | /* 14 | * row0 = ( x00 x01 x02 x03 ) 15 | * row1 = ( x10 x11 x12 x13 ) 16 | * row2 = ( x20 x21 x22 x23 ) 17 | * row3 = ( x30 x31 x32 x33 ) 18 | */ 19 | 20 | /* 21 | * row01lo = ( x00 x10 x01 x11 ) 22 | * row01hi = ( x02 x12 x03 x13 ) 23 | * row23lo = ( x20 x30 x21 x31 ) 24 | * row23hi = ( x22 x32 x23 x33 ) 25 | */ 26 | const psimd_f32 row01lo = psimd_interleave_lo_f32(row0, row1); 27 | const psimd_f32 row01hi = psimd_interleave_hi_f32(row0, row1); 28 | const psimd_f32 row23lo = psimd_interleave_lo_f32(row2, row3); 29 | const psimd_f32 row23hi = psimd_interleave_hi_f32(row2, row3); 30 | 31 | /* 32 | * col0 = ( x00 x10 x20 x30 ) 33 | * col1 = ( x01 x11 x21 x31 ) 34 | * col2 = ( x02 x12 x22 x32 ) 35 | * col3 = ( x03 x13 x23 x33 ) 36 | */ 37 | *col0 = psimd_concat_lo_f32(row01lo, row23lo); 38 | *col1 = psimd_concat_hi_f32(row01lo, row23lo); 39 | *col2 = psimd_concat_lo_f32(row01hi, row23hi); 40 | *col3 = psimd_concat_hi_f32(row01hi, row23hi); 41 | } 42 | -------------------------------------------------------------------------------- /src/psimd/winograd-f6k3.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void nnp_iwt_f6k3__psimd( 5 | const float d[restrict static 32], 6 | float w[restrict static 32]) 7 | { 8 | const psimd_f32 d0 = psimd_load_f32(d + 0); 9 | const psimd_f32 d1 = psimd_load_f32(d + 4); 10 | const psimd_f32 d2 = psimd_load_f32(d + 8); 11 | const psimd_f32 d3 = psimd_load_f32(d + 12); 12 | const psimd_f32 d4 = psimd_load_f32(d + 16); 13 | const psimd_f32 d5 = psimd_load_f32(d + 20); 14 | const psimd_f32 d6 = psimd_load_f32(d + 24); 15 | const psimd_f32 d7 = psimd_load_f32(d + 28); 16 | 17 | psimd_f32 w0, w1, w2, w3, w4, w5, w6, w7; 18 | winograd_f6k3_input_transform( 19 | d0, d1, d2, d3, d4, d5, d6, d7, 20 | &w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7); 21 | 22 | psimd_store_f32(w + 0, w0); 23 | psimd_store_f32(w + 4, w1); 24 | psimd_store_f32(w + 8, w2); 25 | psimd_store_f32(w + 12, w3); 26 | psimd_store_f32(w + 16, w4); 27 | psimd_store_f32(w + 20, w5); 28 | psimd_store_f32(w + 24, w6); 29 | psimd_store_f32(w + 28, w7); 30 | } 31 | 32 | void nnp_kwt_f6k3__psimd( 33 | const float g[restrict static 12], 34 | float w[restrict static 32]) 35 | { 36 | const psimd_f32 g0 = psimd_load_f32(g + 0); 37 | const psimd_f32 g1 = psimd_load_f32(g + 4); 38 | const psimd_f32 g2 = psimd_load_f32(g + 8); 39 | 40 | psimd_f32 w0, w1, w2, w3, w4, w5, w6, w7; 41 | winograd_f6k3_kernel_transform( 42 | g0, g1, g2, 43 | &w0, &w1, &w2, &w3, &w4, &w5, &w6, &w7, 44 | true /* rescale coefficients */); 45 | 46 | psimd_store_f32(w + 0, w0); 47 | psimd_store_f32(w + 4, w1); 48 | psimd_store_f32(w + 8, w2); 49 | psimd_store_f32(w + 12, w3); 50 | psimd_store_f32(w + 16, w4); 51 | psimd_store_f32(w + 20, w5); 52 | psimd_store_f32(w + 24, w6); 53 | psimd_store_f32(w + 28, w7); 54 | } 55 | 56 | void nnp_owt_f6k3__psimd( 57 | const float m[restrict static 32], 58 | float s[restrict static 24]) 59 | { 60 | const psimd_f32 m0 = psimd_load_f32(m + 0); 61 | const psimd_f32 m1 = psimd_load_f32(m + 4); 62 | const psimd_f32 m2 = psimd_load_f32(m + 8); 63 | const psimd_f32 m3 = psimd_load_f32(m + 12); 64 | const psimd_f32 m4 = psimd_load_f32(m + 16); 65 | const psimd_f32 m5 = psimd_load_f32(m + 20); 66 | const psimd_f32 m6 = psimd_load_f32(m + 24); 67 | const psimd_f32 m7 = psimd_load_f32(m + 28); 68 | 69 | psimd_f32 s0, s1, s2, s3, s4, s5; 70 | winograd_f6k3_output_transform( 71 | m0, m1, m2, m3, m4, m5, m6, m7, 72 | &s0, &s1, &s2, &s3, &s4, &s5); 73 | 74 | psimd_store_f32(s + 0, s0); 75 | psimd_store_f32(s + 4, s1); 76 | psimd_store_f32(s + 8, s2); 77 | psimd_store_f32(s + 12, s3); 78 | psimd_store_f32(s + 16, s4); 79 | psimd_store_f32(s + 20, s5); 80 | } 81 | -------------------------------------------------------------------------------- /src/ref/convolution-input-gradient.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | struct convolution_input_gradient_context { 5 | size_t input_channels; 6 | size_t output_channels; 7 | struct nnp_size input_size; 8 | struct nnp_padding input_padding; 9 | struct nnp_size kernel_size; 10 | struct nnp_size output_size; 11 | const float* grad_output_pointer; 12 | const float* kernel_pointer; 13 | float* grad_input_pointer; 14 | }; 15 | 16 | static void compute_convolution_input_gradient( 17 | const struct convolution_input_gradient_context context[restrict static 1], 18 | size_t sample, size_t input_channel) 19 | { 20 | const size_t input_channels = context->input_channels; 21 | const size_t output_channels = context->output_channels; 22 | const struct nnp_size input_size = context->input_size; 23 | const struct nnp_padding input_padding = context->input_padding; 24 | const struct nnp_size kernel_size = context->kernel_size; 25 | const struct nnp_size output_size = context->output_size; 26 | 27 | const float (*grad_output)[output_channels][output_size.height][output_size.width] = 28 | (const float(*)[output_channels][output_size.height][output_size.width]) context->grad_output_pointer; 29 | const float (*kernel)[input_channels][kernel_size.height][kernel_size.width] = 30 | (const float(*)[input_channels][kernel_size.height][kernel_size.width]) context->kernel_pointer; 31 | 32 | float (*grad_input)[input_channels][input_size.height][input_size.width] = 33 | (float(*)[input_channels][input_size.height][input_size.width]) context->grad_input_pointer; 34 | 35 | for (size_t y = 0; y < input_size.height; y++) { 36 | for (size_t x = 0; x < input_size.width; x++) { 37 | double v = 0.0; 38 | for (size_t output_channel = 0; output_channel < output_channels; output_channel++) { 39 | for (size_t i = 0; i < kernel_size.height; i++) { 40 | const size_t s = y - i + input_padding.top; 41 | if (s < output_size.height) { 42 | for (size_t j = 0; j < kernel_size.width; j++) { 43 | const size_t t = x - j + input_padding.left; 44 | if (t < output_size.width) { 45 | v += grad_output[sample][output_channel][s][t] * kernel[output_channel][input_channel][i][j]; 46 | } 47 | } 48 | } 49 | } 50 | } 51 | grad_input[sample][input_channel][y][x] = v; 52 | } 53 | } 54 | } 55 | 56 | void nnp_convolution_input_gradient__reference( 57 | size_t batch_size, 58 | size_t input_channels, 59 | size_t output_channels, 60 | struct nnp_size input_size, 61 | struct nnp_padding input_padding, 62 | struct nnp_size kernel_size, 63 | const float grad_output_pointer[], 64 | const float kernel_pointer[], 65 | float grad_input_pointer[], 66 | pthreadpool_t threadpool) 67 | { 68 | const struct nnp_size output_size = { 69 | .width = input_padding.left + input_size.width + input_padding.right - kernel_size.width + 1, 70 | .height = input_padding.top + input_size.height + input_padding.bottom - kernel_size.height + 1 71 | }; 72 | struct convolution_input_gradient_context convolution_input_gradient_context = { 73 | .input_channels = input_channels, 74 | .output_channels = output_channels, 75 | .input_size = input_size, 76 | .input_padding = input_padding, 77 | .kernel_size = kernel_size, 78 | .output_size = output_size, 79 | .grad_output_pointer = grad_output_pointer, 80 | .kernel_pointer = kernel_pointer, 81 | .grad_input_pointer = grad_input_pointer, 82 | }; 83 | 84 | pthreadpool_parallelize_2d(threadpool, 85 | (pthreadpool_function_2d_t) compute_convolution_input_gradient, 86 | &convolution_input_gradient_context, 87 | batch_size, input_channels, 88 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 89 | } 90 | -------------------------------------------------------------------------------- /src/ref/convolution-output.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | struct convolution_output_context { 5 | size_t input_channels; 6 | size_t output_channels; 7 | struct nnp_size input_size; 8 | struct nnp_padding input_padding; 9 | struct nnp_size kernel_size; 10 | struct nnp_size output_size; 11 | struct nnp_size output_subsampling; 12 | const float* input_pointer; 13 | const float* kernel_pointer; 14 | const float* bias; 15 | float* output_pointer; 16 | }; 17 | 18 | static void compute_convolution_output( 19 | const struct convolution_output_context context[restrict static 1], 20 | size_t sample, size_t output_channel) 21 | { 22 | const size_t input_channels = context->input_channels; 23 | const size_t output_channels = context->output_channels; 24 | const struct nnp_size input_size = context->input_size; 25 | const struct nnp_padding input_padding = context->input_padding; 26 | const struct nnp_size kernel_size = context->kernel_size; 27 | const struct nnp_size output_size = context->output_size; 28 | const struct nnp_size output_subsampling = context->output_subsampling; 29 | 30 | const float (*input)[input_channels][input_size.height][input_size.width] = 31 | (const float(*)[input_channels][input_size.height][input_size.width]) context->input_pointer; 32 | const float (*kernel)[input_channels][kernel_size.height][kernel_size.width] = 33 | (const float(*)[input_channels][kernel_size.height][kernel_size.width]) context->kernel_pointer; 34 | float (*output)[output_channels][output_size.height][output_size.width] = 35 | (float(*)[output_channels][output_size.height][output_size.width]) context->output_pointer; 36 | 37 | for (size_t y = 0; y < output_size.height; y++) { 38 | for (size_t x = 0; x < output_size.width; x++) { 39 | double v = 0.0; 40 | for (size_t input_channel = 0; input_channel < input_channels; input_channel++) { 41 | for (size_t i = 0; i < kernel_size.height; i++) { 42 | const size_t s = y * output_subsampling.height + i - input_padding.top; 43 | if (s < input_size.height) { 44 | for (size_t j = 0; j < kernel_size.width; j++) { 45 | const size_t t = x * output_subsampling.width + j - input_padding.left; 46 | if (t < input_size.width) { 47 | v += input[sample][input_channel][s][t] * kernel[output_channel][input_channel][i][j]; 48 | } 49 | } 50 | } 51 | } 52 | } 53 | output[sample][output_channel][y][x] = v + context->bias[output_channel]; 54 | } 55 | } 56 | } 57 | 58 | void nnp_convolution_output__reference( 59 | size_t batch_size, 60 | size_t input_channels, 61 | size_t output_channels, 62 | struct nnp_size input_size, 63 | struct nnp_padding input_padding, 64 | struct nnp_size kernel_size, 65 | struct nnp_size output_subsampling, 66 | const float input_pointer[], 67 | const float kernel_pointer[], 68 | const float bias[], 69 | float output_pointer[], 70 | pthreadpool_t threadpool) 71 | { 72 | const struct nnp_size output_size = { 73 | .width = (input_padding.left + input_size.width + input_padding.right - kernel_size.width) / output_subsampling.width + 1, 74 | .height = (input_padding.top + input_size.height + input_padding.bottom - kernel_size.height) / output_subsampling.height + 1 75 | }; 76 | struct convolution_output_context convolution_output_context = { 77 | .input_channels = input_channels, 78 | .output_channels = output_channels, 79 | .input_size = input_size, 80 | .input_padding = input_padding, 81 | .kernel_size = kernel_size, 82 | .output_size = output_size, 83 | .output_subsampling = output_subsampling, 84 | .input_pointer = input_pointer, 85 | .kernel_pointer = kernel_pointer, 86 | .bias = bias, 87 | .output_pointer = output_pointer 88 | }; 89 | 90 | pthreadpool_parallelize_2d(threadpool, 91 | (pthreadpool_task_2d_t) compute_convolution_output, 92 | &convolution_output_context, 93 | batch_size, output_channels, 94 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 95 | } 96 | -------------------------------------------------------------------------------- /src/ref/fully-connected-output.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | struct fully_connected_output_context { 7 | size_t input_channels; 8 | size_t output_channels; 9 | const void* input_pointer; 10 | const void* kernel_pointer; 11 | void* output_pointer; 12 | }; 13 | 14 | static void compute_fully_connected_output_f32( 15 | const struct fully_connected_output_context* context, 16 | size_t sample, size_t output_channel) 17 | { 18 | const size_t input_channels = context->input_channels; 19 | const size_t output_channels = context->output_channels; 20 | 21 | const float (*input)[input_channels] = (const float(*)[input_channels]) context->input_pointer; 22 | const float (*kernel)[input_channels] = (const float(*)[input_channels]) context->kernel_pointer; 23 | float (*output)[output_channels] = (float(*)[output_channels]) context->output_pointer; 24 | 25 | double v = 0.0; 26 | for (size_t input_channel = 0; input_channel < input_channels; input_channel++) { 27 | v += (double) input[sample][input_channel] * (double) kernel[output_channel][input_channel]; 28 | } 29 | output[sample][output_channel] = v; 30 | } 31 | 32 | static void compute_fully_connected_output_f16f32( 33 | const struct fully_connected_output_context* context, 34 | size_t sample, size_t output_channel) 35 | { 36 | const size_t input_channels = context->input_channels; 37 | const size_t output_channels = context->output_channels; 38 | 39 | const float (*input)[input_channels] = (const float(*)[input_channels]) context->input_pointer; 40 | const uint16_t (*kernel)[input_channels] = (const uint16_t(*)[input_channels]) context->kernel_pointer; 41 | float (*output)[output_channels] = (float(*)[output_channels]) context->output_pointer; 42 | 43 | double v = 0.0; 44 | for (size_t input_channel = 0; input_channel < input_channels; input_channel++) { 45 | v += (double) input[sample][input_channel] * 46 | (double) fp16_alt_to_fp32_value(kernel[output_channel][input_channel]); 47 | } 48 | output[sample][output_channel] = v; 49 | } 50 | 51 | void nnp_fully_connected_output_f32__reference( 52 | size_t batch_size, 53 | size_t input_channels, 54 | size_t output_channels, 55 | const float* input, 56 | const float* kernel, 57 | float* output, 58 | pthreadpool_t threadpool) 59 | { 60 | struct fully_connected_output_context fully_connected_output_context = { 61 | .input_channels = input_channels, 62 | .output_channels = output_channels, 63 | .input_pointer = input, 64 | .kernel_pointer = kernel, 65 | .output_pointer = output 66 | }; 67 | 68 | pthreadpool_parallelize_2d(threadpool, 69 | (pthreadpool_function_2d_t) compute_fully_connected_output_f32, 70 | &fully_connected_output_context, 71 | batch_size, output_channels, 72 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 73 | } 74 | 75 | void nnp_fully_connected_output_f16f32__reference( 76 | size_t batch_size, 77 | size_t input_channels, 78 | size_t output_channels, 79 | const float* input, 80 | const void* kernel, 81 | float* output, 82 | pthreadpool_t threadpool) 83 | { 84 | struct fully_connected_output_context fully_connected_output_context = { 85 | .input_channels = input_channels, 86 | .output_channels = output_channels, 87 | .input_pointer = input, 88 | .kernel_pointer = kernel, 89 | .output_pointer = output 90 | }; 91 | 92 | pthreadpool_parallelize_2d(threadpool, 93 | (pthreadpool_function_2d_t) compute_fully_connected_output_f16f32, 94 | &fully_connected_output_context, 95 | batch_size, output_channels, 96 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 97 | } 98 | -------------------------------------------------------------------------------- /src/ref/max-pooling-output.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | struct max_pooling_output_context { 6 | size_t channels; 7 | struct nnp_size input_size; 8 | struct nnp_padding input_padding; 9 | struct nnp_size pooling_size; 10 | struct nnp_size pooling_stride; 11 | struct nnp_size output_size; 12 | const float* input; 13 | float* output; 14 | }; 15 | 16 | static void compute_max_pooling_output( 17 | const struct max_pooling_output_context context[restrict static 1], 18 | size_t sample, size_t channel) 19 | { 20 | const size_t channels = context->channels; 21 | const struct nnp_size input_size = context->input_size; 22 | const struct nnp_padding input_padding = context->input_padding; 23 | const struct nnp_size pooling_size = context->pooling_size; 24 | const struct nnp_size pooling_stride = context->pooling_stride; 25 | const struct nnp_size output_size = context->output_size; 26 | 27 | const float (*input)[channels][input_size.height][input_size.width] = 28 | (const float(*)[channels][input_size.height][input_size.width]) context->input; 29 | float (*output)[channels][output_size.height][output_size.width] = 30 | (float(*)[channels][output_size.height][output_size.width]) context->output; 31 | 32 | for (size_t y = 0; y < output_size.height; y++) { 33 | for (size_t x = 0; x < output_size.width; x++) { 34 | float v = -__builtin_inff(); 35 | for (size_t i = 0; i < pooling_size.height; i++) { 36 | const size_t s = y * pooling_stride.height + i - input_padding.top; 37 | if (s < input_size.height) { 38 | for (size_t j = 0; j < pooling_size.width; j++) { 39 | const size_t t = x * pooling_stride.width + j - input_padding.left; 40 | if (t < input_size.width) { 41 | v = maxf(input[sample][channel][s][t], v); 42 | } 43 | } 44 | } 45 | } 46 | output[sample][channel][y][x] = v; 47 | } 48 | } 49 | } 50 | 51 | void nnp_max_pooling_output__reference( 52 | size_t batch_size, 53 | size_t channels, 54 | struct nnp_size input_size, 55 | struct nnp_padding input_padding, 56 | struct nnp_size pooling_size, 57 | struct nnp_size pooling_stride, 58 | const float* input, 59 | float* output, 60 | pthreadpool_t threadpool) 61 | { 62 | const struct nnp_size output_size = { 63 | .height = divide_round_up(doz(input_padding.top + input_size.height + input_padding.bottom, pooling_size.height), pooling_stride.height) + 1, 64 | .width = divide_round_up(doz(input_padding.left + input_size.width + input_padding.right, pooling_size.width), pooling_stride.width) + 1, 65 | }; 66 | 67 | struct max_pooling_output_context max_pooling_output_context = { 68 | .channels = channels, 69 | .input_size = input_size, 70 | .input_padding = input_padding, 71 | .pooling_size = pooling_size, 72 | .pooling_stride = pooling_stride, 73 | .output_size = output_size, 74 | .input = input, 75 | .output = output 76 | }; 77 | 78 | pthreadpool_parallelize_2d(threadpool, 79 | (pthreadpool_function_2d_t) compute_max_pooling_output, 80 | &max_pooling_output_context, 81 | batch_size, channels, 82 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 83 | } 84 | -------------------------------------------------------------------------------- /src/ref/relu-input-gradient.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | 6 | struct relu_input_gradient_context { 7 | size_t channels; 8 | const float* grad_output; 9 | const float* input; 10 | float* grad_input; 11 | float negative_slope; 12 | }; 13 | 14 | static void compute_relu_input_gradient( 15 | const struct relu_input_gradient_context context[restrict static 1], 16 | size_t sample) 17 | { 18 | const size_t channels = context->channels; 19 | const float* grad_output = context->grad_output + sample * channels; 20 | const float* input = context->input + sample * channels; 21 | float* grad_input = context->grad_input + sample * channels; 22 | float negative_slope = context->negative_slope; 23 | 24 | for (size_t channel = 0; channel < channels; channel++) { 25 | grad_input[channel] = grad_relu(grad_output[channel], input[channel], negative_slope); 26 | } 27 | } 28 | 29 | void nnp_relu_input_gradient__reference( 30 | size_t batch_size, 31 | size_t channels, 32 | const float grad_output[], 33 | const float input[], 34 | float grad_input[], 35 | float negative_slope, 36 | pthreadpool_t threadpool) 37 | { 38 | struct relu_input_gradient_context relu_input_gradient_context = { 39 | .channels = channels, 40 | .grad_output = grad_output, 41 | .input = input, 42 | .grad_input = grad_input, 43 | .negative_slope = negative_slope, 44 | }; 45 | 46 | pthreadpool_parallelize_1d(threadpool, 47 | (pthreadpool_function_1d_t) compute_relu_input_gradient, 48 | &relu_input_gradient_context, 49 | batch_size, 50 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 51 | } 52 | -------------------------------------------------------------------------------- /src/ref/relu-output.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | struct relu_output_context { 6 | size_t channels; 7 | const float* input; 8 | float* output; 9 | float negative_slope; 10 | }; 11 | 12 | static void compute_relu_output( 13 | const struct relu_output_context context[restrict static 1], 14 | size_t sample) 15 | { 16 | const size_t channels = context->channels; 17 | const float* input = context->input + sample * channels; 18 | float* output = context->output + sample * channels; 19 | float negative_slope = context->negative_slope; 20 | 21 | for (size_t channel = 0; channel < channels; channel++) { 22 | output[channel] = relu(input[channel], negative_slope); 23 | } 24 | } 25 | 26 | void nnp_relu_output__reference( 27 | size_t batch_size, 28 | size_t channels, 29 | const float input[], 30 | float output[], 31 | float negative_slope, 32 | pthreadpool_t threadpool) 33 | { 34 | struct relu_output_context relu_output_context = { 35 | .channels = channels, 36 | .input = input, 37 | .output = output, 38 | .negative_slope = negative_slope, 39 | }; 40 | 41 | pthreadpool_parallelize_1d(threadpool, 42 | (pthreadpool_function_1d_t) compute_relu_output, 43 | &relu_output_context, 44 | batch_size, 45 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 46 | } 47 | -------------------------------------------------------------------------------- /src/ref/softmax-output.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | static inline float vector_maxf(size_t length, const float array[restrict static length]) { 9 | float max_element = -FLT_MAX; 10 | for (size_t i = 0; i < length; i++) { 11 | max_element = maxf(max_element, array[i]); 12 | } 13 | return max_element; 14 | } 15 | 16 | static inline float vector_sum_expf_minus_c(size_t length, const float array[restrict static length], float c) { 17 | float sum = 0.0f; 18 | for (size_t i = 0; i < length; i++) { 19 | sum += expf(array[i] - c); 20 | } 21 | return sum; 22 | } 23 | 24 | struct softmax_output_context { 25 | size_t channels; 26 | const float* input; 27 | float* output; 28 | }; 29 | 30 | static void compute_softmax_output( 31 | const struct softmax_output_context context[restrict static 1], 32 | size_t sample) 33 | { 34 | const size_t channels = context->channels; 35 | 36 | const float (*input)[channels] = 37 | (const float(*)[channels]) context->input; 38 | float (*output)[channels] = 39 | (float(*)[channels]) context->output; 40 | 41 | const float max_element = vector_maxf(channels, input[sample]); 42 | const float sum_exp = vector_sum_expf_minus_c(channels, input[sample], max_element); 43 | const float norm_factor = 1.0f / sum_exp; 44 | for (size_t channel = 0; channel < channels; channel++) { 45 | output[sample][channel] = norm_factor * expf(input[sample][channel] - max_element); 46 | } 47 | } 48 | 49 | void nnp_softmax_output__reference( 50 | size_t batch_size, 51 | size_t channels, 52 | const float* input, 53 | float* output, 54 | pthreadpool_t threadpool) 55 | { 56 | struct softmax_output_context softmax_output_context = { 57 | .channels = channels, 58 | .input = input, 59 | .output = output, 60 | }; 61 | pthreadpool_parallelize_1d(threadpool, 62 | (pthreadpool_function_1d_t) compute_softmax_output, 63 | &softmax_output_context, 64 | batch_size, 65 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 66 | } 67 | -------------------------------------------------------------------------------- /src/relu-input-gradient.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | struct NNP_CACHE_ALIGN relu_context { 15 | nnp_grad_relu_function grad_relu_function; 16 | const float* grad_output; 17 | const float* input; 18 | float* grad_input; 19 | float negative_slope; 20 | }; 21 | 22 | static void compute_grad_relu( 23 | const struct relu_context context[restrict static 1], 24 | size_t block_start, size_t block_size) 25 | { 26 | nnp_grad_relu_function grad_relu = context->grad_relu_function; 27 | const float* grad_output = context->grad_output; 28 | const float* input = context->input; 29 | float* grad_input = context->grad_input; 30 | float negative_slope = context->negative_slope; 31 | 32 | grad_relu(grad_output + block_start, input + block_start, grad_input + block_start, block_size, negative_slope); 33 | } 34 | 35 | enum nnp_status nnp_relu_input_gradient( 36 | size_t batch_size, 37 | size_t channels, 38 | const float grad_output[], 39 | const float input[], 40 | float grad_input[], 41 | float negative_slope, 42 | pthreadpool_t threadpool) 43 | { 44 | enum nnp_status status = validate_relu_arguments(batch_size, channels); 45 | if (status != nnp_status_success) { 46 | return status; 47 | } 48 | 49 | size_t elements = batch_size * channels; 50 | const size_t simd_width = nnp_hwinfo.simd_width; 51 | 52 | assert(((uintptr_t) grad_output) % sizeof(float) == 0); 53 | assert(((uintptr_t) input) % sizeof(float) == 0); 54 | assert(((uintptr_t) grad_input) % sizeof(float) == 0); 55 | 56 | const size_t prologue_elements = min((size_t) (-(((uintptr_t) grad_input) / sizeof(float)) % simd_width), elements); 57 | for (size_t i = 0; i < prologue_elements; i++) { 58 | grad_input[i] = grad_relu(grad_output[i], input[i], negative_slope); 59 | } 60 | elements -= prologue_elements; 61 | grad_output += prologue_elements; 62 | input += prologue_elements; 63 | grad_input += prologue_elements; 64 | 65 | const size_t epilogue_elements = elements % simd_width; 66 | for (size_t i = 0; i < epilogue_elements; i++) { 67 | grad_input[elements - epilogue_elements + i] = grad_relu( 68 | grad_output[elements - epilogue_elements + i], 69 | input[elements - epilogue_elements + i], 70 | negative_slope); 71 | } 72 | elements -= epilogue_elements; 73 | 74 | struct relu_context relu_context = { 75 | .grad_relu_function = nnp_hwinfo.activations.grad_relu, 76 | .grad_output = grad_output, 77 | .input = input, 78 | .grad_input = grad_input, 79 | .negative_slope = negative_slope, 80 | }; 81 | pthreadpool_parallelize_1d_tile_1d(threadpool, 82 | (pthreadpool_function_1d_tiled_t) compute_grad_relu, 83 | &relu_context, 84 | elements, round_down(nnp_hwinfo.blocking.l1 / sizeof(float), simd_width), 85 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 86 | 87 | return nnp_status_success; 88 | } 89 | -------------------------------------------------------------------------------- /src/relu-output.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | #include 11 | #include 12 | 13 | 14 | struct NNP_CACHE_ALIGN relu_context { 15 | nnp_relu_function relu_function; 16 | const float* input; 17 | float* output; 18 | float negative_slope; 19 | }; 20 | 21 | static void compute_relu_output( 22 | const struct relu_context context[restrict static 1], 23 | size_t block_start, size_t block_size) 24 | { 25 | nnp_relu_function relu = context->relu_function; 26 | const float* input = context->input; 27 | float* output = context->output; 28 | float negative_slope = context->negative_slope; 29 | 30 | relu(input + block_start, output + block_start, block_size, negative_slope); 31 | } 32 | 33 | struct NNP_CACHE_ALIGN inplace_relu_context { 34 | nnp_inplace_relu_function relu_function; 35 | float* data; 36 | float negative_slope; 37 | }; 38 | 39 | static void compute_inplace_relu_output( 40 | const struct inplace_relu_context context[restrict static 1], 41 | size_t block_start, size_t block_size) 42 | { 43 | nnp_inplace_relu_function relu = context->relu_function; 44 | float* data = context->data; 45 | float negative_slope = context->negative_slope; 46 | 47 | relu(data + block_start, block_size, negative_slope); 48 | } 49 | 50 | enum nnp_status nnp_relu_output( 51 | size_t batch_size, 52 | size_t channels, 53 | const float input[], 54 | float output[], 55 | float negative_slope, 56 | pthreadpool_t threadpool) 57 | { 58 | enum nnp_status status = validate_relu_arguments(batch_size, channels); 59 | if (status != nnp_status_success) { 60 | return status; 61 | } 62 | 63 | size_t elements = batch_size * channels; 64 | const size_t simd_width = nnp_hwinfo.simd_width; 65 | 66 | assert(((uintptr_t) input) % sizeof(float) == 0); 67 | assert(((uintptr_t) output) % sizeof(float) == 0); 68 | 69 | const size_t prologue_elements = min((size_t) (-(((uintptr_t) output) / sizeof(float)) % simd_width), elements); 70 | for (size_t i = 0; i < prologue_elements; i++) { 71 | output[i] = relu(input[i], negative_slope); 72 | } 73 | elements -= prologue_elements; 74 | input += prologue_elements; 75 | output += prologue_elements; 76 | 77 | const size_t epilogue_elements = elements % simd_width; 78 | for (size_t i = 0; i < epilogue_elements; i++) { 79 | output[elements - epilogue_elements + i] = 80 | relu(input[elements - epilogue_elements + i], negative_slope); 81 | } 82 | elements -= epilogue_elements; 83 | 84 | if (input != output) { 85 | /* Out-of-place transformation */ 86 | struct relu_context relu_context = { 87 | .relu_function = nnp_hwinfo.activations.relu, 88 | .input = input, 89 | .output = output, 90 | .negative_slope = negative_slope, 91 | }; 92 | pthreadpool_parallelize_1d_tile_1d(threadpool, 93 | (pthreadpool_function_1d_tiled_t) compute_relu_output, 94 | &relu_context, 95 | elements, round_down(nnp_hwinfo.blocking.l1 / sizeof(float), simd_width), 96 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 97 | } else { 98 | /* In-place transformation */ 99 | struct inplace_relu_context inplace_relu_context = { 100 | .relu_function = nnp_hwinfo.activations.inplace_relu, 101 | .data = output, 102 | .negative_slope = negative_slope, 103 | }; 104 | pthreadpool_parallelize_1d_tile_1d(threadpool, 105 | (pthreadpool_function_1d_tiled_t) compute_inplace_relu_output, 106 | &inplace_relu_context, 107 | elements, round_down(nnp_hwinfo.blocking.l1 / sizeof(float), simd_width), 108 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 109 | } 110 | 111 | return nnp_status_success; 112 | } 113 | -------------------------------------------------------------------------------- /src/scalar/blas/cgemm-conjb-transc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | void nnp_cgemm_conjb_transc_only_2x2__scalar( 6 | size_t k, size_t update, 7 | const float a[restrict static 1], 8 | const float b[restrict static 1], 9 | float c[restrict static 1], 10 | size_t row_stride_c) 11 | { 12 | float acc00r, acc01r, acc10r, acc11r; 13 | float acc00i, acc01i, acc10i, acc11i; 14 | acc00r = acc01r = acc10r = acc11r = 0.0f; 15 | acc00i = acc01i = acc10i = acc11i = 0.0f; 16 | do { 17 | const float a0r = a[0]; 18 | const float a1r = a[2]; 19 | const float a0i = a[1]; 20 | const float a1i = a[3]; 21 | a += 4; 22 | 23 | const float b0r = b[0]; 24 | const float b1r = b[2]; 25 | acc00r += a0r * b0r; 26 | acc01r += a0r * b1r; 27 | acc10r += a1r * b0r; 28 | acc11r += a1r * b1r; 29 | acc00i += a0i * b0r; 30 | acc01i += a0i * b1r; 31 | acc10i += a1i * b0r; 32 | acc11i += a1i * b1r; 33 | 34 | const float b0i = b[1]; 35 | const float b1i = b[3]; 36 | b += 4; 37 | 38 | acc00r += a0i * b0i; 39 | acc01r += a0i * b1i; 40 | acc10r += a1i * b0i; 41 | acc11r += a1i * b1i; 42 | acc00i -= a0r * b0i; 43 | acc01i -= a0r * b1i; 44 | acc10i -= a1r * b0i; 45 | acc11i -= a1r * b1i; 46 | } while (--k); 47 | 48 | if (update != 0) { 49 | c[0] += acc00r; 50 | c[1] += acc00i; 51 | c[2] += acc10r; 52 | c[3] += acc10i; 53 | c += row_stride_c; 54 | c[0] += acc01r; 55 | c[1] += acc01i; 56 | c[2] += acc11r; 57 | c[3] += acc11i; 58 | } else { 59 | c[0] = acc00r; 60 | c[1] = acc00i; 61 | c[2] = acc10r; 62 | c[3] = acc10i; 63 | c += row_stride_c; 64 | c[0] = acc01r; 65 | c[1] = acc01i; 66 | c[2] = acc11r; 67 | c[3] = acc11i; 68 | } 69 | } 70 | 71 | void nnp_cgemm_conjb_transc_upto_2x2__scalar( 72 | uint32_t mr, uint32_t nr, 73 | size_t k, size_t update, 74 | const float a[restrict static 1], 75 | const float b[restrict static 1], 76 | float c[restrict static 1], 77 | size_t row_stride_c) 78 | { 79 | float acc00r, acc01r, acc10r, acc11r; 80 | float acc00i, acc01i, acc10i, acc11i; 81 | acc00r = acc01r = acc10r = acc11r = 0.0f; 82 | acc00i = acc01i = acc10i = acc11i = 0.0f; 83 | do { 84 | const float a0r = a[0]; 85 | const float a0i = a[1]; 86 | a += 2; 87 | 88 | float a1r, a1i; 89 | if (mr > 1) { 90 | a1r = a[0]; 91 | a1i = a[1]; 92 | a += 2; 93 | } 94 | 95 | const float b0r = b[0]; 96 | const float b0i = b[1]; 97 | b += 2; 98 | 99 | acc00r += a0r * b0r; 100 | acc10r += a1r * b0r; 101 | acc00i += a0i * b0r; 102 | acc10i += a1i * b0r; 103 | 104 | acc00r += a0i * b0i; 105 | acc10r += a1i * b0i; 106 | acc00i -= a0r * b0i; 107 | acc10i -= a1r * b0i; 108 | 109 | if (nr > 1) { 110 | const float b1r = b[0]; 111 | const float b1i = b[1]; 112 | b += 2; 113 | 114 | acc01r += a0r * b1r; 115 | acc11r += a1r * b1r; 116 | acc01i += a0i * b1r; 117 | acc11i += a1i * b1r; 118 | 119 | acc01r += a0i * b1i; 120 | acc11r += a1i * b1i; 121 | acc01i -= a0r * b1i; 122 | acc11i -= a1r * b1i; 123 | } 124 | } while (--k); 125 | 126 | if (update != 0) { 127 | c[0] += acc00r; 128 | c[1] += acc00i; 129 | if (mr > 1) { 130 | c[2] += acc10r; 131 | c[3] += acc10i; 132 | } 133 | if (nr > 1) { 134 | c += row_stride_c; 135 | c[0] += acc01r; 136 | c[1] += acc01i; 137 | if (mr > 1) { 138 | c[2] += acc11r; 139 | c[3] += acc11i; 140 | } 141 | } 142 | } else { 143 | c[0] = acc00r; 144 | c[1] = acc00i; 145 | if (mr > 1) { 146 | c[2] = acc10r; 147 | c[3] = acc10i; 148 | } 149 | if (nr > 1) { 150 | c += row_stride_c; 151 | c[0] = acc01r; 152 | c[1] = acc01i; 153 | if (mr > 1) { 154 | c[2] = acc11r; 155 | c[3] = acc11i; 156 | } 157 | } 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/scalar/blas/cgemm-conjb.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | void nnp_cgemm_conjb_only_2x2__scalar( 6 | size_t k, size_t update, 7 | const float a[restrict static 1], 8 | const float b[restrict static 1], 9 | float c[restrict static 1], 10 | size_t row_stride_c) 11 | { 12 | float acc00r, acc01r, acc10r, acc11r; 13 | float acc00i, acc01i, acc10i, acc11i; 14 | acc00r = acc01r = acc10r = acc11r = 0.0f; 15 | acc00i = acc01i = acc10i = acc11i = 0.0f; 16 | do { 17 | const float a0r = a[0]; 18 | const float a1r = a[2]; 19 | const float a0i = a[1]; 20 | const float a1i = a[3]; 21 | a += 4; 22 | 23 | const float b0r = b[0]; 24 | const float b1r = b[2]; 25 | acc00r += a0r * b0r; 26 | acc01r += a0r * b1r; 27 | acc10r += a1r * b0r; 28 | acc11r += a1r * b1r; 29 | acc00i += a0i * b0r; 30 | acc01i += a0i * b1r; 31 | acc10i += a1i * b0r; 32 | acc11i += a1i * b1r; 33 | 34 | const float b0i = b[1]; 35 | const float b1i = b[3]; 36 | b += 4; 37 | 38 | acc00r += a0i * b0i; 39 | acc01r += a0i * b1i; 40 | acc10r += a1i * b0i; 41 | acc11r += a1i * b1i; 42 | acc00i -= a0r * b0i; 43 | acc01i -= a0r * b1i; 44 | acc10i -= a1r * b0i; 45 | acc11i -= a1r * b1i; 46 | } while (--k); 47 | 48 | if (update != 0) { 49 | c[0] += acc00r; 50 | c[1] += acc00i; 51 | c[2] += acc01r; 52 | c[3] += acc01i; 53 | c += row_stride_c; 54 | c[0] += acc10r; 55 | c[1] += acc10i; 56 | c[2] += acc11r; 57 | c[3] += acc11i; 58 | } else { 59 | c[0] = acc00r; 60 | c[1] = acc00i; 61 | c[2] = acc01r; 62 | c[3] = acc01i; 63 | c += row_stride_c; 64 | c[0] = acc10r; 65 | c[1] = acc10i; 66 | c[2] = acc11r; 67 | c[3] = acc11i; 68 | } 69 | } 70 | 71 | void nnp_cgemm_conjb_upto_2x2__scalar( 72 | uint32_t mr, uint32_t nr, 73 | size_t k, size_t update, 74 | const float a[restrict static 1], 75 | const float b[restrict static 1], 76 | float c[restrict static 1], 77 | size_t row_stride_c) 78 | { 79 | float acc00r, acc01r, acc10r, acc11r; 80 | float acc00i, acc01i, acc10i, acc11i; 81 | acc00r = acc01r = acc10r = acc11r = 0.0f; 82 | acc00i = acc01i = acc10i = acc11i = 0.0f; 83 | do { 84 | const float a0r = a[0]; 85 | const float a0i = a[1]; 86 | a += 2; 87 | 88 | float a1r, a1i; 89 | if (mr > 1) { 90 | a1r = a[0]; 91 | a1i = a[1]; 92 | a += 2; 93 | } 94 | 95 | const float b0r = b[0]; 96 | const float b0i = b[1]; 97 | b += 2; 98 | 99 | acc00r += a0r * b0r; 100 | acc10r += a1r * b0r; 101 | acc00i += a0i * b0r; 102 | acc10i += a1i * b0r; 103 | 104 | acc00r += a0i * b0i; 105 | acc10r += a1i * b0i; 106 | acc00i -= a0r * b0i; 107 | acc10i -= a1r * b0i; 108 | 109 | if (nr > 1) { 110 | const float b1r = b[0]; 111 | const float b1i = b[1]; 112 | b += 2; 113 | 114 | acc01r += a0r * b1r; 115 | acc11r += a1r * b1r; 116 | acc01i += a0i * b1r; 117 | acc11i += a1i * b1r; 118 | 119 | acc01r += a0i * b1i; 120 | acc11r += a1i * b1i; 121 | acc01i -= a0r * b1i; 122 | acc11i -= a1r * b1i; 123 | } 124 | } while (--k); 125 | 126 | if (update != 0) { 127 | c[0] += acc00r; 128 | c[1] += acc00i; 129 | if (nr > 1) { 130 | c[2] += acc01r; 131 | c[3] += acc01i; 132 | } 133 | if (mr > 1) { 134 | c += row_stride_c; 135 | c[0] += acc10r; 136 | c[1] += acc10i; 137 | if (nr > 1) { 138 | c[2] += acc11r; 139 | c[3] += acc11i; 140 | } 141 | } 142 | } else { 143 | c[0] = acc00r; 144 | c[1] = acc00i; 145 | if (nr > 1) { 146 | c[2] = acc01r; 147 | c[3] = acc01i; 148 | } 149 | if (mr > 1) { 150 | c += row_stride_c; 151 | c[0] = acc10r; 152 | c[1] = acc10i; 153 | if (nr > 1) { 154 | c[2] = acc11r; 155 | c[3] = acc11i; 156 | } 157 | } 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/scalar/blas/cgemm.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | void nnp_cgemm_only_2x2__scalar( 6 | size_t k, size_t update, 7 | const float a[restrict static 1], 8 | const float b[restrict static 1], 9 | float c[restrict static 1], 10 | size_t row_stride_c) 11 | { 12 | float acc00r, acc01r, acc10r, acc11r; 13 | float acc00i, acc01i, acc10i, acc11i; 14 | acc00r = acc01r = acc10r = acc11r = 0.0f; 15 | acc00i = acc01i = acc10i = acc11i = 0.0f; 16 | do { 17 | const float a0r = a[0]; 18 | const float a1r = a[2]; 19 | const float a0i = a[1]; 20 | const float a1i = a[3]; 21 | a += 4; 22 | 23 | const float b0r = b[0]; 24 | const float b1r = b[2]; 25 | acc00r += a0r * b0r; 26 | acc01r += a0r * b1r; 27 | acc10r += a1r * b0r; 28 | acc11r += a1r * b1r; 29 | acc00i += a0i * b0r; 30 | acc01i += a0i * b1r; 31 | acc10i += a1i * b0r; 32 | acc11i += a1i * b1r; 33 | 34 | const float b0i = b[1]; 35 | const float b1i = b[3]; 36 | b += 4; 37 | 38 | acc00r -= a0i * b0i; 39 | acc01r -= a0i * b1i; 40 | acc10r -= a1i * b0i; 41 | acc11r -= a1i * b1i; 42 | acc00i += a0r * b0i; 43 | acc01i += a0r * b1i; 44 | acc10i += a1r * b0i; 45 | acc11i += a1r * b1i; 46 | } while (--k); 47 | 48 | if (update != 0) { 49 | c[0] += acc00r; 50 | c[1] += acc00i; 51 | c[2] += acc01r; 52 | c[3] += acc01i; 53 | c += row_stride_c; 54 | c[0] += acc10r; 55 | c[1] += acc10i; 56 | c[2] += acc11r; 57 | c[3] += acc11i; 58 | } else { 59 | c[0] = acc00r; 60 | c[1] = acc00i; 61 | c[2] = acc01r; 62 | c[3] = acc01i; 63 | c += row_stride_c; 64 | c[0] = acc10r; 65 | c[1] = acc10i; 66 | c[2] = acc11r; 67 | c[3] = acc11i; 68 | } 69 | } 70 | 71 | void nnp_cgemm_upto_2x2__scalar( 72 | uint32_t mr, uint32_t nr, 73 | size_t k, size_t update, 74 | const float a[restrict static 1], 75 | const float b[restrict static 1], 76 | float c[restrict static 1], 77 | size_t row_stride_c) 78 | { 79 | float acc00r, acc01r, acc10r, acc11r; 80 | float acc00i, acc01i, acc10i, acc11i; 81 | acc00r = acc01r = acc10r = acc11r = 0.0f; 82 | acc00i = acc01i = acc10i = acc11i = 0.0f; 83 | do { 84 | const float a0r = a[0]; 85 | const float a0i = a[1]; 86 | a += 2; 87 | 88 | float a1r, a1i; 89 | if (mr > 1) { 90 | a1r = a[0]; 91 | a1i = a[1]; 92 | a += 2; 93 | } 94 | 95 | const float b0r = b[0]; 96 | const float b0i = b[1]; 97 | b += 2; 98 | 99 | acc00r += a0r * b0r; 100 | acc10r += a1r * b0r; 101 | acc00i += a0i * b0r; 102 | acc10i += a1i * b0r; 103 | 104 | acc00r -= a0i * b0i; 105 | acc10r -= a1i * b0i; 106 | acc00i += a0r * b0i; 107 | acc10i += a1r * b0i; 108 | 109 | if (nr > 1) { 110 | const float b1r = b[0]; 111 | const float b1i = b[1]; 112 | b += 2; 113 | 114 | acc01r += a0r * b1r; 115 | acc11r += a1r * b1r; 116 | acc01i += a0i * b1r; 117 | acc11i += a1i * b1r; 118 | 119 | acc01r -= a0i * b1i; 120 | acc11r -= a1i * b1i; 121 | acc01i += a0r * b1i; 122 | acc11i += a1r * b1i; 123 | } 124 | } while (--k); 125 | 126 | if (update != 0) { 127 | c[0] += acc00r; 128 | c[1] += acc00i; 129 | if (nr > 1) { 130 | c[2] += acc01r; 131 | c[3] += acc01i; 132 | } 133 | if (mr > 1) { 134 | c += row_stride_c; 135 | c[0] += acc10r; 136 | c[1] += acc10i; 137 | if (nr > 1) { 138 | c[2] += acc11r; 139 | c[3] += acc11i; 140 | } 141 | } 142 | } else { 143 | c[0] = acc00r; 144 | c[1] = acc00i; 145 | if (nr > 1) { 146 | c[2] = acc01r; 147 | c[3] = acc01i; 148 | } 149 | if (mr > 1) { 150 | c += row_stride_c; 151 | c[0] = acc10r; 152 | c[1] = acc10i; 153 | if (nr > 1) { 154 | c[2] = acc11r; 155 | c[3] = acc11i; 156 | } 157 | } 158 | } 159 | } 160 | -------------------------------------------------------------------------------- /src/scalar/blas/s2gemm-transc.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | void nnp_s2gemm_transc_only_2x2__scalar( 6 | size_t k, size_t update, 7 | const float a[restrict static 1], 8 | const float b[restrict static 1], 9 | float c[restrict static 1], 10 | size_t row_stride_c) 11 | { 12 | float acc00r, acc01r, acc10r, acc11r; 13 | float acc00i, acc01i, acc10i, acc11i; 14 | acc00r = acc01r = acc10r = acc11r = 0.0f; 15 | acc00i = acc01i = acc10i = acc11i = 0.0f; 16 | do { 17 | const float a0r = a[0]; 18 | const float a1r = a[2]; 19 | const float b0r = b[0]; 20 | const float b1r = b[2]; 21 | acc00r += a0r * b0r; 22 | acc01r += a0r * b1r; 23 | acc10r += a1r * b0r; 24 | acc11r += a1r * b1r; 25 | 26 | const float a0i = a[1]; 27 | const float a1i = a[3]; 28 | const float b0i = b[1]; 29 | const float b1i = b[3]; 30 | acc00i += a0i * b0i; 31 | acc01i += a0i * b1i; 32 | acc10i += a1i * b0i; 33 | acc11i += a1i * b1i; 34 | 35 | a += 4; 36 | b += 4; 37 | } while (--k); 38 | 39 | if (update != 0) { 40 | c[0] += acc00r; 41 | c[1] += acc00i; 42 | c[2] += acc10r; 43 | c[3] += acc10i; 44 | c += row_stride_c; 45 | c[0] += acc01r; 46 | c[1] += acc01i; 47 | c[2] += acc11r; 48 | c[3] += acc11i; 49 | } else { 50 | c[0] = acc00r; 51 | c[1] = acc00i; 52 | c[2] = acc10r; 53 | c[3] = acc10i; 54 | c += row_stride_c; 55 | c[0] = acc01r; 56 | c[1] = acc01i; 57 | c[2] = acc11r; 58 | c[3] = acc11i; 59 | } 60 | } 61 | 62 | void nnp_s2gemm_transc_upto_2x2__scalar( 63 | uint32_t mr, uint32_t nr, 64 | size_t k, size_t update, 65 | const float a[restrict static 1], 66 | const float b[restrict static 1], 67 | float c[restrict static 1], 68 | size_t row_stride_c) 69 | { 70 | float acc00r, acc01r, acc10r, acc11r; 71 | float acc00i, acc01i, acc10i, acc11i; 72 | acc00r = acc01r = acc10r = acc11r = 0.0f; 73 | acc00i = acc01i = acc10i = acc11i = 0.0f; 74 | do { 75 | const float a0r = a[0]; 76 | const float a0i = a[1]; 77 | a += 2; 78 | 79 | float a1r, a1i; 80 | if (mr > 1) { 81 | a1r = a[0]; 82 | a1i = a[1]; 83 | a += 2; 84 | } 85 | 86 | const float b0r = b[0]; 87 | const float b0i = b[1]; 88 | b += 2; 89 | 90 | acc00r += a0r * b0r; 91 | acc10r += a1r * b0r; 92 | acc00i += a0i * b0i; 93 | acc10i += a1i * b0i; 94 | 95 | if (nr > 1) { 96 | const float b1r = b[0]; 97 | const float b1i = b[1]; 98 | b += 2; 99 | 100 | acc01r += a0r * b1r; 101 | acc11r += a1r * b1r; 102 | acc01i += a0i * b1i; 103 | acc11i += a1i * b1i; 104 | } 105 | } while (--k); 106 | 107 | if (update != 0) { 108 | c[0] += acc00r; 109 | c[1] += acc00i; 110 | if (mr > 1) { 111 | c[2] += acc10r; 112 | c[3] += acc10i; 113 | } 114 | if (nr > 1) { 115 | c += row_stride_c; 116 | c[0] += acc01r; 117 | c[1] += acc01i; 118 | if (mr > 1) { 119 | c[2] += acc11r; 120 | c[3] += acc11i; 121 | } 122 | } 123 | } else { 124 | c[0] = acc00r; 125 | c[1] = acc00i; 126 | if (mr > 1) { 127 | c[2] = acc10r; 128 | c[3] = acc10i; 129 | } 130 | if (nr > 1) { 131 | c += row_stride_c; 132 | c[0] = acc01r; 133 | c[1] = acc01i; 134 | if (mr > 1) { 135 | c[2] = acc11r; 136 | c[3] = acc11i; 137 | } 138 | } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/scalar/blas/s2gemm.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | 5 | void nnp_s2gemm_only_2x2__scalar( 6 | size_t k, size_t update, 7 | const float a[restrict static 1], 8 | const float b[restrict static 1], 9 | float c[restrict static 1], 10 | size_t row_stride_c) 11 | { 12 | float acc00r, acc01r, acc10r, acc11r; 13 | float acc00i, acc01i, acc10i, acc11i; 14 | acc00r = acc01r = acc10r = acc11r = 0.0f; 15 | acc00i = acc01i = acc10i = acc11i = 0.0f; 16 | do { 17 | const float a0r = a[0]; 18 | const float a1r = a[2]; 19 | const float b0r = b[0]; 20 | const float b1r = b[2]; 21 | acc00r += a0r * b0r; 22 | acc01r += a0r * b1r; 23 | acc10r += a1r * b0r; 24 | acc11r += a1r * b1r; 25 | 26 | const float a0i = a[1]; 27 | const float a1i = a[3]; 28 | const float b0i = b[1]; 29 | const float b1i = b[3]; 30 | acc00i += a0i * b0i; 31 | acc01i += a0i * b1i; 32 | acc10i += a1i * b0i; 33 | acc11i += a1i * b1i; 34 | 35 | a += 4; 36 | b += 4; 37 | } while (--k); 38 | 39 | if (update != 0) { 40 | c[0] += acc00r; 41 | c[1] += acc00i; 42 | c[2] += acc01r; 43 | c[3] += acc01i; 44 | c += row_stride_c; 45 | c[0] += acc10r; 46 | c[1] += acc10i; 47 | c[2] += acc11r; 48 | c[3] += acc11i; 49 | } else { 50 | c[0] = acc00r; 51 | c[1] = acc00i; 52 | c[2] = acc01r; 53 | c[3] = acc01i; 54 | c += row_stride_c; 55 | c[0] = acc10r; 56 | c[1] = acc10i; 57 | c[2] = acc11r; 58 | c[3] = acc11i; 59 | } 60 | } 61 | 62 | void nnp_s2gemm_upto_2x2__scalar( 63 | uint32_t mr, uint32_t nr, 64 | size_t k, size_t update, 65 | const float a[restrict static 1], 66 | const float b[restrict static 1], 67 | float c[restrict static 1], 68 | size_t row_stride_c) 69 | { 70 | float acc00r, acc01r, acc10r, acc11r; 71 | float acc00i, acc01i, acc10i, acc11i; 72 | acc00r = acc01r = acc10r = acc11r = 0.0f; 73 | acc00i = acc01i = acc10i = acc11i = 0.0f; 74 | do { 75 | const float a0r = a[0]; 76 | const float a0i = a[1]; 77 | a += 2; 78 | 79 | float a1r, a1i; 80 | if (mr > 1) { 81 | a1r = a[0]; 82 | a1i = a[1]; 83 | a += 2; 84 | } 85 | 86 | const float b0r = b[0]; 87 | const float b0i = b[1]; 88 | b += 2; 89 | 90 | acc00r += a0r * b0r; 91 | acc10r += a1r * b0r; 92 | acc00i += a0i * b0i; 93 | acc10i += a1i * b0i; 94 | 95 | if (nr > 1) { 96 | const float b1r = b[0]; 97 | const float b1i = b[1]; 98 | b += 2; 99 | 100 | acc01r += a0r * b1r; 101 | acc11r += a1r * b1r; 102 | acc01i += a0i * b1i; 103 | acc11i += a1i * b1i; 104 | } 105 | } while (--k); 106 | 107 | if (update != 0) { 108 | c[0] += acc00r; 109 | c[1] += acc00i; 110 | if (nr > 1) { 111 | c[2] += acc01r; 112 | c[3] += acc01i; 113 | } 114 | if (mr > 1) { 115 | c += row_stride_c; 116 | c[0] += acc10r; 117 | c[1] += acc10i; 118 | if (nr > 1) { 119 | c[2] += acc11r; 120 | c[3] += acc11i; 121 | } 122 | } 123 | } else { 124 | c[0] = acc00r; 125 | c[1] = acc00i; 126 | if (nr > 1) { 127 | c[2] = acc01r; 128 | c[3] = acc01i; 129 | } 130 | if (mr > 1) { 131 | c += row_stride_c; 132 | c[0] = acc10r; 133 | c[1] = acc10i; 134 | if (nr > 1) { 135 | c[2] = acc11r; 136 | c[3] = acc11i; 137 | } 138 | } 139 | } 140 | } 141 | -------------------------------------------------------------------------------- /src/scalar/butterfly.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | 4 | static inline void scalar_swap(float a[restrict static 1], float b[restrict static 1]) { 5 | const float new_a = *b; 6 | const float new_b = *a; 7 | *a = new_a; 8 | *b = new_b; 9 | } 10 | 11 | static inline void scalar_butterfly(float a[restrict static 1], float b[restrict static 1]) { 12 | const float new_a = *a + *b; 13 | const float new_b = *a - *b; 14 | *a = new_a; 15 | *b = new_b; 16 | } 17 | 18 | static inline void scalar_butterfly_and_negate_b(float a[restrict static 1], float b[restrict static 1]) { 19 | const float new_a = *a + *b; 20 | const float new_b = *b - *a; 21 | *a = new_a; 22 | *b = new_b; 23 | } 24 | 25 | static inline void scalar_butterfly_with_negated_b(float a[restrict static 1], float b[restrict static 1]) { 26 | const float new_a = *a - *b; 27 | const float new_b = *a + *b; 28 | *a = new_a; 29 | *b = new_b; 30 | } 31 | -------------------------------------------------------------------------------- /src/scalar/fft-aos.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void nnp_fft4_aos__scalar( 5 | const float t[restrict static 8], 6 | float f[restrict static 8]) 7 | { 8 | float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i; 9 | scalar_fft4_aos( 10 | t, t + 4, 1, 0, 8, 11 | &w0r, &w0i, &w1r, &w1i, &w2r, &w2i, &w3r, &w3i); 12 | f[0] = w0r; 13 | f[1] = w0i; 14 | f[2] = w1r; 15 | f[3] = w1i; 16 | f[4] = w2r; 17 | f[5] = w2i; 18 | f[6] = w3r; 19 | f[7] = w3i; 20 | } 21 | 22 | void nnp_fft8_aos__scalar( 23 | const float t[restrict static 16], 24 | float f[restrict static 16]) 25 | { 26 | float w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i, w5r, w5i, w6r, w6i, w7r, w7i; 27 | scalar_fft8_aos( 28 | t, t + 8, 1, 0, 16, 29 | &w0r, &w0i, &w1r, &w1i, &w2r, &w2i, &w3r, &w3i, &w4r, &w4i, &w5r, &w5i, &w6r, &w6i, &w7r, &w7i); 30 | f[ 0] = w0r; 31 | f[ 1] = w0i; 32 | f[ 2] = w1r; 33 | f[ 3] = w1i; 34 | f[ 4] = w2r; 35 | f[ 5] = w2i; 36 | f[ 6] = w3r; 37 | f[ 7] = w3i; 38 | f[ 8] = w4r; 39 | f[ 9] = w4i; 40 | f[10] = w5r; 41 | f[11] = w5i; 42 | f[12] = w6r; 43 | f[13] = w6i; 44 | f[14] = w7r; 45 | f[15] = w7i; 46 | } 47 | 48 | void nnp_ifft4_aos__scalar( 49 | const float f[restrict static 8], 50 | float t[restrict static 8]) 51 | { 52 | const float w0r = f[0]; 53 | const float w0i = f[1]; 54 | const float w1r = f[2]; 55 | const float w1i = f[3]; 56 | const float w2r = f[4]; 57 | const float w2i = f[5]; 58 | const float w3r = f[6]; 59 | const float w3i = f[7]; 60 | 61 | scalar_ifft4_aos( 62 | w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i, 63 | t, t + 4, 1); 64 | } 65 | 66 | void nnp_ifft8_aos__scalar( 67 | const float f[restrict static 16], 68 | float t[restrict static 16]) 69 | { 70 | const float w0r = f[ 0]; 71 | const float w0i = f[ 1]; 72 | const float w1r = f[ 2]; 73 | const float w1i = f[ 3]; 74 | const float w2r = f[ 4]; 75 | const float w2i = f[ 5]; 76 | const float w3r = f[ 6]; 77 | const float w3i = f[ 7]; 78 | const float w4r = f[ 8]; 79 | const float w4i = f[ 9]; 80 | const float w5r = f[10]; 81 | const float w5i = f[11]; 82 | const float w6r = f[12]; 83 | const float w6i = f[13]; 84 | const float w7r = f[14]; 85 | const float w7i = f[15]; 86 | 87 | scalar_ifft8_aos( 88 | w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i, w5r, w5i, w6r, w6i, w7r, w7i, 89 | t, t + 8, 1); 90 | } 91 | -------------------------------------------------------------------------------- /src/scalar/fft-dualreal.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void nnp_fft8_dualreal__scalar( 5 | const float t[restrict static 16], 6 | float f[restrict static 16]) 7 | { 8 | float x0, y0, x1r, y1r, x2r, y2r, x3r, y3r; 9 | float x4, y4, x1i, y1i, x2i, y2i, x3i, y3i; 10 | scalar_fft8_dualreal(t, 11 | &x0, &y0, &x1r, &y1r, &x2r, &y2r, &x3r, &y3r, 12 | &x4, &y4, &x1i, &y1i, &x2i, &y2i, &x3i, &y3i); 13 | 14 | f[0] = x0; 15 | f[1] = y0; 16 | f[2] = x1r; 17 | f[3] = y1r; 18 | f[4] = x2r; 19 | f[5] = y2r; 20 | f[6] = x3r; 21 | f[7] = y3r; 22 | 23 | f[ 8] = x4; 24 | f[ 9] = y4; 25 | f[10] = x1i; 26 | f[11] = y1i; 27 | f[12] = x2i; 28 | f[13] = y2i; 29 | f[14] = x3i; 30 | f[15] = y3i; 31 | } 32 | 33 | void nnp_fft16_dualreal__scalar( 34 | const float t[restrict static 32], 35 | float f[restrict static 32]) 36 | { 37 | float x0, y0, x1r, y1r, x2r, y2r, x3r, y3r, x4r, y4r, x5r, y5r, x6r, y6r, x7r, y7r; 38 | float x8, y8, x1i, y1i, x2i, y2i, x3i, y3i, x4i, y4i, x5i, y5i, x6i, y6i, x7i, y7i; 39 | scalar_fft16_dualreal(t, 40 | &x0, &y0, &x1r, &y1r, &x2r, &y2r, &x3r, &y3r, &x4r, &y4r, &x5r, &y5r, &x6r, &y6r, &x7r, &y7r, 41 | &x8, &y8, &x1i, &y1i, &x2i, &y2i, &x3i, &y3i, &x4i, &y4i, &x5i, &y5i, &x6i, &y6i, &x7i, &y7i); 42 | 43 | f[ 0] = x0; 44 | f[ 1] = y0; 45 | f[ 2] = x1r; 46 | f[ 3] = y1r; 47 | f[ 4] = x2r; 48 | f[ 5] = y2r; 49 | f[ 6] = x3r; 50 | f[ 7] = y3r; 51 | f[ 8] = x4r; 52 | f[ 9] = y4r; 53 | f[10] = x5r; 54 | f[11] = y5r; 55 | f[12] = x6r; 56 | f[13] = y6r; 57 | f[14] = x7r; 58 | f[15] = y7r; 59 | 60 | f[16] = x8; 61 | f[17] = y8; 62 | f[18] = x1i; 63 | f[19] = y1i; 64 | f[20] = x2i; 65 | f[21] = y2i; 66 | f[22] = x3i; 67 | f[23] = y3i; 68 | f[24] = x4i; 69 | f[25] = y4i; 70 | f[26] = x5i; 71 | f[27] = y5i; 72 | f[28] = x6i; 73 | f[29] = y6i; 74 | f[30] = x7i; 75 | f[31] = y7i; 76 | } 77 | 78 | void nnp_ifft8_dualreal__scalar( 79 | const float f[restrict static 16], 80 | float t[restrict static 16]) 81 | { 82 | const float x0 = f[ 0]; 83 | const float y0 = f[ 1]; 84 | const float x1r = f[ 2]; 85 | const float y1r = f[ 3]; 86 | const float x2r = f[ 4]; 87 | const float y2r = f[ 5]; 88 | const float x3r = f[ 6]; 89 | const float y3r = f[ 7]; 90 | const float x4 = f[ 8]; 91 | const float y4 = f[ 9]; 92 | const float x1i = f[10]; 93 | const float y1i = f[11]; 94 | const float x2i = f[12]; 95 | const float y2i = f[13]; 96 | const float x3i = f[14]; 97 | const float y3i = f[15]; 98 | 99 | scalar_ifft8_dualreal( 100 | x0, y0, x1r, y1r, x2r, y2r, x3r, y3r, 101 | x4, y4, x1i, y1i, x2i, y2i, x3i, y3i, 102 | t); 103 | } 104 | 105 | void nnp_ifft16_dualreal__scalar( 106 | const float f[restrict static 32], 107 | float t[restrict static 32]) 108 | { 109 | const float x0 = f[ 0]; 110 | const float y0 = f[ 1]; 111 | const float x1r = f[ 2]; 112 | const float y1r = f[ 3]; 113 | const float x2r = f[ 4]; 114 | const float y2r = f[ 5]; 115 | const float x3r = f[ 6]; 116 | const float y3r = f[ 7]; 117 | const float x4r = f[ 8]; 118 | const float y4r = f[ 9]; 119 | const float x5r = f[10]; 120 | const float y5r = f[11]; 121 | const float x6r = f[12]; 122 | const float y6r = f[13]; 123 | const float x7r = f[14]; 124 | const float y7r = f[15]; 125 | 126 | const float x8 = f[16]; 127 | const float y8 = f[17]; 128 | const float x1i = f[18]; 129 | const float y1i = f[19]; 130 | const float x2i = f[20]; 131 | const float y2i = f[21]; 132 | const float x3i = f[22]; 133 | const float y3i = f[23]; 134 | const float x4i = f[24]; 135 | const float y4i = f[25]; 136 | const float x5i = f[26]; 137 | const float y5i = f[27]; 138 | const float x6i = f[28]; 139 | const float y6i = f[29]; 140 | const float x7i = f[30]; 141 | const float y7i = f[31]; 142 | 143 | scalar_ifft16_dualreal( 144 | x0, y0, x1r, y1r, x2r, y2r, x3r, y3r, x4r, y4r, x5r, y5r, x6r, y6r, x7r, y7r, 145 | x8, y8, x1i, y1i, x2i, y2i, x3i, y3i, x4i, y4i, x5i, y5i, x6i, y6i, x7i, y7i, 146 | t); 147 | } 148 | -------------------------------------------------------------------------------- /src/scalar/fft-real.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void nnp_fft8_real__scalar( 5 | const float t[restrict static 8], 6 | float f[restrict static 8]) 7 | { 8 | scalar_fft8_real( 9 | t, t + 4, 1, 0, 8, 10 | f, 1); 11 | } 12 | 13 | void nnp_fft16_real__scalar( 14 | const float t[restrict static 16], 15 | float f[restrict static 16]) 16 | { 17 | float w0r, w8r, w1r, w1i, w2r, w2i, w3r, w3i, w4r, w4i, w5r, w5i, w6r, w6i, w7r, w7i; 18 | scalar_fft16_real( 19 | t, t + 8, 1, 0, 16, 20 | f, 1); 21 | } 22 | 23 | void nnp_ifft8_real__scalar( 24 | const float f[restrict static 8], 25 | float t[restrict static 8]) 26 | { 27 | const float f0r = f[0]; 28 | const float f4r = f[1]; 29 | const float f1r = f[2]; 30 | const float f1i = f[3]; 31 | const float f2r = f[4]; 32 | const float f2i = f[5]; 33 | const float f3r = f[6]; 34 | const float f3i = f[7]; 35 | scalar_ifft8_real( 36 | f0r, f4r, f1r, f1i, f2r, f2i, f3r, f3i, 37 | t, t + 4, 1); 38 | } 39 | 40 | void nnp_ifft16_real__scalar( 41 | const float f[restrict static 16], 42 | float t[restrict static 16]) 43 | { 44 | const float f0r = f[ 0]; 45 | const float f8r = f[ 1]; 46 | const float f1r = f[ 2]; 47 | const float f1i = f[ 3]; 48 | const float f2r = f[ 4]; 49 | const float f2i = f[ 5]; 50 | const float f3r = f[ 6]; 51 | const float f3i = f[ 7]; 52 | const float f4r = f[ 8]; 53 | const float f4i = f[ 9]; 54 | const float f5r = f[10]; 55 | const float f5i = f[11]; 56 | const float f6r = f[12]; 57 | const float f6i = f[13]; 58 | const float f7r = f[14]; 59 | const float f7i = f[15]; 60 | scalar_ifft16_real( 61 | f0r, f8r, f1r, f1i, f2r, f2i, f3r, f3i, f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i, 62 | t, t + 8, 1); 63 | } 64 | -------------------------------------------------------------------------------- /src/scalar/fft-soa.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void nnp_fft8_soa__scalar( 5 | const float t[restrict static 16], 6 | float f[restrict static 16]) 7 | { 8 | float f0r, f1r, f2r, f3r, f4r, f5r, f6r, f7r; 9 | float f0i, f1i, f2i, f3i, f4i, f5i, f6i, f7i; 10 | scalar_fft8_soa(t, 11 | &f0r, &f1r, &f2r, &f3r, &f4r, &f5r, &f6r, &f7r, 12 | &f0i, &f1i, &f2i, &f3i, &f4i, &f5i, &f6i, &f7i); 13 | 14 | f[0] = f0r; 15 | f[1] = f1r; 16 | f[2] = f2r; 17 | f[3] = f3r; 18 | f[4] = f4r; 19 | f[5] = f5r; 20 | f[6] = f6r; 21 | f[7] = f7r; 22 | 23 | f[ 8] = f0i; 24 | f[ 9] = f1i; 25 | f[10] = f2i; 26 | f[11] = f3i; 27 | f[12] = f4i; 28 | f[13] = f5i; 29 | f[14] = f6i; 30 | f[15] = f7i; 31 | } 32 | 33 | void nnp_fft16_soa__scalar( 34 | const float t[restrict static 32], 35 | float f[restrict static 32]) 36 | { 37 | float f0r, f1r, f2r, f3r, f4r, f5r, f6r, f7r, f8r, f9r, f10r, f11r, f12r, f13r, f14r, f15r; 38 | float f0i, f1i, f2i, f3i, f4i, f5i, f6i, f7i, f8i, f9i, f10i, f11i, f12i, f13i, f14i, f15i; 39 | scalar_fft16_soa(t, 40 | &f0r, &f1r, &f2r, &f3r, &f4r, &f5r, &f6r, &f7r, &f8r, &f9r, &f10r, &f11r, &f12r, &f13r, &f14r, &f15r, 41 | &f0i, &f1i, &f2i, &f3i, &f4i, &f5i, &f6i, &f7i, &f8i, &f9i, &f10i, &f11i, &f12i, &f13i, &f14i, &f15i); 42 | 43 | f[ 0] = f0r; 44 | f[ 1] = f1r; 45 | f[ 2] = f2r; 46 | f[ 3] = f3r; 47 | f[ 4] = f4r; 48 | f[ 5] = f5r; 49 | f[ 6] = f6r; 50 | f[ 7] = f7r; 51 | f[ 8] = f8r; 52 | f[ 9] = f9r; 53 | f[10] = f10r; 54 | f[11] = f11r; 55 | f[12] = f12r; 56 | f[13] = f13r; 57 | f[14] = f14r; 58 | f[15] = f15r; 59 | 60 | f[16] = f0i; 61 | f[17] = f1i; 62 | f[18] = f2i; 63 | f[19] = f3i; 64 | f[20] = f4i; 65 | f[21] = f5i; 66 | f[22] = f6i; 67 | f[23] = f7i; 68 | f[24] = f8i; 69 | f[25] = f9i; 70 | f[26] = f10i; 71 | f[27] = f11i; 72 | f[28] = f12i; 73 | f[29] = f13i; 74 | f[30] = f14i; 75 | f[31] = f15i; 76 | } 77 | 78 | void nnp_ifft8_soa__scalar( 79 | const float f[restrict static 16], 80 | float t[restrict static 16]) 81 | { 82 | const float f0r = f[0]; 83 | const float f1r = f[1]; 84 | const float f2r = f[2]; 85 | const float f3r = f[3]; 86 | const float f4r = f[4]; 87 | const float f5r = f[5]; 88 | const float f6r = f[6]; 89 | const float f7r = f[7]; 90 | 91 | const float f0i = f[ 8]; 92 | const float f1i = f[ 9]; 93 | const float f2i = f[10]; 94 | const float f3i = f[11]; 95 | const float f4i = f[12]; 96 | const float f5i = f[13]; 97 | const float f6i = f[14]; 98 | const float f7i = f[15]; 99 | 100 | scalar_ifft8_soa( 101 | f0r, f1r, f2r, f3r, f4r, f5r, f6r, f7r, 102 | f0i, f1i, f2i, f3i, f4i, f5i, f6i, f7i, 103 | t); 104 | } 105 | 106 | void nnp_ifft16_soa__scalar( 107 | const float f[restrict static 32], 108 | float t[restrict static 32]) 109 | { 110 | const float f0r = f[ 0]; 111 | const float f1r = f[ 1]; 112 | const float f2r = f[ 2]; 113 | const float f3r = f[ 3]; 114 | const float f4r = f[ 4]; 115 | const float f5r = f[ 5]; 116 | const float f6r = f[ 6]; 117 | const float f7r = f[ 7]; 118 | const float f8r = f[ 8]; 119 | const float f9r = f[ 9]; 120 | const float f10r = f[10]; 121 | const float f11r = f[11]; 122 | const float f12r = f[12]; 123 | const float f13r = f[13]; 124 | const float f14r = f[14]; 125 | const float f15r = f[15]; 126 | 127 | const float f0i = f[16]; 128 | const float f1i = f[17]; 129 | const float f2i = f[18]; 130 | const float f3i = f[19]; 131 | const float f4i = f[20]; 132 | const float f5i = f[21]; 133 | const float f6i = f[22]; 134 | const float f7i = f[23]; 135 | const float f8i = f[24]; 136 | const float f9i = f[25]; 137 | const float f10i = f[26]; 138 | const float f11i = f[27]; 139 | const float f12i = f[28]; 140 | const float f13i = f[29]; 141 | const float f14i = f[30]; 142 | const float f15i = f[31]; 143 | 144 | scalar_ifft16_soa( 145 | f0r, f1r, f2r, f3r, f4r, f5r, f6r, f7r, f8r, f9r, f10r, f11r, f12r, f13r, f14r, f15r, 146 | f0i, f1i, f2i, f3i, f4i, f5i, f6i, f7i, f8i, f9i, f10i, f11i, f12i, f13i, f14i, f15i, 147 | t); 148 | } 149 | -------------------------------------------------------------------------------- /src/scalar/relu.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | 7 | void nnp_relu__scalar( 8 | const float input[restrict static 1], 9 | float output[restrict static 1], 10 | size_t length, 11 | float negative_slope) 12 | { 13 | while (length >= 4) { 14 | const float data0 = input[0]; 15 | const float data1 = input[1]; 16 | const float data2 = input[2]; 17 | const float data3 = input[3]; 18 | input += 4; 19 | 20 | output[0] = relu(data0, negative_slope); 21 | output[1] = relu(data1, negative_slope); 22 | output[2] = relu(data2, negative_slope); 23 | output[3] = relu(data3, negative_slope); 24 | output += 4; 25 | 26 | length -= 4; 27 | } 28 | while (length != 0) { 29 | *output++ = relu(*input++, negative_slope); 30 | length -= 1; 31 | } 32 | } 33 | 34 | void nnp_inplace_relu__scalar( 35 | float data[restrict static 1], 36 | size_t length, 37 | float negative_slope) 38 | { 39 | while (length >= 4) { 40 | const float data0 = data[0]; 41 | const float data1 = data[1]; 42 | const float data2 = data[2]; 43 | const float data3 = data[3]; 44 | 45 | data[0] = relu(data0, negative_slope); 46 | data[1] = relu(data1, negative_slope); 47 | data[2] = relu(data2, negative_slope); 48 | data[3] = relu(data3, negative_slope); 49 | data += 4; 50 | 51 | length -= 4; 52 | } 53 | while (length != 0) { 54 | *data = relu(*data, negative_slope); 55 | 56 | data += 1; 57 | length -= 1; 58 | } 59 | } 60 | 61 | void nnp_grad_relu__scalar( 62 | const float output_gradient[restrict static 4], 63 | const float input[restrict static 4], 64 | float input_gradient[restrict static 4], 65 | size_t length, 66 | float negative_slope) 67 | { 68 | while (length >= 4) { 69 | const float data0 = input[0]; 70 | const float data1 = input[1]; 71 | const float data2 = input[2]; 72 | const float data3 = input[3]; 73 | input += 4; 74 | 75 | const float grad0 = output_gradient[0]; 76 | const float grad1 = output_gradient[1]; 77 | const float grad2 = output_gradient[2]; 78 | const float grad3 = output_gradient[3]; 79 | output_gradient += 4; 80 | 81 | input_gradient[0] = grad_relu(grad0, data0, negative_slope); 82 | input_gradient[1] = grad_relu(grad1, data1, negative_slope); 83 | input_gradient[2] = grad_relu(grad2, data2, negative_slope); 84 | input_gradient[3] = grad_relu(grad3, data3, negative_slope); 85 | input_gradient += 4; 86 | 87 | length -= 4; 88 | } 89 | while (length != 0) { 90 | *input_gradient++ = grad_relu(*output_gradient++, *input++, negative_slope); 91 | length -= 1; 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/scalar/softmax.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | 9 | static float max__scalar(size_t n, const float v[restrict static n]) { 10 | float max_v = *v++; 11 | while (--n) { 12 | max_v = maxf(max_v, *v++); 13 | } 14 | return max_v; 15 | } 16 | 17 | static float sum_exp_minus_c__scalar(size_t n, const float v[restrict static n], float c) { 18 | float sum = 0.0f; 19 | do { 20 | sum += expf(*v++ - c); 21 | } while (--n); 22 | return sum; 23 | } 24 | 25 | static void scaled_exp_minus_c__scalar(size_t n, const float x[static n], float y[static n], float scale, float c) { 26 | do { 27 | *y++ = scale * expf(*x++ - c); 28 | } while (--n); 29 | } 30 | 31 | void nnp_softmax__scalar( 32 | size_t n, 33 | const float x[restrict static n], 34 | float y[restrict static n]) 35 | { 36 | const float c = max__scalar(n, x); 37 | const float sum = sum_exp_minus_c__scalar(n, x, c); 38 | const float scale = 1.0f / sum; 39 | scaled_exp_minus_c__scalar(n, x, y, scale, c); 40 | } 41 | 42 | void nnp_inplace_softmax__scalar( 43 | size_t n, 44 | float v[restrict static n]) 45 | { 46 | const float c = max__scalar(n, v); 47 | const float sum = sum_exp_minus_c__scalar(n, v, c); 48 | const float scale = 1.0f / sum; 49 | scaled_exp_minus_c__scalar(n, v, v, scale, c); 50 | } 51 | -------------------------------------------------------------------------------- /src/scalar/winograd-f6k3.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | void nnp_iwt_f6k3__scalar( 5 | const float d[restrict static 8], 6 | float w[restrict static 8]) 7 | { 8 | winograd_f6k3_input_transform( 9 | d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7], 10 | &w[0], &w[1], &w[2], &w[3], &w[4], &w[5], &w[6], &w[7]); 11 | } 12 | 13 | void nnp_kwt_f6k3__scalar( 14 | const float g[restrict static 3], 15 | float w[restrict static 8]) 16 | { 17 | winograd_f6k3_kernel_transform( 18 | g[0], g[1], g[2], 19 | &w[0], &w[1], &w[2], &w[3], &w[4], &w[5], &w[6], &w[7], 20 | true /* rescale coefficients */); 21 | } 22 | 23 | void nnp_owt_f6k3__scalar( 24 | const float m[restrict static 8], 25 | float s[restrict static 6]) 26 | { 27 | winograd_f6k3_output_transform( 28 | m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], 29 | &s[0], &s[1], &s[2], &s[3], &s[4], &s[5]); 30 | } 31 | -------------------------------------------------------------------------------- /src/softmax-output.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | 12 | struct NNP_CACHE_ALIGN softmax_context { 13 | nnp_softmax_function softmax_function; 14 | size_t channels; 15 | const float* input; 16 | float* output; 17 | }; 18 | 19 | static void compute_softmax_output( 20 | const struct softmax_context context[restrict static 1], 21 | size_t sample) 22 | { 23 | const nnp_softmax_function softmax = context->softmax_function; 24 | const size_t channels = context->channels; 25 | 26 | const float (*input)[channels] = (const float(*)[channels]) context->input; 27 | float (*output)[channels] = (float(*)[channels]) context->output; 28 | 29 | softmax(channels, input[sample], output[sample]); 30 | } 31 | 32 | struct NNP_CACHE_ALIGN inplace_softmax_context { 33 | nnp_inplace_softmax_function softmax_function; 34 | size_t channels; 35 | float* data; 36 | }; 37 | 38 | static void compute_inplace_softmax_output( 39 | const struct inplace_softmax_context context[restrict static 1], 40 | size_t sample) 41 | { 42 | const nnp_inplace_softmax_function softmax = context->softmax_function; 43 | const size_t channels = context->channels; 44 | 45 | float (*data)[channels] = (float(*)[channels]) context->data; 46 | 47 | softmax(channels, data[sample]); 48 | } 49 | 50 | enum nnp_status nnp_softmax_output( 51 | size_t batch_size, 52 | size_t channels, 53 | const float* input, 54 | float* output, 55 | pthreadpool_t threadpool) 56 | { 57 | enum nnp_status status = validate_softmax_arguments(batch_size, channels); 58 | if (status != nnp_status_success) { 59 | return status; 60 | } 61 | 62 | if (input != output) { 63 | /* Out-of-place softmax */ 64 | struct softmax_context softmax_context = { 65 | .softmax_function = nnp_hwinfo.activations.softmax, 66 | .channels = channels, 67 | .input = input, 68 | .output = output, 69 | }; 70 | pthreadpool_parallelize_1d(threadpool, 71 | (pthreadpool_function_1d_t) compute_softmax_output, 72 | &softmax_context, 73 | batch_size, 74 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 75 | } else { 76 | /* In-place softmax */ 77 | struct inplace_softmax_context inplace_softmax_context = { 78 | .softmax_function = nnp_hwinfo.activations.inplace_softmax, 79 | .channels = channels, 80 | .data = output, 81 | }; 82 | pthreadpool_parallelize_1d(threadpool, 83 | (pthreadpool_function_1d_t) compute_inplace_softmax_output, 84 | &inplace_softmax_context, 85 | batch_size, 86 | PTHREADPOOL_FLAG_DISABLE_DENORMALS); 87 | } 88 | 89 | return nnp_status_success; 90 | } 91 | -------------------------------------------------------------------------------- /src/x86_64-fma/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Maratyszcza/NNPACK/70a77f485e8b934224f3a79efd8edcd84cd377b8/src/x86_64-fma/__init__.py -------------------------------------------------------------------------------- /src/x86_64-fma/blas/sdotxf.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | 4 | simd_width = YMMRegister.size // float_.size 5 | 6 | for fusion_factor in range(1, 8 + 1): 7 | arg_x = Argument(ptr(const_float_), "x") 8 | arg_y = Argument(ptr(const_float_), "y") 9 | arg_stride_y = Argument(size_t, "stride_y") 10 | arg_sum = Argument(ptr(float_), "sum") 11 | arg_n = Argument(size_t, "n") 12 | with Function("nnp_sdotxf{fusion_factor}__avx2".format(fusion_factor=fusion_factor), 13 | (arg_x, arg_y, arg_stride_y, arg_sum, arg_n), 14 | target=uarch.default + isa.fma3 + isa.avx2): 15 | 16 | reg_x = GeneralPurposeRegister64() 17 | LOAD.ARGUMENT(reg_x, arg_x) 18 | 19 | reg_ys = [GeneralPurposeRegister64() for m in range(fusion_factor)] 20 | LOAD.ARGUMENT(reg_ys[0], arg_y) 21 | 22 | reg_stride_y = GeneralPurposeRegister64() 23 | LOAD.ARGUMENT(reg_stride_y, arg_stride_y) 24 | SHL(reg_stride_y, 2) 25 | 26 | reg_sum = GeneralPurposeRegister64() 27 | LOAD.ARGUMENT(reg_sum, arg_sum) 28 | 29 | reg_n = GeneralPurposeRegister64() 30 | LOAD.ARGUMENT(reg_n, arg_n) 31 | 32 | ymm_accs = [YMMRegister() for m in range(fusion_factor)] 33 | VZEROALL() 34 | 35 | for m in range(1, fusion_factor): 36 | LEA(reg_ys[m], [reg_ys[m - 1] + reg_stride_y * 1]) 37 | 38 | main_loop = Loop() 39 | end_block = Block() 40 | 41 | SUB(reg_n, YMMRegister.size // float_.size) 42 | JB(main_loop.end) 43 | 44 | with main_loop: 45 | ymm_x = YMMRegister() 46 | VMOVUPS(ymm_x, [reg_x]) 47 | ADD(reg_x, YMMRegister.size) 48 | 49 | for reg_y, ymm_acc in zip(reg_ys, ymm_accs): 50 | VFMADD231PS(ymm_acc, ymm_x, [reg_y]) 51 | ADD(reg_y, YMMRegister.size) 52 | 53 | SUB(reg_n, YMMRegister.size // float_.size) 54 | JAE(main_loop.begin) 55 | 56 | ADD(reg_n, YMMRegister.size // float_.size) 57 | JE(end_block.end) 58 | 59 | with end_block: 60 | ymm_mask = YMMRegister() 61 | VMOVD(ymm_mask.as_xmm, reg_n.as_dword) 62 | VPBROADCASTD(ymm_mask, ymm_mask.as_xmm) 63 | VPCMPGTD(ymm_mask, ymm_mask, Constant.uint32x8(0, 1, 2, 3, 4, 5, 6, 7)) 64 | 65 | ymm_x = YMMRegister() 66 | VMASKMOVPS(ymm_x, ymm_mask, [reg_x]) 67 | 68 | for reg_y, ymm_acc in zip(reg_ys, ymm_accs): 69 | ymm_y = YMMRegister() 70 | VMASKMOVPS(ymm_y, ymm_mask, [reg_y]) 71 | VFMADD231PS(ymm_acc, ymm_x, ymm_y) 72 | 73 | # Reduce the SIMD registers into a single elements 74 | xmm_tmp = XMMRegister() 75 | for i, ymm_acc in enumerate(ymm_accs): 76 | VEXTRACTF128(xmm_tmp, ymm_acc, 1) 77 | VADDPS(ymm_acc.as_xmm, ymm_acc.as_xmm, xmm_tmp) 78 | VHADDPS(ymm_acc, ymm_acc, ymm_acc) 79 | VHADDPS(ymm_acc, ymm_acc, ymm_acc) 80 | VMOVSS([reg_sum + i * float_.size], ymm_acc.as_xmm) 81 | 82 | RETURN() 83 | 84 | -------------------------------------------------------------------------------- /src/x86_64-fma/blas/shdotxf.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | 4 | from fp16.avx import fp16_alt_xmm_to_fp32_xmm 5 | from fp16.avx2 import fp16_alt_xmm_to_fp32_ymm 6 | 7 | simd_width = YMMRegister.size // float_.size 8 | 9 | for fusion_factor in range(1, 8 + 1): 10 | arg_x = Argument(ptr(const_float_), "x") 11 | arg_y = Argument(ptr(const_float_), "y") 12 | arg_stride_y = Argument(size_t, "stride_y") 13 | arg_sum = Argument(ptr(float_), "sum") 14 | arg_n = Argument(size_t, "n") 15 | with Function("nnp_shdotxf{fusion_factor}__avx2".format(fusion_factor=fusion_factor), 16 | (arg_x, arg_y, arg_stride_y, arg_sum, arg_n), 17 | target=uarch.default + isa.fma3 + isa.avx2): 18 | 19 | reg_x = GeneralPurposeRegister64() 20 | LOAD.ARGUMENT(reg_x, arg_x) 21 | 22 | reg_ys = [GeneralPurposeRegister64() for m in range(fusion_factor)] 23 | LOAD.ARGUMENT(reg_ys[0], arg_y) 24 | 25 | reg_stride_y = GeneralPurposeRegister64() 26 | LOAD.ARGUMENT(reg_stride_y, arg_stride_y) 27 | ADD(reg_stride_y, reg_stride_y) 28 | 29 | reg_sum = GeneralPurposeRegister64() 30 | LOAD.ARGUMENT(reg_sum, arg_sum) 31 | 32 | reg_n = GeneralPurposeRegister64() 33 | LOAD.ARGUMENT(reg_n, arg_n) 34 | 35 | ymm_accs = [YMMRegister() for m in range(fusion_factor)] 36 | VZEROALL() 37 | 38 | for m in range(1, fusion_factor): 39 | LEA(reg_ys[m], [reg_ys[m - 1] + reg_stride_y * 1]) 40 | 41 | main_loop = Loop() 42 | edge_loop = Loop() 43 | 44 | SUB(reg_n, XMMRegister.size // uint16_t.size) 45 | JB(main_loop.end) 46 | 47 | with main_loop: 48 | ymm_x = YMMRegister() 49 | VMOVUPS(ymm_x, [reg_x]) 50 | ADD(reg_x, YMMRegister.size) 51 | 52 | for reg_y, ymm_acc in zip(reg_ys, ymm_accs): 53 | xmm_half = XMMRegister() 54 | VMOVUPS(xmm_half, [reg_y]) 55 | ADD(reg_y, XMMRegister.size) 56 | 57 | ymm_y = fp16_alt_xmm_to_fp32_ymm(xmm_half) 58 | VFMADD231PS(ymm_acc, ymm_x, ymm_y) 59 | 60 | SUB(reg_n, YMMRegister.size // float_.size) 61 | JAE(main_loop.begin) 62 | 63 | ADD(reg_n, XMMRegister.size // uint16_t.size) 64 | JE(edge_loop.end) 65 | 66 | with edge_loop: 67 | xmm_x = XMMRegister() 68 | VMOVSS(xmm_x, [reg_x]) 69 | ADD(reg_x, YMMRegister.size) 70 | 71 | for reg_y, ymm_acc in zip(reg_ys, ymm_accs): 72 | reg_half = GeneralPurposeRegister32() 73 | MOVZX(reg_half, word[reg_y]) 74 | 75 | xmm_half = XMMRegister() 76 | VMOVD(xmm_half, reg_half) 77 | ADD(reg_y, uint16_t.size) 78 | 79 | ymm_y = fp16_alt_xmm_to_fp32_ymm(xmm_half) 80 | VFMADD231PS(ymm_acc, xmm_x.as_ymm, ymm_y) 81 | 82 | SUB(reg_n, 1) 83 | JAE(edge_loop.begin) 84 | 85 | # Reduce the SIMD registers into a single elements 86 | xmm_tmp = XMMRegister() 87 | for i, ymm_acc in enumerate(ymm_accs): 88 | VEXTRACTF128(xmm_tmp, ymm_acc, 1) 89 | VADDPS(ymm_acc.as_xmm, ymm_acc.as_xmm, xmm_tmp) 90 | VHADDPS(ymm_acc, ymm_acc, ymm_acc) 91 | VHADDPS(ymm_acc, ymm_acc, ymm_acc) 92 | VMOVSS([reg_sum + i * float_.size], ymm_acc.as_xmm) 93 | 94 | RETURN() 95 | 96 | -------------------------------------------------------------------------------- /src/x86_64-fma/exp.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | __m256 _mm256_exp_ps(__m256 x) { 8 | const __m256 magic_bias = _mm256_set1_ps(0x1.800000p+23f); 9 | const __m256 zero_cutoff = _mm256_set1_ps(-0x1.9FE368p+6f); /* The smallest x for which expf(x) is non-zero */ 10 | const __m256 inf_cutoff = _mm256_set1_ps(0x1.62E42Ep+6f); /* The largest x for which expf(x) is finite */ 11 | const __m256 log2e = _mm256_set1_ps(0x1.715476p+3f); 12 | const __m256 minus_ln2_hi = _mm256_set1_ps(-0x1.62E430p-4f); 13 | const __m256 minus_ln2_lo = _mm256_set1_ps( 0x1.05C610p-32f); 14 | const __m256 plus_inf = _mm256_set1_ps(__builtin_inff()); 15 | 16 | const __m256 c2 = _mm256_set1_ps(0x1.00088Ap-1f); 17 | const __m256 c3 = _mm256_set1_ps(0x1.555A86p-3f); 18 | const __m256 table = _mm256_set_ps(0x1.D5818Ep+0f, 0x1.AE89FAp+0f, 0x1.8ACE54p+0f, 0x1.6A09E6p+0f, 0x1.4BFDAEp+0f, 0x1.306FE0p+0f, 0x1.172B84p+0f, 0x1.000000p+0f); 19 | 20 | const __m256i min_exponent = _mm256_set1_epi32(-126 << 23); 21 | const __m256i max_exponent = _mm256_set1_epi32(127 << 23); 22 | const __m256i default_exponent = _mm256_set1_epi32(0x3F800000u); 23 | const __m256i mantissa_mask = _mm256_set1_epi32(0x007FFFF8); 24 | 25 | __m256 t = _mm256_fmadd_ps(x, log2e, magic_bias); 26 | __m256i e1 = _mm256_slli_epi32(_mm256_and_si256(_mm256_castps_si256(t), mantissa_mask), 20); 27 | __m256i e2 = e1; 28 | e1 = _mm256_min_epi32(_mm256_max_epi32(e1, min_exponent), max_exponent); 29 | e2 = _mm256_sub_epi32(e2, e1); 30 | const __m256 s1 = _mm256_castsi256_ps(_mm256_add_epi32(e1, default_exponent)); 31 | const __m256 s2 = _mm256_castsi256_ps(_mm256_add_epi32(e2, default_exponent)); 32 | const __m256 tf = _mm256_permutevar8x32_ps(table, _mm256_castps_si256(t)); 33 | t = _mm256_sub_ps(t, magic_bias); 34 | const __m256 rx = _mm256_fmadd_ps(t, minus_ln2_lo, _mm256_fmadd_ps(t, minus_ln2_hi, x)); 35 | const __m256 rf = _mm256_fmadd_ps(rx, _mm256_mul_ps(rx, _mm256_fmadd_ps(rx, c3, c2)), rx); 36 | __m256 f = _mm256_fmadd_ps(tf, rf, tf); 37 | f = _mm256_mul_ps(s2, _mm256_mul_ps(s1, f)); 38 | /* Fixup underflow to zero */ 39 | f = _mm256_andnot_ps(_mm256_cmp_ps(x, zero_cutoff, _CMP_LT_OS), f); 40 | /* Fixup overflow */ 41 | f = _mm256_blendv_ps(f, plus_inf, _mm256_cmp_ps(x, inf_cutoff, _CMP_GT_OS)); 42 | /* Fixup NaN */ 43 | f = _mm256_blendv_ps(x, f, _mm256_cmp_ps(x, x, _CMP_EQ_OS)); 44 | return f; 45 | } 46 | 47 | static inline uint32_t as_uint32(float x) { 48 | union { 49 | float x; 50 | uint32_t n; 51 | } data = { 52 | .x = x 53 | }; 54 | return data.n; 55 | } 56 | 57 | static inline float as_float(uint32_t n) { 58 | union { 59 | float x; 60 | uint32_t n; 61 | } data = { 62 | .n = n 63 | }; 64 | return data.x; 65 | } 66 | 67 | static inline float ulpf(float x) { 68 | const float absx = fabsf(x); 69 | if (absx < __builtin_inff()) { 70 | return as_float(as_uint32(absx) + 1) - absx; 71 | } else { 72 | return absx; 73 | } 74 | } 75 | 76 | int main() { 77 | float max_error = 0.0f; 78 | for (uint32_t n = INT32_MIN; n < as_uint32(-0x1.9FE368p+6f); n++) { 79 | const float x = as_float(n); 80 | const float ref_y = expf(x); 81 | const float opt_y = _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_exp_ps(_mm256_set1_ps(x)))); 82 | const float error = fabsf(ref_y - opt_y) / ulpf(ref_y); 83 | if (error > max_error) 84 | max_error = error; 85 | } 86 | printf("Max error: %.2f ULP\n", max_error); 87 | 88 | max_error = 0.0f; 89 | for (uint32_t n = 0; n < as_uint32(0x1.62E42Ep+6f); n++) { 90 | const float x = as_float(n); 91 | const float ref_y = expf(x); 92 | const float opt_y = _mm_cvtss_f32(_mm256_castps256_ps128(_mm256_exp_ps(_mm256_set1_ps(x)))); 93 | const float error = fabsf(ref_y - opt_y) / ulpf(ref_y); 94 | if (error > max_error) 95 | max_error = error; 96 | } 97 | printf("Max error: %.2f ULP\n", max_error); 98 | } 99 | -------------------------------------------------------------------------------- /src/x86_64-fma/exp.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | 4 | from peachpy import * 5 | from peachpy.x86_64 import * 6 | 7 | log2e = float.fromhex("+0x1.715476p+3") 8 | magic_bias = float.fromhex("+0x1.800000p+23") 9 | zero_cutoff = float.fromhex("-0x1.9FE368p+6") 10 | inf_cutoff = float.fromhex("+0x1.62E42Ep+6") 11 | minus_ln2_hi = float.fromhex("-0x1.62E430p-4") 12 | minus_ln2_lo = float.fromhex("+0x1.05C610p-32") 13 | plus_inf = float("inf") 14 | 15 | c2 = float.fromhex("0x1.00088Ap-1") 16 | c3 = float.fromhex("0x1.555A86p-3") 17 | t0 = float.fromhex("0x1.000000p+0") 18 | t1 = float.fromhex("0x1.172B84p+0") 19 | t2 = float.fromhex("0x1.306FE0p+0") 20 | t3 = float.fromhex("0x1.4BFDAEp+0") 21 | t4 = float.fromhex("0x1.6A09E6p+0") 22 | t5 = float.fromhex("0x1.8ACE54p+0") 23 | t6 = float.fromhex("0x1.AE89FAp+0") 24 | t7 = float.fromhex("0x1.D5818Ep+0") 25 | 26 | min_exponent = (-126 << 23) & 0xFFFFFFFF 27 | max_exponent = 127 << 23 28 | default_exponent = 0x3F800000 29 | mantissa_mask = 0x007FFFF8 30 | 31 | x_arg = Argument(m256, "x") 32 | with Function("_mm256_exp_ps", (x_arg,), m256, 33 | target=uarch.default + isa.fma3 + isa.avx2): 34 | ymm_x = YMMRegister() 35 | LOAD.ARGUMENT(ymm_x, x_arg) 36 | 37 | ymm_magic_bias = YMMRegister() 38 | VMOVAPS(ymm_magic_bias, Constant.float32x8(magic_bias)) 39 | 40 | ymm_t = YMMRegister() 41 | VMOVAPS(ymm_t, ymm_x) 42 | VFMADD132PS(ymm_t, ymm_magic_bias, Constant.float32x8(log2e)) 43 | 44 | ymm_e1, ymm_e2 = YMMRegister(), YMMRegister() 45 | VPAND(ymm_e2, ymm_t, Constant.uint32x8(mantissa_mask)) 46 | VPSLLD(ymm_e2, ymm_e2, 20) 47 | 48 | ymm_tf = YMMRegister() 49 | VPERMPS(ymm_tf, ymm_t, Constant.float32x8(t0, t1, t2, t3, t4, t5, t6, t7)) 50 | VSUBPS(ymm_t, ymm_t, ymm_magic_bias) 51 | 52 | # rx = fma(t, minus_ln2_lo, fma(t, minus_ln2_hi, x)) 53 | # rx := t * minus_ln2_hi + x 54 | # rx := t * minus_ln2_lo + rx 55 | ymm_rx = YMMRegister() 56 | VMOVAPS(ymm_rx, ymm_x) 57 | VFMADD231PS(ymm_rx, ymm_t, Constant.float32x8(minus_ln2_hi)) 58 | VFMADD231PS(ymm_rx, ymm_t, Constant.float32x8(minus_ln2_lo)) 59 | 60 | VPMAXSD(ymm_e1, ymm_e2, Constant.uint32x8(min_exponent)) 61 | VPMINSD(ymm_e1, ymm_e1, Constant.uint32x8(max_exponent)) 62 | 63 | ymm_default_exponent = YMMRegister() 64 | VMOVDQA(ymm_default_exponent, Constant.uint32x8(default_exponent)) 65 | VPSUBD(ymm_e2, ymm_e2, ymm_e1) 66 | 67 | VPADDD(ymm_e1, ymm_e1, ymm_default_exponent) 68 | VPADDD(ymm_e2, ymm_e2, ymm_default_exponent) 69 | 70 | # rf = fma(rx, rx * fma(rx, c3, c2), rx) 71 | # rf := rx * c3 + c2 72 | # rf := rx * rf 73 | # rf := rx * rf + rx 74 | ymm_rf = YMMRegister() 75 | VMOVAPS(ymm_rf, Constant.float32x8(c2)) 76 | VFMADD231PS(ymm_rf, ymm_rx, Constant.float32x8(c3)) 77 | VMULPS(ymm_rf, ymm_rf, ymm_rx) 78 | VFMADD213PS(ymm_rf, ymm_rx, ymm_rx) 79 | 80 | # f = fma(tf, rf, tf) 81 | VFMADD231PS(ymm_tf, ymm_tf, ymm_rf) 82 | ymm_f = ymm_tf 83 | 84 | VMULPS(ymm_f, ymm_f, ymm_e1) 85 | VMULPS(ymm_f, ymm_f, ymm_e2) 86 | 87 | RETURN(ymm_f) 88 | -------------------------------------------------------------------------------- /src/x86_64-fma/fft-dualreal.py: -------------------------------------------------------------------------------- 1 | import fft.complex_soa 2 | import fft.two_real_to_two_complex_soa_perm_planar 3 | 4 | 5 | arg_t = Argument(ptr(const_float_), name="t") 6 | arg_f = Argument(ptr(float_), name="f") 7 | 8 | 9 | with Function("nnp_fft8_dualreal__avx2", 10 | (arg_t, arg_f), 11 | target=uarch.default + isa.fma3 + isa.avx2): 12 | 13 | reg_t = GeneralPurposeRegister64() 14 | LOAD.ARGUMENT(reg_t, arg_t) 15 | 16 | reg_f = GeneralPurposeRegister64() 17 | LOAD.ARGUMENT(reg_f, arg_f) 18 | 19 | ymm_seq_a, ymm_seq_b = YMMRegister(), YMMRegister() 20 | 21 | VMOVUPS(ymm_seq_a, [reg_t]) 22 | VMOVUPS(ymm_seq_b, [reg_t + YMMRegister.size]) 23 | 24 | fft.complex_soa.fft8_within_rows(ymm_seq_a, ymm_seq_b) 25 | ymm_wr, ymm_wi = ymm_seq_a, ymm_seq_b 26 | 27 | fft.two_real_to_two_complex_soa_perm_planar.fft8_within_rows_postprocess(ymm_wr, ymm_wi) 28 | ymm_xhr, ymm_xhi = ymm_wr, ymm_wi 29 | 30 | VMOVUPS([reg_f], ymm_xhr) 31 | VMOVUPS([reg_f + YMMRegister.size], ymm_xhi) 32 | 33 | RETURN() 34 | 35 | 36 | with Function("nnp_fft16_dualreal__avx2", 37 | (arg_t, arg_f), 38 | target=uarch.default + isa.fma3 + isa.avx2): 39 | 40 | reg_t = GeneralPurposeRegister64() 41 | LOAD.ARGUMENT(reg_t, arg_t) 42 | 43 | reg_f = GeneralPurposeRegister64() 44 | LOAD.ARGUMENT(reg_f, arg_f) 45 | 46 | ymm_seq_a = YMMRegister(), YMMRegister() 47 | ymm_seq_b = YMMRegister(), YMMRegister() 48 | for i, ymm_a in enumerate(ymm_seq_a + ymm_seq_b): 49 | VMOVUPS(ymm_a, [reg_t + i * YMMRegister.size]) 50 | 51 | fft.complex_soa.fft16_within_rows(ymm_seq_a, ymm_seq_b) 52 | ymm_wr, ymm_wi = ymm_seq_a, ymm_seq_b 53 | 54 | fft.two_real_to_two_complex_soa_perm_planar.fft16_within_rows_postprocess(ymm_wr, ymm_wi) 55 | 56 | for i, ymm_w in enumerate(ymm_wr + ymm_wi): 57 | VMOVUPS([reg_f + i * YMMRegister.size], ymm_w) 58 | 59 | RETURN() 60 | -------------------------------------------------------------------------------- /src/x86_64-fma/fft-real.py: -------------------------------------------------------------------------------- 1 | import fft.real_to_complex_soa_perm 2 | 3 | arg_t = Argument(ptr(const_float_), name="t") 4 | arg_f = Argument(ptr(float_), name="f") 5 | 6 | 7 | with Function("nnp_fft8_8real__fma3", 8 | (arg_t, arg_f), 9 | target=uarch.default + isa.fma3): 10 | 11 | reg_t = GeneralPurposeRegister64() 12 | LOAD.ARGUMENT(reg_t, arg_t) 13 | 14 | reg_f = GeneralPurposeRegister64() 15 | LOAD.ARGUMENT(reg_f, arg_f) 16 | 17 | ymm_data = [YMMRegister() for _ in range(8)] 18 | 19 | for i, ymm_i in enumerate(ymm_data): 20 | VMOVUPS(ymm_i, [reg_t + i * YMMRegister.size]) 21 | 22 | fft.real_to_complex_soa_perm.fft8_across_rows(ymm_data) 23 | 24 | for i, ymm_i in enumerate(ymm_data): 25 | VMOVUPS([reg_f + i * YMMRegister.size], ymm_i) 26 | 27 | RETURN() 28 | 29 | 30 | import fft16x16 31 | 32 | 33 | with Function("nnp_fft16_8real__fma3", 34 | (arg_t, arg_f), 35 | target=uarch.default + isa.fma3): 36 | 37 | reg_t0 = GeneralPurposeRegister64() 38 | LOAD.ARGUMENT(reg_t0, arg_t) 39 | 40 | reg_f = GeneralPurposeRegister64() 41 | LOAD.ARGUMENT(reg_f, arg_f) 42 | 43 | reg_stride = GeneralPurposeRegister64() 44 | MOV(reg_stride, YMMRegister.size) 45 | 46 | reg_t8 = GeneralPurposeRegister64() 47 | LEA(reg_t8, [reg_t0 + 8 * YMMRegister.size]) 48 | 49 | fft16x16.forward_vfft(reg_t0, reg_t8, reg_stride, 50 | data_out=[yword[reg_f + YMMRegister.size * i] for i in range(16)]) 51 | 52 | RETURN() 53 | -------------------------------------------------------------------------------- /src/x86_64-fma/fft-soa.py: -------------------------------------------------------------------------------- 1 | import fft.complex_soa 2 | 3 | arg_t = Argument(ptr(const_float_), name="t") 4 | arg_f = Argument(ptr(float_), name="f") 5 | 6 | 7 | with Function("nnp_fft16_soa__avx2", 8 | (arg_t, arg_f), 9 | target=uarch.default + isa.fma3 + isa.avx2): 10 | 11 | reg_t = GeneralPurposeRegister64() 12 | LOAD.ARGUMENT(reg_t, arg_t) 13 | 14 | reg_f = GeneralPurposeRegister64() 15 | LOAD.ARGUMENT(reg_f, arg_f) 16 | 17 | ymm_real = YMMRegister(), YMMRegister() 18 | ymm_imag = YMMRegister(), YMMRegister() 19 | 20 | for i, ymm_data in enumerate(ymm_real + ymm_imag): 21 | VMOVUPS(ymm_data, [reg_t + i * YMMRegister.size]) 22 | 23 | fft.complex_soa.fft16_within_rows(ymm_real, ymm_imag) 24 | 25 | for i, ymm_data in enumerate(ymm_real + ymm_imag): 26 | VMOVUPS([reg_f + i * YMMRegister.size], ymm_data) 27 | 28 | RETURN() 29 | 30 | 31 | with Function("nnp_fft8_soa__avx2", 32 | (arg_t, arg_f), 33 | target=uarch.default + isa.fma3 + isa.avx2): 34 | 35 | reg_t = GeneralPurposeRegister64() 36 | LOAD.ARGUMENT(reg_t, arg_t) 37 | 38 | reg_f = GeneralPurposeRegister64() 39 | LOAD.ARGUMENT(reg_f, arg_f) 40 | 41 | ymm_real, ymm_imag = YMMRegister(), YMMRegister() 42 | 43 | VMOVUPS(ymm_real, [reg_t]) 44 | VMOVUPS(ymm_imag, [reg_t + YMMRegister.size]) 45 | 46 | fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag) 47 | 48 | VMOVUPS([reg_f], ymm_real) 49 | VMOVUPS([reg_f + YMMRegister.size], ymm_imag) 50 | 51 | RETURN() 52 | 53 | 54 | with Function("nnp_ifft8_soa__avx2", 55 | (arg_t, arg_f), 56 | target=uarch.default + isa.fma3 + isa.avx2): 57 | 58 | reg_t = GeneralPurposeRegister64() 59 | LOAD.ARGUMENT(reg_t, arg_t) 60 | 61 | reg_f = GeneralPurposeRegister64() 62 | LOAD.ARGUMENT(reg_f, arg_f) 63 | 64 | ymm_real, ymm_imag = YMMRegister(), YMMRegister() 65 | 66 | VMOVUPS(ymm_real, [reg_t]) 67 | VMOVUPS(ymm_imag, [reg_t + YMMRegister.size]) 68 | 69 | fft.complex_soa.fft8_within_rows(ymm_real, ymm_imag, transformation="inverse") 70 | 71 | VMOVUPS([reg_f], ymm_real) 72 | VMOVUPS([reg_f + YMMRegister.size], ymm_imag) 73 | 74 | RETURN() 75 | 76 | 77 | with Function("nnp_ifft16_soa__avx2", 78 | (arg_f, arg_t), 79 | target=uarch.default + isa.fma3 + isa.avx2): 80 | 81 | reg_f = GeneralPurposeRegister64() 82 | LOAD.ARGUMENT(reg_f, arg_f) 83 | 84 | reg_t = GeneralPurposeRegister64() 85 | LOAD.ARGUMENT(reg_t, arg_t) 86 | 87 | ymm_real = YMMRegister(), YMMRegister() 88 | ymm_imag = YMMRegister(), YMMRegister() 89 | 90 | for i, ymm_data in enumerate(ymm_real + ymm_imag): 91 | VMOVUPS(ymm_data, [reg_f + i * YMMRegister.size]) 92 | 93 | fft.complex_soa.ifft16_within_rows(ymm_real, ymm_imag) 94 | 95 | for i, ymm_data in enumerate(ymm_real + ymm_imag): 96 | VMOVUPS([reg_t + i * YMMRegister.size], ymm_data) 97 | 98 | RETURN() 99 | -------------------------------------------------------------------------------- /src/x86_64-fma/fft/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Maratyszcza/NNPACK/70a77f485e8b934224f3a79efd8edcd84cd377b8/src/x86_64-fma/fft/__init__.py -------------------------------------------------------------------------------- /src/x86_64-fma/fft/complex_soa_perm_to_real.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | 4 | from peachpy import * 5 | from peachpy.x86_64 import * 6 | 7 | from common import sqrt2_over_2 8 | from common import butterfly 9 | 10 | import fft.complex_soa 11 | 12 | def ifft8_across_rows(ymm_data, bias=None): 13 | assert isinstance(ymm_data, list) and len(ymm_data) == 8 14 | ymm_real = ymm_data[0::2] 15 | ymm_imag = ymm_data[1::2] 16 | 17 | if bias is None: 18 | # Do 1/N scaling before IFFT 19 | ymm_one_eighth = YMMRegister() 20 | VMOVAPS(ymm_one_eighth, Constant.float32x8(0.125)) 21 | for ymm_row in ymm_data: 22 | if ymm_row is ymm_real[2]: 23 | VMULPS(ymm_row, ymm_row, Constant.float32x8(0.25)) 24 | elif ymm_row is ymm_imag[2]: 25 | VMULPS(ymm_row, ymm_row, Constant.float32x8(-0.25)) 26 | else: 27 | VMULPS(ymm_row, ymm_row, ymm_one_eighth) 28 | else: 29 | # Do 1/N scaling after FFT (merge with bias addition) 30 | VMULPS(ymm_real[2], ymm_real[2], Constant.float32x8(2.0)) 31 | VMULPS(ymm_imag[2], ymm_imag[2], Constant.float32x8(-2.0)) 32 | 33 | butterfly(ymm_real[0], ymm_imag[0]) 34 | 35 | # H1.real, H1.imag = W1.real - W3.real, W1.imag + W3.imag 36 | ymm_h1_real, ymm_h1_imag = YMMRegister(), YMMRegister() 37 | VSUBPS(ymm_h1_real, ymm_real[1], ymm_real[3]) 38 | VADDPS(ymm_h1_imag, ymm_imag[1], ymm_imag[3]) 39 | 40 | # G1.real, G1.imag = W1.real + W3.real, W1.imag - W3.imag 41 | ymm_g1_real, ymm_g1_imag = YMMRegister(), YMMRegister() 42 | VADDPS(ymm_g1_real, ymm_real[1], ymm_real[3]) 43 | VSUBPS(ymm_g1_imag, ymm_imag[1], ymm_imag[3]) 44 | 45 | # H1+, H1- = H1.real + H1.imag, H1.real - H1.imag 46 | ymm_h1_plus, ymm_h1_minus = YMMRegister(), YMMRegister() 47 | VADDPS(ymm_h1_plus, ymm_h1_real, ymm_h1_imag) 48 | VSUBPS(ymm_h1_minus, ymm_h1_real, ymm_h1_imag) 49 | 50 | ymm_sqrt2_over_2 = YMMRegister() 51 | VMOVAPS(ymm_sqrt2_over_2, Constant.float32x8(sqrt2_over_2)) 52 | 53 | # w1.real = G1.real - SQRT2_OVER_2 * H1.plus; 54 | # w3.real = G1.real + SQRT2_OVER_2 * H1.plus; 55 | VMOVAPS(ymm_real[1], ymm_g1_real) 56 | VFNMADD231PS(ymm_real[1], ymm_h1_plus, ymm_sqrt2_over_2) 57 | VFMADD231PS(ymm_g1_real, ymm_h1_plus, ymm_sqrt2_over_2) 58 | SWAP.REGISTERS(ymm_real[3], ymm_g1_real) 59 | 60 | # w1.imag = G1.imag + SQRT2_OVER_2 * H1.minus; 61 | # w3.imag = -G1.imag + SQRT2_OVER_2 * H1.minus; 62 | VMOVAPS(ymm_imag[1], ymm_g1_imag) 63 | VFMADD231PS(ymm_imag[1], ymm_h1_minus, ymm_sqrt2_over_2) 64 | VFMSUB231PS(ymm_g1_imag, ymm_h1_minus, ymm_sqrt2_over_2) 65 | SWAP.REGISTERS(ymm_imag[3], ymm_g1_imag) 66 | 67 | fft.complex_soa.fft4_across_rows(ymm_real, ymm_imag, transformation="inverse") 68 | 69 | if bias is not None: 70 | ymm_bias = bias 71 | if not isinstance(bias, YMMRegister): 72 | ymm_bias = YMMRegister() 73 | VMOVAPS(ymm_bias, bias) 74 | 75 | ymm_one_eighth = YMMRegister() 76 | VMOVAPS(ymm_one_eighth, Constant.float32x8(0.125)) 77 | 78 | # 1/N scaling 79 | for ymm_row in ymm_data: 80 | VFMADD132PS(ymm_row, ymm_bias, ymm_one_eighth) 81 | -------------------------------------------------------------------------------- /src/x86_64-fma/fft/real_to_complex_soa_perm.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from __future__ import division 3 | 4 | from peachpy import * 5 | from peachpy.x86_64 import * 6 | 7 | from common import sqrt2_over_2 8 | from common import butterfly 9 | 10 | import fft.complex_soa 11 | 12 | 13 | def fft8_across_rows(ymm_data): 14 | assert isinstance(ymm_data, list) and len(ymm_data) == 8 15 | ymm_real = ymm_data[0::2] 16 | ymm_imag = ymm_data[1::2] 17 | 18 | fft.complex_soa.fft4_across_rows(ymm_real, ymm_imag) 19 | 20 | butterfly(ymm_real[0], ymm_imag[0]) 21 | 22 | # const float two_gdata1_real = crealf(data1) + crealf(data3); 23 | # const float two_gdata1_imag = cimagf(data1) - cimagf(data3); 24 | ymm_two_gdata1_real, ymm_two_gdata1_imag = YMMRegister(), YMMRegister() 25 | VADDPS(ymm_two_gdata1_real, ymm_real[1], ymm_real[3]) 26 | VSUBPS(ymm_two_gdata1_imag, ymm_imag[1], ymm_imag[3]) 27 | 28 | # const float two_hdata1_real = cimagf(data1) + cimagf(data3); 29 | # const float two_hdata1_imag = crealf(data3) - crealf(data1); 30 | ymm_two_hdata1_real, ymm_two_hdata1_imag = YMMRegister(), YMMRegister() 31 | VADDPS(ymm_two_hdata1_real, ymm_imag[1], ymm_imag[3]) 32 | VSUBPS(ymm_two_hdata1_imag, ymm_real[3], ymm_real[1]) 33 | 34 | # const float two_hdata1_real_plus_imag = two_hdata1_real + two_hdata1_imag; 35 | # const float two_hdata1_real_minus_imag = two_hdata1_real - two_hdata1_imag; 36 | ymm_two_hdata1_plus, ymm_two_hdata1_minus = YMMRegister(), YMMRegister() 37 | VADDPS(ymm_two_hdata1_plus, ymm_two_hdata1_real, ymm_two_hdata1_imag) 38 | VSUBPS(ymm_two_hdata1_minus, ymm_two_hdata1_real, ymm_two_hdata1_imag) 39 | 40 | ymm_sqrt2_over_2 = YMMRegister() 41 | VMOVAPS(ymm_sqrt2_over_2, Constant.float32x8(sqrt2_over_2)) 42 | 43 | # const float two_data1_real = two_gdata1_real + SQRT2_OVER_2 * two_hdata1_real_plus_imag; 44 | # const float two_data1_imag = two_gdata1_imag - SQRT2_OVER_2 * two_hdata1_real_minus_imag; 45 | # const float two_data3_real = two_gdata1_real - SQRT2_OVER_2 * two_hdata1_real_plus_imag; 46 | # const float two_data3_imag = -two_gdata1_imag - SQRT2_OVER_2 * two_hdata1_real_minus_imag; 47 | ymm_two_data1_real, ymm_two_data1_imag = YMMRegister(), YMMRegister() 48 | ymm_two_data3_real, ymm_two_data3_imag = YMMRegister(), YMMRegister() 49 | VMOVAPS(ymm_two_data3_real, ymm_two_gdata1_real) 50 | VMOVAPS(ymm_two_data3_imag, ymm_two_gdata1_imag) 51 | VFMADD231PS(ymm_two_gdata1_real, ymm_two_hdata1_plus, ymm_sqrt2_over_2) 52 | VFNMADD231PS(ymm_two_gdata1_imag, ymm_two_hdata1_minus, ymm_sqrt2_over_2) 53 | SWAP.REGISTERS(ymm_two_data1_real, ymm_two_gdata1_real) 54 | SWAP.REGISTERS(ymm_two_data1_imag, ymm_two_gdata1_imag) 55 | VFNMADD231PS(ymm_two_data3_real, ymm_two_hdata1_plus, ymm_sqrt2_over_2) 56 | VFNMSUB231PS(ymm_two_data3_imag, ymm_two_hdata1_minus, ymm_sqrt2_over_2) 57 | 58 | # /* Store outputs */ 59 | # fdata[0] = crealf(data0) + cimagf(data0); 60 | # fdata[1] = crealf(data0) - cimagf(data0); 61 | # fdata[2] = 0.5f * two_data1_real; 62 | # fdata[3] = 0.5f * two_data1_imag; 63 | # fdata[4] = crealf(data2); 64 | # fdata[5] = -cimagf(data2); 65 | # fdata[6] = 0.5f * two_data3_real; 66 | # fdata[7] = 0.5f * two_data3_imag; 67 | 68 | ymm_half = YMMRegister() 69 | VMOVAPS(ymm_half, Constant.float32x8(0.5)) 70 | VMULPS(ymm_real[1], ymm_two_data1_real, ymm_half) 71 | VMULPS(ymm_imag[1], ymm_two_data1_imag, ymm_half) 72 | VXORPS(ymm_imag[2], ymm_imag[2], Constant.float32x8(-0.0)) 73 | VMULPS(ymm_real[3], ymm_two_data3_real, ymm_half) 74 | VMULPS(ymm_imag[3], ymm_two_data3_imag, ymm_half) 75 | -------------------------------------------------------------------------------- /src/x86_64-fma/ifft-dualreal.py: -------------------------------------------------------------------------------- 1 | import fft.complex_soa 2 | import fft.two_complex_soa_perm_to_two_real_planar 3 | 4 | 5 | arg_f = Argument(ptr(const_float_), name="f") 6 | arg_t = Argument(ptr(float_), name="t") 7 | 8 | 9 | with Function("nnp_ifft8_dualreal__avx2", 10 | (arg_f, arg_t), 11 | target=uarch.default + isa.fma3 + isa.avx2): 12 | 13 | reg_f = GeneralPurposeRegister64() 14 | LOAD.ARGUMENT(reg_f, arg_f) 15 | 16 | reg_t = GeneralPurposeRegister64() 17 | LOAD.ARGUMENT(reg_t, arg_t) 18 | 19 | ymm_xhr, ymm_xhi = YMMRegister(), YMMRegister() 20 | VMOVUPS(ymm_xhr, [reg_f]) 21 | VMOVUPS(ymm_xhi, [reg_f + YMMRegister.size]) 22 | 23 | fft.two_complex_soa_perm_to_two_real_planar.ifft8_within_rows_preprocess(ymm_xhr, ymm_xhi) 24 | ymm_wr, ymm_wi = ymm_xhr, ymm_xhi 25 | 26 | fft.complex_soa.fft8_within_rows(ymm_wr, ymm_wi, transformation="inverse") 27 | ymm_seq_a, ymm_seq_b = ymm_wr, ymm_wi 28 | 29 | VMOVUPS([reg_t], ymm_seq_a) 30 | VMOVUPS([reg_t + YMMRegister.size], ymm_seq_b) 31 | 32 | RETURN() 33 | 34 | 35 | with Function("nnp_ifft16_dualreal__avx2", 36 | (arg_f, arg_t), 37 | target=uarch.default + isa.fma3 + isa.avx2): 38 | 39 | reg_f = GeneralPurposeRegister64() 40 | LOAD.ARGUMENT(reg_f, arg_f) 41 | 42 | reg_t = GeneralPurposeRegister64() 43 | LOAD.ARGUMENT(reg_t, arg_t) 44 | 45 | ymm_wr = YMMRegister(), YMMRegister() 46 | ymm_wi = YMMRegister(), YMMRegister() 47 | 48 | for i, ymm_w in enumerate(ymm_wr + ymm_wi): 49 | VMOVUPS(ymm_w, [reg_f + i * YMMRegister.size]) 50 | 51 | fft.two_complex_soa_perm_to_two_real_planar.ifft16_within_rows_preprocess(ymm_wr, ymm_wi) 52 | 53 | fft.complex_soa.ifft16_within_rows(ymm_wr, ymm_wi) 54 | 55 | for i, ymm_w in enumerate(ymm_wr + ymm_wi): 56 | VMOVUPS([reg_t + i * YMMRegister.size], ymm_w) 57 | 58 | RETURN() 59 | -------------------------------------------------------------------------------- /src/x86_64-fma/ifft-real.py: -------------------------------------------------------------------------------- 1 | import fft.complex_soa_perm_to_real 2 | from common import butterfly, cos_npi_over_8, sqrt2_over_2 3 | 4 | 5 | def fft8_bitreverse(n): 6 | return int(format(n, "03b")[::-1], 2) 7 | 8 | 9 | arg_f = Argument(ptr(const_float_), name="f") 10 | arg_t = Argument(ptr(float_), name="t") 11 | 12 | 13 | with Function("nnp_ifft8_8real__fma3", 14 | (arg_f, arg_t), 15 | target=uarch.default + isa.fma3): 16 | 17 | reg_f = GeneralPurposeRegister64() 18 | LOAD.ARGUMENT(reg_f, arg_f) 19 | 20 | reg_t = GeneralPurposeRegister64() 21 | LOAD.ARGUMENT(reg_t, arg_t) 22 | 23 | ymm_data = [YMMRegister() for _ in range(8)] 24 | ymm_real, ymm_imag = ymm_data[0::2], ymm_data[1::2] 25 | 26 | for i, ymm_i in enumerate(ymm_data): 27 | VMOVUPS(ymm_i, [reg_f + i * YMMRegister.size]) 28 | 29 | fft.complex_soa_perm_to_real.ifft8_across_rows(ymm_data) 30 | 31 | for i, ymm_i in enumerate(ymm_data): 32 | VMOVUPS([reg_t + i * YMMRegister.size], ymm_i) 33 | 34 | RETURN() 35 | 36 | 37 | import fft16x16 38 | 39 | 40 | with Function("nnp_ifft16_8real__fma3", 41 | (arg_f, arg_t), 42 | target=uarch.default + isa.fma3): 43 | 44 | reg_f = GeneralPurposeRegister64() 45 | LOAD.ARGUMENT(reg_f, arg_f) 46 | 47 | reg_t0 = GeneralPurposeRegister64() 48 | LOAD.ARGUMENT(reg_t0, arg_t) 49 | 50 | reg_stride = GeneralPurposeRegister64() 51 | MOV(reg_stride, YMMRegister.size) 52 | 53 | reg_t8 = GeneralPurposeRegister64() 54 | LEA(reg_t8, [reg_t0 + 8 * YMMRegister.size]) 55 | 56 | fft16x16.inverse_vfft(reg_t0, reg_t8, reg_stride, 57 | data_in=[yword[reg_f + YMMRegister.size * i] for i in range(16)]) 58 | 59 | RETURN() 60 | -------------------------------------------------------------------------------- /src/x86_64-fma/softmax.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | float max__avx(size_t n, const float v[restrict static n]); 9 | float sum_exp_minus_c__avx2(size_t n, const float v[restrict static n], float c); 10 | void scaled_exp_minus_c__avx2(size_t n, const float x[restrict static n], float y[restrict static n], float scale, float c); 11 | void inplace_scaled_exp_minus_c__avx2(size_t n, const float v[restrict static n], float scale, float c); 12 | 13 | void nnp_softmax__avx2( 14 | size_t n, 15 | const float x[restrict static n], 16 | float y[restrict static n]) 17 | { 18 | const float c = max__avx(n, x); 19 | const float sum = sum_exp_minus_c__avx2(n, x, c); 20 | const float scale = 1.0f / sum; 21 | scaled_exp_minus_c__avx2(n, x, y, scale, c); 22 | } 23 | 24 | void nnp_inplace_softmax__avx2( 25 | size_t n, 26 | float v[restrict static n]) 27 | { 28 | const float c = max__avx(n, v); 29 | const float sum = sum_exp_minus_c__avx2(n, v, c); 30 | const float scale = 1.0f / sum; 31 | inplace_scaled_exp_minus_c__avx2(n, v, scale, c); 32 | } 33 | -------------------------------------------------------------------------------- /src/x86_64-fma/vecmath/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Maratyszcza/NNPACK/70a77f485e8b934224f3a79efd8edcd84cd377b8/src/x86_64-fma/vecmath/__init__.py -------------------------------------------------------------------------------- /src/x86_64-fma/winograd-f6k3.py: -------------------------------------------------------------------------------- 1 | import winograd.o6x6k3x3 2 | 3 | 4 | arg_d_pointer = Argument(ptr(const_float_), name="d") 5 | arg_w_pointer = Argument(ptr(float_), name="w") 6 | with Function("nnp_iwt_f6k3__fma3", (arg_d_pointer, arg_w_pointer), 7 | target=uarch.default + isa.fma3): 8 | 9 | reg_d = GeneralPurposeRegister64() 10 | LOAD.ARGUMENT(reg_d, arg_d_pointer) 11 | 12 | reg_w = GeneralPurposeRegister64() 13 | LOAD.ARGUMENT(reg_w, arg_w_pointer) 14 | 15 | ymm_data = [YMMRegister() for _ in range(8)] 16 | for i, ymm_row in enumerate(ymm_data): 17 | VMOVUPS(ymm_row, [reg_d + i * YMMRegister.size]) 18 | 19 | ymm_data = winograd.o6x6k3x3.input_transform(ymm_data) 20 | 21 | for i, ymm_row in enumerate(ymm_data): 22 | VMOVUPS([reg_w + i * YMMRegister.size], ymm_row) 23 | 24 | RETURN() 25 | 26 | 27 | arg_g_pointer = Argument(ptr(const_float_), name="g") 28 | arg_w_pointer = Argument(ptr(float_), name="w") 29 | with Function("nnp_kwt_f6k3__fma3", (arg_g_pointer, arg_w_pointer), 30 | target=uarch.default + isa.fma3): 31 | 32 | reg_g = GeneralPurposeRegister64() 33 | LOAD.ARGUMENT(reg_g, arg_g_pointer) 34 | 35 | reg_w = GeneralPurposeRegister64() 36 | LOAD.ARGUMENT(reg_w, arg_w_pointer) 37 | 38 | ymm_data = [YMMRegister() for _ in range(3)] 39 | for i, ymm_row in enumerate(ymm_data): 40 | VMOVUPS(ymm_row, [reg_g + i * YMMRegister.size]) 41 | 42 | ymm_data = winograd.o6x6k3x3.kernel_transform(ymm_data) 43 | 44 | for i, ymm_row in enumerate(ymm_data): 45 | VMOVUPS([reg_w + i * YMMRegister.size], ymm_row) 46 | 47 | RETURN() 48 | 49 | 50 | arg_m_pointer = Argument(ptr(const_float_), name="m") 51 | arg_s_pointer = Argument(ptr(float_), name="s") 52 | with Function("nnp_owt_f6k3__fma3", (arg_m_pointer, arg_s_pointer), 53 | target=uarch.default + isa.fma3): 54 | 55 | reg_m = GeneralPurposeRegister64() 56 | LOAD.ARGUMENT(reg_m, arg_m_pointer) 57 | 58 | reg_s = GeneralPurposeRegister64() 59 | LOAD.ARGUMENT(reg_s, arg_s_pointer) 60 | 61 | ymm_m = [YMMRegister() for _ in range(8)] 62 | for i, ymm_row in enumerate(ymm_m): 63 | VMOVUPS(ymm_row, [reg_m + i * YMMRegister.size]) 64 | 65 | ymm_s = winograd.o6x6k3x3.output_transform(ymm_m) 66 | 67 | for i, ymm_row in enumerate(ymm_s): 68 | VMOVUPS([reg_s + i * YMMRegister.size], ymm_row) 69 | 70 | RETURN() 71 | -------------------------------------------------------------------------------- /src/x86_64-fma/winograd/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Maratyszcza/NNPACK/70a77f485e8b934224f3a79efd8edcd84cd377b8/src/x86_64-fma/winograd/__init__.py -------------------------------------------------------------------------------- /test/convolution-input-gradient/alexnet.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * AlexNet conv2 layer 10 | */ 11 | 12 | TEST(FT8x8, conv2) { 13 | AlexNet::conv2() 14 | .batchSize(128) 15 | .errorLimit(1.0e-5) 16 | .testInputGradient(nnp_convolution_algorithm_ft8x8); 17 | } 18 | 19 | TEST(FT16x16, conv2) { 20 | AlexNet::conv2() 21 | .batchSize(128) 22 | .errorLimit(1.0e-5) 23 | .testInputGradient(nnp_convolution_algorithm_ft16x16); 24 | } 25 | 26 | /* 27 | * AlexNet conv3 layer 28 | */ 29 | 30 | TEST(FT8x8, conv3) { 31 | AlexNet::conv3() 32 | .batchSize(128) 33 | .errorLimit(1.0e-5) 34 | .testInputGradient(nnp_convolution_algorithm_ft8x8); 35 | } 36 | 37 | TEST(FT16x16, conv3) { 38 | AlexNet::conv3() 39 | .batchSize(128) 40 | .errorLimit(1.0e-5) 41 | .testInputGradient(nnp_convolution_algorithm_ft16x16); 42 | } 43 | 44 | TEST(WT8x8, conv3) { 45 | AlexNet::conv3() 46 | .batchSize(128) 47 | .errorLimit(1.0e-5) 48 | .testInputGradient(nnp_convolution_algorithm_wt8x8); 49 | } 50 | 51 | /* 52 | * AlexNet conv4 layer 53 | */ 54 | 55 | TEST(FT8x8, conv4) { 56 | AlexNet::conv4() 57 | .batchSize(128) 58 | .errorLimit(1.0e-5) 59 | .testInputGradient(nnp_convolution_algorithm_ft8x8); 60 | } 61 | 62 | TEST(FT16x16, conv4) { 63 | AlexNet::conv4() 64 | .batchSize(128) 65 | .errorLimit(1.0e-5) 66 | .testInputGradient(nnp_convolution_algorithm_ft16x16); 67 | } 68 | 69 | TEST(WT8x8, conv4) { 70 | AlexNet::conv4() 71 | .batchSize(128) 72 | .errorLimit(1.0e-5) 73 | .testInputGradient(nnp_convolution_algorithm_wt8x8); 74 | } 75 | 76 | /* 77 | * AlexNet conv5 layer 78 | */ 79 | 80 | TEST(FT8x8, conv5) { 81 | AlexNet::conv5() 82 | .batchSize(128) 83 | .errorLimit(1.0e-5) 84 | .testInputGradient(nnp_convolution_algorithm_ft8x8); 85 | } 86 | 87 | TEST(FT16x16, conv5) { 88 | AlexNet::conv5() 89 | .batchSize(128) 90 | .errorLimit(1.0e-5) 91 | .testInputGradient(nnp_convolution_algorithm_ft16x16); 92 | } 93 | 94 | TEST(WT8x8, conv5) { 95 | AlexNet::conv5() 96 | .batchSize(128) 97 | .errorLimit(1.0e-5) 98 | .testInputGradient(nnp_convolution_algorithm_wt8x8); 99 | } 100 | 101 | int main(int argc, char* argv[]) { 102 | const enum nnp_status init_status = nnp_initialize(); 103 | assert(init_status == nnp_status_success); 104 | setenv("TERM", "xterm-256color", 0); 105 | ::testing::InitGoogleTest(&argc, argv); 106 | return RUN_ALL_TESTS(); 107 | } 108 | -------------------------------------------------------------------------------- /test/convolution-input-gradient/overfeat-fast.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * OverFeat (Fast model) conv2 layer 10 | */ 11 | 12 | TEST(FT8x8, conv2) { 13 | OverFeat_Fast::conv2() 14 | .batchSize(128) 15 | .errorLimit(1.0e-5) 16 | .testInputGradient(nnp_convolution_algorithm_ft8x8); 17 | } 18 | 19 | TEST(FT16x16, conv2) { 20 | OverFeat_Fast::conv2() 21 | .batchSize(128) 22 | .errorLimit(1.0e-5) 23 | .testInputGradient(nnp_convolution_algorithm_ft16x16); 24 | } 25 | 26 | /* 27 | * OverFeat (Fast model) conv3 layer 28 | */ 29 | 30 | TEST(FT8x8, conv3) { 31 | OverFeat_Fast::conv3() 32 | .batchSize(128) 33 | .errorLimit(1.0e-5) 34 | .testInputGradient(nnp_convolution_algorithm_ft8x8); 35 | } 36 | 37 | TEST(FT16x16, conv3) { 38 | OverFeat_Fast::conv3() 39 | .batchSize(128) 40 | .errorLimit(1.0e-5) 41 | .testInputGradient(nnp_convolution_algorithm_ft16x16); 42 | } 43 | 44 | TEST(WT8x8, conv3) { 45 | OverFeat_Fast::conv3() 46 | .batchSize(128) 47 | .errorLimit(1.0e-5) 48 | .testInputGradient(nnp_convolution_algorithm_wt8x8); 49 | } 50 | 51 | /* 52 | * OverFeat (Fast model) conv4 layer 53 | */ 54 | 55 | TEST(FT8x8, conv4) { 56 | OverFeat_Fast::conv4() 57 | .batchSize(128) 58 | .errorLimit(1.0e-5) 59 | .testInputGradient(nnp_convolution_algorithm_ft8x8); 60 | } 61 | 62 | TEST(FT16x16, conv4) { 63 | OverFeat_Fast::conv4() 64 | .batchSize(128) 65 | .errorLimit(1.0e-5) 66 | .testInputGradient(nnp_convolution_algorithm_ft16x16); 67 | } 68 | 69 | TEST(WT8x8, conv4) { 70 | OverFeat_Fast::conv4() 71 | .batchSize(128) 72 | .errorLimit(1.0e-5) 73 | .testInputGradient(nnp_convolution_algorithm_wt8x8); 74 | } 75 | 76 | /* 77 | * OverFeat (Fast model) conv5 layer 78 | */ 79 | 80 | TEST(FT8x8, conv5) { 81 | OverFeat_Fast::conv5() 82 | .batchSize(128) 83 | .errorLimit(1.0e-5) 84 | .testInputGradient(nnp_convolution_algorithm_ft8x8); 85 | } 86 | 87 | TEST(FT16x16, conv5) { 88 | OverFeat_Fast::conv5() 89 | .batchSize(128) 90 | .errorLimit(1.0e-5) 91 | .testInputGradient(nnp_convolution_algorithm_ft16x16); 92 | } 93 | 94 | TEST(WT8x8, conv5) { 95 | OverFeat_Fast::conv5() 96 | .batchSize(128) 97 | .errorLimit(1.0e-5) 98 | .testInputGradient(nnp_convolution_algorithm_wt8x8); 99 | } 100 | 101 | int main(int argc, char* argv[]) { 102 | const enum nnp_status init_status = nnp_initialize(); 103 | assert(init_status == nnp_status_success); 104 | setenv("TERM", "xterm-256color", 0); 105 | ::testing::InitGoogleTest(&argc, argv); 106 | return RUN_ALL_TESTS(); 107 | } 108 | -------------------------------------------------------------------------------- /test/convolution-kernel-gradient/alexnet.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * AlexNet conv2 layer 10 | */ 11 | 12 | TEST(FT8x8, conv2) { 13 | AlexNet::conv2() 14 | .batchSize(128) 15 | .errorLimit(1.0e-6) 16 | .testKernelGradient(nnp_convolution_algorithm_ft8x8); 17 | } 18 | 19 | TEST(FT16x16, conv2) { 20 | AlexNet::conv2() 21 | .batchSize(128) 22 | .errorLimit(1.0e-5) 23 | .testKernelGradient(nnp_convolution_algorithm_ft16x16); 24 | } 25 | 26 | /* 27 | * AlexNet conv3 layer 28 | */ 29 | 30 | TEST(FT8x8, conv3) { 31 | AlexNet::conv3() 32 | .batchSize(128) 33 | .errorLimit(1.0e-6) 34 | .testKernelGradient(nnp_convolution_algorithm_ft8x8); 35 | } 36 | 37 | TEST(FT16x16, conv3) { 38 | AlexNet::conv3() 39 | .batchSize(128) 40 | .errorLimit(1.0e-5) 41 | .testKernelGradient(nnp_convolution_algorithm_ft16x16); 42 | } 43 | 44 | TEST(WT8x8, DISABLED_conv3) { 45 | AlexNet::conv3() 46 | .batchSize(128) 47 | .errorLimit(1.0e-3) 48 | .testKernelGradient(nnp_convolution_algorithm_wt8x8); 49 | } 50 | 51 | /* 52 | * AlexNet conv4 layer 53 | */ 54 | 55 | TEST(FT8x8, conv4) { 56 | AlexNet::conv4() 57 | .batchSize(128) 58 | .errorLimit(1.0e-6) 59 | .testKernelGradient(nnp_convolution_algorithm_ft8x8); 60 | } 61 | 62 | TEST(FT16x16, conv4) { 63 | AlexNet::conv4() 64 | .batchSize(128) 65 | .errorLimit(1.0e-5) 66 | .testKernelGradient(nnp_convolution_algorithm_ft16x16); 67 | } 68 | 69 | TEST(WT8x8, DISABLED_conv4) { 70 | AlexNet::conv4() 71 | .batchSize(128) 72 | .errorLimit(1.0e-3) 73 | .testKernelGradient(nnp_convolution_algorithm_wt8x8); 74 | } 75 | 76 | /* 77 | * AlexNet conv5 layer 78 | */ 79 | 80 | TEST(FT8x8, conv5) { 81 | AlexNet::conv5() 82 | .batchSize(128) 83 | .errorLimit(1.0e-6) 84 | .testKernelGradient(nnp_convolution_algorithm_ft8x8); 85 | } 86 | 87 | TEST(FT16x16, conv5) { 88 | AlexNet::conv5() 89 | .batchSize(128) 90 | .errorLimit(1.0e-5) 91 | .testKernelGradient(nnp_convolution_algorithm_ft16x16); 92 | } 93 | 94 | TEST(WT8x8, DISABLED_conv5) { 95 | AlexNet::conv5() 96 | .batchSize(128) 97 | .errorLimit(1.0e-3) 98 | .testKernelGradient(nnp_convolution_algorithm_wt8x8); 99 | } 100 | 101 | int main(int argc, char* argv[]) { 102 | const enum nnp_status init_status = nnp_initialize(); 103 | assert(init_status == nnp_status_success); 104 | setenv("TERM", "xterm-256color", 0); 105 | ::testing::InitGoogleTest(&argc, argv); 106 | return RUN_ALL_TESTS(); 107 | } 108 | -------------------------------------------------------------------------------- /test/convolution-kernel-gradient/overfeat-fast.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * OverFeat (Fast model) conv2 layer 10 | */ 11 | 12 | TEST(FT8x8, conv2) { 13 | OverFeat_Fast::conv2() 14 | .batchSize(128) 15 | .errorLimit(1.0e-6) 16 | .testKernelGradient(nnp_convolution_algorithm_ft8x8); 17 | } 18 | 19 | TEST(FT16x16, conv2) { 20 | OverFeat_Fast::conv2() 21 | .batchSize(128) 22 | .errorLimit(1.0e-5) 23 | .testKernelGradient(nnp_convolution_algorithm_ft16x16); 24 | } 25 | 26 | /* 27 | * OverFeat (Fast model) conv3 layer 28 | */ 29 | 30 | TEST(FT8x8, conv3) { 31 | OverFeat_Fast::conv3() 32 | .batchSize(128) 33 | .errorLimit(1.0e-6) 34 | .testKernelGradient(nnp_convolution_algorithm_ft8x8); 35 | } 36 | 37 | TEST(FT16x16, conv3) { 38 | OverFeat_Fast::conv3() 39 | .batchSize(128) 40 | .errorLimit(1.0e-5) 41 | .testKernelGradient(nnp_convolution_algorithm_ft16x16); 42 | } 43 | 44 | TEST(WT8x8, DISABLED_conv3) { 45 | OverFeat_Fast::conv3() 46 | .batchSize(128) 47 | .errorLimit(1.0e-3) 48 | .testKernelGradient(nnp_convolution_algorithm_wt8x8); 49 | } 50 | 51 | /* 52 | * OverFeat (Fast model) conv4 layer 53 | */ 54 | 55 | TEST(FT8x8, conv4) { 56 | OverFeat_Fast::conv4() 57 | .batchSize(128) 58 | .errorLimit(1.0e-6) 59 | .testKernelGradient(nnp_convolution_algorithm_ft8x8); 60 | } 61 | 62 | TEST(FT16x16, conv4) { 63 | OverFeat_Fast::conv4() 64 | .batchSize(128) 65 | .errorLimit(1.0e-5) 66 | .testKernelGradient(nnp_convolution_algorithm_ft16x16); 67 | } 68 | 69 | TEST(WT8x8, DISABLED_conv4) { 70 | OverFeat_Fast::conv4() 71 | .batchSize(128) 72 | .errorLimit(1.0e-3) 73 | .testKernelGradient(nnp_convolution_algorithm_wt8x8); 74 | } 75 | 76 | /* 77 | * OverFeat (Fast model) conv5 layer 78 | */ 79 | 80 | TEST(FT8x8, conv5) { 81 | OverFeat_Fast::conv5() 82 | .batchSize(128) 83 | .errorLimit(1.0e-6) 84 | .testKernelGradient(nnp_convolution_algorithm_ft8x8); 85 | } 86 | 87 | TEST(FT16x16, conv5) { 88 | OverFeat_Fast::conv5() 89 | .batchSize(128) 90 | .errorLimit(1.0e-5) 91 | .testKernelGradient(nnp_convolution_algorithm_ft16x16); 92 | } 93 | 94 | TEST(WT8x8, DISABLED_conv5) { 95 | OverFeat_Fast::conv5() 96 | .batchSize(128) 97 | .errorLimit(1.0e-3) 98 | .testKernelGradient(nnp_convolution_algorithm_wt8x8); 99 | } 100 | 101 | int main(int argc, char* argv[]) { 102 | const enum nnp_status init_status = nnp_initialize(); 103 | assert(init_status == nnp_status_success); 104 | setenv("TERM", "xterm-256color", 0); 105 | ::testing::InitGoogleTest(&argc, argv); 106 | return RUN_ALL_TESTS(); 107 | } 108 | -------------------------------------------------------------------------------- /test/fully-connected-inference/alexnet.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * AlexNet fc6 layer 10 | */ 11 | 12 | TEST(F32, fc6) { 13 | AlexNet::fc6() 14 | .errorLimit(2.0e-5) 15 | .testInferenceF32(); 16 | } 17 | 18 | TEST(F16F32, fc6) { 19 | AlexNet::fc6() 20 | .errorLimit(2.0e-5) 21 | .testInferenceF16F32(); 22 | } 23 | 24 | /* 25 | * AlexNet fc7 layer 26 | */ 27 | 28 | TEST(F32, fc7) { 29 | AlexNet::fc7() 30 | .errorLimit(1.0e-5) 31 | .testInferenceF32(); 32 | } 33 | 34 | TEST(F16F32, fc7) { 35 | AlexNet::fc7() 36 | .errorLimit(1.0e-5) 37 | .testInferenceF16F32(); 38 | } 39 | 40 | /* 41 | * AlexNet fc8 layer 42 | */ 43 | 44 | TEST(F32, fc8) { 45 | AlexNet::fc8() 46 | .errorLimit(1.0e-5) 47 | .testInferenceF32(); 48 | } 49 | 50 | TEST(F16F32, fc8) { 51 | AlexNet::fc8() 52 | .errorLimit(1.0e-5) 53 | .testInferenceF16F32(); 54 | } 55 | 56 | int main(int argc, char* argv[]) { 57 | const enum nnp_status init_status = nnp_initialize(); 58 | assert(init_status == nnp_status_success); 59 | setenv("TERM", "xterm-256color", 0); 60 | ::testing::InitGoogleTest(&argc, argv); 61 | return RUN_ALL_TESTS(); 62 | } 63 | -------------------------------------------------------------------------------- /test/fully-connected-inference/overfeat-fast.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * OverFeat (Fast model) fc6 layer 10 | */ 11 | 12 | TEST(F32, fc6) { 13 | OverFeat_Fast::fc6() 14 | .errorLimit(2.0e-5) 15 | .testInferenceF32(); 16 | } 17 | 18 | TEST(F16F32, fc6) { 19 | OverFeat_Fast::fc6() 20 | .errorLimit(2.0e-5) 21 | .testInferenceF16F32(); 22 | } 23 | 24 | /* 25 | * OverFeat (Fast model) fc7 layer 26 | */ 27 | 28 | TEST(F32, fc7) { 29 | OverFeat_Fast::fc7() 30 | .errorLimit(1.0e-5) 31 | .testInferenceF32(); 32 | } 33 | 34 | TEST(F16F32, fc7) { 35 | OverFeat_Fast::fc7() 36 | .errorLimit(1.0e-5) 37 | .testInferenceF16F32(); 38 | } 39 | 40 | /* 41 | * OverFeat (Fast model) fc8 layer 42 | */ 43 | 44 | TEST(F32, fc8) { 45 | OverFeat_Fast::fc8() 46 | .errorLimit(1.0e-5) 47 | .testInferenceF32(); 48 | } 49 | 50 | TEST(F16F32, fc8) { 51 | OverFeat_Fast::fc8() 52 | .errorLimit(1.0e-5) 53 | .testInferenceF16F32(); 54 | } 55 | 56 | int main(int argc, char* argv[]) { 57 | const enum nnp_status init_status = nnp_initialize(); 58 | assert(init_status == nnp_status_success); 59 | setenv("TERM", "xterm-256color", 0); 60 | ::testing::InitGoogleTest(&argc, argv); 61 | return RUN_ALL_TESTS(); 62 | } 63 | -------------------------------------------------------------------------------- /test/fully-connected-inference/vgg-a.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * VGG model A fc6 layer 10 | */ 11 | 12 | TEST(F32, fc6) { 13 | VGG_A::fc6() 14 | .errorLimit(2.0e-5) 15 | .testInferenceF32(); 16 | } 17 | 18 | TEST(F16F32, fc6) { 19 | VGG_A::fc6() 20 | .errorLimit(2.0e-5) 21 | .testInferenceF16F32(); 22 | } 23 | 24 | /* 25 | * VGG model A fc7 layer 26 | */ 27 | 28 | TEST(F32, fc7) { 29 | VGG_A::fc7() 30 | .errorLimit(1.0e-5) 31 | .testInferenceF32(); 32 | } 33 | 34 | TEST(F16F32, fc7) { 35 | VGG_A::fc7() 36 | .errorLimit(1.0e-5) 37 | .testInferenceF16F32(); 38 | } 39 | 40 | /* 41 | * VGG model A fc8 layer 42 | */ 43 | 44 | TEST(F32, fc8) { 45 | VGG_A::fc8() 46 | .errorLimit(1.0e-5) 47 | .testInferenceF32(); 48 | } 49 | 50 | TEST(F16F32, fc8) { 51 | VGG_A::fc8() 52 | .errorLimit(1.0e-5) 53 | .testInferenceF16F32(); 54 | } 55 | 56 | int main(int argc, char* argv[]) { 57 | const enum nnp_status init_status = nnp_initialize(); 58 | assert(init_status == nnp_status_success); 59 | setenv("TERM", "xterm-256color", 0); 60 | ::testing::InitGoogleTest(&argc, argv); 61 | return RUN_ALL_TESTS(); 62 | } 63 | -------------------------------------------------------------------------------- /test/fully-connected-output/alexnet.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * AlexNet fc6 layer 10 | */ 11 | 12 | TEST(FC, fc6) { 13 | AlexNet::fc6() 14 | .batchSize(128) 15 | .errorLimit(1.0e-5) 16 | .testOutput(); 17 | } 18 | 19 | /* 20 | * AlexNet fc7 layer 21 | */ 22 | 23 | TEST(FC, fc7) { 24 | AlexNet::fc7() 25 | .batchSize(128) 26 | .errorLimit(1.0e-5) 27 | .testOutput(); 28 | } 29 | 30 | /* 31 | * AlexNet fc8 layer 32 | */ 33 | 34 | TEST(FC, fc8) { 35 | AlexNet::fc8() 36 | .batchSize(128) 37 | .errorLimit(1.0e-5) 38 | .testOutput(); 39 | } 40 | 41 | int main(int argc, char* argv[]) { 42 | const enum nnp_status init_status = nnp_initialize(); 43 | assert(init_status == nnp_status_success); 44 | setenv("TERM", "xterm-256color", 0); 45 | ::testing::InitGoogleTest(&argc, argv); 46 | return RUN_ALL_TESTS(); 47 | } 48 | -------------------------------------------------------------------------------- /test/fully-connected-output/overfeat-fast.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * OverFeat (Fast model) fc6 layer 10 | */ 11 | 12 | TEST(FC, fc6) { 13 | OverFeat_Fast::fc6() 14 | .batchSize(128) 15 | .errorLimit(1.0e-5) 16 | .testOutput(); 17 | } 18 | 19 | /* 20 | * OverFeat (Fast model) fc7 layer 21 | */ 22 | 23 | TEST(FC, fc7) { 24 | OverFeat_Fast::fc7() 25 | .batchSize(128) 26 | .errorLimit(1.0e-5) 27 | .testOutput(); 28 | } 29 | 30 | /* 31 | * OverFeat (Fast model) fc8 layer 32 | */ 33 | 34 | TEST(FC, fc8) { 35 | OverFeat_Fast::fc8() 36 | .batchSize(128) 37 | .errorLimit(1.0e-5) 38 | .testOutput(); 39 | } 40 | 41 | int main(int argc, char* argv[]) { 42 | const enum nnp_status init_status = nnp_initialize(); 43 | assert(init_status == nnp_status_success); 44 | setenv("TERM", "xterm-256color", 0); 45 | ::testing::InitGoogleTest(&argc, argv); 46 | return RUN_ALL_TESTS(); 47 | } 48 | -------------------------------------------------------------------------------- /test/fully-connected-output/vgg-a.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * VGG model A fc6 layer 10 | */ 11 | 12 | TEST(FC, fc6) { 13 | VGG_A::fc6() 14 | .batchSize(64) 15 | .errorLimit(1.0e-5) 16 | .testOutput(); 17 | } 18 | 19 | /* 20 | * VGG model A fc7 layer 21 | */ 22 | 23 | TEST(FC, fc7) { 24 | VGG_A::fc7() 25 | .batchSize(64) 26 | .errorLimit(1.0e-5) 27 | .testOutput(); 28 | } 29 | 30 | /* 31 | * VGG model A fc8 layer 32 | */ 33 | 34 | TEST(FC, fc8) { 35 | VGG_A::fc8() 36 | .batchSize(64) 37 | .errorLimit(1.0e-5) 38 | .testOutput(); 39 | } 40 | 41 | int main(int argc, char* argv[]) { 42 | const enum nnp_status init_status = nnp_initialize(); 43 | assert(init_status == nnp_status_success); 44 | setenv("TERM", "xterm-256color", 0); 45 | ::testing::InitGoogleTest(&argc, argv); 46 | return RUN_ALL_TESTS(); 47 | } 48 | -------------------------------------------------------------------------------- /test/hxgemm/neon.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 9 | TEST(FAST_H4GEMM_3x3, neonhp) { 10 | ASSERT_TRUE(cpuinfo_initialize()); 11 | if (cpuinfo_has_arm_neon_fma()) { 12 | GemmMicroKernelTester tester = GemmMicroKernelTester() 13 | .simdWidth(4) 14 | .mr(3) 15 | .nr(3) 16 | .errorLimit(1.0e-3f); 17 | 18 | for (uint32_t kc = 1; kc < 10; kc++) { 19 | tester 20 | .kc(kc) 21 | .accumulateC(true) 22 | .testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__neonhp)); 23 | tester 24 | .accumulateC(false) 25 | .testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__neonhp)); 26 | } 27 | } 28 | } 29 | 30 | TEST(FULL_H4GEMM_3x3, neon) { 31 | ASSERT_TRUE(cpuinfo_initialize()); 32 | if (cpuinfo_has_arm_neon_fma()) { 33 | GemmMicroKernelTester tester = GemmMicroKernelTester() 34 | .simdWidth(4) 35 | .mr(3) 36 | .nr(3) 37 | .errorLimit(1.0e-3f); 38 | 39 | for (uint32_t kc = 1; kc < 10; kc++) { 40 | tester 41 | .kc(kc) 42 | .accumulateC(true) 43 | .testHXGEMM(nnp_full_tuple_gemm_function(nnp_h4gemm_upto_3x3__neonhp)); 44 | tester 45 | .accumulateC(false) 46 | .testHXGEMM(nnp_full_tuple_gemm_function(nnp_h4gemm_upto_3x3__neonhp)); 47 | } 48 | } 49 | } 50 | #endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */ 51 | 52 | #if CPUINFO_ARCH_ARM 53 | TEST(FAST_H4GEMM_3x3, aarch32_neonhp) { 54 | ASSERT_TRUE(cpuinfo_initialize()); 55 | if (cpuinfo_has_arm_neon_fma()) { 56 | GemmMicroKernelTester tester = GemmMicroKernelTester() 57 | .simdWidth(4) 58 | .mr(3) 59 | .nr(3) 60 | .errorLimit(1.0e-3f); 61 | 62 | for (uint32_t kc = 1; kc < 10; kc++) { 63 | tester 64 | .kc(kc) 65 | .accumulateC(true) 66 | .testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neonhp)); 67 | tester 68 | .accumulateC(false) 69 | .testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neonhp)); 70 | } 71 | } 72 | } 73 | 74 | TEST(FAST_H4GEMM_3x3, aarch32_neon2) { 75 | ASSERT_TRUE(cpuinfo_initialize()); 76 | if (cpuinfo_has_arm_neon_fma()) { 77 | GemmMicroKernelTester tester = GemmMicroKernelTester() 78 | .simdWidth(4) 79 | .mr(3) 80 | .nr(3) 81 | .errorLimit(1.0e-3f); 82 | 83 | for (uint32_t kc = 1; kc < 10; kc++) { 84 | tester 85 | .kc(kc) 86 | .accumulateC(true) 87 | .testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neon2)); 88 | tester 89 | .accumulateC(false) 90 | .testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neon2)); 91 | } 92 | } 93 | } 94 | 95 | TEST(FULL_H4GEMM_3x3, aarch32_neon2) { 96 | ASSERT_TRUE(cpuinfo_initialize()); 97 | if (cpuinfo_has_arm_neon_fma()) { 98 | GemmMicroKernelTester tester = GemmMicroKernelTester() 99 | .simdWidth(4) 100 | .mr(3) 101 | .nr(3) 102 | .errorLimit(1.0e-3f); 103 | 104 | for (uint32_t kc = 1; kc < 10; kc++) { 105 | tester 106 | .kc(kc) 107 | .accumulateC(true) 108 | .testHXGEMM(nnp_full_tuple_gemm_function(nnp_h4gemm_upto_3x3__aarch32_neon2)); 109 | tester 110 | .accumulateC(false) 111 | .testHXGEMM(nnp_full_tuple_gemm_function(nnp_h4gemm_upto_3x3__aarch32_neon2)); 112 | } 113 | } 114 | } 115 | 116 | TEST(FAST_H4GEMM_3x3, aarch32_neonhparith) { 117 | ASSERT_TRUE(cpuinfo_initialize()); 118 | if (cpuinfo_has_arm_neon_fp16_arith()) { 119 | GemmMicroKernelTester tester = GemmMicroKernelTester() 120 | .simdWidth(4) 121 | .mr(3) 122 | .nr(3) 123 | .errorLimit(1.0e-3f); 124 | 125 | for (uint32_t kc = 1; kc < 10; kc++) { 126 | tester 127 | .kc(kc) 128 | .accumulateC(true) 129 | .testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neonhparith)); 130 | tester 131 | .accumulateC(false) 132 | .testHXGEMM(nnp_fast_tuple_gemm_function(nnp_h4gemm_only_3x3__aarch32_neonhparith)); 133 | } 134 | } 135 | } 136 | #endif /* CPUINFO_ARCH_ARM */ 137 | -------------------------------------------------------------------------------- /test/max-pooling-output/overfeat-fast.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * OverFeat (Fast model) pool1 layer 10 | */ 11 | 12 | TEST(MaxPooling2x2, pool1) { 13 | OverFeat_Fast::pool1() 14 | .batchSize(128) 15 | .testOutput(); 16 | } 17 | 18 | /* 19 | * OverFeat (Fast model) pool2 layer 20 | */ 21 | 22 | TEST(MaxPooling2x2, pool2) { 23 | OverFeat_Fast::pool2() 24 | .batchSize(128) 25 | .testOutput(); 26 | } 27 | 28 | /* 29 | * OverFeat (Fast model) pool3 layer 30 | */ 31 | 32 | TEST(MaxPooling2x2, pool3) { 33 | OverFeat_Fast::pool3() 34 | .batchSize(128) 35 | .testOutput(); 36 | } 37 | 38 | int main(int argc, char* argv[]) { 39 | const enum nnp_status init_status = nnp_initialize(); 40 | assert(init_status == nnp_status_success); 41 | setenv("TERM", "xterm-256color", 0); 42 | ::testing::InitGoogleTest(&argc, argv); 43 | return RUN_ALL_TESTS(); 44 | } 45 | -------------------------------------------------------------------------------- /test/max-pooling-output/vgg-a.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * VGG model A pool1 layer 10 | */ 11 | 12 | TEST(MaxPooling2x2, pool1) { 13 | VGG_A::pool1() 14 | .batchSize(64) 15 | .testOutput(); 16 | } 17 | 18 | /* 19 | * VGG model A pool2 layer 20 | */ 21 | 22 | TEST(MaxPooling2x2, pool2) { 23 | VGG_A::pool2() 24 | .batchSize(64) 25 | .testOutput(); 26 | } 27 | 28 | /* 29 | * VGG model A pool3 layer 30 | */ 31 | 32 | TEST(MaxPooling2x2, pool3) { 33 | VGG_A::pool3() 34 | .batchSize(64) 35 | .testOutput(); 36 | } 37 | 38 | /* 39 | * VGG model A pool4 layer 40 | */ 41 | 42 | TEST(MaxPooling2x2, pool4) { 43 | VGG_A::pool4() 44 | .batchSize(64) 45 | .testOutput(); 46 | } 47 | 48 | /* 49 | * VGG model A pool5 layer 50 | */ 51 | 52 | TEST(MaxPooling2x2, pool5) { 53 | VGG_A::pool5() 54 | .batchSize(64) 55 | .testOutput(); 56 | } 57 | 58 | int main(int argc, char* argv[]) { 59 | const enum nnp_status init_status = nnp_initialize(); 60 | assert(init_status == nnp_status_success); 61 | setenv("TERM", "xterm-256color", 0); 62 | ::testing::InitGoogleTest(&argc, argv); 63 | return RUN_ALL_TESTS(); 64 | } 65 | -------------------------------------------------------------------------------- /test/relu-input-gradient/alexnet.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * AlexNet conv1 ReLU layer 10 | */ 11 | 12 | TEST(OUT_OF_PLACE, conv1_relu) { 13 | AlexNet::conv1_relu() 14 | .batchSize(128) 15 | .testInputGradient(); 16 | } 17 | 18 | /* 19 | * AlexNet conv1 ReLU layer 20 | */ 21 | 22 | TEST(OUT_OF_PLACE, conv2_relu) { 23 | AlexNet::conv2_relu() 24 | .batchSize(128) 25 | .testInputGradient(); 26 | } 27 | 28 | /* 29 | * AlexNet conv3 ReLU layer 30 | */ 31 | 32 | TEST(OUT_OF_PLACE, conv3_relu) { 33 | AlexNet::conv3_relu() 34 | .batchSize(128) 35 | .testInputGradient(); 36 | } 37 | 38 | /* 39 | * AlexNet conv4 ReLU layer 40 | */ 41 | 42 | TEST(OUT_OF_PLACE, conv4_relu) { 43 | AlexNet::conv4_relu() 44 | .batchSize(128) 45 | .testInputGradient(); 46 | } 47 | 48 | /* 49 | * AlexNet fc6 ReLU layer 50 | */ 51 | 52 | TEST(OUT_OF_PLACE, fc6_relu) { 53 | AlexNet::fc6_relu() 54 | .batchSize(128) 55 | .testInputGradient(); 56 | } 57 | 58 | /* 59 | * AlexNet fc8 ReLU layer 60 | */ 61 | 62 | TEST(OUT_OF_PLACE, fc8_relu) { 63 | AlexNet::fc8_relu() 64 | .batchSize(128) 65 | .testInputGradient(); 66 | } 67 | 68 | int main(int argc, char* argv[]) { 69 | const enum nnp_status init_status = nnp_initialize(); 70 | assert(init_status == nnp_status_success); 71 | setenv("TERM", "xterm-256color", 0); 72 | ::testing::InitGoogleTest(&argc, argv); 73 | return RUN_ALL_TESTS(); 74 | } 75 | -------------------------------------------------------------------------------- /test/relu-input-gradient/overfeat-fast.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * OverFeat (Fast model) conv1 ReLU layer 10 | */ 11 | 12 | TEST(OUT_OF_PLACE, conv1_relu) { 13 | OverFeat_Fast::conv1_relu() 14 | .batchSize(128) 15 | .testInputGradient(); 16 | } 17 | 18 | /* 19 | * OverFeat (Fast model) conv1 ReLU layer 20 | */ 21 | 22 | TEST(OUT_OF_PLACE, conv2_relu) { 23 | OverFeat_Fast::conv2_relu() 24 | .batchSize(128) 25 | .testInputGradient(); 26 | } 27 | 28 | /* 29 | * OverFeat (Fast model) conv3 ReLU layer 30 | */ 31 | 32 | TEST(OUT_OF_PLACE, conv3_relu) { 33 | OverFeat_Fast::conv3_relu() 34 | .batchSize(128) 35 | .testInputGradient(); 36 | } 37 | 38 | /* 39 | * OverFeat (Fast model) conv4 ReLU layer 40 | */ 41 | 42 | TEST(OUT_OF_PLACE, conv4_relu) { 43 | OverFeat_Fast::conv4_relu() 44 | .batchSize(128) 45 | .testInputGradient(); 46 | } 47 | 48 | /* 49 | * OverFeat (Fast model) fc6 ReLU layer 50 | */ 51 | 52 | TEST(OUT_OF_PLACE, fc6_relu) { 53 | OverFeat_Fast::fc6_relu() 54 | .batchSize(128) 55 | .testInputGradient(); 56 | } 57 | 58 | /* 59 | * OverFeat (Fast model) fc7 ReLU layer 60 | */ 61 | 62 | TEST(OUT_OF_PLACE, fc7_relu) { 63 | OverFeat_Fast::fc7_relu() 64 | .batchSize(128) 65 | .testInputGradient(); 66 | } 67 | 68 | /* 69 | * OverFeat (Fast model) fc8 ReLU layer 70 | */ 71 | 72 | TEST(OUT_OF_PLACE, fc8_relu) { 73 | OverFeat_Fast::fc8_relu() 74 | .batchSize(128) 75 | .testInputGradient(); 76 | } 77 | 78 | int main(int argc, char* argv[]) { 79 | const enum nnp_status init_status = nnp_initialize(); 80 | assert(init_status == nnp_status_success); 81 | setenv("TERM", "xterm-256color", 0); 82 | ::testing::InitGoogleTest(&argc, argv); 83 | return RUN_ALL_TESTS(); 84 | } 85 | -------------------------------------------------------------------------------- /test/relu-input-gradient/vgg-a.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * VGG model A conv1 ReLU layer 10 | */ 11 | 12 | TEST(OUT_OF_PLACE, conv1_relu) { 13 | VGG_A::conv1_relu() 14 | .batchSize(64) 15 | .testInputGradient(); 16 | } 17 | 18 | /* 19 | * VGG model A conv1 ReLU layer 20 | */ 21 | 22 | TEST(OUT_OF_PLACE, conv2_relu) { 23 | VGG_A::conv2_relu() 24 | .batchSize(64) 25 | .testInputGradient(); 26 | } 27 | 28 | /* 29 | * VGG model A conv3 ReLU layer 30 | */ 31 | 32 | TEST(OUT_OF_PLACE, conv3_relu) { 33 | VGG_A::conv3_relu() 34 | .batchSize(64) 35 | .testInputGradient(); 36 | } 37 | 38 | /* 39 | * VGG model A conv5 ReLU layer 40 | */ 41 | 42 | TEST(OUT_OF_PLACE, conv5_relu) { 43 | VGG_A::conv5_relu() 44 | .batchSize(64) 45 | .testInputGradient(); 46 | } 47 | 48 | /* 49 | * VGG model A conv8 ReLU layer 50 | */ 51 | 52 | TEST(OUT_OF_PLACE, conv8_relu) { 53 | VGG_A::conv8_relu() 54 | .batchSize(64) 55 | .testInputGradient(); 56 | } 57 | 58 | /* 59 | * VGG model A fc6 ReLU layer 60 | */ 61 | 62 | TEST(OUT_OF_PLACE, fc6_relu) { 63 | VGG_A::fc6_relu() 64 | .batchSize(64) 65 | .testInputGradient(); 66 | } 67 | 68 | /* 69 | * VGG model A fc8 ReLU layer 70 | */ 71 | 72 | TEST(OUT_OF_PLACE, fc8_relu) { 73 | VGG_A::fc8_relu() 74 | .batchSize(64) 75 | .testInputGradient(); 76 | } 77 | 78 | int main(int argc, char* argv[]) { 79 | const enum nnp_status init_status = nnp_initialize(); 80 | assert(init_status == nnp_status_success); 81 | setenv("TERM", "xterm-256color", 0); 82 | ::testing::InitGoogleTest(&argc, argv); 83 | return RUN_ALL_TESTS(); 84 | } 85 | -------------------------------------------------------------------------------- /test/relu-output/alexnet.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * AlexNet conv1 ReLU layer 10 | */ 11 | 12 | TEST(OUT_OF_PLACE, conv1_relu) { 13 | AlexNet::conv1_relu() 14 | .batchSize(128) 15 | .testOutput(); 16 | } 17 | 18 | TEST(IN_PLACE, conv1_relu) { 19 | AlexNet::conv1_relu() 20 | .batchSize(128) 21 | .testOutputInplace(); 22 | } 23 | 24 | /* 25 | * AlexNet conv1 ReLU layer 26 | */ 27 | 28 | TEST(OUT_OF_PLACE, conv2_relu) { 29 | AlexNet::conv2_relu() 30 | .batchSize(128) 31 | .testOutput(); 32 | } 33 | 34 | TEST(IN_PLACE, conv2_relu) { 35 | AlexNet::conv2_relu() 36 | .batchSize(128) 37 | .testOutputInplace(); 38 | } 39 | 40 | /* 41 | * AlexNet conv3 ReLU layer 42 | */ 43 | 44 | TEST(OUT_OF_PLACE, conv3_relu) { 45 | AlexNet::conv3_relu() 46 | .batchSize(128) 47 | .testOutput(); 48 | } 49 | 50 | TEST(IN_PLACE, conv3_relu) { 51 | AlexNet::conv3_relu() 52 | .batchSize(128) 53 | .testOutputInplace(); 54 | } 55 | 56 | /* 57 | * AlexNet conv4 ReLU layer 58 | */ 59 | 60 | TEST(OUT_OF_PLACE, conv4_relu) { 61 | AlexNet::conv4_relu() 62 | .batchSize(128) 63 | .testOutput(); 64 | } 65 | 66 | TEST(IN_PLACE, conv4_relu) { 67 | AlexNet::conv4_relu() 68 | .batchSize(128) 69 | .testOutputInplace(); 70 | } 71 | 72 | /* 73 | * AlexNet fc6 ReLU layer 74 | */ 75 | 76 | TEST(OUT_OF_PLACE, fc6_relu) { 77 | AlexNet::fc6_relu() 78 | .batchSize(128) 79 | .testOutput(); 80 | } 81 | 82 | TEST(IN_PLACE, fc6_relu) { 83 | AlexNet::fc6_relu() 84 | .batchSize(128) 85 | .testOutputInplace(); 86 | } 87 | 88 | /* 89 | * AlexNet fc8 ReLU layer 90 | */ 91 | 92 | TEST(OUT_OF_PLACE, fc8_relu) { 93 | AlexNet::fc8_relu() 94 | .batchSize(128) 95 | .testOutput(); 96 | } 97 | 98 | TEST(IN_PLACE, fc8_relu) { 99 | AlexNet::fc8_relu() 100 | .batchSize(128) 101 | .testOutputInplace(); 102 | } 103 | 104 | int main(int argc, char* argv[]) { 105 | const enum nnp_status init_status = nnp_initialize(); 106 | assert(init_status == nnp_status_success); 107 | setenv("TERM", "xterm-256color", 0); 108 | ::testing::InitGoogleTest(&argc, argv); 109 | return RUN_ALL_TESTS(); 110 | } 111 | -------------------------------------------------------------------------------- /test/relu-output/overfeat-fast.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * OverFeat (Fast model) conv1 ReLU layer 10 | */ 11 | 12 | TEST(OUT_OF_PLACE, conv1_relu) { 13 | OverFeat_Fast::conv1_relu() 14 | .batchSize(128) 15 | .testOutput(); 16 | } 17 | 18 | TEST(IN_PLACE, conv1_relu) { 19 | OverFeat_Fast::conv1_relu() 20 | .batchSize(128) 21 | .testOutputInplace(); 22 | } 23 | 24 | /* 25 | * OverFeat (Fast model) conv1 ReLU layer 26 | */ 27 | 28 | TEST(OUT_OF_PLACE, conv2_relu) { 29 | OverFeat_Fast::conv2_relu() 30 | .batchSize(128) 31 | .testOutput(); 32 | } 33 | 34 | TEST(IN_PLACE, conv2_relu) { 35 | OverFeat_Fast::conv2_relu() 36 | .batchSize(128) 37 | .testOutputInplace(); 38 | } 39 | 40 | /* 41 | * OverFeat (Fast model) conv3 ReLU layer 42 | */ 43 | 44 | TEST(OUT_OF_PLACE, conv3_relu) { 45 | OverFeat_Fast::conv3_relu() 46 | .batchSize(128) 47 | .testOutput(); 48 | } 49 | 50 | TEST(IN_PLACE, conv3_relu) { 51 | OverFeat_Fast::conv3_relu() 52 | .batchSize(128) 53 | .testOutputInplace(); 54 | } 55 | 56 | /* 57 | * OverFeat (Fast model) conv4 ReLU layer 58 | */ 59 | 60 | TEST(OUT_OF_PLACE, conv4_relu) { 61 | OverFeat_Fast::conv4_relu() 62 | .batchSize(128) 63 | .testOutput(); 64 | } 65 | 66 | TEST(IN_PLACE, conv4_relu) { 67 | OverFeat_Fast::conv4_relu() 68 | .batchSize(128) 69 | .testOutputInplace(); 70 | } 71 | 72 | /* 73 | * OverFeat (Fast model) fc6 ReLU layer 74 | */ 75 | 76 | TEST(OUT_OF_PLACE, fc6_relu) { 77 | OverFeat_Fast::fc6_relu() 78 | .batchSize(128) 79 | .testOutput(); 80 | } 81 | 82 | TEST(IN_PLACE, fc6_relu) { 83 | OverFeat_Fast::fc6_relu() 84 | .batchSize(128) 85 | .testOutputInplace(); 86 | } 87 | 88 | /* 89 | * OverFeat (Fast model) fc7 ReLU layer 90 | */ 91 | 92 | TEST(OUT_OF_PLACE, fc7_relu) { 93 | OverFeat_Fast::fc7_relu() 94 | .batchSize(128) 95 | .testOutput(); 96 | } 97 | 98 | TEST(IN_PLACE, fc7_relu) { 99 | OverFeat_Fast::fc7_relu() 100 | .batchSize(128) 101 | .testOutputInplace(); 102 | } 103 | 104 | /* 105 | * OverFeat (Fast model) fc8 ReLU layer 106 | */ 107 | 108 | TEST(OUT_OF_PLACE, fc8_relu) { 109 | OverFeat_Fast::fc8_relu() 110 | .batchSize(128) 111 | .testOutput(); 112 | } 113 | 114 | TEST(IN_PLACE, fc8_relu) { 115 | OverFeat_Fast::fc8_relu() 116 | .batchSize(128) 117 | .testOutputInplace(); 118 | } 119 | 120 | int main(int argc, char* argv[]) { 121 | const enum nnp_status init_status = nnp_initialize(); 122 | assert(init_status == nnp_status_success); 123 | setenv("TERM", "xterm-256color", 0); 124 | ::testing::InitGoogleTest(&argc, argv); 125 | return RUN_ALL_TESTS(); 126 | } 127 | -------------------------------------------------------------------------------- /test/relu-output/vgg-a.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | /* 9 | * VGG model A conv1 ReLU layer 10 | */ 11 | 12 | TEST(OUT_OF_PLACE, conv1_relu) { 13 | VGG_A::conv1_relu() 14 | .batchSize(64) 15 | .testOutput(); 16 | } 17 | 18 | TEST(IN_PLACE, conv1_relu) { 19 | VGG_A::conv1_relu() 20 | .batchSize(64) 21 | .testOutputInplace(); 22 | } 23 | 24 | /* 25 | * VGG model A conv1 ReLU layer 26 | */ 27 | 28 | TEST(OUT_OF_PLACE, conv2_relu) { 29 | VGG_A::conv2_relu() 30 | .batchSize(64) 31 | .testOutput(); 32 | } 33 | 34 | TEST(IN_PLACE, conv2_relu) { 35 | VGG_A::conv2_relu() 36 | .batchSize(64) 37 | .testOutputInplace(); 38 | } 39 | 40 | /* 41 | * VGG model A conv3 ReLU layer 42 | */ 43 | 44 | TEST(OUT_OF_PLACE, conv3_relu) { 45 | VGG_A::conv3_relu() 46 | .batchSize(64) 47 | .testOutput(); 48 | } 49 | 50 | TEST(IN_PLACE, conv3_relu) { 51 | VGG_A::conv3_relu() 52 | .batchSize(64) 53 | .testOutputInplace(); 54 | } 55 | 56 | /* 57 | * VGG model A conv5 ReLU layer 58 | */ 59 | 60 | TEST(OUT_OF_PLACE, conv5_relu) { 61 | VGG_A::conv5_relu() 62 | .batchSize(64) 63 | .testOutput(); 64 | } 65 | 66 | TEST(IN_PLACE, conv5_relu) { 67 | VGG_A::conv5_relu() 68 | .batchSize(64) 69 | .testOutputInplace(); 70 | } 71 | 72 | /* 73 | * VGG model A conv8 ReLU layer 74 | */ 75 | 76 | TEST(OUT_OF_PLACE, conv8_relu) { 77 | VGG_A::conv8_relu() 78 | .batchSize(64) 79 | .testOutput(); 80 | } 81 | 82 | TEST(IN_PLACE, conv8_relu) { 83 | VGG_A::conv8_relu() 84 | .batchSize(64) 85 | .testOutputInplace(); 86 | } 87 | 88 | /* 89 | * VGG model A fc6 ReLU layer 90 | */ 91 | 92 | TEST(OUT_OF_PLACE, fc6_relu) { 93 | VGG_A::fc6_relu() 94 | .batchSize(64) 95 | .testOutput(); 96 | } 97 | 98 | TEST(IN_PLACE, fc6_relu) { 99 | VGG_A::fc6_relu() 100 | .batchSize(64) 101 | .testOutputInplace(); 102 | } 103 | 104 | /* 105 | * VGG model A fc8 ReLU layer 106 | */ 107 | 108 | TEST(OUT_OF_PLACE, fc8_relu) { 109 | VGG_A::fc8_relu() 110 | .batchSize(64) 111 | .testOutput(); 112 | } 113 | 114 | TEST(IN_PLACE, fc8_relu) { 115 | VGG_A::fc8_relu() 116 | .batchSize(64) 117 | .testOutputInplace(); 118 | } 119 | 120 | int main(int argc, char* argv[]) { 121 | const enum nnp_status init_status = nnp_initialize(); 122 | assert(init_status == nnp_status_success); 123 | setenv("TERM", "xterm-256color", 0); 124 | ::testing::InitGoogleTest(&argc, argv); 125 | return RUN_ALL_TESTS(); 126 | } 127 | -------------------------------------------------------------------------------- /test/sgemm/neon.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | TEST(FAST6x8_NEON, kc1) { 9 | auto tester = GemmMicroKernelTester() 10 | .mr(6) 11 | .nr(8) 12 | .kc(1) 13 | .simdWidth(4) 14 | .errorLimit(1.0e-6f); 15 | tester.accumulateC(false) 16 | .testSGEMM(nnp_sgemm_only_6x8__neon); 17 | tester.accumulateC(true) 18 | .testSGEMM(nnp_sgemm_only_6x8__neon); 19 | } 20 | 21 | TEST(FAST6x8_NEON, kc2) { 22 | auto tester = GemmMicroKernelTester() 23 | .mr(6) 24 | .nr(8) 25 | .kc(2) 26 | .simdWidth(4) 27 | .errorLimit(1.0e-6f); 28 | tester.accumulateC(false) 29 | .testSGEMM(nnp_sgemm_only_6x8__neon); 30 | tester.accumulateC(true) 31 | .testSGEMM(nnp_sgemm_only_6x8__neon); 32 | } 33 | 34 | TEST(FAST6x8_NEON, kc10) { 35 | auto tester = GemmMicroKernelTester() 36 | .mr(6) 37 | .nr(8) 38 | .kc(10) 39 | .simdWidth(4) 40 | .errorLimit(1.0e-6f); 41 | tester.accumulateC(false) 42 | .testSGEMM(nnp_sgemm_only_6x8__neon); 43 | tester.accumulateC(true) 44 | .testSGEMM(nnp_sgemm_only_6x8__neon); 45 | } 46 | 47 | #if CPUINFO_ARCH_ARM 48 | TEST(FAST6x8_AARCH32_NEON, kc1) { 49 | auto tester = GemmMicroKernelTester() 50 | .mr(6) 51 | .nr(8) 52 | .kc(1) 53 | .simdWidth(4) 54 | .errorLimit(1.0e-6f); 55 | tester 56 | .accumulateC(true) 57 | .testSGEMM(nnp_sgemm_only_6x8__aarch32_neon); 58 | tester 59 | .accumulateC(false) 60 | .testSGEMM(nnp_sgemm_only_6x8__aarch32_neon); 61 | } 62 | 63 | TEST(FAST6x8_AARCH32_NEON, kc2) { 64 | auto tester = GemmMicroKernelTester() 65 | .mr(6) 66 | .nr(8) 67 | .kc(2) 68 | .simdWidth(4) 69 | .errorLimit(1.0e-6f); 70 | tester 71 | .accumulateC(true) 72 | .testSGEMM(nnp_sgemm_only_6x8__aarch32_neon); 73 | tester 74 | .accumulateC(false) 75 | .testSGEMM(nnp_sgemm_only_6x8__aarch32_neon); 76 | } 77 | 78 | TEST(FAST6x8_AARCH32_NEON, kc10) { 79 | auto tester = GemmMicroKernelTester() 80 | .mr(6) 81 | .nr(8) 82 | .kc(10) 83 | .simdWidth(4) 84 | .errorLimit(1.0e-6f); 85 | tester 86 | .accumulateC(true) 87 | .testSGEMM(nnp_sgemm_only_6x8__aarch32_neon); 88 | tester 89 | .accumulateC(false) 90 | .testSGEMM(nnp_sgemm_only_6x8__aarch32_neon); 91 | } 92 | #endif 93 | 94 | TEST(FULL6x8_NEON, kc1) { 95 | auto tester = GemmMicroKernelTester() 96 | .mr(6) 97 | .nr(8) 98 | .kc(1) 99 | .simdWidth(4) 100 | .errorLimit(1.0e-6f); 101 | tester 102 | .accumulateC(true) 103 | .testSGEMM(nnp_sgemm_upto_6x8__neon); 104 | tester 105 | .accumulateC(false) 106 | .testSGEMM(nnp_sgemm_upto_6x8__neon); 107 | } 108 | 109 | TEST(FULL6x8_NEON, kc2) { 110 | auto tester = GemmMicroKernelTester() 111 | .mr(6) 112 | .nr(8) 113 | .kc(2) 114 | .simdWidth(4) 115 | .errorLimit(1.0e-6f); 116 | tester 117 | .accumulateC(true) 118 | .testSGEMM(nnp_sgemm_upto_6x8__neon); 119 | tester 120 | .accumulateC(false) 121 | .testSGEMM(nnp_sgemm_upto_6x8__neon); 122 | } 123 | 124 | TEST(FULL6x8_NEON, kc10) { 125 | auto tester = GemmMicroKernelTester() 126 | .mr(6) 127 | .nr(8) 128 | .kc(10) 129 | .simdWidth(4) 130 | .errorLimit(1.0e-6f); 131 | tester 132 | .accumulateC(true) 133 | .testSGEMM(nnp_sgemm_upto_6x8__neon); 134 | tester 135 | .accumulateC(false) 136 | .testSGEMM(nnp_sgemm_upto_6x8__neon); 137 | } 138 | -------------------------------------------------------------------------------- /test/sgemm/psimd.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | TEST(FAST4x8, kc1) { 7 | auto tester = GemmMicroKernelTester() 8 | .mr(4) 9 | .nr(8) 10 | .kc(1) 11 | .simdWidth(4) 12 | .errorLimit(1.0e-6f); 13 | tester 14 | .accumulateC(true) 15 | .testSGEMM(nnp_sgemm_only_4x8__psimd); 16 | tester 17 | .accumulateC(false) 18 | .testSGEMM(nnp_sgemm_only_4x8__psimd); 19 | } 20 | 21 | TEST(FAST4x8, kc2) { 22 | auto tester = GemmMicroKernelTester() 23 | .mr(4) 24 | .nr(8) 25 | .kc(2) 26 | .simdWidth(4) 27 | .errorLimit(1.0e-6f); 28 | tester 29 | .accumulateC(true) 30 | .testSGEMM(nnp_sgemm_only_4x8__psimd); 31 | tester 32 | .accumulateC(false) 33 | .testSGEMM(nnp_sgemm_only_4x8__psimd); 34 | } 35 | 36 | TEST(FAST4x8, kc10) { 37 | auto tester = GemmMicroKernelTester() 38 | .mr(4) 39 | .nr(8) 40 | .kc(10) 41 | .simdWidth(4) 42 | .errorLimit(1.0e-6f); 43 | tester 44 | .accumulateC(true) 45 | .testSGEMM(nnp_sgemm_only_4x8__psimd); 46 | tester 47 | .accumulateC(false) 48 | .testSGEMM(nnp_sgemm_only_4x8__psimd); 49 | } 50 | 51 | TEST(FULL4x8, kc1) { 52 | auto tester = GemmMicroKernelTester() 53 | .mr(4) 54 | .nr(8) 55 | .kc(1) 56 | .simdWidth(4) 57 | .errorLimit(1.0e-6f); 58 | tester 59 | .accumulateC(true) 60 | .testSGEMM(nnp_sgemm_upto_4x8__psimd); 61 | tester 62 | .accumulateC(false) 63 | .testSGEMM(nnp_sgemm_upto_4x8__psimd); 64 | } 65 | 66 | TEST(FULL4x8, kc2) { 67 | auto tester = GemmMicroKernelTester() 68 | .mr(4) 69 | .nr(8) 70 | .kc(2) 71 | .simdWidth(4) 72 | .errorLimit(1.0e-6f); 73 | tester 74 | .accumulateC(true) 75 | .testSGEMM(nnp_sgemm_upto_4x8__psimd); 76 | tester 77 | .accumulateC(false) 78 | .testSGEMM(nnp_sgemm_upto_4x8__psimd); 79 | } 80 | 81 | TEST(FULL4x8, kc10) { 82 | auto tester = GemmMicroKernelTester() 83 | .mr(4) 84 | .nr(8) 85 | .kc(10) 86 | .simdWidth(4) 87 | .errorLimit(1.0e-6f); 88 | tester 89 | .accumulateC(true) 90 | .testSGEMM(nnp_sgemm_upto_4x8__psimd); 91 | tester 92 | .accumulateC(false) 93 | .testSGEMM(nnp_sgemm_upto_4x8__psimd); 94 | } 95 | -------------------------------------------------------------------------------- /test/sgemm/scalar.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | TEST(FAST4x3, kc1) { 7 | auto tester = GemmMicroKernelTester() 8 | .mr(4) 9 | .nr(3) 10 | .kc(1) 11 | .simdWidth(1) 12 | .errorLimit(1.0e-6f); 13 | tester 14 | .accumulateC(true) 15 | .testSGEMM(nnp_sgemm_only_4x3__scalar); 16 | tester 17 | .accumulateC(false) 18 | .testSGEMM(nnp_sgemm_only_4x3__scalar); 19 | } 20 | 21 | TEST(FAST4x3, kc2) { 22 | auto tester = GemmMicroKernelTester() 23 | .mr(4) 24 | .nr(3) 25 | .kc(2) 26 | .simdWidth(1) 27 | .errorLimit(1.0e-6f); 28 | tester 29 | .accumulateC(true) 30 | .testSGEMM(nnp_sgemm_only_4x3__scalar); 31 | tester 32 | .accumulateC(false) 33 | .testSGEMM(nnp_sgemm_only_4x3__scalar); 34 | } 35 | 36 | TEST(FAST4x3, kc10) { 37 | auto tester = GemmMicroKernelTester() 38 | .mr(4) 39 | .nr(3) 40 | .kc(10) 41 | .simdWidth(1) 42 | .errorLimit(1.0e-6f); 43 | tester 44 | .accumulateC(true) 45 | .testSGEMM(nnp_sgemm_only_4x3__scalar); 46 | tester 47 | .accumulateC(false) 48 | .testSGEMM(nnp_sgemm_only_4x3__scalar); 49 | } 50 | 51 | TEST(FULL4x3, kc1) { 52 | auto tester = GemmMicroKernelTester() 53 | .mr(4) 54 | .nr(3) 55 | .kc(1) 56 | .simdWidth(1) 57 | .errorLimit(1.0e-6f); 58 | tester 59 | .accumulateC(true) 60 | .testSGEMM(nnp_sgemm_upto_4x3__scalar); 61 | tester 62 | .accumulateC(false) 63 | .testSGEMM(nnp_sgemm_upto_4x3__scalar); 64 | } 65 | 66 | TEST(FULL4x3, kc2) { 67 | auto tester = GemmMicroKernelTester() 68 | .mr(4) 69 | .nr(3) 70 | .kc(2) 71 | .simdWidth(1) 72 | .errorLimit(1.0e-6f); 73 | tester 74 | .accumulateC(true) 75 | .testSGEMM(nnp_sgemm_upto_4x3__scalar); 76 | tester 77 | .accumulateC(false) 78 | .testSGEMM(nnp_sgemm_upto_4x3__scalar); 79 | } 80 | 81 | TEST(FULL4x3, kc10) { 82 | auto tester = GemmMicroKernelTester() 83 | .mr(4) 84 | .nr(3) 85 | .kc(10) 86 | .simdWidth(1) 87 | .errorLimit(1.0e-6f); 88 | tester 89 | .accumulateC(true) 90 | .testSGEMM(nnp_sgemm_upto_4x3__scalar); 91 | tester 92 | .accumulateC(false) 93 | .testSGEMM(nnp_sgemm_upto_4x3__scalar); 94 | } 95 | -------------------------------------------------------------------------------- /test/sgemm/x86_64-fma3.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | TEST(FAST4x24, kc1) { 7 | auto tester = GemmMicroKernelTester() 8 | .mr(4) 9 | .nr(24) 10 | .kc(1) 11 | .simdWidth(8) 12 | .errorLimit(1.0e-6f); 13 | tester 14 | .accumulateC(true) 15 | .testSGEMM(nnp_sgemm_only_4x24__fma3); 16 | tester 17 | .accumulateC(false) 18 | .testSGEMM(nnp_sgemm_only_4x24__fma3); 19 | } 20 | 21 | TEST(FAST4x24, kc2) { 22 | auto tester = GemmMicroKernelTester() 23 | .mr(4) 24 | .nr(24) 25 | .kc(2) 26 | .simdWidth(8) 27 | .errorLimit(1.0e-6f); 28 | tester 29 | .accumulateC(true) 30 | .testSGEMM(nnp_sgemm_only_4x24__fma3); 31 | tester 32 | .accumulateC(false) 33 | .testSGEMM(nnp_sgemm_only_4x24__fma3); 34 | } 35 | 36 | TEST(FAST4x24, kc10) { 37 | auto tester = GemmMicroKernelTester() 38 | .mr(4) 39 | .nr(24) 40 | .kc(10) 41 | .simdWidth(8) 42 | .errorLimit(1.0e-6f); 43 | tester 44 | .accumulateC(true) 45 | .testSGEMM(nnp_sgemm_only_4x24__fma3); 46 | tester 47 | .accumulateC(false) 48 | .testSGEMM(nnp_sgemm_only_4x24__fma3); 49 | } 50 | 51 | TEST(FULL4x24, kc1) { 52 | auto tester = GemmMicroKernelTester() 53 | .mr(4) 54 | .nr(24) 55 | .kc(1) 56 | .simdWidth(8) 57 | .errorLimit(1.0e-6f); 58 | tester 59 | .accumulateC(true) 60 | .testSGEMM(nnp_sgemm_upto_4x24__fma3); 61 | tester 62 | .accumulateC(false) 63 | .testSGEMM(nnp_sgemm_upto_4x24__fma3); 64 | } 65 | 66 | TEST(FULL4x24, kc2) { 67 | auto tester = GemmMicroKernelTester() 68 | .mr(4) 69 | .nr(24) 70 | .kc(2) 71 | .simdWidth(8) 72 | .errorLimit(1.0e-6f); 73 | tester 74 | .accumulateC(true) 75 | .testSGEMM(nnp_sgemm_upto_4x24__fma3); 76 | tester 77 | .accumulateC(false) 78 | .testSGEMM(nnp_sgemm_upto_4x24__fma3); 79 | } 80 | 81 | TEST(FULL4x24, kc10) { 82 | auto tester = GemmMicroKernelTester() 83 | .mr(4) 84 | .nr(24) 85 | .kc(10) 86 | .simdWidth(8) 87 | .errorLimit(1.0e-6f); 88 | tester 89 | .accumulateC(true) 90 | .testSGEMM(nnp_sgemm_upto_4x24__fma3); 91 | tester 92 | .accumulateC(false) 93 | .testSGEMM(nnp_sgemm_upto_4x24__fma3); 94 | } 95 | -------------------------------------------------------------------------------- /test/softmax-output/imagenet.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | 7 | /* 8 | * ImageNet (1000 categories) with batch size = 1 9 | */ 10 | 11 | TEST(OUT_OF_PLACE, batch1) { 12 | SoftmaxTester() 13 | .channels(1000) 14 | .testOutput(); 15 | } 16 | 17 | TEST(IN_PLACE, batch1) { 18 | SoftmaxTester() 19 | .channels(1000) 20 | .testOutputInplace(); 21 | } 22 | 23 | /* 24 | * ImageNet (1000 categories) with batch size = 2 25 | */ 26 | 27 | TEST(OUT_OF_PLACE, batch2) { 28 | SoftmaxTester() 29 | .batchSize(2) 30 | .channels(1000) 31 | .testOutput(); 32 | } 33 | 34 | TEST(IN_PLACE, batch2) { 35 | SoftmaxTester() 36 | .batchSize(2) 37 | .channels(1000) 38 | .testOutputInplace(); 39 | } 40 | 41 | /* 42 | * ImageNet (1000 categories) with batch size = 16 43 | */ 44 | 45 | TEST(OUT_OF_PLACE, batch16) { 46 | SoftmaxTester() 47 | .batchSize(16) 48 | .channels(1000) 49 | .testOutput(); 50 | } 51 | 52 | TEST(IN_PLACE, batch16) { 53 | SoftmaxTester() 54 | .batchSize(16) 55 | .channels(1000) 56 | .testOutputInplace(); 57 | } 58 | 59 | /* 60 | * ImageNet (1000 categories) with batch size = 64 61 | */ 62 | 63 | TEST(OUT_OF_PLACE, batch64) { 64 | SoftmaxTester() 65 | .batchSize(64) 66 | .channels(1000) 67 | .testOutput(); 68 | } 69 | 70 | TEST(IN_PLACE, batch64) { 71 | SoftmaxTester() 72 | .batchSize(64) 73 | .channels(1000) 74 | .testOutputInplace(); 75 | } 76 | 77 | /* 78 | * ImageNet (1000 categories) with batch size = 128 79 | */ 80 | 81 | TEST(OUT_OF_PLACE, batch128) { 82 | SoftmaxTester() 83 | .multithreading(true) 84 | .batchSize(128) 85 | .channels(1000) 86 | .testOutput(); 87 | } 88 | 89 | TEST(IN_PLACE, batch128) { 90 | SoftmaxTester() 91 | .multithreading(true) 92 | .batchSize(128) 93 | .channels(1000) 94 | .testOutputInplace(); 95 | } 96 | 97 | /* 98 | * ImageNet (1000 categories) with batch size = 256 99 | */ 100 | 101 | TEST(OUT_OF_PLACE, batch256) { 102 | SoftmaxTester() 103 | .multithreading(true) 104 | .batchSize(256) 105 | .channels(1000) 106 | .testOutput(); 107 | } 108 | 109 | TEST(IN_PLACE, batch256) { 110 | SoftmaxTester() 111 | .multithreading(true) 112 | .batchSize(256) 113 | .channels(1000) 114 | .testOutputInplace(); 115 | } 116 | 117 | int main(int argc, char* argv[]) { 118 | const enum nnp_status init_status = nnp_initialize(); 119 | assert(init_status == nnp_status_success); 120 | setenv("TERM", "xterm-256color", 0); 121 | ::testing::InitGoogleTest(&argc, argv); 122 | return RUN_ALL_TESTS(); 123 | } 124 | -------------------------------------------------------------------------------- /test/softmax-output/smoke.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | 7 | /* 8 | * Test that implementation works for a small number of channels 9 | */ 10 | 11 | TEST(OUT_OF_PLACE, few_channels) { 12 | auto tester = SoftmaxTester(); 13 | for (size_t channels = 1; channels <= 96; channels += 1) { 14 | tester.channels(1000) 15 | .testOutput(); 16 | } 17 | } 18 | 19 | TEST(IN_PLACE, few_channels) { 20 | auto tester = SoftmaxTester(); 21 | for (size_t channels = 1; channels <= 96; channels += 1) { 22 | tester.channels(1000) 23 | .testOutputInplace(); 24 | } 25 | } 26 | 27 | /* 28 | * Test that implementation works for a moderate number of channels with small batch 29 | */ 30 | 31 | TEST(OUT_OF_PLACE, small_batch) { 32 | auto tester = SoftmaxTester(); 33 | for (size_t channels = 100; channels <= 115; channels += 1) { 34 | for (size_t batch = 2; batch <= 5; batch += 1) { 35 | tester.channels(1000) 36 | .batchSize(batch) 37 | .testOutput(); 38 | } 39 | } 40 | } 41 | 42 | TEST(IN_PLACE, small_batch) { 43 | auto tester = SoftmaxTester(); 44 | for (size_t channels = 100; channels <= 115; channels += 1) { 45 | for (size_t batch = 2; batch <= 5; batch += 1) { 46 | tester.channels(1000) 47 | .batchSize(batch) 48 | .testOutputInplace(); 49 | } 50 | } 51 | } 52 | 53 | int main(int argc, char* argv[]) { 54 | const enum nnp_status init_status = nnp_initialize(); 55 | assert(init_status == nnp_status_success); 56 | setenv("TERM", "xterm-256color", 0); 57 | ::testing::InitGoogleTest(&argc, argv); 58 | return RUN_ALL_TESTS(); 59 | } 60 | -------------------------------------------------------------------------------- /test/sxgemm/neon.cc: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | 5 | #include 6 | #include 7 | 8 | #if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 9 | TEST(FAST_S4GEMM_3x3, neon) { 10 | ASSERT_TRUE(cpuinfo_initialize()); 11 | if (cpuinfo_has_arm_neon_fma()) { 12 | GemmMicroKernelTester tester = GemmMicroKernelTester() 13 | .simdWidth(4) 14 | .mr(3) 15 | .nr(3) 16 | .errorLimit(1.0e-6f); 17 | 18 | for (uint32_t kc = 1; kc < 10; kc++) { 19 | tester 20 | .kc(kc) 21 | .accumulateC(true) 22 | .testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__neon)); 23 | tester 24 | .accumulateC(false) 25 | .testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__neon)); 26 | } 27 | } 28 | } 29 | 30 | TEST(FULL_S4GEMM_3x3, neon) { 31 | ASSERT_TRUE(cpuinfo_initialize()); 32 | if (cpuinfo_has_arm_neon_fma()) { 33 | GemmMicroKernelTester tester = GemmMicroKernelTester() 34 | .simdWidth(4) 35 | .mr(3) 36 | .nr(3) 37 | .errorLimit(1.0e-6f); 38 | 39 | for (uint32_t kc = 1; kc < 10; kc++) { 40 | tester 41 | .kc(kc) 42 | .accumulateC(true) 43 | .testSXGEMM(nnp_full_tuple_gemm_function(nnp_s4gemm_upto_3x3__neon)); 44 | tester 45 | .accumulateC(false) 46 | .testSXGEMM(nnp_full_tuple_gemm_function(nnp_s4gemm_upto_3x3__neon)); 47 | } 48 | } 49 | } 50 | #endif /* CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64 */ 51 | 52 | #if CPUINFO_ARCH_ARM 53 | TEST(FAST_S4GEMM_3x3, aarch32_neon) { 54 | ASSERT_TRUE(cpuinfo_initialize()); 55 | if (cpuinfo_has_arm_neon_fma()) { 56 | GemmMicroKernelTester tester = GemmMicroKernelTester() 57 | .simdWidth(4) 58 | .mr(3) 59 | .nr(3) 60 | .errorLimit(1.0e-6f); 61 | 62 | for (uint32_t kc = 1; kc < 10; kc++) { 63 | tester 64 | .kc(kc) 65 | .accumulateC(true) 66 | .testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__aarch32_neon)); 67 | tester 68 | .accumulateC(false) 69 | .testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__aarch32_neon)); 70 | } 71 | } 72 | } 73 | 74 | TEST(FAST_S4GEMM_3x3, aarch32_neon2) { 75 | ASSERT_TRUE(cpuinfo_initialize()); 76 | if (cpuinfo_has_arm_neon_fma()) { 77 | GemmMicroKernelTester tester = GemmMicroKernelTester() 78 | .simdWidth(4) 79 | .mr(3) 80 | .nr(3) 81 | .errorLimit(1.0e-6f); 82 | 83 | for (uint32_t kc = 1; kc < 10; kc++) { 84 | tester 85 | .kc(kc) 86 | .accumulateC(true) 87 | .testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__aarch32_neon2)); 88 | tester 89 | .accumulateC(false) 90 | .testSXGEMM(nnp_fast_tuple_gemm_function(nnp_s4gemm_only_3x3__aarch32_neon2)); 91 | } 92 | } 93 | } 94 | #endif /* CPUINFO_ARCH_ARM */ 95 | -------------------------------------------------------------------------------- /web/nnpack.nmf: -------------------------------------------------------------------------------- 1 | { 2 | "program": { 3 | "portable": { 4 | "pnacl-translate": { 5 | "url": "webnnpack.pexe" 6 | } 7 | } 8 | } 9 | } 10 | --------------------------------------------------------------------------------