├── CMakeLists.txt ├── README.md ├── build_scripts ├── Info.plist ├── build_android.sh ├── build_ios.sh ├── build_linux_aarch64.sh ├── build_linux_avx.sh ├── build_macos_avx.sh ├── ios.toolchain.cmake ├── linux-aarch64.toolchain.cmake └── pack_ios_framework.sh └── src ├── CMakeLists.txt ├── blob.cpp ├── blob.h ├── booster ├── CMakeLists.txt ├── arm │ ├── CMakeLists.txt │ ├── booster.cpp │ ├── caffe_interp.cpp │ ├── depthwise.cpp │ ├── generic_kernels.cpp │ ├── helper.cpp │ ├── sgeconv.cpp │ ├── sgemm.cpp │ ├── sgemm_legacy.cpp │ ├── sgemm_legacy.h │ ├── sgemv.cpp │ ├── winograd_kernels.cpp │ └── winograd_kernels_F63.cpp ├── avx │ ├── CMakeLists.txt │ ├── booster.cpp │ ├── caffe_interp.cpp │ ├── depthwise.cpp │ ├── generic_kernels.cpp │ ├── helper.cpp │ ├── sgeconv.cpp │ ├── sgemm.cpp │ ├── sgemv.cpp │ ├── winograd_kernels_F63.cpp │ └── winograd_kernels_F63_fused.cpp └── include │ └── booster │ ├── booster.h │ ├── caffe_interp.h │ ├── depthwise.h │ ├── generic_kernels.h │ ├── helper.h │ ├── power.h │ ├── sgeconv.h │ ├── sgemm.h │ ├── sgemv.h │ ├── thpool.h │ └── winograd_kernels.h ├── layer.cpp ├── layer.h ├── layer_factory.cpp ├── layer_factory.h ├── layers ├── batchnorm_layer.h ├── concat_layer.h ├── conv_layer.h ├── dropout_layer.h ├── eltwise_layer.h ├── inner_product_layer.h ├── input_layer.h ├── pooling_layer.h ├── relu_layer.h ├── scale_layer.h ├── softmax_layer.h └── split_layer.h ├── mempool.cpp ├── mempool.h ├── ncnn ├── allocator.cpp ├── allocator.h ├── mat.cpp ├── mat.h ├── mat_pixel.cpp ├── mat_pixel_resize.cpp ├── modelbin.cpp ├── modelbin.h ├── paramdict.cpp ├── paramdict.h └── platform.h ├── net.cpp ├── net.h ├── rt_param.h ├── utils.cpp └── utils.h /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | if(CMAKE_TOOLCHAIN_FILE) 2 | set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to") 3 | # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :( 4 | get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME) 5 | find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH) 6 | message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}") 7 | endif() 8 | 9 | if(NOT DEFINED CMAKE_INSTALL_PREFIX) 10 | set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory") 11 | endif() 12 | message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}") 13 | 14 | project(feather) 15 | 16 | cmake_minimum_required(VERSION 2.8) 17 | 18 | #set(CMAKE_BUILD_TYPE Debug) 19 | #set(CMAKE_BUILD_TYPE Release) 20 | 21 | option(FEATHER_OPENMP "openmp support" ON) 22 | 23 | if(FEATHER_OPENMP) 24 | if(CMAKE_HOST_APPLE) 25 | #if(1) 26 | if(IOS) 27 | #if(0) 28 | message(STATUS "iOS doesn't support OpenMP, use GCD instead.") 29 | set(OPENMP_FOUND false) 30 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fembed-bitcode") 31 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fembed-bitcode") 32 | else() 33 | set(OpenMP_C_FLAGS) 34 | set(OpenMP_CXX_FLAGS) 35 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") 36 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 37 | message(STATUS ${OpenMP_C_FLAGS}) 38 | message(STATUS ${OpenMP_CXX_FLAGS}) 39 | endif() 40 | else() 41 | #find_package(OpenMP) 42 | include(FindOpenMP) 43 | if(OPENMP_FOUND) 44 | #if(OpenMP_CXX_FOUND OR OPENMP_FOUND) 45 | #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") 46 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 47 | message(STATUS ${OpenMP_C_FLAGS}) 48 | message(STATUS ${OpenMP_CXX_FLAGS}) 49 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}") 50 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}") 51 | endif() 52 | message(STATUS "OpenMP flags ${CMAKE_CXX_FLAGS}") 53 | endif() 54 | endif() 55 | 56 | #add_definitions(-Wall -Wextra -Wno-unused-function) 57 | add_definitions(-fPIC) 58 | add_definitions(-Ofast) 59 | add_definitions(-ffast-math) 60 | # add_definitions(-march=native) 61 | 62 | # add_definitions(-flto) 63 | 64 | add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden) 65 | 66 | if(ANDROID) 67 | # disable shared library on android 68 | #set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE) 69 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti ") 70 | add_definitions("-DFEATHER_ANDROID_LOG") 71 | add_definitions("-D_NDK_MATH_NO_SOFTFP=1") 72 | if(${ANDROID_ABI} STREQUAL "armeabi-v7a") 73 | add_definitions("-mfpu=neon-vfpv4") 74 | endif() 75 | elseif(IOS) 76 | # disable shared library on xcode ios 77 | add_definitions(-isysroot ${IOS_SDK_PATH} -arch ${IOS_ARCH}) 78 | set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE) 79 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti ") 80 | endif() 81 | 82 | ############################################## 83 | 84 | # add_subdirectory(examples) 85 | # add_subdirectory(benchmark) 86 | add_subdirectory(src) 87 | #if(NOT ANDROID AND NOT IOS) 88 | # add_subdirectory(tools) 89 | #endif() 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | [![license](http://img.shields.io/badge/license-BSD3-blue.svg?style=flat)](https://github.com/Tencent/FeatherCNN/blob/master/LICENSE) 4 | [![Release Version](https://img.shields.io/badge/release-0.1.0-red.svg)](https://github.com/Tencent/FeatherCNN/releases) 5 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/Tencent/FeatherCNN/pulls) 6 | 7 | ## Introduction 8 | 9 | FeatherCNN is a high-performance lightweight CNN inference library, developed by Tencent AI Platform Department. 10 | FeatureCNN origins from our game AI project for King of Glory (Chinese: 王者荣耀), in which we aim to build a neural model for MOBA game AI and run it on mobile devices. 11 | FeatherCNN currently targets at ARM CPUs. 12 | We will extend it to cover other architecutures in the near future. 13 | 14 | Comparing with other libraries, FeatherCNN has the following features: 15 | 16 | - **High Performance** FeatherCNN delivers state-of-the-art inference computing performance on a wide range of devices, including mobile phones (iOS/Android), embedded devices (Linux) as well as ARM-based servers (Linux). 17 | 18 | - **Easy Deployment** FeatherCNN packs everything in a single code base to get rid of third-party dependencies. Hence, it facilitates deployment on mobile platforms. 19 | 20 | - **Featherweight** The compiled FeatherCNN library is small-sized (hundreds of KBs). 21 | 22 | Please kindly open an issue in this repo for bug reports and enhancement suggests. We are grateful to user responses and will actively polish this library. 23 | 24 | ## Citation 25 | 26 | FeatherCNN: Fast Inference Computation with TensorGEMM on ARM Architectures (TPDS September 2019, In press, DOI:10.1109/TPDS.2019.2939785) 27 | 28 | ## Clone hints 29 | The FeatherCNN repository has a heavy development history, please only clone the master branch as follows: 30 | ``` 31 | git clone -b master --single-branch https://github.com/tencent/FeatherCNN.git 32 | ``` 33 | 34 | ## Detailed Instructions for iOS/Android/Linux 35 | 36 | [**Build From Source**](https://github.com/Tencent/FeatherCNN/wikis/Build-From-Source) 37 | 38 | [**iOS Guide**](https://github.com/Tencent/FeatherCNN/wikis/iOS-Guide) 39 | 40 | [**Android Guide**](https://github.com/Tencent/FeatherCNN/wiki/Android-Guide) 41 | 42 | [**Android ADB Guide**](https://github.com/Tencent/FeatherCNN/wiki/Android-ADB-Guide) 43 | 44 | ## Usage 45 | 46 | ### Model Format Conversion 47 | 48 | FeatherCNN accepts Caffemodels. It merges the structure file (.prototxt) and the weight file (.caffemodel) into a single binary model (.feathermodel). The convert tool requires protobuf, but you don't need them for the library. 49 | 50 | [**Model Convert Guide**](https://github.com/Tencent/FeatherCNN/wikis/Model-Convert-Guide). 51 | 52 | ### Runtime Interfaces 53 | 54 | The basic user interfaces are listed in feather/net.h. Currently we are using raw pointers to reference data. 55 | We may provide more convenient interfaces in the near future. 56 | 57 | Before inference, FeatherCNN requires two steps to initialize the network. 58 | ```cpp 59 | feather::Net forward_net(num_threads); 60 | forward_net.InitFromPath(FILE_PATH_TO_FEATHERMODEL); 61 | ``` 62 | The net can also be initialized with raw buffers and FILE pointers. 63 | We can perform forward computation with raw `float*` buffer consequently. 64 | ```cpp 65 | forward_net.Forward(PTR_TO_YOUR_INPUT_DATA); 66 | ``` 67 | The output can be extracted from the net by the name of blobs. The blob names are kept consistent with caffe prototxt. 68 | ```cpp 69 | forward_net.ExtractBlob(PTR_TO_YOUR_OUTPUT_BUFFER, BLOB_NAME); 70 | ``` 71 | BTW, you can also get the blob's data size by calling 72 | ```cpp 73 | size_t data_size = 0; 74 | forward_net.GetBlobDataSize(&data_size, BLOB_NAME); 75 | ``` 76 | 77 | ## Performance Benchmarks 78 | We have tested FeatherCNN on a bunch of devices, see [**this page**](https://github.com/Tencent/FeatherCNN/wikis/Benchmarks) for details. 79 | 80 | ## User Groups 81 | 82 | Telegram: https://t.me/FeatherCNN 83 | 84 | QQ: 728147343 85 | -------------------------------------------------------------------------------- /build_scripts/Info.plist: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | CFBundleName 6 | feather 7 | CFBundleIdentifier 8 | com.tencent.feather 9 | CFBundleVersion 10 | 0.1 11 | CFBundleShortVersionString 12 | 0.1 13 | CFBundleSignature 14 | ???? 15 | CFBundlePackageType 16 | FMWK 17 | 18 | 19 | -------------------------------------------------------------------------------- /build_scripts/build_android.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p build-android 4 | pushd build-android 5 | mkdir -p arm64-v8a 6 | pushd arm64-v8a 7 | cmake -DCMAKE_TOOLCHAIN_FILE=$NDK_ROOT/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DBOOSTER_ARM=1 -DCOMPILE_OPENCL=1 ../.. 8 | make -j6 9 | make install 10 | popd 11 | 12 | mkdir -p armeabi-v7a 13 | pushd armeabi-v7a 14 | cmake -DCMAKE_TOOLCHAIN_FILE=$NDK_ROOT/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-16 -DBOOSTER_ARM=1 -DCOMPILE_OPENCL=1 ../.. 15 | make -j6 16 | make install 17 | popd 18 | 19 | #mkdir -p armeabi 20 | #pushd armeabi 21 | #cmake -DCMAKE_TOOLCHAIN_FILE=$NDK_ROOT/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi" -DANDROID_PLATFORM=android-16 -DFEATHER_ARM=0 ../.. 22 | #make -j4 23 | #make install 24 | #popd 25 | 26 | mkdir -p feather 27 | mkdir -p booster 28 | pushd feather 29 | mkdir -p include 30 | mkdir -p include/feather 31 | cp -r ../arm64-v8a/install/feather/include/* ./include/feather/ 32 | mkdir -p arm64-v8a 33 | cp ../arm64-v8a/install/feather/lib/* ./arm64-v8a/ 34 | mkdir -p armeabi-v7a 35 | cp ../armeabi-v7a/install/feather/lib/* ./armeabi-v7a/ 36 | #mkdir -p armeabi 37 | #cp ../armeabi/install/feather/lib/* ./armeabi/ 38 | #popd 39 | popd 40 | pushd booster 41 | mkdir -p include/booster 42 | cp -r ../arm64-v8a/install/booster/include/* ./include/ 43 | mkdir -p arm64-v8a 44 | cp ../arm64-v8a/install/booster/lib/* ./arm64-v8a/ 45 | mkdir -p armeabi-v7a 46 | cp ../armeabi-v7a/install/booster/lib/* ./armeabi-v7a/ 47 | popd 48 | -------------------------------------------------------------------------------- /build_scripts/build_ios.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo $(xcrun --sdk iphoneos --show-sdk-path) 4 | mkdir -p build-ios 5 | pushd build-ios 6 | mkdir -p arm64 7 | pushd arm64 8 | cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphoneos --show-sdk-path) -DIOS_ARCH=arm64 -DBOOSTER_ARM=1 ../.. 9 | make -j4 10 | make install 11 | popd 12 | 13 | mkdir -p armv7s 14 | pushd armv7s 15 | cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphoneos --show-sdk-path) -DIOS_ARCH=armv7s -DBOOSTER_ARM=1 ../.. 16 | make -j4 17 | make install 18 | popd 19 | 20 | #mkdir -p armv7 21 | #pushd armv7 22 | #cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphoneos --show-sdk-path) -DIOS_ARCH=armv7 ../.. 23 | #make -j4 24 | #make install 25 | #popd 26 | 27 | #mkdir -p x86_64 28 | #pushd x86_64 29 | #cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphonesimulator --show-sdk-path) -DIOS_ARCH=x86_64 ../.. 30 | #make -j4 31 | #make install 32 | #popd 33 | 34 | #mkdir -p i386 35 | #pushd i386 36 | #cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphonesimulator --show-sdk-path) -DIOS_ARCH=i386 ../.. 37 | #make -j4 38 | #make install 39 | #popd 40 | 41 | popd 42 | bash ./build_scripts/pack_ios_framework.sh 43 | -------------------------------------------------------------------------------- /build_scripts/build_linux_aarch64.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p build-linux-aarch64 4 | pushd build-linux-aarch64 5 | #cmake -DCMAKE_TOOLCHAIN_FILE=../build_scripts/linux-aarch64.toolchain.cmake .. -DFEATHER_ARM=true -DCOMPILE_OPENCL=false 6 | cmake .. -DBOOSTER_ARM=true -DCOMPILE_OPENCL=false -DCMAKE_BUILD_TYPE=Release 7 | make -j4 8 | make install 9 | popd 10 | -------------------------------------------------------------------------------- /build_scripts/build_linux_avx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p build-linux-avx 4 | pushd build-linux-avx 5 | cmake .. -DBOOSTER_AVX=1 -DCMAKE_BUILD_TYPE=Release 6 | make VERBOSE=1 7 | make install 8 | popd -------------------------------------------------------------------------------- /build_scripts/build_macos_avx.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p build-macos-avx 4 | pushd build-macos-avx 5 | cmake .. -DBOOSTER_AVX=1 -DCMAKE_BUILD_TYPE=Release 6 | make -j4 7 | make install 8 | popd 9 | -------------------------------------------------------------------------------- /build_scripts/ios.toolchain.cmake: -------------------------------------------------------------------------------- 1 | # For cross-compiling on arm64 Linux using gcc-aarch64-linux-gnu package: 2 | # - install AArch64 tool chain: 3 | # $ sudo apt-get install g++-aarch64-linux-gnu 4 | # - cross-compiling config 5 | # $ cmake -DCMAKE_TOOLCHAIN_FILE=../dynamorio/make/toolchain-arm64.cmake ../dynamorio 6 | # You may have to set CMAKE_FIND_ROOT_PATH to point to the target enviroment, e.g. 7 | # by passing -DCMAKE_FIND_ROOT_PATH=/usr/aarch64-linux-gnu on Debian-like systems. 8 | set(CMAKE_SYSTEM_NAME Darwin) 9 | set(CMAKE_SYSTEM_VERSION 1) 10 | set(UNIX True) 11 | set(APPLE True) 12 | set(IOS True) 13 | 14 | # specify the cross compiler as clang. 15 | set(CMAKE_C_COMPILER clang) 16 | set(CMAKE_CXX_COMPILER clang++) 17 | 18 | # To build the tests, we need to set where the target environment containing 19 | # the required library is. 20 | set(CMAKE_FIND_ROOT_PATH ${IOS_SDK_PATH}) 21 | # search for programs in the build host directories 22 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 23 | # for libraries and headers in the target directories 24 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 25 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 26 | 27 | 28 | # Set additional variables. 29 | # If we don't set some of these, CMake will end up using the host version. 30 | # We want the full path, however, so we can pass EXISTS and other checks in 31 | # the our CMake code. 32 | find_program(CC_FULL_PATH clang) 33 | if (NOT CC_FULL_PATH) 34 | message(FATAL_ERROR "Cross-compiler clang not found") 35 | endif () 36 | get_filename_component(CC_DIR ${CC_FULL_PATH} PATH) 37 | message(STATUS "CC path is ${CC_FULL_PATH}") 38 | #set(IOS_ARCH arm64) 39 | 40 | #SET(CMAKE_LINKER ${CC_DIR}/aarch64-${TARGET_ABI}-ld CACHE FILEPATH "linker") 41 | #SET(CMAKE_ASM_COMPILER ${CC_DIR}/aarch64-${TARGET_ABI}-as CACHE FILEPATH "assembler") 42 | #SET(CMAKE_OBJCOPY ${CC_DIR}/aarch64-${TARGET_ABI}-objcopy CACHE FILEPATH "objcopy") 43 | #SET(CMAKE_STRIP ${CC_DIR}/aarch64-${TARGET_ABI}-strip CACHE FILEPATH "strip") 44 | #SET(CMAKE_CPP ${CC_DIR}/aarch64-${TARGET_ABI}-cpp CACHE FILEPATH "cpp") 45 | 46 | set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE 1) 47 | # Without this, Xcode adds -fembed-bitcode-marker compile options instead of -fembed-bitcode set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}") 48 | set(CMAKE_XCODE_ATTRIBUTE_BITCODE_GENERATION_MODE "bitcode") 49 | set(BITCODE_FLAGS "-fembed-bitcode") 50 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${BITCODE_FLAGS}" CACHE INTERNAL "ios c compiler flags" FORCE) 51 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BITCODE_FLAGS}" CACHE INTERNAL "ios c compiler flags" FORCE) 52 | 53 | 54 | 55 | -------------------------------------------------------------------------------- /build_scripts/linux-aarch64.toolchain.cmake: -------------------------------------------------------------------------------- 1 | # ********************************************************** 2 | # Copyright (c) 2014-2017 Google, Inc. All rights reserved. 3 | # ********************************************************** 4 | 5 | # Redistribution and use in source and binary forms, with or without 6 | # modification, are permitted provided that the following conditions are met: 7 | # 8 | # * Redistributions of source code must retain the above copyright notice, 9 | # this list of conditions and the following disclaimer. 10 | # 11 | # * Redistributions in binary form must reproduce the above copyright notice, 12 | # this list of conditions and the following disclaimer in the documentation 13 | # and/or other materials provided with the distribution. 14 | # 15 | # * Neither the name of Google, Inc. nor the names of its contributors may be 16 | # used to endorse or promote products derived from this software without 17 | # specific prior written permission. 18 | # 19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 | # ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE, INC. OR CONTRIBUTORS BE LIABLE 23 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 | # DAMAGE. 30 | 31 | # For cross-compiling on arm64 Linux using gcc-aarch64-linux-gnu package: 32 | # - install AArch64 tool chain: 33 | # $ sudo apt-get install g++-aarch64-linux-gnu 34 | # - cross-compiling config 35 | # $ cmake -DCMAKE_TOOLCHAIN_FILE=../dynamorio/make/toolchain-arm64.cmake ../dynamorio 36 | # You may have to set CMAKE_FIND_ROOT_PATH to point to the target enviroment, e.g. 37 | # by passing -DCMAKE_FIND_ROOT_PATH=/usr/aarch64-linux-gnu on Debian-like systems. 38 | set(CMAKE_SYSTEM_NAME Linux) 39 | set(CMAKE_SYSTEM_PROCESSOR aarch64) 40 | set(TARGET_ABI "linux-gnu") 41 | # specify the cross compiler 42 | SET(CMAKE_C_COMPILER aarch64-${TARGET_ABI}-gcc) 43 | SET(CMAKE_CXX_COMPILER aarch64-${TARGET_ABI}-g++) 44 | 45 | # To build the tests, we need to set where the target environment containing 46 | # the required library is. On Debian-like systems, this is 47 | # /usr/aarch64-linux-gnu. 48 | SET(CMAKE_FIND_ROOT_PATH "/usr/aarch64-${TARGET_ABI}") 49 | # search for programs in the build host directories 50 | SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) 51 | # for libraries and headers in the target directories 52 | SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) 53 | SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) 54 | 55 | # Set additional variables. 56 | # If we don't set some of these, CMake will end up using the host version. 57 | # We want the full path, however, so we can pass EXISTS and other checks in 58 | # the our CMake code. 59 | find_program(GCC_FULL_PATH aarch64-${TARGET_ABI}-gcc) 60 | if (NOT GCC_FULL_PATH) 61 | message(FATAL_ERROR "Cross-compiler aarch64-${TARGET_ABI}-gcc not found") 62 | endif () 63 | get_filename_component(GCC_DIR ${GCC_FULL_PATH} PATH) 64 | SET(CMAKE_LINKER ${GCC_DIR}/aarch64-${TARGET_ABI}-ld CACHE FILEPATH "linker") 65 | SET(CMAKE_ASM_COMPILER ${GCC_DIR}/aarch64-${TARGET_ABI}-as CACHE FILEPATH "assembler") 66 | SET(CMAKE_OBJCOPY ${GCC_DIR}/aarch64-${TARGET_ABI}-objcopy CACHE FILEPATH "objcopy") 67 | SET(CMAKE_STRIP ${GCC_DIR}/aarch64-${TARGET_ABI}-strip CACHE FILEPATH "strip") 68 | SET(CMAKE_CPP ${GCC_DIR}/aarch64-${TARGET_ABI}-cpp CACHE FILEPATH "cpp") 69 | -------------------------------------------------------------------------------- /build_scripts/pack_ios_framework.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | NAME=feather 4 | 5 | ##### package android lib 6 | #ANDROIDPKGNAME=${NAME}-android-lib 7 | #rm -rf $ANDROIDPKGNAME 8 | #mkdir -p $ANDROIDPKGNAME 9 | #mkdir -p $ANDROIDPKGNAME/armeabi-v7a 10 | #mkdir -p $ANDROIDPKGNAME/arm64-v8a 11 | #mkdir -p $ANDROIDPKGNAME/include 12 | #cp build-android-armv7/install/lib/lib${NAME}.a $ANDROIDPKGNAME/armeabi-v7a/ 13 | #cp build-android-aarch64/install/lib/lib${NAME}.a $ANDROIDPKGNAME/arm64-v8a/ 14 | #cp build-android-aarch64/install/include/* $ANDROIDPKGNAME/include/ 15 | #rm -f $ANDROIDPKGNAME.zip 16 | #zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME 17 | 18 | ##### package ios framework 19 | IOSPKGNAME=./build-ios/${NAME}.framework 20 | rm -rf $IOSPKGNAME 21 | mkdir -p $IOSPKGNAME/Versions/A/Headers 22 | mkdir -p $IOSPKGNAME/Versions/A/Resources 23 | ln -s A $IOSPKGNAME/Versions/Current 24 | ln -s Versions/Current/Headers $IOSPKGNAME/Headers 25 | ln -s Versions/Current/Resources $IOSPKGNAME/Resources 26 | ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME} 27 | lipo -create \ 28 | build-ios/arm64/install/${NAME}/lib/lib${NAME}.a \ 29 | build-ios/armv7/install/${NAME}/lib/lib${NAME}.a \ 30 | build-ios/x86_64/install/${NAME}/lib/lib${NAME}.a \ 31 | build-ios/i386/install/${NAME}/lib/lib${NAME}.a \ 32 | -o $IOSPKGNAME/Versions/A/${NAME} 33 | #build-ios-sim/install/${NAME}/lib/lib${NAME}.a \ 34 | cp -r build-ios/arm64/install/${NAME}/include/* $IOSPKGNAME/Versions/A/Headers/ 35 | 36 | #HEADER_PATH=$IOSPKGNAME/Versions/A/Headers 37 | #HEADERS_TO_EDIT=$HEADER_PATH/feather_simple_generated.h\ $HEADER_PATH/flatbuffers/flatbuffers.h\ $HEADER_PATH/flatbuffers/base.h 38 | #HEADERS_TO_EDIT=$HEADER_PATH/flatbuffers/flatbuffers.h 39 | #HEADERS_TO_EDIT=$HEADER_PATH/flatbuffers/base.h 40 | 41 | # Fix the relative path for the framework package. 42 | #for FILE in $HEADERS_TO_EDIT 43 | #do 44 | # echo $FILE 45 | # sed -i.bak 's/flatbuffers\//feather\/flatbuffers\//' $FILE 46 | # echo $FILE.bak 47 | # rm $FILE.bak 48 | #done 49 | 50 | cp ./build_scripts/Info.plist ${IOSPKGNAME}/Versions/A/Resources/ 51 | rm -f $IOSPKGNAME.zip 52 | zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME 53 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB LIB_HEADERS *.h) 2 | file(GLOB LIB_SRC *.cpp) 3 | file(GLOB LAYER_HEADERS layers/*.h) 4 | file(GLOB LAYER_SRC layers/*.cpp) 5 | file(GLOB FLATBUFFERS_HEADERS flatbuffers/*.h) 6 | 7 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wno-format") 8 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -std=c++11 -Wall") 9 | 10 | include_directories("${PROJECT_SOURCE_DIR}/src") 11 | include_directories("booster/include") 12 | 13 | message(STATUS "Using Booster backend.") 14 | add_subdirectory(./booster) 15 | if(BOOSTER_ARM) 16 | message(STATUS "Compiling for Arm backend.") 17 | add_library(feather STATIC ${LIB_SRC} ${LIB_HEADERS} ${LAYER_SRC} ${LAYER_HEADERS} $) 18 | elseif(BOOSTER_AVX) 19 | message(STATUS "Compiling for AVX backend.") 20 | add_library(feather STATIC ${LIB_SRC} ${LIB_HEADERS} ${LAYER_SRC} ${LAYER_HEADERS} $) 21 | else() 22 | error("You have to specify a backend, either FEATHER_ARM or FEATHER_AVX") 23 | endif() 24 | 25 | set(FEATHER_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/feather") 26 | 27 | message(Library headers: ${LIB_HEADERS}) 28 | list(REMOVE_ITEM LIB_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/feather_simple_generated.h) 29 | message(Library headers: ${LIB_HEADERS}) 30 | install(TARGETS feather DESTINATION "${FEATHER_INSTALL_DIR}/lib") 31 | install(FILES ${LIB_HEADERS} DESTINATION "${FEATHER_INSTALL_DIR}/include") 32 | -------------------------------------------------------------------------------- /src/blob.cpp: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #include "blob.h" 16 | 17 | #include 18 | 19 | namespace feather 20 | { 21 | template 22 | void Blob::Alloc() 23 | { 24 | size_t dim_byte = _num * _channels * _height * _width * sizeof(Dtype); 25 | _data = (Dtype*) _mm_malloc(dim_byte, 32); 26 | } 27 | template 28 | void Blob::Free() 29 | { 30 | if (this->_data) 31 | { 32 | free(this->_data); 33 | this->_data = NULL; 34 | } 35 | } 36 | 37 | template 38 | void Blob::ReshapeWithRealloc(const Blob *p_blob) 39 | { 40 | int num = p_blob->num(); 41 | int channels = p_blob->channels(); 42 | int height = p_blob->height(); 43 | int width = p_blob->width(); 44 | 45 | ReshapeWithRealloc(num, channels, height, width); 46 | } 47 | 48 | template 49 | void Blob::ReshapeWithRealloc(int num, int channels, int height, int width) 50 | { 51 | // LOGI("Reallc: (%d %d %d %d) to (%d %d %d %d)", _num, _channels, _height, _width, num, channels, height, width); 52 | int elem_size = num * channels * height * width; 53 | Realloc(elem_size); 54 | this->_num = num; 55 | this->_channels = channels; 56 | this->_height = height; 57 | this->_width = width; 58 | } 59 | 60 | template 61 | void Blob::Realloc(size_t elem_size) 62 | { 63 | if (elem_size > this->data_size()) 64 | { 65 | Free(); 66 | _data = (Dtype*) _mm_malloc(elem_size * sizeof(Dtype), 32); 67 | } 68 | } 69 | 70 | template 71 | int Blob::CopyFromMat(const ncnn::Mat& mat) 72 | { 73 | this->ReshapeWithRealloc(1, mat.c, mat.h, mat.w); 74 | this->CopyDataFromMat(mat); 75 | return 0; 76 | } 77 | 78 | template 79 | int Blob::CopyDataFromMat(const ncnn::Mat& mat) 80 | { 81 | if (this->data_size() != mat.c * mat.h * mat.w) 82 | { 83 | LOGE("In Blob %s: Mat and target blob shape mismatch. blob shape (%zu %zu %zu %zu), mat shape (%d %d %d)\n", this->name.c_str(), num(), channels(), height(), width(), mat.c, mat.h, mat.w); 84 | return -500; // BAD DATA DIMENSION 85 | } 86 | Dtype* dst_p = (Dtype *) this->_data; 87 | size_t copy_stride = mat.h * mat.w; 88 | for (int c = 0; c < mat.c; ++c ) 89 | { 90 | ncnn::Mat channel_mat = mat.channel(c); 91 | memcpy(dst_p, channel_mat.data, copy_stride * sizeof(Dtype)); 92 | dst_p += copy_stride; 93 | } 94 | return 0; 95 | } 96 | 97 | template class Blob; 98 | template class Blob; 99 | template class Blob; 100 | }; 101 | -------------------------------------------------------------------------------- /src/blob.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "utils.h" 18 | 19 | #include "ncnn/mat.h" 20 | 21 | #include 22 | 23 | namespace feather 24 | { 25 | template 26 | class Blob 27 | { 28 | public: 29 | Blob() 30 | : name(), _num(0), _channels(0), _height(0), _width(0), _data(NULL) 31 | {} 32 | 33 | explicit Blob(std::string name) 34 | : name(name), _num(0), _channels(0), _height(0), _width(0), _data(NULL) 35 | {} 36 | 37 | explicit Blob(const size_t num, const size_t channels, const size_t height, const size_t width) 38 | : name(), _data(NULL), _num(num), _channels(channels), _height(height), _width(width) 39 | {} 40 | 41 | explicit Blob(Dtype* data, const size_t num, const size_t channels, const size_t height, const size_t width) 42 | : name(), _data(data), _num(num), _channels(channels), _height(height), _width(width) 43 | {} 44 | 45 | ~Blob() 46 | { 47 | Free(); 48 | } 49 | 50 | void Free(); 51 | void Alloc(); 52 | 53 | void ReshapeWithRealloc(const Blob *p_blob); 54 | void ReshapeWithRealloc(int num, int channels, int height, int width); 55 | void Realloc(size_t elem_size); 56 | 57 | int CopyFromMat(const ncnn::Mat &src_mat); 58 | int CopyDataFromMat(const ncnn::Mat &src_mat); 59 | 60 | void CopyData(const Dtype* data) 61 | { 62 | size_t size = _num * _channels * _height * _width; 63 | memcpy(_data, data, sizeof(Dtype) * size); 64 | } 65 | void CopyShape(const Blob* p_blob) 66 | { 67 | this->_num = p_blob->num(); 68 | this->_channels = p_blob->channels(); 69 | this->_width = p_blob->width(); 70 | this->_height = p_blob->height(); 71 | } 72 | void Copy(const Blob* p_blob) 73 | { 74 | this->Free(); 75 | CopyShape(p_blob); 76 | this->Alloc(); 77 | CopyData(p_blob->data()); 78 | } 79 | 80 | Dtype* data() const 81 | { 82 | return (Dtype*) _data; 83 | } 84 | 85 | size_t data_size() const 86 | { 87 | return _num * _channels * _height * _width; 88 | } 89 | size_t num() const 90 | { 91 | return _num; 92 | } 93 | size_t channels() const 94 | { 95 | return _channels; 96 | } 97 | size_t height() const 98 | { 99 | return _height; 100 | } 101 | size_t width() const 102 | { 103 | return _width; 104 | } 105 | void PrintBlobInfo() const 106 | { 107 | printf("----BlobShape----\n"); 108 | printf("NCHW=(%zu %zu %zu %zu)\n", _num, _channels, _height, _width); 109 | printf("----------------\n"); 110 | } 111 | 112 | std::string name; 113 | 114 | void* _data; 115 | size_t _elemsize; 116 | 117 | size_t _num; 118 | size_t _channels; 119 | size_t _height; 120 | size_t _width; 121 | }; 122 | }; 123 | -------------------------------------------------------------------------------- /src/booster/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.10) 2 | 3 | file(GLOB LIB_HEADERS ./include/booster/*.h) 4 | 5 | if(CMAKE_SYSTEM_NAME MATCHES "Windows") 6 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O2 -std=c++11") 7 | else() 8 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wno-format") 9 | endif() 10 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -std=c++11 -Wall") 11 | 12 | include_directories("./include/") 13 | 14 | if(BOOSTER_AVX) 15 | message(STATUS "Compiling booster AVX version.") 16 | add_subdirectory(./avx) 17 | if(COMPILE_OPENCL) 18 | add_subdirectory(./cl) 19 | add_library(booster STATIC $ $) 20 | else() 21 | add_library(booster STATIC $) 22 | endif() 23 | elseif(BOOSTER_ARM) 24 | add_subdirectory(./arm) 25 | if(COMPILE_OPENCL) 26 | add_subdirectory(./cl) 27 | add_library(booster STATIC $ $) 28 | else() 29 | add_library(booster STATIC $) 30 | endif() 31 | else() 32 | error("Unkown booster configuration.") 33 | endif() 34 | 35 | 36 | set(BOOSTER_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/booster") 37 | 38 | message(Library headers: ${LIB_HEADERS}) 39 | install(TARGETS booster DESTINATION "${BOOSTER_INSTALL_DIR}/lib") 40 | install(FILES ${LIB_HEADERS} DESTINATION "${BOOSTER_INSTALL_DIR}/include/booster") 41 | -------------------------------------------------------------------------------- /src/booster/arm/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB ARM_SRC ./*.cpp) 2 | file(GLOB ARM_HEADERS ../include/*.h) 3 | list(REMOVE_ITEM ARM_SRC "${CMAKE_CURRENT_SOURCE_DIR}/./sgemm_legacy.cpp") 4 | 5 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a -fopenmp") 6 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -Wall") 7 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wno-format") 8 | 9 | add_library(booster_arm_obj OBJECT ${ARM_SRC} ${ARM_HEADERS}) 10 | #add_library(arm_backend STATIC ${ARM_SRC} ${ARM_HEADERS}) 11 | 12 | #target_include_directories(arm_backend PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 13 | #set(ARM_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/booster_arm/") 14 | #install(TARGETS arm_backend DESTINATION ${ARM_INSTALL_DIR}/lib) 15 | #install(FILES ${ARM_HEADERS} DESTINATION "${ARM_INSTALL_DIR}/include") 16 | -------------------------------------------------------------------------------- /src/booster/arm/caffe_interp.cpp: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #include 16 | 17 | // Bi-linear interpolation 18 | // IN : [channels height1 width1] cropped from a bigger [Height1 Width1] image 19 | // OUT: [channels height2 width2] cropped from a bigger [Height2 Width2] image 20 | template 21 | void caffe_cpu_interp2(const int channels, 22 | const Dtype *data1, const int x1, const int y1, const int height1, const int width1, const int Height1, const int Width1, 23 | Dtype *data2, const int x2, const int y2, const int height2, const int width2, const int Height2, const int Width2) 24 | { 25 | // CHECK(x1 >= 0 && y1 >= 0 && height1 > 0 && width1 > 0 && x2 >= 0 && y2 >= 0 && height2 > 0 && width2 > 0); 26 | // CHECK(Width1 >= width1 + x1 && Height1 >= height1 + y1 && Width2 >= width2 + x2 && Height2 >= height2 + y2); 27 | // special case: just copy 28 | if (height1 == height2 && width1 == width2) 29 | { 30 | for (int h2 = 0; h2 < height2; ++h2) 31 | { 32 | const int h1 = h2; 33 | for (int w2 = 0; w2 < width2; ++w2) 34 | { 35 | const int w1 = w2; 36 | if (packed) 37 | { 38 | const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))]; 39 | Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))]; 40 | for (int c = 0; c < channels; ++c) 41 | { 42 | pos2[0] = pos1[0]; 43 | pos1++; 44 | pos2++; 45 | } 46 | } 47 | else 48 | { 49 | const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)]; 50 | Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)]; 51 | for (int c = 0; c < channels; ++c) 52 | { 53 | pos2[0] = pos1[0]; 54 | pos1 += Width1 * Height1; 55 | pos2 += Width2 * Height2; 56 | } 57 | } 58 | } 59 | } 60 | return; 61 | } 62 | const float rheight = (height2 > 1) ? static_cast(height1) / (height2) : 0.f; 63 | const float rwidth = (width2 > 1) ? static_cast(width1) / (width2) : 0.f; 64 | for (int h2 = 0; h2 < height2; ++h2) 65 | { 66 | const float h1r = rheight * h2; 67 | const int h1 = h1r; 68 | const int h1p = (h1 < height1 - 1) ? 1 : 0; 69 | const Dtype h1lambda = h1r - h1; 70 | const Dtype h0lambda = Dtype(1.) - h1lambda; 71 | for (int w2 = 0; w2 < width2; ++w2) 72 | { 73 | const float w1r = rwidth * w2; 74 | const int w1 = w1r; 75 | const int w1p = (w1 < width1 - 1) ? 1 : 0; 76 | const Dtype w1lambda = w1r - w1; 77 | const Dtype w0lambda = Dtype(1.) - w1lambda; 78 | if (packed) 79 | { 80 | const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))]; 81 | Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))]; 82 | for (int c = 0; c < channels; ++c) 83 | { 84 | pos2[0] = 85 | h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[channels * w1p]) + 86 | h1lambda * (w0lambda * pos1[channels * h1p * Width1] + w1lambda * pos1[channels * (h1p * Width1 + w1p)]); 87 | pos1++; 88 | pos2++; 89 | } 90 | } 91 | else 92 | { 93 | const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)]; 94 | Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)]; 95 | for (int c = 0; c < channels; ++c) 96 | { 97 | pos2[0] = 98 | h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) + 99 | h1lambda * (w0lambda * pos1[h1p * Width1] + w1lambda * pos1[h1p * Width1 + w1p]); 100 | pos1 += Width1 * Height1; 101 | pos2 += Width2 * Height2; 102 | } 103 | } 104 | } 105 | } 106 | } 107 | 108 | template void caffe_cpu_interp2(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int); 109 | template void caffe_cpu_interp2(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int); 110 | template void caffe_cpu_interp2(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int); 111 | template void caffe_cpu_interp2(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int); 112 | -------------------------------------------------------------------------------- /src/booster/arm/helper.cpp: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | void print_vec2(float32x4_t* vp) 23 | { 24 | float* ep = (float *) vp; 25 | printf("input %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3)); 26 | } 27 | 28 | void print_vec3(float32x4_t* vp) 29 | { 30 | float* ep = (float *) vp; 31 | printf("transformed %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3)); 32 | } 33 | 34 | void print_vec(float32x4_t* vp, const char* comment) 35 | { 36 | float* ep = (float *) vp; 37 | printf("%s %.3f, %.3f, %.3f, %.3f\n", comment, *(ep), *(ep + 1), *(ep + 2), *(ep + 3)); 38 | } 39 | 40 | 41 | void print_vec(float32x4_t* vp) 42 | { 43 | float* ep = (float *) vp; 44 | printf("vec %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3)); 45 | } 46 | 47 | void print_arr(float* vp) 48 | { 49 | float* ep = (float *) vp; 50 | printf("arr %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3)); 51 | } 52 | 53 | void print_floats(const float* arr, const int len) 54 | { 55 | for (int i = 0; i < len; ++i) 56 | { 57 | printf("%.2f ", arr[i]); 58 | } 59 | printf("\n\n"); 60 | } 61 | 62 | void print_floats(const float* arr, const int dimX, const int dimY) 63 | { 64 | for (int i = 0; i < dimX; ++i) 65 | { 66 | for (int j = 0; j < dimY; ++j) 67 | printf("%.2f ", arr[i * dimY + j]); 68 | printf("\n"); 69 | } 70 | printf("\n\n"); 71 | } 72 | 73 | 74 | void diff(float* arr1, float* arr2, int len) 75 | { 76 | float dif = 0.0f; 77 | for (int i = 0; i < len; ++i) 78 | { 79 | float err = fabsf(arr1[i] - arr2[i]); 80 | if (err > 1.0f) 81 | { 82 | dif += err; 83 | } 84 | } 85 | LOGD("The difference is %.2f\n", dif); 86 | } 87 | void diff(float* arr1, float* arr2, int M, int N) 88 | { 89 | float dif = 0.0f; 90 | for (int i = 0; i < M; ++i) 91 | { 92 | for (int j = 0; j < N; ++j) 93 | { 94 | float err = fabsf(arr1[i * N + j] - arr2[i * N + j]); 95 | if (err > 1.0f) 96 | { 97 | dif += err; 98 | LOGD("Error position (%d, %d), value %.2f, %.2f\n", i, j, arr1[i * N + j], arr2[i * N + j]); 99 | } 100 | } 101 | } 102 | LOGD("The difference is %.2f\n", dif); 103 | } 104 | 105 | #include 106 | 107 | void Timer::startBench() 108 | { 109 | clock_gettime(CLOCK_MONOTONIC, &start); 110 | } 111 | 112 | double Timer::endBench() 113 | { 114 | clock_gettime(CLOCK_MONOTONIC, &stop); 115 | return (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0; 116 | } 117 | 118 | void Timer::endBench(const char* comment) 119 | { 120 | clock_gettime(CLOCK_MONOTONIC, &stop); 121 | double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0; 122 | LOGD("%s %lfms\n", comment, elapsedTime); 123 | } 124 | 125 | void Timer::endBench(const char* comment, double fold) 126 | { 127 | clock_gettime(CLOCK_MONOTONIC, &stop); 128 | double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0; 129 | printf("%s %lfms\n", comment, elapsedTime / fold); 130 | } 131 | -------------------------------------------------------------------------------- /src/booster/arm/sgemm_legacy.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #ifndef TCNN_SGEMM_H_ 16 | #define TCNN_SGEMM_H_ 17 | 18 | 19 | void externalPackA(int M, int L, float* packA, float* a, int lda);//External packing for A, requires space allocation for packA 20 | void block_sgemm_external_pack_threading(int M, int N, int L, float *A, float *B, float *C, int num_threads); 21 | 22 | 23 | void externalPackA8(int M, int L, float* packA, float* a, int lda);//External packing for A, requires space allocation for packA 24 | void block_sgemm_external_pack_threading_8x8(int M, int N, int L, float *A, float *B, float *C, int num_threads); 25 | 26 | 27 | #endif 28 | -------------------------------------------------------------------------------- /src/booster/arm/sgemv.cpp: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | template 22 | void fully_connected_inference_direct(const int input_size, const int output_size, const float *x, const float *y, float *z, const int num_threads, float* bias_arr) 23 | { 24 | #pragma omp parallel for schedule(static) num_threads(num_threads) 25 | for (int i = 0; i < output_size; i++) 26 | { 27 | float sum = 0; 28 | for (int j = 0; j < input_size; j++) 29 | sum += x[j] * y[i * input_size + j]; 30 | if (fuseBias) 31 | sum += bias_arr[i]; 32 | if (fuseRelu) 33 | sum = (sum > 0.f) ? sum : 0.f; 34 | z[i] = sum; 35 | } 36 | } 37 | 38 | template 39 | void fully_connected_transpose_inference(const int input_size, const int output_size, const float *x, const float *y, float *z, const int num_threads, float* bias_arr) 40 | { 41 | assert(input_size % 8 == 0); 42 | assert(output_size % 8 == 0); 43 | #pragma omp parallel for schedule(static) num_threads(num_threads) 44 | for (int k = 0; k < output_size / 8; k++) 45 | { 46 | float32x4_t vBias = vld1q_f32(bias_arr + k * 8); 47 | float32x4_t vBias1 = vld1q_f32(bias_arr + k * 8 + 4); 48 | float32x4_t vZero = vdupq_n_f32(0.f); 49 | const float *yPtr = y + k * 8 * input_size; 50 | float32x4_t res = {0.0, 0.0, 0.0, 0.0}; 51 | float32x4_t res1 = {0.0, 0.0, 0.0, 0.0}; 52 | float32x4_t va, vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7; 53 | for (int i = 0; i < input_size; i += 4) 54 | { 55 | // float32x4_t v1, v2; 56 | va = vld1q_f32(x + i); 57 | 58 | vb0 = vld1q_f32(yPtr); 59 | vb1 = vld1q_f32(yPtr + 4); 60 | vb2 = vld1q_f32(yPtr + 8); 61 | vb3 = vld1q_f32(yPtr + 12); 62 | vb4 = vld1q_f32(yPtr + 16); 63 | vb5 = vld1q_f32(yPtr + 20); 64 | vb6 = vld1q_f32(yPtr + 24); 65 | vb7 = vld1q_f32(yPtr + 28); 66 | 67 | #if __aarch64__ 68 | res = vfmaq_laneq_f32(res, vb0, va, 0); 69 | res1 = vfmaq_laneq_f32(res1, vb1, va, 0); 70 | res = vfmaq_laneq_f32(res, vb2, va, 1); 71 | res1 = vfmaq_laneq_f32(res1, vb3, va, 1); 72 | res = vfmaq_laneq_f32(res, vb4, va, 2); 73 | res1 = vfmaq_laneq_f32(res1, vb5, va, 2); 74 | res = vfmaq_laneq_f32(res, vb6, va, 3); 75 | res1 = vfmaq_laneq_f32(res1, vb7, va, 3); 76 | #else 77 | res = vmlaq_f32(res, vb0, vld1q_dup_f32(x + i + 0)); 78 | res1 = vmlaq_f32(res1, vb1, vld1q_dup_f32(x + i + 0)); 79 | res = vmlaq_f32(res, vb2, vld1q_dup_f32(x + i + 1)); 80 | res1 = vmlaq_f32(res1, vb3, vld1q_dup_f32(x + i + 1)); 81 | res = vmlaq_f32(res, vb4, vld1q_dup_f32(x + i + 2)); 82 | res1 = vmlaq_f32(res1, vb5, vld1q_dup_f32(x + i + 2)); 83 | res = vmlaq_f32(res, vb6, vld1q_dup_f32(x + i + 3)); 84 | res1 = vmlaq_f32(res1, vb7, vld1q_dup_f32(x + i + 3)); 85 | #endif 86 | yPtr += 32; 87 | } 88 | 89 | if (fuseBias) 90 | { 91 | res = vaddq_f32(res, vBias); 92 | res1 = vaddq_f32(res1, vBias1); 93 | } 94 | if (fuseRelu) 95 | { 96 | res = vmaxq_f32(res, vZero); 97 | res1 = vmaxq_f32(res1, vZero); 98 | } 99 | vst1q_f32((float32_t *)(z + 8 * k), res); 100 | vst1q_f32((float32_t *)(z + 8 * k + 4), res1); 101 | } 102 | } 103 | 104 | template void fully_connected_inference_direct(const int, const int, const float *, const float *, float *, const int, float*); 105 | template void fully_connected_inference_direct(const int, const int, const float *, const float *, float *, const int, float*); 106 | template void fully_connected_inference_direct(const int, const int, const float *, const float *, float *, const int, float*); 107 | template void fully_connected_inference_direct(const int, const int, const float *, const float *, float *, const int, float*); 108 | 109 | template void fully_connected_transpose_inference(const int, const int, const float *, const float *, float *, const int, float*); 110 | template void fully_connected_transpose_inference(const int, const int, const float *, const float *, float *, const int, float*); 111 | template void fully_connected_transpose_inference(const int, const int, const float *, const float *, float *, const int, float*); 112 | template void fully_connected_transpose_inference(const int, const int, const float *, const float *, float *, const int, float*); 113 | 114 | #if 0 115 | void fully_connected_inference_direct_BiasReLU(int input_size, int output_size, float *x, float *y, float *z, float* biasArr, int num_threads) 116 | { 117 | #pragma omp parallel for schedule(static) num_threads(num_threads) 118 | for (int i = 0; i < output_size; i++) 119 | { 120 | float sum = 0.f; 121 | for (int j = 0; j < input_size; j++) 122 | sum += x[j] * y[i * input_size + j]; 123 | 124 | sum += biasArr[i]; 125 | if (sum < 0.f) sum = 0.f; 126 | z[i] = sum; 127 | } 128 | } 129 | 130 | void fully_connected_transpose_inference_neon8_BiasReLU(int input_size, int output_size, float *x, float *y, float *z, float* biasArr, int num_threads) 131 | { 132 | assert(input_size % 8 == 0); 133 | assert(output_size % 8 == 0); 134 | #pragma omp parallel for schedule(static) num_threads(num_threads) 135 | for (int k = 0; k < output_size / 8; k++) 136 | { 137 | float *yPtr = y + k * 8 * input_size; 138 | const float32x4_t vzero = vdupq_n_f32(0.f); 139 | 140 | float32x4_t res = vld1q_f32(biasArr + k * 8); 141 | float32x4_t res1 = vld1q_f32(biasArr + k * 8 + 4); 142 | 143 | float32x4_t va, vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7; 144 | for (int i = 0; i < input_size; i += 4) 145 | { 146 | va = vld1q_f32(x + i); 147 | 148 | vb0 = vld1q_f32(yPtr); 149 | vb1 = vld1q_f32(yPtr + 4); 150 | vb2 = vld1q_f32(yPtr + 8); 151 | vb3 = vld1q_f32(yPtr + 12); 152 | vb4 = vld1q_f32(yPtr + 16); 153 | vb5 = vld1q_f32(yPtr + 20); 154 | vb6 = vld1q_f32(yPtr + 24); 155 | vb7 = vld1q_f32(yPtr + 28); 156 | 157 | #if __aarch64__ 158 | res = vfmaq_laneq_f32(res, vb0, va, 0); 159 | res1 = vfmaq_laneq_f32(res1, vb1, va, 0); 160 | res = vfmaq_laneq_f32(res, vb2, va, 1); 161 | res1 = vfmaq_laneq_f32(res1, vb3, va, 1); 162 | res = vfmaq_laneq_f32(res, vb4, va, 2); 163 | res1 = vfmaq_laneq_f32(res1, vb5, va, 2); 164 | res = vfmaq_laneq_f32(res, vb6, va, 3); 165 | res1 = vfmaq_laneq_f32(res1, vb7, va, 3); 166 | #else 167 | res = vmlaq_f32(res, vb0, vld1q_dup_f32(x + i + 0)); 168 | res1 = vmlaq_f32(res1, vb1, vld1q_dup_f32(x + i + 0)); 169 | res = vmlaq_f32(res, vb2, vld1q_dup_f32(x + i + 1)); 170 | res1 = vmlaq_f32(res1, vb3, vld1q_dup_f32(x + i + 1)); 171 | res = vmlaq_f32(res, vb4, vld1q_dup_f32(x + i + 2)); 172 | res1 = vmlaq_f32(res1, vb5, vld1q_dup_f32(x + i + 2)); 173 | res = vmlaq_f32(res, vb6, vld1q_dup_f32(x + i + 3)); 174 | res1 = vmlaq_f32(res1, vb7, vld1q_dup_f32(x + i + 3)); 175 | #endif 176 | yPtr += 32; 177 | } 178 | 179 | //res = vaddq_f32(res, vBias); 180 | //res1 = vaddq_f32(res, vBias1); 181 | 182 | res = vmaxq_f32(res, vzero); 183 | res1 = vmaxq_f32(res1, vzero); 184 | 185 | vst1q_f32((float32_t *)(z + 8 * k), res); 186 | vst1q_f32((float32_t *)(z + 8 * k + 4), res1); 187 | } 188 | } 189 | /* 190 | void fully_connected_transpose_inference_neon(int input_size, int output_size, float *x, float *y, float *z) 191 | { 192 | assert(input_size %4==0); 193 | assert(output_size%4==0); 194 | //#pragma omp parallel for num_threads(32) schedule(static) 195 | for(int k=0; k A[n][m] 238 | { 239 | for (int i = 0; i < m; i++) for (int j = 0; j < n; j++) 240 | buffer[j * m + i] = array[i * n + j]; 241 | memcpy(array, buffer, m * n * sizeof(float)); 242 | } 243 | -------------------------------------------------------------------------------- /src/booster/avx/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | file(GLOB AVX_SRC ./*.cpp) 2 | file(GLOB AVX_HEADERS ./*.h) 3 | 4 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a -fopenmp") 5 | 6 | if(CMAKE_SYSTEM_NAME MATCHES "Windows") 7 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -std=c++11 -Wall") 8 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -std=c++11 -O2") 9 | else() 10 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -std=c++11 -march=core-avx2 -g -Wall") 11 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -std=c++11 -march=core-avx2 -O3 -Wno-format -Wno-unused-parameter") 12 | endif() 13 | 14 | add_library(booster_avx_obj OBJECT ${AVX_SRC} ${AVX_HEADERS}) 15 | #add_library(arm_backend STATIC ${AVX_SRC} ${AVX_HEADERS}) 16 | 17 | #target_include_directories(arm_backend PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 18 | #set(AVX_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/feather_backend_avx/") 19 | #install(TARGETS arm_backend DESTINATION ${AVX_INSTALL_DIR}/lib) 20 | #install(FILES ${AVX_HEADERS} DESTINATION "${AVX_INSTALL_DIR}/include") 21 | -------------------------------------------------------------------------------- /src/booster/avx/caffe_interp.cpp: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #include 16 | 17 | // Bi-linear interpolation 18 | // IN : [channels height1 width1] cropped from a bigger [Height1 Width1] image 19 | // OUT: [channels height2 width2] cropped from a bigger [Height2 Width2] image 20 | template 21 | void caffe_cpu_interp2(const int channels, 22 | const Dtype *data1, const int x1, const int y1, const int height1, const int width1, const int Height1, const int Width1, 23 | Dtype *data2, const int x2, const int y2, const int height2, const int width2, const int Height2, const int Width2) 24 | { 25 | // CHECK(x1 >= 0 && y1 >= 0 && height1 > 0 && width1 > 0 && x2 >= 0 && y2 >= 0 && height2 > 0 && width2 > 0); 26 | // CHECK(Width1 >= width1 + x1 && Height1 >= height1 + y1 && Width2 >= width2 + x2 && Height2 >= height2 + y2); 27 | // special case: just copy 28 | if (height1 == height2 && width1 == width2) 29 | { 30 | for (int h2 = 0; h2 < height2; ++h2) 31 | { 32 | const int h1 = h2; 33 | for (int w2 = 0; w2 < width2; ++w2) 34 | { 35 | const int w1 = w2; 36 | if (packed) 37 | { 38 | const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))]; 39 | Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))]; 40 | for (int c = 0; c < channels; ++c) 41 | { 42 | pos2[0] = pos1[0]; 43 | pos1++; 44 | pos2++; 45 | } 46 | } 47 | else 48 | { 49 | const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)]; 50 | Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)]; 51 | for (int c = 0; c < channels; ++c) 52 | { 53 | pos2[0] = pos1[0]; 54 | pos1 += Width1 * Height1; 55 | pos2 += Width2 * Height2; 56 | } 57 | } 58 | } 59 | } 60 | return; 61 | } 62 | const float rheight = (height2 > 1) ? static_cast(height1) / (height2) : 0.f; 63 | const float rwidth = (width2 > 1) ? static_cast(width1) / (width2) : 0.f; 64 | for (int h2 = 0; h2 < height2; ++h2) 65 | { 66 | const float h1r = rheight * h2; 67 | const int h1 = h1r; 68 | const int h1p = (h1 < height1 - 1) ? 1 : 0; 69 | const Dtype h1lambda = h1r - h1; 70 | const Dtype h0lambda = Dtype(1.) - h1lambda; 71 | for (int w2 = 0; w2 < width2; ++w2) 72 | { 73 | const float w1r = rwidth * w2; 74 | const int w1 = w1r; 75 | const int w1p = (w1 < width1 - 1) ? 1 : 0; 76 | const Dtype w1lambda = w1r - w1; 77 | const Dtype w0lambda = Dtype(1.) - w1lambda; 78 | if (packed) 79 | { 80 | const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))]; 81 | Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))]; 82 | for (int c = 0; c < channels; ++c) 83 | { 84 | pos2[0] = 85 | h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[channels * w1p]) + 86 | h1lambda * (w0lambda * pos1[channels * h1p * Width1] + w1lambda * pos1[channels * (h1p * Width1 + w1p)]); 87 | pos1++; 88 | pos2++; 89 | } 90 | } 91 | else 92 | { 93 | const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)]; 94 | Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)]; 95 | for (int c = 0; c < channels; ++c) 96 | { 97 | pos2[0] = 98 | h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) + 99 | h1lambda * (w0lambda * pos1[h1p * Width1] + w1lambda * pos1[h1p * Width1 + w1p]); 100 | pos1 += Width1 * Height1; 101 | pos2 += Width2 * Height2; 102 | } 103 | } 104 | } 105 | } 106 | } 107 | 108 | template void caffe_cpu_interp2(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int); 109 | template void caffe_cpu_interp2(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int); 110 | template void caffe_cpu_interp2(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int); 111 | template void caffe_cpu_interp2(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int); 112 | -------------------------------------------------------------------------------- /src/booster/avx/depthwise.cpp: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | 22 | //#include 23 | 24 | #ifdef __APPLE__ 25 | #else 26 | #include 27 | #endif 28 | 29 | 30 | template 31 | void globalDwConv(float *output, const float *input, int input_channels, int inw, int inh, float *kernel, int group, int nThreads, float *bias_arr) 32 | { 33 | assert(group > 0 || input_channels % group == 0); 34 | int step = inw * inh; 35 | int block = input_channels / group; 36 | int groupKernelSize = inw * inh * group; 37 | 38 | for (int i = 0; i < input_channels; i++) 39 | { 40 | int k = i / group, u = i % group; 41 | output[i] = 0; 42 | for (int j = 0; j < step; j++) 43 | { 44 | output[i] += input[i * step + j] * kernel[k * groupKernelSize + u * step + j]; 45 | } 46 | if (fuseBias) 47 | { 48 | output[i] += bias_arr[i]; 49 | } 50 | if (fuseRelu) 51 | { 52 | output[i] = (output[i] > 0.f) ? output[i] : 0.f; 53 | } 54 | } 55 | 56 | /* 57 | int kw = inw, kh = inh; 58 | int width = kw * kh; 59 | int widthAligned = width & 0xFFFFFFFC; 60 | int widthRem = width & 0x03; // int widthRem = width & 0x11; 61 | int height = group; 62 | int heightAligned = group & 0xFFFFFFFC; 63 | int heightRem = height & 0x03; // int heightRem = height & 0x11; 64 | float ext[8]; 65 | for(int i = 0; i < heightAligned; i += 4) 66 | { 67 | float32x4_t sum = vdupq_n_f32(0.f); 68 | float* p0 = const_cast(input) + width * i; 69 | float* p1 = p0 + width; 70 | float* p2 = p1 + width; 71 | float* p3 = p2 + width; 72 | float* k0 = kernel + width * i; 73 | float* k1 = k0 + width; 74 | float* k2 = k1 + width; 75 | float* k3 = k2 + width; 76 | 77 | for(int j = 0; j < widthAligned; j += 4) 78 | { 79 | float32x4_t v0 = vld1q_f32(p0); 80 | p0 += 4; 81 | float32x4_t v1 = vld1q_f32(p1); 82 | p1 += 4; 83 | float32x4_t v2 = vld1q_f32(p2); 84 | p2 += 4; 85 | float32x4_t v3 = vld1q_f32(p3); 86 | p3 += 4; 87 | 88 | float32x4_t r0 = vld1q_f32(k0); 89 | k0 += 4; 90 | float32x4_t r1 = vld1q_f32(k1); 91 | k1 += 4; 92 | float32x4_t r2 = vld1q_f32(k2); 93 | k2 += 4; 94 | float32x4_t r3 = vld1q_f32(k3); 95 | k3 += 4; 96 | 97 | float32x4x2_t row01 = vtrnq_f32(v0, v1); 98 | float32x4x2_t row23 = vtrnq_f32(v2, v3); 99 | 100 | // * row0 = ( x00 x10 x20 x30 ) 101 | // * row1 = ( x01 x11 x21 x31 ) 102 | // * row2 = ( x02 x12 x22 x32 ) 103 | // * row3 = ( x03 x13 x23 x33 ) 104 | 105 | v0 = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0])); 106 | v1 = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1])); 107 | v2 = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0])); 108 | v3 = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1])); 109 | row01 = vtrnq_f32(r0, r1); 110 | row23 = vtrnq_f32(r2, r3); 111 | r0 = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0])); 112 | r1 = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1])); 113 | r2 = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0])); 114 | r3 = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1])); 115 | #ifdef __aarch64__ 116 | sum = vfmaq_f32(sum, v0, r0); 117 | sum = vfmaq_f32(sum, v1, r1); 118 | sum = vfmaq_f32(sum, v2, r2); 119 | sum = vfmaq_f32(sum, v3, r3); 120 | #else 121 | sum = vmlaq_f32(sum, v0, r0); 122 | sum = vmlaq_f32(sum, v1, r1); 123 | sum = vmlaq_f32(sum, v2, r2); 124 | sum = vmlaq_f32(sum, v3, r3); 125 | #endif 126 | } 127 | if(widthRem){ 128 | for(int j = 0; j < widthRem; ++j) 129 | { 130 | ext[0] = p0[j]; 131 | ext[1] = p1[j]; 132 | ext[2] = p2[j]; 133 | ext[3] = p3[j]; 134 | ext[4] = k0[j]; 135 | ext[5] = k1[j]; 136 | ext[6] = k2[j]; 137 | ext[7] = k3[j]; 138 | #ifdef __aarch64__ 139 | sum = vfmaq_f32(sum, vld1q_f32(ext + 4), vld1q_f32(ext)); 140 | #else 141 | sum = vmlaq_f32(sum, vld1q_f32(ext + 4), vld1q_f32(ext)); 142 | #endif 143 | } 144 | } 145 | vst1q_f32(output + i, sum); 146 | } 147 | for(int i = heightAligned; i < height; ++i) 148 | { 149 | float* p = const_cast(input) + i * width; 150 | float* k = kernel + i * width; 151 | float sum = 0.f; 152 | for(int j = 0; j < width; ++j) 153 | { 154 | sum += p[j] * k[j]; 155 | } 156 | output[i] = sum; // output[heightAligned + i] = sum; 157 | } 158 | */ 159 | } 160 | 161 | template 162 | void dwConv_template(float *output, float *input, int input_channels, int inw, int inh, int stridew, int strideh, float *kernel, int kw, int kh, int group, int nThreads, float *bias_arr) 163 | { 164 | if ((kw == inw) && (kh == inh)) 165 | { 166 | globalDwConv(output, input, input_channels, inw, inh, kernel, group, nThreads, bias_arr); 167 | } 168 | else 169 | { 170 | int outw = (inw - kw) / stridew + 1; //for strided case in odd dimensions, should take the floor value as output dim. 171 | int outh = (inh - kh) / strideh + 1; 172 | 173 | // #pragma omp parallel for num_threads(nThreads) schedule(static) 174 | //printf("dw param %d kernel %d %d stride %d %d input %d %d %d output %d %d\n", group, kh, kw, strideh, stridew, input_channels, inh, inw, outh, outw); 175 | for (int g = 0; g < group; ++g) 176 | { 177 | float *kp = kernel + kw * kh * g; 178 | float *outg = output + g * outw * outh; 179 | float *ing = input + g * inw * inh; 180 | for (int i = 0; i < outh; ++i) 181 | { 182 | for (int j = 0; j < outw; ++j) 183 | { 184 | float *inp = ing + inw * (i * stridew) + (j * strideh); 185 | float convSum = 0.f; 186 | for (int m = 0; m < kh; m++) 187 | { 188 | for (int n = 0; n < kw; n++) 189 | { 190 | convSum += inp[m * inw + n] * kp[m * kw + n]; 191 | } 192 | } 193 | if (fuseBias) 194 | { 195 | convSum += bias_arr[g]; 196 | } 197 | if (fuseRelu) 198 | { 199 | convSum = (convSum > 0.f) ? convSum : 0.f; 200 | } 201 | outg[j] = convSum; 202 | } 203 | outg += outw; 204 | } 205 | } 206 | } 207 | } 208 | 209 | template void dwConv_template(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *); 210 | template void dwConv_template(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *); 211 | template void dwConv_template(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *); 212 | template void dwConv_template(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *); 213 | -------------------------------------------------------------------------------- /src/booster/avx/helper.cpp: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | #include 20 | 21 | void print_arr(float* vp) 22 | { 23 | float* ep = (float *) vp; 24 | printf("arr %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3)); 25 | } 26 | 27 | void print_floats(const float* arr, const int len) 28 | { 29 | for (int i = 0; i < len; ++i) 30 | { 31 | printf("%.2f ", arr[i]); 32 | } 33 | printf("\n\n"); 34 | } 35 | 36 | void print_floats(const float* arr, const int dimX, const int dimY) 37 | { 38 | for (int i = 0; i < dimX; ++i) 39 | { 40 | for (int j = 0; j < dimY; ++j) 41 | printf("%.2f ", arr[i * dimY + j]); 42 | printf("\n"); 43 | } 44 | printf("\n\n"); 45 | } 46 | 47 | 48 | void diff(float* arr1, float* arr2, int len) 49 | { 50 | float dif = 0.0f; 51 | for (int i = 0; i < len; ++i) 52 | { 53 | float err = fabsf(arr1[i] - arr2[i]); 54 | if (err > 1.0f) 55 | { 56 | dif += err; 57 | } 58 | } 59 | printf("The difference is %.2f\n", dif); 60 | } 61 | void diff(float* arr1, float* arr2, int M, int N) 62 | { 63 | float dif = 0.0f; 64 | for (int i = 0; i < M; ++i) 65 | { 66 | for (int j = 0; j < N; ++j) 67 | { 68 | float err = fabsf(arr1[i * N + j] - arr2[i * N + j]); 69 | if (err > 1.0f) 70 | { 71 | dif += err; 72 | printf("Error position (%d, %d), value %.2f, %.2f\n", i, j, arr1[i * N + j], arr2[i * N + j]); 73 | } 74 | } 75 | } 76 | printf("The difference is %.2f\n", dif); 77 | } 78 | 79 | #include 80 | 81 | #ifdef _WIN32 82 | #define CLOCK_MONOTONIC 0 83 | int clock_gettime(int no_use, struct timespec *spec) 84 | { 85 | return timespec_get(spec, TIME_UTC); 86 | } 87 | #endif 88 | 89 | void Timer::startBench() 90 | { 91 | clock_gettime(CLOCK_MONOTONIC, &start); 92 | } 93 | 94 | double Timer::endBench() 95 | { 96 | clock_gettime(CLOCK_MONOTONIC, &stop); 97 | return (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0; 98 | } 99 | 100 | void Timer::endBench(const char* comment) 101 | { 102 | clock_gettime(CLOCK_MONOTONIC, &stop); 103 | double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0; 104 | printf("%s %lfms\n", comment, elapsedTime); 105 | } 106 | 107 | void Timer::endBench(const char* comment, double fold) 108 | { 109 | clock_gettime(CLOCK_MONOTONIC, &stop); 110 | double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0; 111 | printf("%s %lfms\n", comment, elapsedTime / fold); 112 | } 113 | -------------------------------------------------------------------------------- /src/booster/include/booster/booster.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | // Booster is the standalone backend of FeatherCNN, in order to facilitate unit testing 16 | // and multi-purpose deployment. I am currently focusing on the fast convolution kernels, 17 | // and will pack other operators as well. This backend library is now supporting 18 | // AVX and Neon, and is going to supoort OpenCL/GLES in the future. 19 | // Booster won't grow up into a hugh and abstract lib. I'll keep it simple and stupid. 20 | // -- Haidong Lan @ Tencent AI Platform, 08/30/2018 21 | 22 | #pragma once 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include 29 | #ifdef FEATHER_OPENCL 30 | #include "CLHPP/clhpp_runtime.hpp" 31 | #endif 32 | 33 | #ifdef _WIN32 34 | #define FEATHER_MEN_ALIGN(alignment) __declspec(align(alignment)) 35 | #else 36 | #define FEATHER_MEN_ALIGN(alignment) __attribute__((aligned(alignment))) 37 | #endif 38 | 39 | namespace booster 40 | { 41 | 42 | enum ConvAlgo 43 | { 44 | NAIVE, 45 | IM2COL, 46 | SGECONV, 47 | DEPTHWISE, 48 | WINOGRADF63, 49 | WINOGRADF63FUSED, 50 | WINOGRADF23, 51 | }; 52 | 53 | enum ActivationType 54 | { 55 | None, 56 | ReLU, 57 | }; 58 | 59 | struct ConvParam 60 | { 61 | int output_channels; 62 | int input_channels; 63 | int input_h; 64 | int input_w; 65 | int kernel_h; 66 | int kernel_w; 67 | int output_h; 68 | int output_w; 69 | int stride_h; 70 | int stride_w; 71 | int pad_left; 72 | int pad_bottom; 73 | int pad_right; 74 | int pad_top; 75 | int group; 76 | bool bias_term; 77 | ActivationType activation; 78 | #ifdef FEATHER_OPENCL 79 | int channel_block_size; 80 | int padded_input_channels; 81 | int padded_output_channels; 82 | int height_block_size; 83 | int width_block_size; 84 | int padded_input_h; 85 | int padded_input_w; 86 | int padded_output_h; 87 | int padded_output_w; 88 | bool padding_needed; 89 | 90 | void AssignCLPaddedDim() 91 | { 92 | channel_block_size = 4; 93 | if (padded_input_channels % 8 == 0 && padded_output_channels % 8 == 0) 94 | { 95 | channel_block_size = 8; 96 | } 97 | 98 | height_block_size = 1; 99 | width_block_size = 1; 100 | padding_needed = false; 101 | if (input_w >= 12) 102 | { 103 | width_block_size = 2; 104 | padded_output_h = (output_h + height_block_size - 1) / height_block_size * height_block_size; 105 | padded_output_w = (output_w + width_block_size - 1) / width_block_size * width_block_size; 106 | padded_input_h = (padded_output_h - 1) * stride_h + kernel_h; 107 | padded_input_w = (padded_output_w - 1) * stride_w + kernel_w; 108 | padding_needed = (padded_input_h != input_h) || (padded_input_w != input_w); 109 | } 110 | } 111 | #endif 112 | 113 | void AssignOutputDim() 114 | { 115 | //Validate default values 116 | if (group == 0) group = 1; 117 | if (stride_h == 0) stride_h = 1; 118 | if (stride_w == 0) stride_w = 1; 119 | output_h = (input_h + pad_top + pad_bottom - kernel_h) / stride_h + 1; 120 | output_w = (input_w + pad_left + pad_right - kernel_w) / stride_w + 1; 121 | if (group == input_channels) 122 | { 123 | output_channels = input_channels; 124 | } 125 | } 126 | void AssignPaddedDim() 127 | { 128 | input_h = input_h + pad_top + pad_bottom; 129 | input_w = input_w + pad_left + pad_right; 130 | pad_left = 0; 131 | pad_bottom = 0; 132 | pad_right = 0; 133 | pad_top = 0; 134 | } 135 | void LogParams(const char* layer_name) 136 | { 137 | printf("-----Layer %s ConvParam----\n", layer_name); 138 | printf("Input CxHxW=(%d, %d, %d)\n", input_channels, input_h, input_w); 139 | printf("Output CxHxW=(%d, %d, %d)\n", output_channels, output_h, output_w); 140 | printf("Group = %d\n", group); 141 | printf("Kernel HxW=(%d, %d)\n", kernel_h, kernel_w); 142 | printf("Stride HxW=(%d, %d)\n", stride_h, stride_w); 143 | printf("Paddings (%d %d %d %d)\n", pad_left, pad_bottom, pad_right, pad_top); 144 | } 145 | double GetFLOPS() 146 | { 147 | return 2.0 * this->output_channels * this->input_channels * this->output_h * this->output_w * this->kernel_h * this->kernel_w / this->group; 148 | } 149 | }; 150 | 151 | typedef int (*GET_BUFFER_SIZE_FUNC)(ConvParam *param, int* buffer_size, int* processed_kernel_size); 152 | typedef int (*INIT_FUNC)(ConvParam *param, float* processed_kernel, float* kernel); 153 | typedef int (*FORWARD_FUNC)(ConvParam *param, float* output, float* input, float* kernel, float* buffer, float* bias_arr, int num_threads); 154 | 155 | //ConvBooster doesn't allocate any memory. 156 | class ConvBooster 157 | { 158 | public: 159 | ConvBooster(); 160 | ~ConvBooster() {} 161 | int SelectAlgo(ConvParam* param); 162 | int ForceSelectAlgo(ConvAlgo algo); 163 | int SetFuncs(); 164 | GET_BUFFER_SIZE_FUNC GetBufferSize; 165 | INIT_FUNC Init; 166 | FORWARD_FUNC Forward; 167 | 168 | private: 169 | ConvAlgo algo; 170 | }; 171 | 172 | #ifdef FEATHER_OPENCL 173 | 174 | struct CLBuffers 175 | { 176 | cl::Buffer* input_mem; 177 | cl::Buffer* padded_input_mem; 178 | cl::Buffer* output_mem; 179 | cl::Buffer* weight_mem; 180 | cl::Buffer* bias_mem; 181 | cl::Buffer* input_trans_mem; 182 | cl::Buffer* out_trans_mem; 183 | }; 184 | 185 | 186 | template 187 | class ConvBoosterCL 188 | { 189 | public: 190 | 191 | typedef int (*INIT_FUNC_CL)(const std::vector& program_names, 192 | const std::vector& kernel_names, 193 | std::map& cl_kernel_info_map); 194 | typedef int (*FORWARD_FUNC_CL)(cl::CommandQueue cmd_q, 195 | std::vector kernel_names, 196 | std::map& cl_kernel_info_map, 197 | const ConvParam& param, 198 | clhpp_feather::OpenCLRuntime* cl_runtime, 199 | std::string layer_name); 200 | typedef int (*WEIGHT_REFORM_FUNC_CL)(const ConvParam& param, 201 | size_t n_grp_size, 202 | size_t c_grp_size, 203 | const Dtype* weight, 204 | Dtype* weight_reformed); 205 | typedef int (*SET_CONV_KERNEL_PARAMS_CL)(const ConvParam& param, 206 | const CLBuffers& buffers, 207 | const std::vector& kernel_names, 208 | std::map& cl_kernel_info_map, 209 | clhpp_feather::OpenCLRuntime* cl_runtime, 210 | bool is_reshape); 211 | typedef int (*SET_CONV_WORK_SIZE_CL)(const ConvParam& param, 212 | std::map& cl_kernel_info_map, 213 | const std::vector& kernel_names, 214 | clhpp_feather::OpenCLRuntime* cl_runtime); 215 | typedef int (*SET_BUILD_OPTS_CL)(const ConvParam& param, 216 | bool is_fp16, 217 | const std::vector& kernel_names, 218 | std::map& cl_kernel_info_map); 219 | 220 | ConvBoosterCL(); 221 | ~ConvBoosterCL() {} 222 | int SelectAlgo(ConvParam* param); 223 | int ForceSelectAlgo(ConvAlgo algo); 224 | int SetFuncs(); 225 | size_t GetWeightSize(); 226 | const std::vector& GetProgramNames(); 227 | const std::vector& GetKernelNames(); 228 | INIT_FUNC_CL Init; 229 | FORWARD_FUNC_CL Forward; 230 | WEIGHT_REFORM_FUNC_CL WeightReform; 231 | SET_CONV_KERNEL_PARAMS_CL SetConvKernelParams; 232 | SET_CONV_WORK_SIZE_CL SetConvWorkSize; 233 | SET_BUILD_OPTS_CL SetBuildOpts; 234 | private: 235 | ConvAlgo algo; 236 | size_t weight_size; 237 | std::vector program_names; 238 | std::vector kernel_names; 239 | 240 | }; 241 | #endif 242 | 243 | }; 244 | -------------------------------------------------------------------------------- /src/booster/include/booster/caffe_interp.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | template 18 | void caffe_cpu_interp2(const int channels, 19 | const Dtype *data1, const int x1, const int y1, const int height1, const int width1, const int Height1, const int Width1, 20 | Dtype *data2, const int x2, const int y2, const int height2, const int width2, const int Height2, const int Width2); -------------------------------------------------------------------------------- /src/booster/include/booster/depthwise.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | 19 | template 20 | void dwConv_template(float* output, float* input, int input_channels, int inw, int inh, int stridew, int strideh, float* kernel, int kw, int kh, int group, int nThreads, float* bias_arr); -------------------------------------------------------------------------------- /src/booster/include/booster/generic_kernels.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | 20 | namespace booster 21 | { 22 | void pad_input(float* padded, const float* input, const size_t input_channels, const size_t input_width, const size_t input_height, const size_t padding_left, const size_t padding_top, const size_t padding_right, const size_t padding_bottom); 23 | void im2col(ConvParam *conv_param, float *img_buffer, float *input); 24 | void naive_sgemm(int M, int N, int L, float *A, float *B, float *C); 25 | 26 | template 27 | void add_relu(float* dst, const float* A, const float* B, const size_t len, const size_t num_threads); 28 | 29 | template 30 | void scale(const size_t channels, const size_t stride, const float* bias_data, const float* scale_data, const float* input, float* output, const size_t num_threads); 31 | 32 | template 33 | void batchnorm(const size_t channels, const size_t stride, const float* alpha, const float* beta, const float* bias_data, const float* scale_data, const float* input, float* output, const size_t num_threads); 34 | 35 | //void dwConv(float* output, const float* input, const int inw, const int inh, const int stridew, const int strideh, const float* kernel, const int kw, const int kh, const int group, const int nThreads); 36 | void softmax(float* input, float n); 37 | bool pooling(float *A, float *B, const char *type, int input_channels, size_t kernelw, size_t kernelh, size_t outputw, size_t outputh, int output_channels); 38 | 39 | void naive_gemm(int M, int N, int L, float *A, float *B, float *C); 40 | 41 | void relu(float* arr, int len); 42 | void biasRelu(float* arr, int len, float bias); 43 | void reluVec(float* arr, int len); 44 | void biasVec(float* arr, int len, float bias); 45 | void biasReluVec(float* arr, int len, float bias); 46 | void reluVecOpenmp(float* arr, int len, int nThreads); 47 | void biasVecOpenmp(float* arr, int len, float bias, int nThreads); 48 | void biasReluVecOpenmp(float* arr, int len, float bias, int nThreads); 49 | }; -------------------------------------------------------------------------------- /src/booster/include/booster/helper.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | // #include "common.h" 17 | 18 | #if 0 19 | #include 20 | #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, "FeatherLib", __VA_ARGS__) 21 | #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, "FeatherLib", __VA_ARGS__) 22 | #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "FeatherLib", __VA_ARGS__) 23 | #else 24 | #include 25 | #define LOGI(...) fprintf(stdout, __VA_ARGS__);fprintf(stdout,"\n"); 26 | #define LOGD(...) fprintf(stdout, __VA_ARGS__);fprintf(stdout,"\n"); 27 | #define LOGE(...) fprintf(stderr, __VA_ARGS__);fprintf(stderr,"\n"); 28 | #endif 29 | 30 | void print_floats(const float* arr, const int len); 31 | void print_floats(const float* arr, const int dimX, const int dimY); 32 | void diff(float* arr1, float* arr2, int len); 33 | void diff(float* arr1, float* arr2, int M, int N); 34 | 35 | #if __ARM_NEON 36 | #include 37 | 38 | void print_vec2(float32x4_t* vp); 39 | void print_vec3(float32x4_t* vp); 40 | void print_vec(float32x4_t* vp, const char* comment); 41 | void print_vec(float32x4_t* vp); 42 | void print_arr(float* vp); 43 | 44 | //Thanks nihui for this code snippet! 45 | #ifndef __aarch64__ 46 | 47 | #ifndef __APPLE__ 48 | //static inline float32x4_t vfmaq_f32(float32x4_t _s, float32x4_t _a, float32x4_t _b) 49 | //{ 50 | // return vmlaq_f32(_s, _a, _b); 51 | //} 52 | #endif 53 | static inline float32x4_t vfmaq_laneq_f32(float32x4_t _s, float32x4_t _a, float32x4_t _b, int lane) 54 | { 55 | if (lane == 0) return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 0); 56 | else if (lane == 1) return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 1); 57 | else if (lane == 2) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 0); 58 | else if (lane == 3) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 1); 59 | else return vdupq_n_f32(0.f); 60 | } 61 | #endif 62 | #endif 63 | 64 | #include 65 | class Timer 66 | { 67 | public: 68 | Timer() {} 69 | virtual ~Timer() {} 70 | void startBench(); 71 | double endBench(); 72 | void endBench(const char *commets); 73 | void endBench(const char *commets, double fold); 74 | private: 75 | timespec start, stop; 76 | }; 77 | -------------------------------------------------------------------------------- /src/booster/include/booster/sgeconv.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | /* 5 | * Performs single-float matrix multiply C = A * B in row-major fashion, 6 | * where C is MxN, A is MxK and B is KxN. 7 | * Allocation requirement: packA: M * K 8 | */ 9 | 10 | namespace booster 11 | { 12 | void pad_input_neon(booster::ConvParam *conv_param, float* padded_input, float* input); 13 | 14 | template 15 | void packed_sgeconv_init(booster::ConvParam* conv_param, int kc, float* packA, float* A); 16 | 17 | template 18 | void packed_sgeconv_im2col_activation(booster::ConvParam* conv_param, float *packA, float *B, const int ldb, float *C, const int ldc, const int nc, const int kc, float* bias, int num_threads, float* pack_array); 19 | }; -------------------------------------------------------------------------------- /src/booster/include/booster/sgemm.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | /* 18 | * Performs single-float matrix multiply C = A * B in row-major fashion, 19 | * where C is MxN, A is MxK and B is KxN. 20 | * Allocation requirement: C: get_aligned_size(M, N), A: get_aligned_size(M, K) 21 | */ 22 | 23 | int get_aligned_size(int M, int N); 24 | 25 | template 26 | void packed_sgemm_init(int M, int K, int kc, float* packA, float* A, int lda); 27 | 28 | //void packed_sgemm(int M, int N, int K, float *packA, float *B, int ldb, float *C, int ldc, int nc, int kc); 29 | template 30 | void packed_sgemm_activation(int M, int N, int K, float *packA, float *b, int ldb, float *c, int ldc, int nc, int kc, float* bias, int num_threads, float* pack_array); 31 | -------------------------------------------------------------------------------- /src/booster/include/booster/sgemv.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | #include 17 | 18 | void matrixTranspose(float* array, size_t m, size_t n, float *buffer); 19 | template 20 | void fully_connected_inference_direct(const int input_size, const int output_size, const float *x, const float *y, float *z, const int num_threads, float* bias_arr); 21 | template 22 | void fully_connected_transpose_inference(const int input_size, const int output_size, const float *x, const float *y, float *z, const int num_threads, float* bias_arr); -------------------------------------------------------------------------------- /src/booster/include/booster/thpool.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | #pragma once 15 | 16 | #include 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | #include 26 | 27 | class ThreadPool { 28 | public: 29 | ThreadPool(size_t); 30 | template 31 | auto enqueue(F&& f, Args&&... args) 32 | -> std::future::type>; 33 | ~ThreadPool(); 34 | 35 | size_t threadNum(); 36 | int threadID(std::thread::id std_id); 37 | private: 38 | // need to keep track of threads so we can join them 39 | std::vector< std::thread > workers; 40 | // the task queue 41 | std::queue< std::function > tasks; 42 | 43 | // synchronization 44 | std::mutex queue_mutex; 45 | std::condition_variable condition; 46 | bool stop; 47 | 48 | //thread IDs 49 | std::map< std::thread::id, int > id_map; 50 | }; 51 | 52 | // the constructor just launches some amount of workers 53 | inline ThreadPool::ThreadPool(size_t threads) 54 | : stop(false) 55 | { 56 | for(size_t i = 0;i task; 64 | 65 | { 66 | std::unique_lock lock(this->queue_mutex); 67 | this->condition.wait(lock, 68 | [this]{ return this->stop || !this->tasks.empty(); }); 69 | if(this->stop && this->tasks.empty()) 70 | return; 71 | task = std::move(this->tasks.front()); 72 | this->tasks.pop(); 73 | 74 | this->id_map[std::this_thread::get_id()] = i; 75 | //std::cout << std::this_thread::get_id() << std::endl; 76 | } 77 | 78 | task(); 79 | } 80 | } 81 | ); 82 | } 83 | } 84 | 85 | // add new work item to the pool 86 | template 87 | auto ThreadPool::enqueue(F&& f, Args&&... args) 88 | -> std::future::type> 89 | { 90 | using return_type = typename std::result_of::type; 91 | 92 | auto task = std::make_shared< std::packaged_task >( 93 | std::bind(std::forward(f), std::forward(args)...) 94 | ); 95 | 96 | std::future res = task->get_future(); 97 | { 98 | std::unique_lock lock(queue_mutex); 99 | 100 | // don't allow enqueueing after stopping the pool 101 | if(stop) 102 | throw std::runtime_error("enqueue on stopped ThreadPool"); 103 | 104 | tasks.emplace([task](){ (*task)(); }); 105 | } 106 | condition.notify_one(); 107 | return res; 108 | } 109 | 110 | // the destructor joins all threads 111 | inline ThreadPool::~ThreadPool() 112 | { 113 | { 114 | std::unique_lock lock(queue_mutex); 115 | stop = true; 116 | } 117 | condition.notify_all(); 118 | for(std::thread &worker : workers) 119 | worker.join(); 120 | } 121 | 122 | inline size_t ThreadPool::threadNum() 123 | { 124 | return workers.size(); 125 | } 126 | 127 | inline int ThreadPool::threadID(std::thread::id std_id) 128 | { 129 | return id_map[std_id]; 130 | } 131 | -------------------------------------------------------------------------------- /src/booster/include/booster/winograd_kernels.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | #include 17 | #include 18 | #include 19 | 20 | enum WinogradOutType 21 | { 22 | Nothing, Relu, Bias, BiasReLU 23 | }; 24 | 25 | //UT larger than 16 * inChannels * outChannels 26 | void transformKernel(float* UT, float* kernel, int inChannels, int outChannels, float *ST); 27 | 28 | //VT larger than 16 * (inputw / 2 - 1) * (inputh / 2 - 1) * inChannels 29 | //WT larger than 16 * (inputw / 2 - 1) * (inputh / 2 - 1) * outChannels 30 | void winogradNonFusedTransform(float *output, int outChannels, float* WT, float* VT, float* UT, float* input, int inChannels, int inputw, int inputh, WinogradOutType outType, float* biasArr, int num_threads); 31 | 32 | size_t getPackArraySize_F6x6_3x3(int inChannels, int num_threads); 33 | void transformKernel_F6x6_3x3(float* UT, float* kernel, int inChannels, int outChannels); 34 | void winogradNonFusedTransform_F6x6_3x3(float *output, int outChannels, float* WT, float* VT, float* UT, float* input, int inChannels, int inputw, int inputh, WinogradOutType outType, float* biasArr, float* pack_array, int num_threads); 35 | 36 | namespace Winograd_F63_Fused 37 | { 38 | void transformKernel_F6x6_3x3(float* UT, float* kernel, int inChannels, int outChannels); 39 | 40 | template 41 | void WinogradF63Fused(booster::ConvParam* conv_param, float* output, const float* input, const float* transformed_weights, const float* bias, float* buffers, ThreadPool* thpool); 42 | }; -------------------------------------------------------------------------------- /src/layer.cpp: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #ifndef _WIN32 16 | #include 17 | #endif 18 | 19 | #include "layer.h" 20 | 21 | namespace feather 22 | { 23 | Layer::Layer(RuntimeParameter* rt_param) 24 | : _fusible(false), 25 | _inplace(false), 26 | rt_param(rt_param), 27 | common_mempool(rt_param->common_mempool()) 28 | { 29 | } 30 | 31 | Layer::~Layer() 32 | { 33 | if (!_inplace) 34 | { 35 | for (int i = 0; i < tops.size(); ++i) 36 | { 37 | delete tops[i]; 38 | } 39 | } 40 | for (int i = 0; i < weights.size(); ++i) 41 | { 42 | delete weights[i]; 43 | } 44 | } 45 | 46 | int Layer::FindBottomIDByName(std::string name) 47 | { 48 | for( int i = 0; i < this->bottoms.size(); ++i) 49 | { 50 | if (this->bottoms[i]->name.compare(name) == 0) 51 | { 52 | return i; 53 | } 54 | } 55 | return -1; 56 | } 57 | 58 | int Layer::FindTopIDByName(std::string name) 59 | { 60 | for( int i = 0; i < this->tops.size(); ++i) 61 | { 62 | if (this->tops[i]->name.compare(name) == 0) 63 | { 64 | return i; 65 | } 66 | } 67 | return -1; 68 | } 69 | 70 | int Layer::LoadParam(const ncnn::ParamDict& pd) 71 | { 72 | // Do nothing. 73 | return 0; 74 | } 75 | 76 | int Layer::LoadWeights(const ncnn::ModelBin& mb) 77 | { 78 | // Do nothing 79 | return 0; 80 | } 81 | 82 | int Layer::TryFuse(Layer *next_layer) 83 | { 84 | //Judge if next_layer points to this layer. 85 | for (int i = 0; i < next_layer->bottoms.size(); ++i) 86 | { 87 | for (int j = 0; j < this->tops.size(); ++j) 88 | { 89 | if (this->tops[j]->name.compare(next_layer->bottoms[i]->name) == 0) 90 | { 91 | return Fuse(next_layer); 92 | } 93 | } 94 | } 95 | return 0; 96 | } 97 | 98 | int Layer::Fuse(Layer* next_layer) 99 | { 100 | return 0; 101 | } 102 | 103 | int Layer::Reshape() 104 | { 105 | /* GenerateTopBlobs 106 | * infers top blob shape and allocate space. 107 | * 108 | * The default behavior is allocate a top with the same shape of bottom 109 | */ 110 | if (tops.size() != 1 || bottoms.size() != 1) 111 | return -400; //False calling base layer. 112 | tops[0]->ReshapeWithRealloc(bottoms[0]->num(), bottoms[0]->channels(), bottoms[0]->height(), bottoms[0]->width()); 113 | return 0; 114 | } 115 | 116 | int Layer::Init() 117 | { 118 | return 0; 119 | } 120 | 121 | int Layer::Forward() 122 | { 123 | return false; 124 | } 125 | 126 | int Layer::ForwardReshape() 127 | { 128 | //Default Reshape Assertation: 129 | // 130 | //There should be a single top blob as well as bottom blob. 131 | //The default behaviour is that the top blob is identical to the bottom blob 132 | //Use default reallocation. 133 | 134 | tops[0]->ReshapeWithRealloc(bottoms[0]); 135 | this->Forward(); 136 | return true; 137 | } 138 | 139 | bool Layer::fusible() const 140 | { 141 | return _fusible; 142 | } 143 | }; -------------------------------------------------------------------------------- /src/layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "blob.h" 18 | #include "mempool.h" 19 | #include "rt_param.h" 20 | #include "utils.h" 21 | 22 | #include "ncnn/paramdict.h" 23 | #include "ncnn/modelbin.h" 24 | 25 | #include 26 | 27 | namespace feather 28 | { 29 | class Layer 30 | { 31 | public: 32 | Layer(RuntimeParameter* rt_param); 33 | ~Layer(); 34 | 35 | /* LoadParam LoadWeights 36 | * 37 | * Load layer specifc paramters and corresponding weight data into memory. 38 | * The two functions rely on `ncnn` model files. 39 | */ 40 | virtual int LoadParam(const ncnn::ParamDict& pd); 41 | virtual int LoadWeights(const ncnn::ModelBin& mb); 42 | // int CopyDataFromMat(Blob* dst, const ncnn::Mat &src); 43 | 44 | /* GenerateTopBlobs 45 | * 46 | * Infer top blob shape and allocate memory. 47 | */ 48 | virtual int Reshape(); 49 | 50 | /* Init 51 | * 52 | * Preprocess the weights in order to reduce inference overhead. 53 | * Common memory pool first memory allocation occurs in this place 54 | * when specify an initial input. 55 | */ 56 | virtual int Init(); 57 | 58 | virtual int Forward(); 59 | virtual int ForwardReshape(); 60 | 61 | /* Fusion functions 62 | * 63 | * Layer fusion is an important technique to imporove memory accessing efficiency. 64 | * We currently support three patterns: Convolutoin-Bias-ReLU, BN-Scale-Relu, InnerProduct-Bias-ReLU 65 | */ 66 | virtual int Fuse(Layer* next_layer); //Fuse layers when possible. 67 | int TryFuse(Layer *next_layer); 68 | bool fusible() const; 69 | 70 | /* Utility functions for blob retrieval by name*/ 71 | int FindBottomIDByName(std::string name); 72 | int FindTopIDByName(std::string name); 73 | 74 | public: // We just make everything public. Take care when you write a derived layer. 75 | std::string name; 76 | std::string type; 77 | 78 | std::vector* > bottoms; 79 | std::vector* > tops; 80 | 81 | std::vector* > weights; 82 | 83 | bool _fusible; 84 | bool _inplace; 85 | 86 | CommonMemPool *common_mempool; 87 | RuntimeParameter *rt_param; 88 | }; 89 | }; 90 | -------------------------------------------------------------------------------- /src/layer_factory.cpp: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #include "layer_factory.h" 16 | #include "layers/input_layer.h" 17 | #include "layers/conv_layer.h" 18 | #include "layers/pooling_layer.h" 19 | #include "layers/relu_layer.h" 20 | #include "layers/inner_product_layer.h" 21 | #include "layers/dropout_layer.h" 22 | #include "layers/softmax_layer.h" 23 | #include "layers/batchnorm_layer.h" 24 | #include "layers/scale_layer.h" 25 | #include "layers/split_layer.h" 26 | #include "layers/eltwise_layer.h" 27 | #include "layers/concat_layer.h" 28 | 29 | #include 30 | 31 | namespace feather 32 | { 33 | /* An example to register a layer: 34 | * 35 | * feather layer name: ConvLayer, ncnn type name: Convolution 36 | * 1. Define a layer creator: DEFINE_LAYER_CREATOR(Conv) 37 | * 2. Register layer in the register_layer_creators() function: REGISTER_LAYER_CREATOR(Convolution, Conv); 38 | */ 39 | 40 | DEFINE_LAYER_CREATOR(Input) 41 | DEFINE_LAYER_CREATOR(Conv) 42 | DEFINE_LAYER_CREATOR(Relu) 43 | DEFINE_LAYER_CREATOR(Pooling) 44 | DEFINE_LAYER_CREATOR(InnerProduct) 45 | DEFINE_LAYER_CREATOR(Dropout) 46 | DEFINE_LAYER_CREATOR(Softmax) 47 | DEFINE_LAYER_CREATOR(BatchNorm) 48 | DEFINE_LAYER_CREATOR(Scale) 49 | DEFINE_LAYER_CREATOR(Split) 50 | DEFINE_LAYER_CREATOR(Eltwise) 51 | DEFINE_LAYER_CREATOR(Concat) 52 | 53 | void register_layer_creators() 54 | { 55 | REGISTER_LAYER_CREATOR(Input, Input); 56 | REGISTER_LAYER_CREATOR(Convolution, Conv); 57 | REGISTER_LAYER_CREATOR(ConvolutionDepthWise, Conv); 58 | REGISTER_LAYER_CREATOR(ReLU, Relu); 59 | REGISTER_LAYER_CREATOR(Pooling, Pooling); 60 | REGISTER_LAYER_CREATOR(InnerProduct, InnerProduct); 61 | REGISTER_LAYER_CREATOR(Dropout, Dropout); 62 | REGISTER_LAYER_CREATOR(Softmax, Softmax); 63 | REGISTER_LAYER_CREATOR(BatchNorm, BatchNorm); 64 | REGISTER_LAYER_CREATOR(Scale, Scale); 65 | REGISTER_LAYER_CREATOR(Split, Split); 66 | REGISTER_LAYER_CREATOR(Eltwise, Eltwise); 67 | REGISTER_LAYER_CREATOR(Concat, Concat); 68 | } 69 | }; 70 | -------------------------------------------------------------------------------- /src/layer_factory.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | /* 16 | * The layer factory modifies from caffe. 17 | */ 18 | #pragma once 19 | 20 | #include "layer.h" 21 | 22 | #include 23 | #include 24 | #include 25 | 26 | using namespace std; 27 | 28 | namespace feather 29 | { 30 | class Layer; 31 | class LayerRegistry 32 | { 33 | public: 34 | typedef Layer* (*Creator)(RuntimeParameter *); 35 | typedef std::map CreatorRegistry; 36 | 37 | static CreatorRegistry &Registry() 38 | { 39 | static CreatorRegistry *g_registry_ = new CreatorRegistry(); 40 | return *g_registry_; 41 | } 42 | 43 | // Adds a creator. 44 | static void AddCreator(const string &type, Creator creator) 45 | { 46 | CreatorRegistry ®istry = Registry(); 47 | registry[type] = creator; 48 | } 49 | 50 | // Get a layer using a LayerParameter. 51 | static Layer *CreateLayer(std::string type, RuntimeParameter *rt_param) 52 | { 53 | // const string &type = param->type()->str(); 54 | CreatorRegistry ®istry = Registry(); 55 | if (registry.find(type) != registry.end()) 56 | { 57 | return registry[type](rt_param); 58 | } 59 | else 60 | { 61 | fprintf(stderr, "Layer type %s is not supported in FeatherCNN...Aborting\n", type.c_str()); 62 | return NULL; 63 | } 64 | } 65 | 66 | private: 67 | // Layer registry should never be instantiated - everything is done with its 68 | // static variables. 69 | LayerRegistry() {} 70 | }; 71 | 72 | 73 | class LayerRegisterer 74 | { 75 | public: 76 | LayerRegisterer(const string &type, 77 | Layer * (*creator)(RuntimeParameter* )) 78 | { 79 | LayerRegistry::AddCreator(type, creator); 80 | } 81 | }; 82 | 83 | void register_layer_creators(); 84 | 85 | #define DEFINE_LAYER_CREATOR(feather_layer_name) \ 86 | static Layer *GetLayer##feather_layer_name(RuntimeParameter * rt_param) \ 87 | {return (Layer *) new feather_layer_name##Layer(rt_param);} 88 | 89 | #define REGISTER_LAYER_CREATOR(ncnn_type_name, feather_layer_name) \ 90 | static LayerRegisterer g_creator_f_##ncnn_type_name(#ncnn_type_name, GetLayer##feather_layer_name); 91 | }; 92 | -------------------------------------------------------------------------------- /src/layers/batchnorm_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | 19 | #include 20 | 21 | namespace feather 22 | { 23 | class BatchNormLayer : Layer 24 | { 25 | public: 26 | BatchNormLayer(RuntimeParameter* rt_param) 27 | : channels(0), 28 | scale_bias_term(false), 29 | fuse_scale(false), 30 | fuse_relu(false), 31 | Layer(rt_param) 32 | { 33 | _fusible = true; 34 | } 35 | 36 | int LoadParam(const ncnn::ParamDict &pd) 37 | { 38 | this->channels = pd.get(0, 0); 39 | this->eps = pd.get(1, 0.f); 40 | return 0; 41 | } 42 | 43 | int LoadWeights(const ncnn::ModelBin &mb) 44 | { 45 | ncnn::Mat slope_data, mean_data, var_data, bias_data; 46 | slope_data = mb.load(channels, 1); 47 | if (slope_data.empty()) 48 | return -100; 49 | 50 | mean_data = mb.load(channels, 1); 51 | if (mean_data.empty()) 52 | return -100; 53 | 54 | var_data = mb.load(channels, 1); 55 | if (var_data.empty()) 56 | return -100; 57 | 58 | bias_data = mb.load(channels, 1); 59 | if (bias_data.empty()) 60 | return -100; 61 | 62 | Blob * alpha_blob = new Blob; 63 | Blob * beta_blob = new Blob; 64 | this->weights.push_back(alpha_blob); 65 | this->weights.push_back(beta_blob); 66 | alpha_blob->ReshapeWithRealloc(1,1,1,channels); 67 | beta_blob->ReshapeWithRealloc(1,1,1,channels); 68 | float* alpha_data = alpha_blob->data(); 69 | float* beta_data = beta_blob->data(); 70 | for (int i = 0; i < channels; i++) 71 | { 72 | float sqrt_var = sqrt(var_data[i] + this->eps); 73 | alpha_data[i] = bias_data[i] - slope_data[i] * mean_data[i] / sqrt_var; 74 | beta_data[i] = slope_data[i] / sqrt_var; 75 | } 76 | return 0; 77 | } 78 | 79 | int Init() 80 | { 81 | const Blob *p_blob = this->bottoms[0]; 82 | if (this->channels != p_blob->channels()) 83 | { 84 | printf("Mismatch channel in layer %s, expected %d but the bottom %s has %d channels.\n", this->name.c_str(), this->channels, p_blob->name.c_str(), p_blob->channels()); 85 | return -100; 86 | } 87 | SetKernel(); 88 | 89 | return 0; 90 | } 91 | 92 | int Forward() 93 | { 94 | const float *input = bottoms[0]->data(); 95 | float *output = tops[0]->data(); 96 | float* alpha_data = weights[0]->data(); 97 | float* beta_data = weights[1]->data(); 98 | float* scale_data = NULL; 99 | float* scale_bias_data = NULL; 100 | if (fuse_scale) 101 | { 102 | scale_data = weights[2]->data(); 103 | } 104 | if (scale_bias_term) 105 | { 106 | scale_bias_data = weights[3]->data(); 107 | } 108 | // memset(output, 0xFF, sizeof(float) * this->top_blob(0)->data_size()); 109 | size_t stride = bottoms[0]->width() * bottoms[0]->height(); 110 | bn_kernel(channels, stride, alpha_data, beta_data, scale_bias_data, scale_data, input, output, 1); 111 | return 0; 112 | } 113 | 114 | int Fuse(Layer *next_layer) 115 | { 116 | if (next_layer->type.compare("Scale") == 0) 117 | { 118 | printf("BN %s fuse Scale layer %s\n", this->name.c_str(), next_layer->name.c_str()); 119 | fuse_scale = true; 120 | for (int i = 0; i < next_layer->weights.size(); ++i) 121 | { 122 | Blob *p_blob = new Blob(); 123 | p_blob->Copy(next_layer->weights[i]); 124 | // _weight_blobs.push_back(p_blob); 125 | } 126 | // scale_bias_term = ((ScaleLayer *)next_layer)->bias_term; 127 | return 1; 128 | } 129 | else if (next_layer->type.compare("ReLU") == 0) 130 | { 131 | printf("BN %s fuse ReLU layer %s\n", this->name.c_str(), next_layer->name.c_str()); 132 | fuse_relu = true; 133 | return 1; 134 | } 135 | else 136 | return 0; 137 | } 138 | 139 | private: 140 | int SetKernel() 141 | { 142 | int pattern_code = 0; 143 | pattern_code += (scale_bias_term) ? 0x1 : 0; 144 | pattern_code += (fuse_scale) ? 0x10 : 0; 145 | pattern_code += (fuse_relu) ? 0x100 : 0; 146 | //printf("pat_code %x\n", pat_code); 147 | switch (pattern_code) 148 | { 149 | case 0x000: 150 | bn_kernel = booster::batchnorm; 151 | break; 152 | case 0x001: 153 | bn_kernel = booster::batchnorm; 154 | break; 155 | case 0x010: 156 | bn_kernel = booster::batchnorm; 157 | break; 158 | case 0x011: 159 | bn_kernel = booster::batchnorm; 160 | break; 161 | case 0x100: 162 | bn_kernel = booster::batchnorm; 163 | break; 164 | case 0x101: 165 | bn_kernel = booster::batchnorm; 166 | break; 167 | case 0x110: 168 | bn_kernel = booster::batchnorm; 169 | break; 170 | case 0x111: 171 | bn_kernel = booster::batchnorm; 172 | break; 173 | default: 174 | fprintf(stderr, "Invalid pattern code 0x%x for batchnorm kernel\n", pattern_code); 175 | return -1; 176 | } 177 | return 0; 178 | } 179 | void (*bn_kernel)(const size_t channels, const size_t stride, const float* alpha, const float* beta, const float* bias_data, const float* scale_data, const float* input, float* output, const size_t num_threads); 180 | 181 | private: 182 | // size_t input_channels; 183 | // size_t input_width; 184 | // size_t input_height; 185 | int channels; 186 | float eps; 187 | 188 | bool fuse_scale; 189 | bool scale_bias_term; 190 | bool fuse_relu; 191 | }; 192 | }; 193 | -------------------------------------------------------------------------------- /src/layers/concat_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | 19 | namespace feather 20 | { 21 | class ConcatLayer : public Layer 22 | { 23 | public: 24 | ConcatLayer(RuntimeParameter* rt_param) 25 | : Layer(rt_param), 26 | axis(0) 27 | { 28 | 29 | } 30 | 31 | int LoadParam(const ncnn::ParamDict& pd) 32 | { 33 | this->axis = pd.get(0, 0); 34 | return 0; 35 | } 36 | 37 | int Forward() 38 | { 39 | float* top_ptr = tops[0]->data(); 40 | for (int i = 0; i < bottoms.size(); ++i) 41 | { 42 | const float* bottom_ptr = bottoms[i]->data(); 43 | memcpy(top_ptr, bottom_ptr, sizeof(float) * bottoms[i]->data_size()); 44 | top_ptr += bottoms[i]->data_size(); 45 | } 46 | return 0; 47 | } 48 | 49 | int Reshape() 50 | { 51 | auto first_blob = this->bottoms[0]; 52 | size_t num = 1; 53 | size_t channels = first_blob->channels(); 54 | size_t width = first_blob->width(); 55 | size_t height = first_blob->height(); 56 | // printf("bottom %s\n", first_blob->name.c_str()); 57 | for (int i = 1; i < bottoms.size(); ++i) 58 | { 59 | auto p_blob = bottoms[i]; 60 | // printf("bottom %s\n", p_blob->name.c_str()); 61 | if (this->axis == 0) 62 | { 63 | if(!(width == p_blob->width() && height == p_blob->height())) 64 | { 65 | printf("Images of different shapes cannot be concatenated together\n"); 66 | return -100; 67 | } 68 | channels += p_blob->channels(); 69 | } 70 | else 71 | { 72 | LOGE("FeatherCNN only supports concat at axis = 0."); 73 | return -100; 74 | } 75 | } 76 | // printf("Concat output shape %d %d %d\n", channels, height, width); 77 | tops[0]->ReshapeWithRealloc(1, channels, height, width); 78 | return 0; 79 | } 80 | private: 81 | int axis; 82 | }; 83 | }; 84 | -------------------------------------------------------------------------------- /src/layers/conv_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | namespace feather 25 | { 26 | class ConvLayer : public Layer 27 | { 28 | public: 29 | ConvLayer(RuntimeParameter* rt_param) 30 | : Layer(rt_param), 31 | conv_booster(), 32 | conv_param(), 33 | bias_data(NULL), 34 | processed_kernel(NULL) 35 | { 36 | // this->_fusible = true; 37 | } 38 | 39 | int LoadParam(const ncnn::ParamDict& pd) 40 | { 41 | int dilation_w = pd.get(2, 1); 42 | int dilation_h = pd.get(12, dilation_w); 43 | if ((dilation_w > 1) || (dilation_h > 1)) 44 | { 45 | printf("Dilated convolution is not supported in FeatherCNN. Please refer to the ncnn repository.\n"); 46 | return -200; //Not supported 47 | } 48 | 49 | int int8_scale_term = pd.get(8, 0); 50 | if (int8_scale_term) 51 | { 52 | printf("Dilated convolution is not supported in FeatherCNN. Please refer to the ncnn repository.\n"); 53 | return -200; //Not supported 54 | } 55 | 56 | conv_param.kernel_w = pd.get(1, 0); 57 | conv_param.kernel_h = pd.get(11, conv_param.kernel_w); 58 | conv_param.stride_w = pd.get(3, 1); 59 | conv_param.stride_h = pd.get(13, conv_param.stride_w); 60 | conv_param.pad_left = pd.get(4, 0); 61 | conv_param.pad_bottom = pd.get(14, conv_param.pad_left); 62 | conv_param.pad_right = pd.get(4, 0); 63 | conv_param.pad_top = pd.get(14, conv_param.pad_left); 64 | conv_param.group = pd.get(7, 1); 65 | conv_param.output_channels = pd.get(0, 0); 66 | conv_param.bias_term = pd.get(5, 0); 67 | conv_param.activation = booster::None; 68 | int weight_data_size = pd.get(6, 0); 69 | if(conv_param.group==0 || conv_param.output_channels%conv_param.group) 70 | { 71 | printf("Layer %s output_channels is not divisible by its group\n", this->name); 72 | exit(0); 73 | } 74 | else conv_param.output_channels /= conv_param.group; 75 | conv_param.input_channels = weight_data_size / conv_param.output_channels / conv_param.kernel_h / conv_param.kernel_w; 76 | 77 | // printf("ic=%d oc=%d (kw,kh)=(%d,%d) (sw,sh)=(%d,%d) (pad)=(%d,%d,%d,%d) group=%d\n", conv_param.input_channels, conv_param.output_channels, conv_param.kernel_w, conv_param.kernel_h, conv_param.stride_w, conv_param.stride_h, conv_param.pad_left, conv_param.pad_bottom, conv_param.pad_right, conv_param.pad_top, conv_param.group); 78 | 79 | // The params are known, therefore we can allocate space for weights. 80 | Blob *conv_weights = new Blob(this->name + "_weights"); 81 | conv_weights->ReshapeWithRealloc(conv_param.output_channels, conv_param.input_channels, conv_param.kernel_h, conv_param.kernel_w); 82 | weights.push_back(conv_weights); 83 | if (conv_param.bias_term) 84 | { 85 | Blob *bias_weights = new Blob(this->name + "_bias"); 86 | bias_weights->ReshapeWithRealloc(conv_param.output_channels, 1, 1, 1); 87 | weights.push_back(bias_weights); 88 | } 89 | return 0; 90 | } 91 | 92 | int Reshape() 93 | { 94 | // Allocate space for the layer's own top. 95 | const Blob *bottom_blob = this->bottoms[0]; 96 | conv_param.input_w = bottom_blob->width(); 97 | conv_param.input_h = bottom_blob->height(); 98 | if (conv_param.input_channels != bottom_blob->channels()) 99 | { 100 | LOGE("Loaded convolution layer %s has %d input channels while bottom blob has %zu channels\n", this->name.c_str(), conv_param.input_channels, bottom_blob->channels()); 101 | return -300; //Topology error 102 | } 103 | // printf("##########################\n"); 104 | // conv_param.LogParams(this->name.c_str()); 105 | conv_param.AssignOutputDim(); 106 | // conv_param.LogParams(this->name.c_str()); 107 | tops[0]->ReshapeWithRealloc(1, conv_param.output_channels, conv_param.output_h, conv_param.output_w); 108 | conv_booster.SelectAlgo(&this->conv_param); 109 | int buffer_size = 0; 110 | int dull = 0; 111 | int ret = conv_booster.GetBufferSize(&conv_param, &buffer_size, &dull); 112 | MEMPOOL_CHECK_RETURN(this->common_mempool->Request(sizeof(float) * buffer_size)); 113 | return 0; 114 | } 115 | 116 | int LoadWeights(const ncnn::ModelBin& mb) 117 | { 118 | int weight_data_size = conv_param.input_channels * conv_param.output_channels * conv_param.kernel_h * conv_param.kernel_w; 119 | ncnn::Mat weight_data = mb.load(weight_data_size, 0); 120 | if (weight_data.empty()) 121 | return -100; 122 | if (this->weights.empty()) 123 | return -100; 124 | this->weights[0]->CopyDataFromMat(weight_data); 125 | 126 | if (conv_param.bias_term) 127 | { 128 | ncnn::Mat bias_data = mb.load(conv_param.output_channels, 1); 129 | if (bias_data.empty()) 130 | return -100; 131 | if (this->weights.size() < 2) 132 | { 133 | LOGE("In layer %s: Bias weight blob not allocated.", this->name.c_str()); 134 | return -100; 135 | } 136 | weights[1]->CopyDataFromMat(bias_data); 137 | } 138 | return 0; 139 | } 140 | 141 | int Forward() 142 | { 143 | // conv_param.LogParams(this->name().c_str()); 144 | float* input = this->bottoms[0]->data(); 145 | float* output = this->tops[0]->data(); 146 | float* buffer = NULL; 147 | MEMPOOL_CHECK_RETURN(this->common_mempool->GetPtr(&buffer)); 148 | printf("thread num =%d\n", this->rt_param->num_threads()); 149 | conv_booster.Forward(&conv_param, output, input, processed_kernel, buffer, bias_data, this->rt_param->num_threads()); 150 | return 0; 151 | } 152 | 153 | int Init() 154 | { 155 | int buffer_size = 0; 156 | int processed_kernel_size = 0; 157 | int ret = conv_booster.GetBufferSize(&conv_param, &buffer_size, &processed_kernel_size); 158 | Blob * processed_weights = new Blob(this->name + "_proc_weights"); 159 | processed_weights->ReshapeWithRealloc(1, 1, 1, processed_kernel_size); 160 | float* kernel_data = this->weights[0]->data(); 161 | float* processed_kernel = processed_weights->data(); 162 | conv_booster.Init(&conv_param, processed_kernel, kernel_data); 163 | delete this->weights[0]; 164 | this->weights[0] = processed_weights; 165 | this->processed_kernel = processed_weights->data(); 166 | if (conv_param.bias_term) 167 | { 168 | bias_data = this->weights[1]->data(); 169 | } 170 | // MEMPOOL_CHECK_RETURN(this->common_mempool->Request(sizeof(float) * buffer_size)); 171 | return 0; 172 | } 173 | 174 | int Fuse(Layer *next_layer) 175 | { 176 | if (next_layer->type.compare("ReLU") == 0) 177 | { 178 | conv_param.activation = booster::ReLU; 179 | return 1; 180 | } 181 | else 182 | { 183 | return 0; 184 | } 185 | } 186 | 187 | protected: 188 | booster::ConvBooster conv_booster; 189 | booster::ConvParam conv_param; 190 | 191 | float *bias_data; 192 | float *processed_kernel; 193 | }; 194 | }; 195 | -------------------------------------------------------------------------------- /src/layers/dropout_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | 19 | namespace feather 20 | { 21 | class DropoutLayer : public Layer 22 | { 23 | public: 24 | DropoutLayer(RuntimeParameter* rt_param) 25 | : Layer(rt_param) 26 | { 27 | _inplace = false; 28 | } 29 | 30 | int LoadParam(const ncnn::ParamDict &pd) 31 | { 32 | scale = pd.get(0, 1.f); 33 | return 0; 34 | } 35 | 36 | int Forward() 37 | { 38 | if (scale == 1.f) 39 | { 40 | memcpy(tops[0]->data(), bottoms[0]->data(), bottoms[0]->data_size() * sizeof(float)); 41 | } 42 | else 43 | { 44 | int w = bottoms[0]->width(); 45 | int h = bottoms[0]->height(); 46 | int channels = bottoms[0]->channels(); 47 | int size = w * h; 48 | 49 | float* inp = bottoms[0]->data(); 50 | float* outp = tops[0]->data(); 51 | for (int i = 0; i < bottoms[0]->data_size(); ++i) 52 | { 53 | outp[i] = inp[i] * scale; 54 | } 55 | } 56 | return 0; 57 | } 58 | private: 59 | float scale; 60 | }; 61 | }; -------------------------------------------------------------------------------- /src/layers/eltwise_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | 19 | namespace feather 20 | { 21 | class EltwiseLayer : public Layer 22 | { 23 | public: 24 | EltwiseLayer(RuntimeParameter* rt_param) 25 | : Layer(rt_param), 26 | op_type(1), 27 | fuse_relu(0) 28 | { 29 | } 30 | 31 | int Reshape() 32 | { 33 | size_t n = bottoms[0]->num(); 34 | size_t c = bottoms[0]->channels(); 35 | size_t h = bottoms[0]->height(); 36 | size_t w = bottoms[0]->width(); 37 | // Check bottom shapes 38 | for (int i = 1; i < bottoms.size(); ++i) 39 | { 40 | if ((n != bottoms[i]->num()) || (c != bottoms[i]->channels()) || (h != bottoms[i]->height()) || (w != bottoms[i]->width())) 41 | { 42 | LOGE("Shape mismatch among bottoms of layer %s.", this->name.c_str()); 43 | return -100; 44 | } 45 | } 46 | for (int i = 0; i < tops.size(); ++i) 47 | { 48 | tops[i]->ReshapeWithRealloc(n, c, h, w); 49 | } 50 | return 0; 51 | } 52 | 53 | int LoadParam(const ncnn::ParamDict &pd) 54 | { 55 | op_type = pd.get(0, 0); 56 | ncnn::Mat coeffs = pd.get(1, ncnn::Mat()); 57 | if (!coeffs.empty()) 58 | { 59 | LOGE("FeatherCNN doesn't support coeffs in eltwise layer. Please refer to ncnn."); 60 | return -100; 61 | } 62 | if (op_type != 1) 63 | { 64 | LOGE("FeatherCNN doesn't support ops rather than SUM. Please refer to ncnn."); 65 | return -100; 66 | } 67 | return 0; 68 | } 69 | 70 | int Forward() 71 | { 72 | float* input_alpha = bottoms[0]->data(); 73 | float* input_beta = bottoms[1]->data(); 74 | float* output_data = tops[0]->data(); 75 | size_t data_size = bottoms[0]->data_size(); 76 | 77 | if (fuse_relu) 78 | booster::add_relu(output_data, input_alpha, input_beta, data_size, 1); 79 | else 80 | booster::add_relu(output_data, input_alpha, input_beta, data_size, 1); 81 | return 0; 82 | } 83 | 84 | enum { Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 }; 85 | 86 | private: 87 | int op_type; 88 | int fuse_relu; 89 | 90 | }; 91 | }; -------------------------------------------------------------------------------- /src/layers/inner_product_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | #include "booster/sgemv.h" 19 | 20 | #include 21 | #include 22 | 23 | namespace feather 24 | { 25 | class InnerProductLayer : public Layer 26 | { 27 | public: 28 | InnerProductLayer(RuntimeParameter* rt_param) 29 | : fuse_relu(false), Layer(rt_param) 30 | { 31 | } 32 | 33 | int Forward() 34 | { 35 | //this->bottom_blob(0)->PrintBlobInfo(); 36 | //this->top_blob(0)->PrintBlobInfo(); 37 | const float *input = bottoms[0]->data(); 38 | float *output = tops[0]->data(); 39 | sgemv_kernel((int)input_size, (int)output_size, input, kernel_data, output, rt_param->num_threads(), bias_data); 40 | return 0; 41 | } 42 | 43 | int Fuse(Layer *next_layer) 44 | { 45 | if (next_layer->type.compare("ReLU") == 0) 46 | { 47 | fuse_relu = true; 48 | return 1; 49 | } 50 | else 51 | { 52 | return 0; 53 | } 54 | } 55 | int Init() 56 | { 57 | Blob *p_blob = new Blob; 58 | printf("input size %d\n", input_size); 59 | p_blob->ReshapeWithRealloc(1, 1, 1, input_size * 8); 60 | this->kernel_data = weights[0]->data(); 61 | float* buffer = p_blob->data(); 62 | if (input_size % 8 == 0 && output_size % 8 == 0) 63 | { 64 | for (int i = 0; i < output_size / 8; i++) 65 | matrixTranspose(this->kernel_data + i * 8 * input_size, 8, input_size, buffer); 66 | } 67 | delete p_blob; 68 | if (output_size % 8 == 0 && input_size % 8 == 0) 69 | { 70 | if (bias_term && fuse_relu) 71 | sgemv_kernel = fully_connected_transpose_inference; 72 | else if (bias_term && !fuse_relu) 73 | sgemv_kernel = fully_connected_transpose_inference; 74 | else if (!bias_term && fuse_relu) 75 | sgemv_kernel = fully_connected_transpose_inference; 76 | else if (!bias_term && !fuse_relu) 77 | sgemv_kernel = fully_connected_transpose_inference; 78 | } 79 | else 80 | { 81 | if (bias_term && fuse_relu) 82 | sgemv_kernel = fully_connected_inference_direct; 83 | else if (bias_term && !fuse_relu) 84 | sgemv_kernel = fully_connected_inference_direct; 85 | else if (!bias_term && fuse_relu) 86 | sgemv_kernel = fully_connected_inference_direct; 87 | else if (!bias_term && !fuse_relu) 88 | sgemv_kernel = fully_connected_inference_direct; 89 | } 90 | 91 | this->bias_data = this->weights[1]->data(); 92 | return 0; 93 | } 94 | 95 | int Reshape() 96 | { 97 | // Allocate space for the layer's own top. 98 | const Blob *bottom_blob = bottoms[0]; 99 | int input_width = bottom_blob->width(); 100 | int input_height = bottom_blob->height(); 101 | int input_channels = bottom_blob->channels(); 102 | // this->input_size = input_width * input_height * input_channels; 103 | if (input_size != bottom_blob->data_size()) 104 | { 105 | LOGE("In Layer %s: Bottom %s data size %zu is inconsistant with expected input size %zu.", this->name.c_str(), bottom_blob->name.c_str(), bottom_blob->data_size(), input_size); 106 | return -100; 107 | } 108 | this->tops[0]->ReshapeWithRealloc(1, output_size, 1, 1); 109 | return 0; 110 | } 111 | 112 | int LoadParam(const ncnn::ParamDict &pd) 113 | { 114 | this->output_size = pd.get(0, 0); 115 | this->bias_term = pd.get(1, 0); 116 | this->weight_data_size = pd.get(2, 0); 117 | this->input_size = this->weight_data_size / this->output_size; 118 | 119 | // The params are known, therefore we can allocate space for weights. 120 | Blob *fc_weights = new Blob(this->name + "_weights"); 121 | fc_weights->ReshapeWithRealloc(this->output_size, this->input_size, 1, 1); 122 | weights.push_back(fc_weights); 123 | if (this->bias_term) 124 | { 125 | Blob *bias_weights = new Blob(this->name + "_bias"); 126 | bias_weights->ReshapeWithRealloc(output_size, 1, 1, 1); 127 | weights.push_back(bias_weights); 128 | } 129 | return 0; 130 | } 131 | 132 | int LoadWeights(const ncnn::ModelBin& mb) 133 | { 134 | printf("Loading dimension %zu %zu\n", output_size, input_size); 135 | printf("weight data size %zu\n", weight_data_size); 136 | ncnn::Mat weight_data = mb.load(weight_data_size, 0); 137 | if (weight_data.empty()) 138 | return -100; 139 | if (this->weights.empty()) 140 | return -100; 141 | this->weights[0]->CopyDataFromMat(weight_data); 142 | 143 | if (this->bias_term) 144 | { 145 | ncnn::Mat bias_data = mb.load(output_size, 1); 146 | if (bias_data.empty()) 147 | return -100; 148 | if (this->weights.size() < 2) 149 | { 150 | LOGE("In layer %s: Bias weight blob not allocated.", this->name.c_str()); 151 | return -100; 152 | } 153 | weights[1]->CopyDataFromMat(bias_data); 154 | } 155 | return 0; 156 | } 157 | protected: 158 | size_t weight_data_size; 159 | 160 | size_t input_size; 161 | size_t output_size; 162 | 163 | bool bias_term; 164 | 165 | float *kernel_data; 166 | float *bias_data; 167 | 168 | bool fuse_relu; 169 | void (*sgemv_kernel)(const int, const int, const float *, const float *, float *, const int, float*); 170 | }; 171 | }; 172 | -------------------------------------------------------------------------------- /src/layers/input_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | #include 19 | #include 20 | 21 | #include 22 | 23 | namespace feather 24 | { 25 | class InputLayer : public Layer 26 | { 27 | public: 28 | InputLayer(RuntimeParameter* rt_param) 29 | : Layer(rt_param) 30 | { 31 | } 32 | 33 | int LoadParam(const ncnn::ParamDict& pd) 34 | { 35 | int w = pd.get(0, 0); 36 | int h = pd.get(1, 0); 37 | int c = pd.get(2, 0); 38 | // this->tops[0]->ReshapeWithRealloc(1, c, h, w); 39 | return 0; 40 | } 41 | 42 | int Reshape() 43 | { 44 | // Nothing to do, don't call base class version. 45 | return 0; 46 | } 47 | 48 | int Init() 49 | { 50 | return 0; 51 | } 52 | }; 53 | }; 54 | -------------------------------------------------------------------------------- /src/layers/pooling_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | 19 | #include 20 | #include 21 | 22 | #define MAX(a,b) ((a)>(b))?(a):(b) 23 | #define MIN(a,b) ((a)<(b))?(a):(b) 24 | 25 | namespace feather 26 | { 27 | class PoolingLayer : public Layer 28 | { 29 | public: 30 | PoolingLayer(RuntimeParameter* rt_param) 31 | : stride_h(1), 32 | stride_w(1), 33 | Layer(rt_param) 34 | { 35 | } 36 | 37 | 38 | int Forward() 39 | { 40 | const float *input = bottoms[0]->data(); 41 | float *output = tops[0]->data(); 42 | float *p = output; 43 | 44 | int slot = input_channels * output_h; 45 | 46 | //#pragma omp parallel for schedule(static) num_threads(num_threads) 47 | for (int i = 0; i < input_channels; ++i) 48 | { 49 | for (int j = 0; j < output_h; j ++) 50 | { 51 | // int i=slot/output_h, j=slot%output_h; 52 | float *p = output + i * output_h * output_w + j * output_w; 53 | for (int l = 0; l < output_w; l++) 54 | p[l] = (this->pooling_type != 0 ? 0 : -1 * std::numeric_limits::max()); 55 | 56 | int tmp_pos = j * stride_h - pad_top - pad_bottom; 57 | int x_min = MAX(tmp_pos, 0); 58 | int x_max = MIN((int)(tmp_pos + kernel_h), (int) input_h); 59 | 60 | for (int k = 0; k < output_w; k ++) 61 | { 62 | int counter = 0; 63 | float total = (this->pooling_type != 0 ? 0 : -1 * std::numeric_limits::max()); 64 | for (int x = x_min; x < x_max; ++x) 65 | { 66 | int xpos = i * input_h * input_w + x * input_w; 67 | int local_pos = k * stride_w - pad_left - pad_right; 68 | int y_min = MAX(local_pos, 0); 69 | int y_max = MIN((int)(local_pos + kernel_w), (int) input_w); 70 | 71 | for (int y = y_min; y < y_max; ++y) 72 | { 73 | float value = input[xpos + y]; 74 | if (this->pooling_type != 0) 75 | total += value, counter++; 76 | else 77 | total = total > value ? total : value; 78 | } 79 | } 80 | if (this->pooling_type != 0) 81 | p[k] += total / (counter); 82 | else 83 | p[k] = (p[k] > total) ? p[k] : total; 84 | } 85 | } 86 | } 87 | return 0; 88 | } 89 | 90 | int LoadParam(const ncnn::ParamDict& pd) 91 | { 92 | pooling_type = pd.get(0, 0); //Pooling type? 93 | kernel_w = pd.get(1, 0); 94 | kernel_h = pd.get(11, kernel_w); 95 | stride_w = pd.get(2, 1); 96 | stride_h = pd.get(12, stride_w); 97 | pad_left = pd.get(3, 0); 98 | pad_right = pd.get(14, pad_left); 99 | pad_top = pd.get(13, pad_left); 100 | pad_bottom = pd.get(15, pad_top); 101 | global_pooling = pd.get(4, 0); 102 | tf_pad_mode = pd.get(5, 0); 103 | // printf("$$ global_pooling %d\n", global_pooling); 104 | // printf("$$ padding %d %d %d %d\n", pad_left, pad_bottom, pad_right, pad_top); 105 | // printf("$$ stride %d %d\n", stride_h, stride_w); 106 | return 0; 107 | } 108 | 109 | int Reshape() 110 | { 111 | //Only accept a single bottom blob. 112 | const Blob *bottom_blob = bottoms[0]; 113 | input_h = bottom_blob->height(); 114 | input_w = bottom_blob->width(); 115 | input_channels = bottom_blob->channels(); 116 | // printf("$$ input %d %d %d\n", input_channels, input_h, input_w); 117 | if (global_pooling) 118 | { 119 | kernel_h = input_h; 120 | kernel_w = input_w; 121 | output_h = 1; 122 | output_w = 1; 123 | output_channels = input_channels; 124 | } 125 | else 126 | { 127 | //General pooling. 128 | output_channels = input_channels; 129 | output_h = static_cast(ceil(static_cast(input_h + pad_top + pad_bottom - kernel_h) / stride_h)) + 1; 130 | output_w = static_cast(ceil(static_cast(input_w + pad_left + pad_right - kernel_w) / stride_w)) + 1; 131 | } 132 | this->tops[0]->ReshapeWithRealloc(1, output_channels, output_h, output_w); 133 | return 0; 134 | } 135 | 136 | private: 137 | int input_h; 138 | int input_w; 139 | int input_channels; 140 | int output_h; 141 | int output_w; 142 | int output_channels; 143 | int pad_left; 144 | int pad_bottom; 145 | int pad_right; 146 | int pad_top; 147 | int kernel_h; 148 | int kernel_w; 149 | int stride_h; 150 | int stride_w; 151 | bool global_pooling; 152 | int pooling_type; 153 | int tf_pad_mode; 154 | }; 155 | }; 156 | -------------------------------------------------------------------------------- /src/layers/relu_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | 19 | namespace feather 20 | { 21 | class ReluLayer : public Layer 22 | { 23 | public: 24 | ReluLayer(RuntimeParameter* rt_param) 25 | : Layer(rt_param) 26 | { 27 | } 28 | 29 | int Forward() 30 | { 31 | const Blob *p_bottom = bottoms[0]; 32 | const float *input = p_bottom->data(); 33 | const size_t data_size = p_bottom->num() * p_bottom->channels() * p_bottom->height() * p_bottom->width(); 34 | 35 | float *output = tops[0]->data(); 36 | for (size_t i = 0; i < data_size; ++i) 37 | { 38 | output[i] = input[i] > 0 ? input[i] : 0; 39 | } 40 | return 0; 41 | } 42 | }; 43 | }; 44 | -------------------------------------------------------------------------------- /src/layers/scale_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | #include 19 | 20 | namespace feather 21 | { 22 | class ScaleLayer : public Layer 23 | { 24 | public: 25 | ScaleLayer(RuntimeParameter* rt_param) 26 | : channels(0), 27 | bias_term(0), 28 | scale_data_size(0), 29 | Layer(rt_param) 30 | { 31 | } 32 | 33 | int LoadParam(const ncnn::ParamDict &pd) 34 | { 35 | scale_data_size = pd.get(0, 0); 36 | bias_term = pd.get(1, 0); 37 | if (scale_data_size < 0) 38 | { 39 | LOGE("feather doesn't accept negative scale data size, please use ncnn to run this model.\n"); 40 | return -100; 41 | } 42 | return 0; 43 | } 44 | 45 | int LoadWeights(const ncnn::ModelBin &mb) 46 | { 47 | ncnn::Mat scale_mat; 48 | ncnn::Mat bias_mat; 49 | if (scale_data_size == -233) 50 | return 0; 51 | 52 | scale_mat = mb.load(scale_data_size, 1); 53 | if (scale_mat.empty()) 54 | return -100; 55 | channels = scale_data_size; 56 | Blob *scale_blob = new Blob; 57 | scale_blob->ReshapeWithRealloc(1, 1, 1, channels); 58 | scale_blob->CopyDataFromMat(scale_mat); 59 | weights.push_back(scale_blob); 60 | 61 | if (bias_term) 62 | { 63 | bias_mat = mb.load(scale_data_size, 1); 64 | if (bias_mat.empty()) 65 | return -100; 66 | Blob *bias_blob = new Blob; 67 | bias_blob->ReshapeWithRealloc(1, 1, 1, channels); 68 | bias_blob->CopyDataFromMat(bias_mat); 69 | weights.push_back(bias_blob); 70 | } 71 | return 0; 72 | } 73 | 74 | int Forward() 75 | { 76 | const float *input = bottoms[0]->data(); 77 | float *output = tops[0]->data(); 78 | size_t stride = bottoms[0]->width() * bottoms[0]->height(); 79 | const float* scale_data = weights[0]->data(); 80 | const float* bias_data = NULL; 81 | if (bias_term) 82 | bias_data = weights[1]->data(); 83 | scale_kernel(channels, stride, bias_data, scale_data, input, output, 1); 84 | return 0; 85 | } 86 | 87 | int Init() 88 | { 89 | // const Blob *p_blob = bottoms[0]; 90 | // input_channels = p_blob->channels(); 91 | // input_height = p_blob->height(); 92 | // input_width = p_blob->width(); 93 | // printf("input %d %d %d", input_channels, input_width, input_height); 94 | // scale_data = _weight_blobs[0]->data(); 95 | //printf("bias_term %d\n", _bias_term ? 1: 0); 96 | if (bias_term) 97 | { 98 | scale_kernel = booster::scale; 99 | } 100 | else 101 | { 102 | scale_kernel = booster::scale; 103 | } 104 | return 0; 105 | } 106 | 107 | private: 108 | size_t channels; 109 | int bias_term; 110 | int scale_data_size; 111 | 112 | private: 113 | void (*scale_kernel)(const size_t channels, const size_t stride, const float* bias_data, const float* scale_data, const float* input, float* output, const size_t num_threads); 114 | }; 115 | }; 116 | -------------------------------------------------------------------------------- /src/layers/softmax_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | 19 | #include 20 | #include 21 | 22 | namespace feather 23 | { 24 | class SoftmaxLayer : public Layer 25 | { 26 | public: 27 | SoftmaxLayer(RuntimeParameter* rt_param) 28 | : Layer(rt_param) 29 | { 30 | } 31 | 32 | int Forward() 33 | { 34 | const Blob *p_bottom = bottoms[0]; 35 | const float *input = p_bottom->data(); 36 | const size_t data_size = p_bottom->num() * p_bottom->channels() * p_bottom->height() * p_bottom->width(); 37 | float *output = tops[0]->data(); 38 | 39 | float sum = 0.0; 40 | float max = -FLT_MAX; 41 | for (size_t i = 0; i < data_size; ++i) 42 | { 43 | max = std::max(max, input[i]); 44 | } 45 | for (size_t i = 0; i < data_size; ++i) 46 | { 47 | output[i] = static_cast(exp(input[i] - max)); 48 | sum += output[i]; 49 | } 50 | for (size_t i = 0; i < data_size; ++i) 51 | { 52 | output[i] = output[i] / sum; 53 | } 54 | return 0; 55 | } 56 | }; 57 | }; 58 | -------------------------------------------------------------------------------- /src/layers/split_layer.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "../layer.h" 18 | 19 | namespace feather 20 | { 21 | class SplitLayer : public Layer 22 | { 23 | public: 24 | SplitLayer(RuntimeParameter* rt_param) 25 | : Layer(rt_param) 26 | { 27 | 28 | } 29 | 30 | int Reshape() 31 | { 32 | size_t n = bottoms[0]->num(); 33 | size_t c = bottoms[0]->channels(); 34 | size_t h = bottoms[0]->height(); 35 | size_t w = bottoms[0]->width(); 36 | for (int i = 0; i < tops.size(); ++i) 37 | { 38 | tops[i]->ReshapeWithRealloc(n, c, h, w); 39 | } 40 | return 0; 41 | } 42 | 43 | int Forward() 44 | { 45 | float* src_data = bottoms[0]->data(); 46 | size_t data_size = bottoms[0]->data_size(); 47 | 48 | for (int i = 0; i < tops.size(); ++i) 49 | { 50 | memcpy(tops[i]->data(), src_data, sizeof(float) * data_size); 51 | } 52 | return 0; 53 | } 54 | }; 55 | }; -------------------------------------------------------------------------------- /src/mempool.cpp: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #include "mempool.h" 16 | 17 | #include 18 | #include 19 | 20 | #include "utils.h" 21 | 22 | template 23 | CommonMemPool::~CommonMemPool() 24 | { 25 | if (common_memory || common_size_map.size() || common_ptr_map.size()) 26 | { 27 | Free(); 28 | } 29 | } 30 | 31 | template 32 | bool CommonMemPool::Alloc() 33 | { 34 | if (common_memory) 35 | { 36 | fprintf(stderr, "Error: common memory already allocated.\n"); 37 | return false; 38 | } 39 | if (common_size > 0) 40 | { 41 | common_memory = (PTR_TYPE *) _mm_malloc(common_size, 128); 42 | if (!common_memory) 43 | { 44 | fprintf(stderr, "Error: cannot allocate common memory.\n"); 45 | return false; 46 | } 47 | allocated_size = common_size; 48 | } 49 | if (common_size_map.size()) 50 | { 51 | std::map::iterator it 52 | = common_size_map.begin(); 53 | while (it != common_size_map.end()) 54 | { 55 | PTR_TYPE *wptr = NULL; 56 | wptr = (PTR_TYPE *) _mm_malloc(it->second, 128); 57 | if (!wptr) 58 | { 59 | fprintf(stderr, "Allocation for size %ld id %ld failed\n", it->second, it->first); 60 | } 61 | common_ptr_map[it->first] = wptr; 62 | ++it; 63 | } 64 | } 65 | return (common_ptr_map.size() == common_size_map.size()) ? true : false; 66 | } 67 | 68 | template 69 | bool CommonMemPool::Free() 70 | { 71 | if (common_memory) 72 | { 73 | free(common_memory); 74 | allocated_size = 0; 75 | common_memory = NULL; 76 | } 77 | return true; 78 | } 79 | 80 | template 81 | bool CommonMemPool::Reset() 82 | { 83 | common_size = 0; 84 | return this->Free(); 85 | } 86 | 87 | template 88 | bool CommonMemPool::Request(size_t size_byte) 89 | { 90 | common_size = (common_size > size_byte) ? common_size : size_byte; 91 | return true; 92 | } 93 | 94 | template 95 | bool CommonMemPool::GetPtr(PTR_TYPE ** ptr) 96 | { 97 | if (!common_memory) 98 | { 99 | fprintf(stderr, "Common memroy not allocated\n"); 100 | // return false; 101 | } 102 | if (this->common_size > allocated_size) 103 | { 104 | this->Free(); 105 | this->Alloc(); 106 | } 107 | *ptr = common_memory; 108 | return true; 109 | } 110 | 111 | template class CommonMemPool; 112 | template class CommonMemPool; 113 | template class CommonMemPool; 114 | template class CommonMemPool; -------------------------------------------------------------------------------- /src/mempool.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | 19 | #define MEMPOOL_CHECK_RETURN(var) {if(!var){fprintf(stderr, "Err in file %s line %d\n", __FILE__, __LINE__);return false;}} 20 | 21 | template 22 | class CommonMemPool 23 | { 24 | public: 25 | CommonMemPool(): common_size(0), allocated_size(0), common_memory(NULL) {} 26 | ~CommonMemPool(); 27 | 28 | bool Request(size_t size_byte); 29 | bool GetPtr(PTR_TYPE ** ptr); 30 | bool Reset(); 31 | bool Free(); 32 | bool Alloc(); 33 | 34 | private: 35 | //Default common memory pool 36 | size_t common_size; 37 | size_t allocated_size; 38 | PTR_TYPE * common_memory; 39 | 40 | //Map common ID to size 41 | std::map common_size_map; 42 | //Map common ID to pointer 43 | std::map common_ptr_map; 44 | }; -------------------------------------------------------------------------------- /src/ncnn/allocator.cpp: -------------------------------------------------------------------------------- 1 | // Tencent is pleased to support the open source community by making ncnn available. 2 | // 3 | // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved. 4 | // 5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | // in compliance with the License. You may obtain a copy of the License at 7 | // 8 | // https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | // Unless required by applicable law or agreed to in writing, software distributed 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | // specific language governing permissions and limitations under the License. 14 | 15 | #include "allocator.h" 16 | 17 | #include 18 | #include 19 | #include "gpu.h" 20 | 21 | namespace ncnn { 22 | 23 | Allocator::~Allocator() 24 | { 25 | 26 | } 27 | 28 | PoolAllocator::PoolAllocator() 29 | { 30 | size_compare_ratio = 192;// 0.75f * 256 31 | } 32 | 33 | PoolAllocator::~PoolAllocator() 34 | { 35 | clear(); 36 | 37 | if (!payouts.empty()) 38 | { 39 | fprintf(stderr, "FATAL ERROR! pool allocator destroyed too early\n"); 40 | std::list< std::pair >::iterator it = payouts.begin(); 41 | for (; it != payouts.end(); it++) 42 | { 43 | void* ptr = it->second; 44 | fprintf(stderr, "%p still in use\n", ptr); 45 | } 46 | } 47 | } 48 | 49 | void PoolAllocator::clear() 50 | { 51 | budgets_lock.lock(); 52 | 53 | std::list< std::pair >::iterator it = budgets.begin(); 54 | for (; it != budgets.end(); it++) 55 | { 56 | void* ptr = it->second; 57 | ncnn::fastFree(ptr); 58 | } 59 | budgets.clear(); 60 | 61 | budgets_lock.unlock(); 62 | } 63 | 64 | void PoolAllocator::set_size_compare_ratio(float scr) 65 | { 66 | if (scr < 0.f || scr > 1.f) 67 | { 68 | fprintf(stderr, "invalid size compare ratio %f\n", scr); 69 | return; 70 | } 71 | 72 | size_compare_ratio = (unsigned int)(scr * 256); 73 | } 74 | 75 | void* PoolAllocator::fastMalloc(size_t size) 76 | { 77 | budgets_lock.lock(); 78 | 79 | // find free budget 80 | std::list< std::pair >::iterator it = budgets.begin(); 81 | for (; it != budgets.end(); it++) 82 | { 83 | size_t bs = it->first; 84 | 85 | // size_compare_ratio ~ 100% 86 | if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size) 87 | { 88 | void* ptr = it->second; 89 | 90 | budgets.erase(it); 91 | 92 | budgets_lock.unlock(); 93 | 94 | payouts_lock.lock(); 95 | 96 | payouts.push_back(std::make_pair(bs, ptr)); 97 | 98 | payouts_lock.unlock(); 99 | 100 | return ptr; 101 | } 102 | } 103 | 104 | budgets_lock.unlock(); 105 | 106 | // new 107 | void* ptr = ncnn::fastMalloc(size); 108 | 109 | payouts_lock.lock(); 110 | 111 | payouts.push_back(std::make_pair(size, ptr)); 112 | 113 | payouts_lock.unlock(); 114 | 115 | return ptr; 116 | } 117 | 118 | void PoolAllocator::fastFree(void* ptr) 119 | { 120 | payouts_lock.lock(); 121 | 122 | // return to budgets 123 | std::list< std::pair >::iterator it = payouts.begin(); 124 | for (; it != payouts.end(); it++) 125 | { 126 | if (it->second == ptr) 127 | { 128 | size_t size = it->first; 129 | 130 | payouts.erase(it); 131 | 132 | payouts_lock.unlock(); 133 | 134 | budgets_lock.lock(); 135 | 136 | budgets.push_back(std::make_pair(size, ptr)); 137 | 138 | budgets_lock.unlock(); 139 | 140 | return; 141 | } 142 | } 143 | 144 | payouts_lock.unlock(); 145 | 146 | fprintf(stderr, "FATAL ERROR! pool allocator get wild %p\n", ptr); 147 | ncnn::fastFree(ptr); 148 | } 149 | 150 | UnlockedPoolAllocator::UnlockedPoolAllocator() 151 | { 152 | size_compare_ratio = 192;// 0.75f * 256 153 | } 154 | 155 | UnlockedPoolAllocator::~UnlockedPoolAllocator() 156 | { 157 | clear(); 158 | 159 | if (!payouts.empty()) 160 | { 161 | fprintf(stderr, "FATAL ERROR! unlocked pool allocator destroyed too early\n"); 162 | std::list< std::pair >::iterator it = payouts.begin(); 163 | for (; it != payouts.end(); it++) 164 | { 165 | void* ptr = it->second; 166 | fprintf(stderr, "%p still in use\n", ptr); 167 | } 168 | } 169 | } 170 | 171 | void UnlockedPoolAllocator::clear() 172 | { 173 | std::list< std::pair >::iterator it = budgets.begin(); 174 | for (; it != budgets.end(); it++) 175 | { 176 | void* ptr = it->second; 177 | ncnn::fastFree(ptr); 178 | } 179 | budgets.clear(); 180 | } 181 | 182 | void UnlockedPoolAllocator::set_size_compare_ratio(float scr) 183 | { 184 | if (scr < 0.f || scr > 1.f) 185 | { 186 | fprintf(stderr, "invalid size compare ratio %f\n", scr); 187 | return; 188 | } 189 | 190 | size_compare_ratio = (unsigned int)(scr * 256); 191 | } 192 | 193 | void* UnlockedPoolAllocator::fastMalloc(size_t size) 194 | { 195 | // find free budget 196 | std::list< std::pair >::iterator it = budgets.begin(); 197 | for (; it != budgets.end(); it++) 198 | { 199 | size_t bs = it->first; 200 | 201 | // size_compare_ratio ~ 100% 202 | if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size) 203 | { 204 | void* ptr = it->second; 205 | 206 | budgets.erase(it); 207 | 208 | payouts.push_back(std::make_pair(bs, ptr)); 209 | 210 | return ptr; 211 | } 212 | } 213 | 214 | // new 215 | void* ptr = ncnn::fastMalloc(size); 216 | 217 | payouts.push_back(std::make_pair(size, ptr)); 218 | 219 | return ptr; 220 | } 221 | 222 | void UnlockedPoolAllocator::fastFree(void* ptr) 223 | { 224 | // return to budgets 225 | std::list< std::pair >::iterator it = payouts.begin(); 226 | for (; it != payouts.end(); it++) 227 | { 228 | if (it->second == ptr) 229 | { 230 | size_t size = it->first; 231 | 232 | payouts.erase(it); 233 | 234 | budgets.push_back(std::make_pair(size, ptr)); 235 | 236 | return; 237 | } 238 | } 239 | 240 | fprintf(stderr, "FATAL ERROR! unlocked pool allocator get wild %p\n", ptr); 241 | ncnn::fastFree(ptr); 242 | } 243 | 244 | } // namespace ncnn 245 | -------------------------------------------------------------------------------- /src/ncnn/mat.cpp: -------------------------------------------------------------------------------- 1 | // Tencent is pleased to support the open source community by making ncnn available. 2 | // 3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. 4 | // 5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | // in compliance with the License. You may obtain a copy of the License at 7 | // 8 | // https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | // Unless required by applicable law or agreed to in writing, software distributed 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | // specific language governing permissions and limitations under the License. 14 | 15 | #include "mat.h" 16 | 17 | #if __ARM_NEON 18 | #include 19 | #endif // __ARM_NEON 20 | #include 21 | 22 | // #include "cpu.h" 23 | 24 | // #include "layer_type.h" 25 | // #include "layer.h" 26 | 27 | namespace ncnn { 28 | 29 | #ifndef COMPILE_WITH_FEATHERCNN 30 | void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_vals) 31 | { 32 | ncnn::Layer* op; 33 | 34 | if (mean_vals && !norm_vals) 35 | { 36 | // substract mean only 37 | op = ncnn::create_layer(ncnn::LayerType::Bias); 38 | 39 | ncnn::ParamDict pd; 40 | pd.set(0, c); 41 | 42 | op->load_param(pd); 43 | 44 | ncnn::Mat weights[1]; 45 | weights[0] = Mat(c); 46 | for (int q=0; qload_model(ncnn::ModelBinFromMatArray(weights)); 52 | } 53 | else if (!mean_vals && norm_vals) 54 | { 55 | // normalize only 56 | op = ncnn::create_layer(ncnn::LayerType::Scale); 57 | 58 | ncnn::ParamDict pd; 59 | pd.set(0, c); 60 | 61 | op->load_param(pd); 62 | 63 | ncnn::Mat weights[1]; 64 | weights[0] = Mat(c); 65 | for (int q=0; qload_model(ncnn::ModelBinFromMatArray(weights)); 71 | } 72 | else if (mean_vals && norm_vals) 73 | { 74 | // substract mean and normalize 75 | op = ncnn::create_layer(ncnn::LayerType::Scale); 76 | 77 | ncnn::ParamDict pd; 78 | pd.set(0, c); 79 | pd.set(1, 1); 80 | 81 | op->load_param(pd); 82 | 83 | ncnn::Mat weights[2]; 84 | weights[0] = Mat(c); 85 | weights[1] = Mat(c); 86 | for (int q=0; qload_model(ncnn::ModelBinFromMatArray(weights)); 93 | } 94 | else // if (!mean_vals && !norm_vals) 95 | { 96 | return; 97 | } 98 | 99 | op->forward_inplace(*this, ncnn::get_default_option()); 100 | 101 | delete op; 102 | } 103 | #endif 104 | // convert half precision floating point to float 105 | static float half2float(unsigned short value) 106 | { 107 | // 1 : 5 : 10 108 | unsigned short sign = (value & 0x8000) >> 15; 109 | unsigned short exponent = (value & 0x7c00) >> 10; 110 | unsigned short significand = value & 0x03FF; 111 | 112 | // fprintf(stderr, "%d %d %d\n", sign, exponent, significand); 113 | 114 | // 1 : 8 : 23 115 | union 116 | { 117 | unsigned int u; 118 | float f; 119 | } tmp; 120 | if (exponent == 0) 121 | { 122 | if (significand == 0) 123 | { 124 | // zero 125 | tmp.u = (sign << 31); 126 | } 127 | else 128 | { 129 | // denormal 130 | exponent = 0; 131 | // find non-zero bit 132 | while ((significand & 0x200) == 0) 133 | { 134 | significand <<= 1; 135 | exponent++; 136 | } 137 | significand <<= 1; 138 | significand &= 0x3FF; 139 | tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13); 140 | } 141 | } 142 | else if (exponent == 0x1F) 143 | { 144 | // infinity or NaN 145 | tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13); 146 | } 147 | else 148 | { 149 | // normalized 150 | tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13); 151 | } 152 | 153 | return tmp.f; 154 | } 155 | 156 | Mat Mat::from_float16(const unsigned short* data, int size) 157 | { 158 | Mat m(size); 159 | if (m.empty()) 160 | return m; 161 | 162 | float* ptr = m;//.data; 163 | 164 | #if __ARM_NEON && (__ARM_FP & 2) 165 | int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0; 166 | int remain = size - (nn << 2); 167 | #else 168 | int remain = size; 169 | #endif // __ARM_NEON 170 | 171 | #if __ARM_NEON && (__ARM_FP & 2) 172 | #if __aarch64__ 173 | if (nn > 0) 174 | { 175 | asm volatile( 176 | "0: \n" 177 | "ld1 {v0.4h}, [%1], #8 \n" 178 | "fcvtl v1.4s, v0.4h \n" 179 | "subs %w0, %w0, #1 \n" 180 | "st1 {v1.4s}, [%2], #16 \n" 181 | "bne 0b \n" 182 | : "=r"(nn), // %0 183 | "=r"(data), // %1 184 | "=r"(ptr) // %2 185 | : "0"(nn), 186 | "1"(data), 187 | "2"(ptr) 188 | : "cc", "memory", "v0", "v1" 189 | ); 190 | } 191 | #else 192 | if (nn > 0) 193 | { 194 | asm volatile( 195 | "0: \n" 196 | "pld [%1, #64] \n" 197 | "vld1.s16 {d0}, [%1 :64]! \n" 198 | "vcvt.f32.f16 q1, d0 \n" 199 | "subs %0, #1 \n" 200 | "vst1.f32 {d2-d3}, [%2 :128]! \n" 201 | "bne 0b \n" 202 | : "=r"(nn), // %0 203 | "=r"(data), // %1 204 | "=r"(ptr) // %2 205 | : "0"(nn), 206 | "1"(data), 207 | "2"(ptr) 208 | : "cc", "memory", "q0", "q1" 209 | ); 210 | } 211 | #endif // __aarch64__ 212 | #endif // __ARM_NEON 213 | for (; remain>0; remain--) 214 | { 215 | *ptr = half2float(*data); 216 | 217 | data++; 218 | ptr++; 219 | } 220 | 221 | return m; 222 | } 223 | 224 | #ifndef COMPILE_WITH_FEATHERCNN 225 | void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, Allocator* allocator, int num_threads) 226 | { 227 | ncnn::Layer* padding = ncnn::create_layer(ncnn::LayerType::Padding); 228 | 229 | ncnn::ParamDict pd; 230 | pd.set(0, top); 231 | pd.set(1, bottom); 232 | pd.set(2, left); 233 | pd.set(3, right); 234 | pd.set(4, type); 235 | pd.set(5, v); 236 | 237 | padding->load_param(pd); 238 | 239 | ncnn::Option opt = ncnn::get_default_option(); 240 | opt.num_threads = num_threads; 241 | opt.blob_allocator = allocator; 242 | 243 | padding->forward(src, dst, opt); 244 | 245 | delete padding; 246 | } 247 | 248 | void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, Allocator* allocator, int num_threads) 249 | { 250 | ncnn::Layer* crop = ncnn::create_layer(ncnn::LayerType::Crop); 251 | 252 | ncnn::ParamDict pd; 253 | pd.set(0, left); 254 | pd.set(1, top); 255 | pd.set(2, 0); 256 | pd.set(3, src.w - left - right); 257 | pd.set(4, src.h - top - bottom); 258 | pd.set(5, src.c); 259 | 260 | crop->load_param(pd); 261 | 262 | ncnn::Option opt = ncnn::get_default_option(); 263 | opt.num_threads = num_threads; 264 | opt.blob_allocator = allocator; 265 | 266 | crop->forward(src, dst, opt); 267 | 268 | delete crop; 269 | } 270 | 271 | void resize_bilinear(const Mat& src, Mat& dst, int w, int h, Allocator* allocator, int num_threads) 272 | { 273 | ncnn::Layer* interp = ncnn::create_layer(ncnn::LayerType::Interp); 274 | 275 | ncnn::ParamDict pd; 276 | pd.set(0, 2); 277 | pd.set(3, h); 278 | pd.set(4, w); 279 | 280 | interp->load_param(pd); 281 | 282 | ncnn::Option opt = ncnn::get_default_option(); 283 | opt.num_threads = num_threads; 284 | opt.blob_allocator = allocator; 285 | 286 | interp->forward(src, dst, opt); 287 | 288 | delete interp; 289 | } 290 | 291 | void resize_bicubic(const Mat& src, Mat& dst, int w, int h, Allocator* allocator, int num_threads) 292 | { 293 | ncnn::Layer* interp = ncnn::create_layer(ncnn::LayerType::Interp); 294 | 295 | ncnn::ParamDict pd; 296 | pd.set(0, 3); 297 | pd.set(3, h); 298 | pd.set(4, w); 299 | 300 | interp->load_param(pd); 301 | 302 | ncnn::Option opt = ncnn::get_default_option(); 303 | opt.num_threads = num_threads; 304 | opt.blob_allocator = allocator; 305 | 306 | interp->forward(src, dst, opt); 307 | 308 | delete interp; 309 | } 310 | 311 | void convert_packing(const Mat& src, Mat& dst, int _packing, Allocator* allocator, int num_threads) 312 | { 313 | ncnn::Layer* packing = ncnn::create_layer(ncnn::LayerType::Packing); 314 | 315 | ncnn::ParamDict pd; 316 | pd.set(0, _packing); 317 | 318 | packing->load_param(pd); 319 | 320 | ncnn::Option opt = ncnn::get_default_option(); 321 | opt.num_threads = num_threads; 322 | opt.blob_allocator = allocator; 323 | 324 | packing->forward(src, dst, opt); 325 | 326 | delete packing; 327 | } 328 | 329 | void cast_float32_to_float16(const Mat& src, Mat& dst, Allocator* allocator, int num_threads) 330 | { 331 | ncnn::Layer* cast = ncnn::create_layer(ncnn::LayerType::Cast); 332 | 333 | ncnn::ParamDict pd; 334 | pd.set(0, 1); 335 | pd.set(1, 2); 336 | 337 | cast->load_param(pd); 338 | 339 | ncnn::Option opt = ncnn::get_default_option(); 340 | opt.num_threads = num_threads; 341 | opt.blob_allocator = allocator; 342 | 343 | cast->forward(src, dst, opt); 344 | 345 | delete cast; 346 | } 347 | 348 | void cast_float16_to_float32(const Mat& src, Mat& dst, Allocator* allocator, int num_threads) 349 | { 350 | ncnn::Layer* cast = ncnn::create_layer(ncnn::LayerType::Cast); 351 | 352 | ncnn::ParamDict pd; 353 | pd.set(0, 2); 354 | pd.set(1, 1); 355 | 356 | cast->load_param(pd); 357 | 358 | ncnn::Option opt = ncnn::get_default_option(); 359 | opt.num_threads = num_threads; 360 | opt.blob_allocator = allocator; 361 | 362 | cast->forward(src, dst, opt); 363 | 364 | delete cast; 365 | } 366 | #endif 367 | } // namespace ncnn 368 | -------------------------------------------------------------------------------- /src/ncnn/modelbin.cpp: -------------------------------------------------------------------------------- 1 | // Tencent is pleased to support the open source community by making ncnn available. 2 | // 3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. 4 | // 5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | // in compliance with the License. You may obtain a copy of the License at 7 | // 8 | // https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | // Unless required by applicable law or agreed to in writing, software distributed 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | // specific language governing permissions and limitations under the License. 14 | 15 | #include "modelbin.h" 16 | 17 | #include 18 | #include 19 | #include 20 | #include "platform.h" 21 | 22 | namespace ncnn { 23 | 24 | Mat ModelBin::load(int w, int h, int type) const 25 | { 26 | Mat m = load(w * h, type); 27 | if (m.empty()) 28 | return m; 29 | 30 | return m.reshape(w, h); 31 | } 32 | 33 | Mat ModelBin::load(int w, int h, int c, int type) const 34 | { 35 | Mat m = load(w * h * c, type); 36 | if (m.empty()) 37 | return m; 38 | 39 | return m.reshape(w, h, c); 40 | } 41 | 42 | #if NCNN_STDIO 43 | ModelBinFromStdio::ModelBinFromStdio(FILE* _binfp) : binfp(_binfp) 44 | { 45 | } 46 | 47 | Mat ModelBinFromStdio::load(int w, int type) const 48 | { 49 | if (!binfp) 50 | return Mat(); 51 | 52 | if (type == 0) 53 | { 54 | int nread; 55 | 56 | union 57 | { 58 | struct 59 | { 60 | unsigned char f0; 61 | unsigned char f1; 62 | unsigned char f2; 63 | unsigned char f3; 64 | }; 65 | unsigned int tag; 66 | } flag_struct; 67 | 68 | nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp); 69 | if (nread != 1) 70 | { 71 | fprintf(stderr, "ModelBin read flag_struct failed %d\n", nread); 72 | return Mat(); 73 | } 74 | 75 | unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3; 76 | 77 | if (flag_struct.tag == 0x01306B47) 78 | { 79 | // half-precision data 80 | int align_data_size = alignSize(w * sizeof(unsigned short), 4); 81 | std::vector float16_weights; 82 | float16_weights.resize(align_data_size); 83 | nread = fread(float16_weights.data(), align_data_size, 1, binfp); 84 | if (nread != 1) 85 | { 86 | fprintf(stderr, "ModelBin read float16_weights failed %d\n", nread); 87 | return Mat(); 88 | } 89 | 90 | return Mat::from_float16(float16_weights.data(), w); 91 | } 92 | else if (flag_struct.tag == 0x000D4B38) 93 | { 94 | // int8 data 95 | int align_data_size = alignSize(w, 4); 96 | std::vector int8_weights; 97 | int8_weights.resize(align_data_size); 98 | nread = fread(int8_weights.data(), align_data_size, 1, binfp); 99 | if (nread != 1) 100 | { 101 | fprintf(stderr, "ModelBin read int8_weights failed %d\n", nread); 102 | return Mat(); 103 | } 104 | 105 | Mat m(w, (size_t)1u); 106 | if (m.empty()) 107 | return m; 108 | 109 | memcpy(m.data, int8_weights.data(), w); 110 | 111 | return m; 112 | } 113 | else if (flag_struct.tag == 0x0002C056) 114 | { 115 | Mat m(w); 116 | if (m.empty()) 117 | return m; 118 | 119 | // raw data with extra scaling 120 | nread = fread(m, sizeof(float), w, binfp); 121 | if (nread != 1) 122 | { 123 | fprintf(stderr, "ModelBin read weight_data failed %d\n", nread); 124 | return Mat(); 125 | } 126 | 127 | return m; 128 | } 129 | 130 | Mat m(w); 131 | if (m.empty()) 132 | return m; 133 | 134 | if (flag != 0) 135 | { 136 | // quantized data 137 | float quantization_value[256]; 138 | nread = fread(quantization_value, 256 * sizeof(float), 1, binfp); 139 | if (nread != 1) 140 | { 141 | fprintf(stderr, "ModelBin read quantization_value failed %d\n", nread); 142 | return Mat(); 143 | } 144 | 145 | int align_weight_data_size = alignSize(w * sizeof(unsigned char), 4); 146 | std::vector index_array; 147 | index_array.resize(align_weight_data_size); 148 | nread = fread(index_array.data(), align_weight_data_size, 1, binfp); 149 | if (nread != 1) 150 | { 151 | fprintf(stderr, "ModelBin read index_array failed %d\n", nread); 152 | return Mat(); 153 | } 154 | 155 | float* ptr = m; 156 | for (int i = 0; i < w; i++) 157 | { 158 | ptr[i] = quantization_value[ index_array[i] ]; 159 | } 160 | } 161 | else if (flag_struct.f0 == 0) 162 | { 163 | // raw data 164 | nread = fread(m, w * sizeof(float), 1, binfp); 165 | if (nread != 1) 166 | { 167 | fprintf(stderr, "ModelBin read weight_data failed %d\n", nread); 168 | return Mat(); 169 | } 170 | } 171 | 172 | return m; 173 | } 174 | else if (type == 1) 175 | { 176 | Mat m(w); 177 | if (m.empty()) 178 | return m; 179 | 180 | // raw data 181 | int nread = fread(m, w * sizeof(float), 1, binfp); 182 | if (nread != 1) 183 | { 184 | fprintf(stderr, "ModelBin read weight_data failed %d\n", nread); 185 | return Mat(); 186 | } 187 | 188 | return m; 189 | } 190 | else 191 | { 192 | fprintf(stderr, "ModelBin load type %d not implemented\n", type); 193 | return Mat(); 194 | } 195 | 196 | return Mat(); 197 | } 198 | #endif // NCNN_STDIO 199 | 200 | ModelBinFromMemory::ModelBinFromMemory(const unsigned char*& _mem) : mem(_mem) 201 | { 202 | } 203 | 204 | Mat ModelBinFromMemory::load(int w, int type) const 205 | { 206 | if (!mem) 207 | return Mat(); 208 | 209 | if (type == 0) 210 | { 211 | union 212 | { 213 | struct 214 | { 215 | unsigned char f0; 216 | unsigned char f1; 217 | unsigned char f2; 218 | unsigned char f3; 219 | }; 220 | unsigned int tag; 221 | } flag_struct; 222 | 223 | memcpy(&flag_struct, mem, sizeof(flag_struct)); 224 | mem += sizeof(flag_struct); 225 | 226 | unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3; 227 | 228 | if (flag_struct.tag == 0x01306B47) 229 | { 230 | // half-precision data 231 | Mat m = Mat::from_float16((unsigned short*)mem, w); 232 | mem += alignSize(w * sizeof(unsigned short), 4); 233 | return m; 234 | } 235 | else if (flag_struct.tag == 0x000D4B38) 236 | { 237 | // int8 data 238 | Mat m = Mat(w, (signed char*)mem, 1u); 239 | mem += alignSize(w, 4); 240 | return m; 241 | } 242 | else if (flag_struct.tag == 0x0002C056) 243 | { 244 | // raw data with extra scaling 245 | Mat m = Mat(w, (float*)mem); 246 | mem += w * sizeof(float); 247 | return m; 248 | } 249 | 250 | if (flag != 0) 251 | { 252 | // quantized data 253 | const float* quantization_value = (const float*)mem; 254 | mem += 256 * sizeof(float); 255 | 256 | const unsigned char* index_array = (const unsigned char*)mem; 257 | mem += alignSize(w * sizeof(unsigned char), 4); 258 | 259 | Mat m(w); 260 | if (m.empty()) 261 | return m; 262 | 263 | float* ptr = m; 264 | for (int i = 0; i < w; i++) 265 | { 266 | ptr[i] = quantization_value[ index_array[i] ]; 267 | } 268 | 269 | return m; 270 | } 271 | else if (flag_struct.f0 == 0) 272 | { 273 | // raw data 274 | Mat m = Mat(w, (float*)mem); 275 | mem += w * sizeof(float); 276 | return m; 277 | } 278 | } 279 | else if (type == 1) 280 | { 281 | // raw data 282 | Mat m = Mat(w, (float*)mem); 283 | mem += w * sizeof(float); 284 | return m; 285 | } 286 | else 287 | { 288 | fprintf(stderr, "ModelBin load type %d not implemented\n", type); 289 | return Mat(); 290 | } 291 | 292 | return Mat(); 293 | } 294 | 295 | ModelBinFromMatArray::ModelBinFromMatArray(const Mat* _weights) : weights(_weights) 296 | { 297 | } 298 | 299 | Mat ModelBinFromMatArray::load(int /*w*/, int /*type*/) const 300 | { 301 | if (!weights) 302 | return Mat(); 303 | 304 | Mat m = weights[0]; 305 | weights++; 306 | return m; 307 | } 308 | 309 | } // namespace ncnn 310 | -------------------------------------------------------------------------------- /src/ncnn/modelbin.h: -------------------------------------------------------------------------------- 1 | // Tencent is pleased to support the open source community by making ncnn available. 2 | // 3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. 4 | // 5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | // in compliance with the License. You may obtain a copy of the License at 7 | // 8 | // https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | // Unless required by applicable law or agreed to in writing, software distributed 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | // specific language governing permissions and limitations under the License. 14 | 15 | #ifndef NCNN_MODELBIN_H 16 | #define NCNN_MODELBIN_H 17 | 18 | #include 19 | #include "mat.h" 20 | #include "platform.h" 21 | 22 | namespace ncnn { 23 | 24 | class Net; 25 | class ModelBin 26 | { 27 | public: 28 | // element type 29 | // 0 = auto 30 | // 1 = float32 31 | // 2 = float16 32 | // 3 = int8 33 | // load vec 34 | virtual Mat load(int w, int type) const = 0; 35 | // load image 36 | virtual Mat load(int w, int h, int type) const; 37 | // load dim 38 | virtual Mat load(int w, int h, int c, int type) const; 39 | }; 40 | 41 | #if NCNN_STDIO 42 | class ModelBinFromStdio : public ModelBin 43 | { 44 | public: 45 | // construct from file 46 | ModelBinFromStdio(FILE* binfp); 47 | 48 | virtual Mat load(int w, int type) const; 49 | 50 | protected: 51 | FILE* binfp; 52 | }; 53 | #endif // NCNN_STDIO 54 | 55 | class ModelBinFromMemory : public ModelBin 56 | { 57 | public: 58 | // construct from external memory 59 | ModelBinFromMemory(const unsigned char*& mem); 60 | 61 | virtual Mat load(int w, int type) const; 62 | 63 | protected: 64 | const unsigned char*& mem; 65 | }; 66 | 67 | class ModelBinFromMatArray : public ModelBin 68 | { 69 | public: 70 | // construct from weight blob array 71 | ModelBinFromMatArray(const Mat* weights); 72 | 73 | virtual Mat load(int w, int type) const; 74 | 75 | protected: 76 | mutable const Mat* weights; 77 | }; 78 | 79 | } // namespace ncnn 80 | 81 | #endif // NCNN_MODELBIN_H 82 | -------------------------------------------------------------------------------- /src/ncnn/paramdict.cpp: -------------------------------------------------------------------------------- 1 | // Tencent is pleased to support the open source community by making ncnn available. 2 | // 3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. 4 | // 5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | // in compliance with the License. You may obtain a copy of the License at 7 | // 8 | // https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | // Unless required by applicable law or agreed to in writing, software distributed 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | // specific language governing permissions and limitations under the License. 14 | 15 | #include 16 | #include 17 | #include 18 | #include "paramdict.h" 19 | #include "platform.h" 20 | 21 | namespace ncnn { 22 | 23 | ParamDict::ParamDict() 24 | { 25 | use_winograd_convolution = 1; 26 | use_sgemm_convolution = 1; 27 | use_int8_inference = 1; 28 | use_vulkan_compute = 0; 29 | 30 | clear(); 31 | } 32 | 33 | int ParamDict::get(int id, int def) const 34 | { 35 | return params[id].loaded ? params[id].i : def; 36 | } 37 | 38 | float ParamDict::get(int id, float def) const 39 | { 40 | return params[id].loaded ? params[id].f : def; 41 | } 42 | 43 | Mat ParamDict::get(int id, const Mat& def) const 44 | { 45 | return params[id].loaded ? params[id].v : def; 46 | } 47 | 48 | void ParamDict::set(int id, int i) 49 | { 50 | params[id].loaded = 1; 51 | params[id].i = i; 52 | } 53 | 54 | void ParamDict::set(int id, float f) 55 | { 56 | params[id].loaded = 1; 57 | params[id].f = f; 58 | } 59 | 60 | void ParamDict::set(int id, const Mat& v) 61 | { 62 | params[id].loaded = 1; 63 | params[id].v = v; 64 | } 65 | 66 | void ParamDict::clear() 67 | { 68 | for (int i = 0; i < NCNN_MAX_PARAM_COUNT; i++) 69 | { 70 | params[i].loaded = 0; 71 | params[i].v = Mat(); 72 | } 73 | } 74 | 75 | #if NCNN_STDIO 76 | #if NCNN_STRING 77 | static bool vstr_is_float(const char vstr[16]) 78 | { 79 | // look ahead for determine isfloat 80 | for (int j=0; j<16; j++) 81 | { 82 | if (vstr[j] == '\0') 83 | break; 84 | 85 | if (vstr[j] == '.' || tolower(vstr[j]) == 'e') 86 | return true; 87 | } 88 | 89 | return false; 90 | } 91 | 92 | int ParamDict::load_param(FILE* fp) 93 | { 94 | clear(); 95 | 96 | // 0=100 1=1.250000 -23303=5,0.1,0.2,0.4,0.8,1.0 97 | 98 | // parse each key=value pair 99 | int id = 0; 100 | while (fscanf(fp, "%d=", &id) == 1) 101 | { 102 | bool is_array = id <= -23300; 103 | if (is_array) 104 | { 105 | id = -id - 23300; 106 | } 107 | 108 | if (is_array) 109 | { 110 | int len = 0; 111 | int nscan = fscanf(fp, "%d", &len); 112 | if (nscan != 1) 113 | { 114 | fprintf(stderr, "ParamDict read array length fail\n"); 115 | return -1; 116 | } 117 | 118 | params[id].v.create(len); 119 | 120 | for (int j = 0; j < len; j++) 121 | { 122 | char vstr[16]; 123 | nscan = fscanf(fp, ",%15[^,\n ]", vstr); 124 | if (nscan != 1) 125 | { 126 | fprintf(stderr, "ParamDict read array element fail\n"); 127 | return -1; 128 | } 129 | 130 | bool is_float = vstr_is_float(vstr); 131 | 132 | if (is_float) 133 | { 134 | float* ptr = params[id].v; 135 | nscan = sscanf(vstr, "%f", &ptr[j]); 136 | } 137 | else 138 | { 139 | int* ptr = params[id].v; 140 | nscan = sscanf(vstr, "%d", &ptr[j]); 141 | } 142 | if (nscan != 1) 143 | { 144 | fprintf(stderr, "ParamDict parse array element fail\n"); 145 | return -1; 146 | } 147 | } 148 | } 149 | else 150 | { 151 | char vstr[16]; 152 | int nscan = fscanf(fp, "%15s", vstr); 153 | if (nscan != 1) 154 | { 155 | fprintf(stderr, "ParamDict read value fail\n"); 156 | return -1; 157 | } 158 | 159 | bool is_float = vstr_is_float(vstr); 160 | 161 | if (is_float) 162 | nscan = sscanf(vstr, "%f", ¶ms[id].f); 163 | else 164 | nscan = sscanf(vstr, "%d", ¶ms[id].i); 165 | if (nscan != 1) 166 | { 167 | fprintf(stderr, "ParamDict parse value fail\n"); 168 | return -1; 169 | } 170 | } 171 | params[id].loaded = 1; 172 | } 173 | return 0; 174 | } 175 | 176 | #if _MSC_VER 177 | static inline int mem_sscanf_with_n(int* _internal_nconsumed_ptr, const char*& ptr, const char* format, ...) 178 | { 179 | *_internal_nconsumed_ptr = 0; 180 | 181 | va_list args; 182 | va_start(args, format); 183 | 184 | int _n = vsscanf(ptr, format, args); 185 | 186 | va_end(args); 187 | 188 | ptr += *_internal_nconsumed_ptr; 189 | 190 | return *_internal_nconsumed_ptr > 0 ? _n : 0; 191 | } 192 | #define mem_sscanf(ptr, format, ...) mem_sscanf_with_n(&_internal_nconsumed, ptr, format "%n", __VA_ARGS__, &_internal_nconsumed) 193 | #else 194 | // return value from macro requires gcc extension https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html 195 | #define mem_sscanf(ptr, format, ...) ({int _b=0; int _n = sscanf(ptr, format "%n", __VA_ARGS__, &_b); ptr+=_b;_b>0?_n:0;}) 196 | #endif // _MSC_VER 197 | 198 | int ParamDict::load_param_mem(const char*& mem) 199 | { 200 | #if _MSC_VER 201 | int _internal_nconsumed; 202 | #endif 203 | 204 | clear(); 205 | 206 | // 0=100 1=1.250000 -23303=5,0.1,0.2,0.4,0.8,1.0 207 | 208 | // parse each key=value pair 209 | int id = 0; 210 | while (mem_sscanf(mem, "%d=", &id) == 1) 211 | { 212 | bool is_array = id <= -23300; 213 | if (is_array) 214 | { 215 | id = -id - 23300; 216 | } 217 | 218 | if (is_array) 219 | { 220 | int len = 0; 221 | int nscan = mem_sscanf(mem, "%d", &len); 222 | if (nscan != 1) 223 | { 224 | fprintf(stderr, "ParamDict read array length fail\n"); 225 | return -1; 226 | } 227 | 228 | params[id].v.create(len); 229 | 230 | for (int j = 0; j < len; j++) 231 | { 232 | char vstr[16]; 233 | nscan = mem_sscanf(mem, ",%15[^,\n ]", vstr); 234 | if (nscan != 1) 235 | { 236 | fprintf(stderr, "ParamDict read array element fail\n"); 237 | return -1; 238 | } 239 | 240 | bool is_float = vstr_is_float(vstr); 241 | 242 | if (is_float) 243 | { 244 | float* ptr = params[id].v; 245 | nscan = sscanf(vstr, "%f", &ptr[j]); 246 | } 247 | else 248 | { 249 | int* ptr = params[id].v; 250 | nscan = sscanf(vstr, "%d", &ptr[j]); 251 | } 252 | if (nscan != 1) 253 | { 254 | fprintf(stderr, "ParamDict parse array element fail\n"); 255 | return -1; 256 | } 257 | } 258 | } 259 | else 260 | { 261 | char vstr[16]; 262 | int nscan = mem_sscanf(mem, "%15s", vstr); 263 | if (nscan != 1) 264 | { 265 | fprintf(stderr, "ParamDict read value fail\n"); 266 | return -1; 267 | } 268 | 269 | bool is_float = vstr_is_float(vstr); 270 | 271 | if (is_float) 272 | nscan = sscanf(vstr, "%f", ¶ms[id].f); 273 | else 274 | nscan = sscanf(vstr, "%d", ¶ms[id].i); 275 | if (nscan != 1) 276 | { 277 | fprintf(stderr, "ParamDict parse value fail\n"); 278 | return -1; 279 | } 280 | } 281 | 282 | params[id].loaded = 1; 283 | } 284 | return 0; 285 | } 286 | #endif // NCNN_STRING 287 | 288 | int ParamDict::load_param_bin(FILE* fp) 289 | { 290 | clear(); 291 | 292 | // binary 0 293 | // binary 100 294 | // binary 1 295 | // binary 1.250000 296 | // binary 3 | array_bit 297 | // binary 5 298 | // binary 0.1 299 | // binary 0.2 300 | // binary 0.4 301 | // binary 0.8 302 | // binary 1.0 303 | // binary -233(EOP) 304 | 305 | int id = 0; 306 | fread(&id, sizeof(int), 1, fp); 307 | 308 | while (id != -233) 309 | { 310 | bool is_array = id <= -23300; 311 | if (is_array) 312 | { 313 | id = -id - 23300; 314 | } 315 | 316 | if (is_array) 317 | { 318 | int len = 0; 319 | fread(&len, sizeof(int), 1, fp); 320 | 321 | params[id].v.create(len); 322 | 323 | float* ptr = params[id].v; 324 | fread(ptr, sizeof(float), len, fp); 325 | } 326 | else 327 | { 328 | fread(¶ms[id].f, sizeof(float), 1, fp); 329 | } 330 | 331 | params[id].loaded = 1; 332 | 333 | fread(&id, sizeof(int), 1, fp); 334 | } 335 | 336 | return 0; 337 | } 338 | #endif // NCNN_STDIO 339 | 340 | int ParamDict::load_param(const unsigned char*& mem) 341 | { 342 | clear(); 343 | 344 | int id = *(int*)(mem); 345 | mem += 4; 346 | 347 | while (id != -233) 348 | { 349 | bool is_array = id <= -23300; 350 | if (is_array) 351 | { 352 | id = -id - 23300; 353 | } 354 | 355 | if (is_array) 356 | { 357 | int len = *(int*)(mem); 358 | mem += 4; 359 | 360 | params[id].v.create(len); 361 | 362 | memcpy(params[id].v.data, mem, len * 4); 363 | mem += len * 4; 364 | } 365 | else 366 | { 367 | params[id].f = *(float*)(mem); 368 | mem += 4; 369 | } 370 | 371 | params[id].loaded = 1; 372 | 373 | id = *(int*)(mem); 374 | mem += 4; 375 | } 376 | 377 | return 0; 378 | } 379 | 380 | } // namespace ncnn 381 | -------------------------------------------------------------------------------- /src/ncnn/paramdict.h: -------------------------------------------------------------------------------- 1 | // Tencent is pleased to support the open source community by making ncnn available. 2 | // 3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. 4 | // 5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | // in compliance with the License. You may obtain a copy of the License at 7 | // 8 | // https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | // Unless required by applicable law or agreed to in writing, software distributed 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | // specific language governing permissions and limitations under the License. 14 | 15 | #ifndef NCNN_PARAMDICT_H 16 | #define NCNN_PARAMDICT_H 17 | 18 | #include 19 | #include "mat.h" 20 | #include "platform.h" 21 | 22 | // at most 20 parameters 23 | #define NCNN_MAX_PARAM_COUNT 20 24 | 25 | namespace ncnn { 26 | 27 | class Net; 28 | class ParamDict 29 | { 30 | public: 31 | // empty 32 | ParamDict(); 33 | 34 | // get int 35 | int get(int id, int def) const; 36 | // get float 37 | float get(int id, float def) const; 38 | // get array 39 | Mat get(int id, const Mat& def) const; 40 | 41 | // set int 42 | void set(int id, int i); 43 | // set float 44 | void set(int id, float f); 45 | // set array 46 | void set(int id, const Mat& v); 47 | 48 | public: 49 | int use_winograd_convolution; 50 | int use_sgemm_convolution; 51 | int use_int8_inference; 52 | int use_vulkan_compute; 53 | 54 | public: 55 | // friend class Net; 56 | 57 | void clear(); 58 | 59 | #if NCNN_STDIO 60 | #if NCNN_STRING 61 | int load_param(FILE* fp); 62 | int load_param_mem(const char*& mem); 63 | #endif // NCNN_STRING 64 | int load_param_bin(FILE* fp); 65 | #endif // NCNN_STDIO 66 | int load_param(const unsigned char*& mem); 67 | 68 | protected: 69 | struct 70 | { 71 | int loaded; 72 | union { int i; float f; }; 73 | Mat v; 74 | } params[NCNN_MAX_PARAM_COUNT]; 75 | }; 76 | 77 | } // namespace ncnn 78 | 79 | #endif // NCNN_PARAMDICT_H 80 | -------------------------------------------------------------------------------- /src/ncnn/platform.h: -------------------------------------------------------------------------------- 1 | // Tencent is pleased to support the open source community by making ncnn available. 2 | // 3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved. 4 | // 5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | // in compliance with the License. You may obtain a copy of the License at 7 | // 8 | // https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | // Unless required by applicable law or agreed to in writing, software distributed 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | // specific language governing permissions and limitations under the License. 14 | 15 | #ifndef NCNN_PLATFORM_H 16 | #define NCNN_PLATFORM_H 17 | 18 | #define NCNN_STDIO 1 19 | #define NCNN_STRING 1 20 | #define NCNN_OPENCV 0 21 | #define NCNN_BENCHMARK 0 22 | #define NCNN_PIXEL 1 23 | #define NCNN_PIXEL_ROTATE 1 24 | #define NCNN_VULKAN 0 25 | #define NCNN_REQUANT 0 26 | #define NCNN_IM2COL_SGEMM 0 27 | #define COMPILE_WITH_FEATHERCNN 28 | #endif // NCNN_PLATFORM_H 29 | -------------------------------------------------------------------------------- /src/net.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include "layer.h" 18 | #include "rt_param.h" 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include 27 | 28 | namespace feather 29 | { 30 | class Net 31 | { 32 | public: 33 | Net(); 34 | 35 | ~Net(); 36 | 37 | int LoadParam(const char * param_path); 38 | int LoadParam(FILE* fp); 39 | int LoadWeights(const char * weights_path); 40 | int LoadWeights(FILE* fp); 41 | 42 | // int FeedInput(const char* input_name, const int w, const int h, const int c, const float* input_data); 43 | 44 | int FeedInput(const char* input_name, ncnn::Mat& in); 45 | 46 | int Forward(); 47 | 48 | int Extract(std::string blob_name, float** output_ptr, int* n, int *c, int* h, int* w); 49 | 50 | int Extract(std::string blob_name, ncnn::Mat& out); 51 | 52 | int BuildBlobMap(); 53 | 54 | std::map *> blob_map; 55 | 56 | private: 57 | int InitLayers(); 58 | int Reshape(); 59 | RuntimeParameter *rt_param; 60 | std::vector layers; 61 | 62 | /* Flag varibles indicating Net status. 63 | * 64 | * _weights_loaded: if Net has loaded the weights. 65 | * _net_initialized: if the weights are already initialized. 66 | */ 67 | int _param_loaded; 68 | int _weights_loaded; 69 | int _net_initialized; 70 | }; 71 | }; // namespace feather 72 | -------------------------------------------------------------------------------- /src/rt_param.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | /* 16 | * For runtime parameters 17 | */ 18 | 19 | #pragma once 20 | 21 | #include "blob.h" 22 | #include "mempool.h" 23 | #include "utils.h" 24 | #include 25 | 26 | template 27 | class RuntimeParameter 28 | { 29 | public: 30 | RuntimeParameter() : _common_mempool(NULL), 31 | _num_threads(1) 32 | { 33 | } 34 | RuntimeParameter(CommonMemPool *common_mempool, size_t num_threads) 35 | : _common_mempool(common_mempool), 36 | _num_threads(num_threads) 37 | { 38 | } 39 | ~RuntimeParameter() 40 | { 41 | } 42 | 43 | CommonMemPool *common_mempool() const 44 | { 45 | return _common_mempool; 46 | } 47 | size_t num_threads() const 48 | { 49 | return _num_threads; 50 | } 51 | 52 | private: 53 | CommonMemPool *_common_mempool; 54 | size_t _num_threads; 55 | }; 56 | -------------------------------------------------------------------------------- /src/utils.cpp: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #include "utils.h" 16 | // #include "booster/helper.h" 17 | #include 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | using namespace std; 26 | 27 | int ChkParamHeader(FILE* fp) 28 | { 29 | fseek(fp, 0, SEEK_SET); 30 | int magic = 0; 31 | int nbr = fscanf(fp, "%d", &magic); 32 | if (nbr != 1) 33 | { 34 | fprintf(stderr, "issue with param file\n"); 35 | return -1; 36 | } 37 | if (magic != 7767517) 38 | { 39 | fprintf(stderr, "param is too old, please regenerate\n"); 40 | return -1; 41 | } 42 | return 0; 43 | } 44 | 45 | int min(int a, int b) 46 | { 47 | return (a < b) ? a : b; 48 | } 49 | 50 | #if (defined(__linux__) && !(defined(__aarch64__)))|| defined(__APPLE_CC__) 51 | #else 52 | void* _mm_malloc(size_t sz, size_t align) 53 | { 54 | void *ptr; 55 | #if (defined __APPLE__) || (defined _WIN32) 56 | return malloc(sz); 57 | #else 58 | int alloc_result = posix_memalign(&ptr, align, sz); 59 | if (alloc_result != 0) 60 | { 61 | return NULL; 62 | } 63 | return ptr; 64 | #endif 65 | } 66 | 67 | void _mm_free(void* ptr) 68 | { 69 | if (NULL != ptr) 70 | { 71 | free(ptr); 72 | ptr = NULL; 73 | } 74 | } 75 | #endif 76 | 77 | void StringTool::SplitString(const std::string &input, const std::string &delim, std::vector &parts) 78 | { 79 | for (char *s = strtok((char *)input.data(), (char *)delim.data()); s; s = strtok(NULL, (char *)delim.data())) 80 | { 81 | if (s != NULL) 82 | { 83 | parts.push_back(s); 84 | } 85 | } 86 | } 87 | 88 | void StringTool::RelaceString(std::string &input, const std::string &delim, const std::string& repstr) 89 | { 90 | size_t pos = input.find(delim); 91 | while (pos != std::string::npos) 92 | { 93 | // Replace this occurrence of Sub String 94 | input.replace(pos, delim.size(), repstr); 95 | // Get the next occurrence from the current position 96 | pos = input.find(delim, pos + delim.size()); 97 | } 98 | } 99 | 100 | #ifdef FEATHER_OPENCL 101 | bool judge_android7_opencl() 102 | { 103 | //libOpenCL.so 104 | //android7.0 sdk api 24 105 | char sdk[93] = ""; 106 | __system_property_get("ro.build.version.sdk", sdk); 107 | if (std::atoi(sdk) < 24) 108 | { 109 | LOGI("[device] sdk [%d] < 24\n", std::atoi(sdk)); 110 | return true; 111 | } 112 | 113 | bool flage = false; 114 | std::string lib_name1 = "libOpenCL.so"; 115 | std::string lib_name2 = "libGLES_mali.so"; 116 | std::vector libraries_list = 117 | { 118 | "/vendor/etc/public.libraries.txt", 119 | "/system/etc/public.libraries.txt", 120 | }; 121 | for (int i = 0; i < libraries_list.size(); i++) 122 | { 123 | std::ifstream out; 124 | std::string line; 125 | out.open(libraries_list[i].c_str()); 126 | while (!out.eof()) 127 | { 128 | std::getline(out, line); 129 | if (line.find(lib_name1) != line.npos || line.find(lib_name2) != line.npos) 130 | { 131 | LOGI("[public] %s:%s", libraries_list[i].c_str(), line.c_str()); 132 | flage = true; 133 | break; 134 | } 135 | 136 | } 137 | out.close(); 138 | } 139 | if(flage == false) 140 | return flage; 141 | 142 | flage = false; 143 | const std::vector libpaths = 144 | { 145 | "libOpenCL.so", 146 | #if defined(__aarch64__) 147 | // Qualcomm Adreno with Android 148 | "/system/vendor/lib64/libOpenCL.so", 149 | "/system/lib64/libOpenCL.so", 150 | // Mali with Android 151 | "/system/vendor/lib64/egl/libGLES_mali.so", 152 | "/system/lib64/egl/libGLES_mali.so", 153 | // Typical Linux board 154 | "/usr/lib/aarch64-linux-gnu/libOpenCL.so", 155 | #else 156 | // Qualcomm Adreno with Android 157 | "/system/vendor/lib/libOpenCL.so", 158 | "/system/lib/libOpenCL.so", 159 | // Mali with Android 160 | "/system/vendor/lib/egl/libGLES_mali.so", 161 | "/system/lib/egl/libGLES_mali.so", 162 | // Typical Linux board 163 | "/usr/lib/arm-linux-gnueabihf/libOpenCL.so", 164 | #endif 165 | }; 166 | for (int i = 0; i < libpaths.size(); i++) 167 | { 168 | ifstream f(libpaths[i].c_str()); 169 | if (f.good()) 170 | { 171 | flage = true; 172 | LOGI("[libpaths]:%s", libpaths[i].c_str()); 173 | break; 174 | } 175 | } 176 | return flage; 177 | } 178 | #endif 179 | 180 | unsigned short hs_floatToHalf(float f) 181 | { 182 | union 183 | { 184 | float d; 185 | unsigned int i; 186 | } u = { f }; 187 | int s = (u.i >> 16) & 0x8000; 188 | int e = ((u.i >> 23) & 0xff) - 112; 189 | int m = u.i & 0x7fffff; 190 | if (e <= 0) 191 | { 192 | if (e < -10) return s; /* underflowed */ 193 | /* force leading 1 and round */ 194 | m |= 0x800000; 195 | int t = 14 - e; 196 | int a = (1 << (t - 1)) - 1; 197 | int b = (m >> t) & 1; 198 | return s | ((m + a + b) >> t); 199 | } 200 | if (e == 143) 201 | { 202 | if (m == 0) return s | 0x7c00; /* +/- infinity */ 203 | 204 | /* NaN, m == 0 forces us to set at least one bit and not become an infinity */ 205 | m >>= 13; 206 | return s | 0x7c00 | m | (m == 0); 207 | } 208 | 209 | /* round the normalized float */ 210 | m = m + 0xfff + ((m >> 13) & 1); 211 | 212 | /* significand overflow */ 213 | if (m & 0x800000) 214 | { 215 | m = 0; 216 | e += 1; 217 | } 218 | 219 | /* exponent overflow */ 220 | if (e > 30) return s | 0x7c00; 221 | 222 | return s | (e << 10) | (m >> 13); 223 | } 224 | 225 | int hs_halfToFloatRep(unsigned short c) 226 | { 227 | int s = (c >> 15) & 0x001; 228 | int e = (c >> 10) & 0x01f; 229 | int m = c & 0x3ff; 230 | if (e == 0) 231 | { 232 | if (m == 0) /* +/- 0 */ return s << 31; 233 | /* denormalized, renormalize it */ 234 | while (!(m & 0x400)) 235 | { 236 | m <<= 1; 237 | e -= 1; 238 | } 239 | e += 1; 240 | m &= ~0x400; 241 | } 242 | else if (e == 31) return (s << 31) | 0x7f800000 | (m << 13); /* NaN or +/- infinity */ 243 | e += 112; 244 | m <<= 13; 245 | return (s << 31) | (e << 23) | m; 246 | } 247 | 248 | float hs_halfToFloat(unsigned short c) 249 | { 250 | union 251 | { 252 | float d; 253 | unsigned int i; 254 | } u; 255 | u.i = hs_halfToFloatRep(c); 256 | return u.d; 257 | } 258 | -------------------------------------------------------------------------------- /src/utils.h: -------------------------------------------------------------------------------- 1 | //Tencent is pleased to support the open source community by making FeatherCNN available. 2 | 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved. 4 | 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except 6 | //in compliance with the License. You may obtain a copy of the License at 7 | // 8 | //https://opensource.org/licenses/BSD-3-Clause 9 | // 10 | //Unless required by applicable law or agreed to in writing, software distributed 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the 13 | //specific language governing permissions and limitations under the License. 14 | 15 | #pragma once 16 | 17 | #include 18 | #include 19 | #include 20 | #include 21 | #ifdef _WIN32 22 | #define HAVE_STRUCT_TIMESPEC 23 | #endif 24 | 25 | 26 | #if 0 27 | #include 28 | #define LOGI(...) __android_log_print(ANDROID_LOG_INFO, "FeatherLib", __VA_ARGS__) 29 | #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, "FeatherLib", __VA_ARGS__) 30 | #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "FeatherLib", __VA_ARGS__) 31 | #else 32 | #include 33 | #define LOGI(...) fprintf(stdout, __VA_ARGS__);fprintf(stdout,"\n"); 34 | #define LOGD(...) fprintf(stdout, __VA_ARGS__);fprintf(stdout,"\n"); 35 | #define LOGE(...) fprintf(stderr, __VA_ARGS__);fprintf(stderr,"\n"); 36 | #endif 37 | 38 | 39 | typedef unsigned short half; 40 | 41 | class StringTool 42 | { 43 | public: 44 | static void SplitString(const std::string &input, const std::string &delim, std::vector &parts); 45 | static void RelaceString(std::string &input, const std::string &delim, const std::string& repstr); 46 | }; 47 | 48 | int ChkParamHeader(FILE* fp); 49 | 50 | int min(int a, int b); 51 | 52 | #if (defined(__linux__) && !defined(__aarch64__)) || defined(__APPLE_CC__) 53 | #include 54 | #else 55 | void* _mm_malloc(size_t sz, size_t align); 56 | void _mm_free(void* ptr); 57 | #endif 58 | 59 | unsigned short hs_floatToHalf(float f); 60 | int hs_halfToFloatRep(unsigned short c); 61 | float hs_halfToFloat(unsigned short c); 62 | --------------------------------------------------------------------------------