├── CMakeLists.txt
├── README.md
├── build_scripts
├── Info.plist
├── build_android.sh
├── build_ios.sh
├── build_linux_aarch64.sh
├── build_linux_avx.sh
├── build_macos_avx.sh
├── ios.toolchain.cmake
├── linux-aarch64.toolchain.cmake
└── pack_ios_framework.sh
└── src
├── CMakeLists.txt
├── blob.cpp
├── blob.h
├── booster
├── CMakeLists.txt
├── arm
│ ├── CMakeLists.txt
│ ├── booster.cpp
│ ├── caffe_interp.cpp
│ ├── depthwise.cpp
│ ├── generic_kernels.cpp
│ ├── helper.cpp
│ ├── sgeconv.cpp
│ ├── sgemm.cpp
│ ├── sgemm_legacy.cpp
│ ├── sgemm_legacy.h
│ ├── sgemv.cpp
│ ├── winograd_kernels.cpp
│ └── winograd_kernels_F63.cpp
├── avx
│ ├── CMakeLists.txt
│ ├── booster.cpp
│ ├── caffe_interp.cpp
│ ├── depthwise.cpp
│ ├── generic_kernels.cpp
│ ├── helper.cpp
│ ├── sgeconv.cpp
│ ├── sgemm.cpp
│ ├── sgemv.cpp
│ ├── winograd_kernels_F63.cpp
│ └── winograd_kernels_F63_fused.cpp
└── include
│ └── booster
│ ├── booster.h
│ ├── caffe_interp.h
│ ├── depthwise.h
│ ├── generic_kernels.h
│ ├── helper.h
│ ├── power.h
│ ├── sgeconv.h
│ ├── sgemm.h
│ ├── sgemv.h
│ ├── thpool.h
│ └── winograd_kernels.h
├── layer.cpp
├── layer.h
├── layer_factory.cpp
├── layer_factory.h
├── layers
├── batchnorm_layer.h
├── concat_layer.h
├── conv_layer.h
├── dropout_layer.h
├── eltwise_layer.h
├── inner_product_layer.h
├── input_layer.h
├── pooling_layer.h
├── relu_layer.h
├── scale_layer.h
├── softmax_layer.h
└── split_layer.h
├── mempool.cpp
├── mempool.h
├── ncnn
├── allocator.cpp
├── allocator.h
├── mat.cpp
├── mat.h
├── mat_pixel.cpp
├── mat_pixel_resize.cpp
├── modelbin.cpp
├── modelbin.h
├── paramdict.cpp
├── paramdict.h
└── platform.h
├── net.cpp
├── net.h
├── rt_param.h
├── utils.cpp
└── utils.h
/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | if(CMAKE_TOOLCHAIN_FILE)
2 | set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to")
3 | # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :(
4 | get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME)
5 | find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH)
6 | message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}")
7 | endif()
8 |
9 | if(NOT DEFINED CMAKE_INSTALL_PREFIX)
10 | set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory")
11 | endif()
12 | message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}")
13 |
14 | project(feather)
15 |
16 | cmake_minimum_required(VERSION 2.8)
17 |
18 | #set(CMAKE_BUILD_TYPE Debug)
19 | #set(CMAKE_BUILD_TYPE Release)
20 |
21 | option(FEATHER_OPENMP "openmp support" ON)
22 |
23 | if(FEATHER_OPENMP)
24 | if(CMAKE_HOST_APPLE)
25 | #if(1)
26 | if(IOS)
27 | #if(0)
28 | message(STATUS "iOS doesn't support OpenMP, use GCD instead.")
29 | set(OPENMP_FOUND false)
30 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fembed-bitcode")
31 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fembed-bitcode")
32 | else()
33 | set(OpenMP_C_FLAGS)
34 | set(OpenMP_CXX_FLAGS)
35 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
36 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
37 | message(STATUS ${OpenMP_C_FLAGS})
38 | message(STATUS ${OpenMP_CXX_FLAGS})
39 | endif()
40 | else()
41 | #find_package(OpenMP)
42 | include(FindOpenMP)
43 | if(OPENMP_FOUND)
44 | #if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
45 | #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
46 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
47 | message(STATUS ${OpenMP_C_FLAGS})
48 | message(STATUS ${OpenMP_CXX_FLAGS})
49 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
50 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
51 | endif()
52 | message(STATUS "OpenMP flags ${CMAKE_CXX_FLAGS}")
53 | endif()
54 | endif()
55 |
56 | #add_definitions(-Wall -Wextra -Wno-unused-function)
57 | add_definitions(-fPIC)
58 | add_definitions(-Ofast)
59 | add_definitions(-ffast-math)
60 | # add_definitions(-march=native)
61 |
62 | # add_definitions(-flto)
63 |
64 | add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
65 |
66 | if(ANDROID)
67 | # disable shared library on android
68 | #set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
69 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti ")
70 | add_definitions("-DFEATHER_ANDROID_LOG")
71 | add_definitions("-D_NDK_MATH_NO_SOFTFP=1")
72 | if(${ANDROID_ABI} STREQUAL "armeabi-v7a")
73 | add_definitions("-mfpu=neon-vfpv4")
74 | endif()
75 | elseif(IOS)
76 | # disable shared library on xcode ios
77 | add_definitions(-isysroot ${IOS_SDK_PATH} -arch ${IOS_ARCH})
78 | set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
79 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti ")
80 | endif()
81 |
82 | ##############################################
83 |
84 | # add_subdirectory(examples)
85 | # add_subdirectory(benchmark)
86 | add_subdirectory(src)
87 | #if(NOT ANDROID AND NOT IOS)
88 | # add_subdirectory(tools)
89 | #endif()
90 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | [](https://github.com/Tencent/FeatherCNN/blob/master/LICENSE)
4 | [](https://github.com/Tencent/FeatherCNN/releases)
5 | [](https://github.com/Tencent/FeatherCNN/pulls)
6 |
7 | ## Introduction
8 |
9 | FeatherCNN is a high-performance lightweight CNN inference library, developed by Tencent AI Platform Department.
10 | FeatureCNN origins from our game AI project for King of Glory (Chinese: 王者荣耀), in which we aim to build a neural model for MOBA game AI and run it on mobile devices.
11 | FeatherCNN currently targets at ARM CPUs.
12 | We will extend it to cover other architecutures in the near future.
13 |
14 | Comparing with other libraries, FeatherCNN has the following features:
15 |
16 | - **High Performance** FeatherCNN delivers state-of-the-art inference computing performance on a wide range of devices, including mobile phones (iOS/Android), embedded devices (Linux) as well as ARM-based servers (Linux).
17 |
18 | - **Easy Deployment** FeatherCNN packs everything in a single code base to get rid of third-party dependencies. Hence, it facilitates deployment on mobile platforms.
19 |
20 | - **Featherweight** The compiled FeatherCNN library is small-sized (hundreds of KBs).
21 |
22 | Please kindly open an issue in this repo for bug reports and enhancement suggests. We are grateful to user responses and will actively polish this library.
23 |
24 | ## Citation
25 |
26 | FeatherCNN: Fast Inference Computation with TensorGEMM on ARM Architectures (TPDS September 2019, In press, DOI:10.1109/TPDS.2019.2939785)
27 |
28 | ## Clone hints
29 | The FeatherCNN repository has a heavy development history, please only clone the master branch as follows:
30 | ```
31 | git clone -b master --single-branch https://github.com/tencent/FeatherCNN.git
32 | ```
33 |
34 | ## Detailed Instructions for iOS/Android/Linux
35 |
36 | [**Build From Source**](https://github.com/Tencent/FeatherCNN/wikis/Build-From-Source)
37 |
38 | [**iOS Guide**](https://github.com/Tencent/FeatherCNN/wikis/iOS-Guide)
39 |
40 | [**Android Guide**](https://github.com/Tencent/FeatherCNN/wiki/Android-Guide)
41 |
42 | [**Android ADB Guide**](https://github.com/Tencent/FeatherCNN/wiki/Android-ADB-Guide)
43 |
44 | ## Usage
45 |
46 | ### Model Format Conversion
47 |
48 | FeatherCNN accepts Caffemodels. It merges the structure file (.prototxt) and the weight file (.caffemodel) into a single binary model (.feathermodel). The convert tool requires protobuf, but you don't need them for the library.
49 |
50 | [**Model Convert Guide**](https://github.com/Tencent/FeatherCNN/wikis/Model-Convert-Guide).
51 |
52 | ### Runtime Interfaces
53 |
54 | The basic user interfaces are listed in feather/net.h. Currently we are using raw pointers to reference data.
55 | We may provide more convenient interfaces in the near future.
56 |
57 | Before inference, FeatherCNN requires two steps to initialize the network.
58 | ```cpp
59 | feather::Net forward_net(num_threads);
60 | forward_net.InitFromPath(FILE_PATH_TO_FEATHERMODEL);
61 | ```
62 | The net can also be initialized with raw buffers and FILE pointers.
63 | We can perform forward computation with raw `float*` buffer consequently.
64 | ```cpp
65 | forward_net.Forward(PTR_TO_YOUR_INPUT_DATA);
66 | ```
67 | The output can be extracted from the net by the name of blobs. The blob names are kept consistent with caffe prototxt.
68 | ```cpp
69 | forward_net.ExtractBlob(PTR_TO_YOUR_OUTPUT_BUFFER, BLOB_NAME);
70 | ```
71 | BTW, you can also get the blob's data size by calling
72 | ```cpp
73 | size_t data_size = 0;
74 | forward_net.GetBlobDataSize(&data_size, BLOB_NAME);
75 | ```
76 |
77 | ## Performance Benchmarks
78 | We have tested FeatherCNN on a bunch of devices, see [**this page**](https://github.com/Tencent/FeatherCNN/wikis/Benchmarks) for details.
79 |
80 | ## User Groups
81 |
82 | Telegram: https://t.me/FeatherCNN
83 |
84 | QQ: 728147343
85 |
--------------------------------------------------------------------------------
/build_scripts/Info.plist:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | CFBundleName
6 | feather
7 | CFBundleIdentifier
8 | com.tencent.feather
9 | CFBundleVersion
10 | 0.1
11 | CFBundleShortVersionString
12 | 0.1
13 | CFBundleSignature
14 | ????
15 | CFBundlePackageType
16 | FMWK
17 |
18 |
19 |
--------------------------------------------------------------------------------
/build_scripts/build_android.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir -p build-android
4 | pushd build-android
5 | mkdir -p arm64-v8a
6 | pushd arm64-v8a
7 | cmake -DCMAKE_TOOLCHAIN_FILE=$NDK_ROOT/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DBOOSTER_ARM=1 -DCOMPILE_OPENCL=1 ../..
8 | make -j6
9 | make install
10 | popd
11 |
12 | mkdir -p armeabi-v7a
13 | pushd armeabi-v7a
14 | cmake -DCMAKE_TOOLCHAIN_FILE=$NDK_ROOT/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-16 -DBOOSTER_ARM=1 -DCOMPILE_OPENCL=1 ../..
15 | make -j6
16 | make install
17 | popd
18 |
19 | #mkdir -p armeabi
20 | #pushd armeabi
21 | #cmake -DCMAKE_TOOLCHAIN_FILE=$NDK_ROOT/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi" -DANDROID_PLATFORM=android-16 -DFEATHER_ARM=0 ../..
22 | #make -j4
23 | #make install
24 | #popd
25 |
26 | mkdir -p feather
27 | mkdir -p booster
28 | pushd feather
29 | mkdir -p include
30 | mkdir -p include/feather
31 | cp -r ../arm64-v8a/install/feather/include/* ./include/feather/
32 | mkdir -p arm64-v8a
33 | cp ../arm64-v8a/install/feather/lib/* ./arm64-v8a/
34 | mkdir -p armeabi-v7a
35 | cp ../armeabi-v7a/install/feather/lib/* ./armeabi-v7a/
36 | #mkdir -p armeabi
37 | #cp ../armeabi/install/feather/lib/* ./armeabi/
38 | #popd
39 | popd
40 | pushd booster
41 | mkdir -p include/booster
42 | cp -r ../arm64-v8a/install/booster/include/* ./include/
43 | mkdir -p arm64-v8a
44 | cp ../arm64-v8a/install/booster/lib/* ./arm64-v8a/
45 | mkdir -p armeabi-v7a
46 | cp ../armeabi-v7a/install/booster/lib/* ./armeabi-v7a/
47 | popd
48 |
--------------------------------------------------------------------------------
/build_scripts/build_ios.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo $(xcrun --sdk iphoneos --show-sdk-path)
4 | mkdir -p build-ios
5 | pushd build-ios
6 | mkdir -p arm64
7 | pushd arm64
8 | cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphoneos --show-sdk-path) -DIOS_ARCH=arm64 -DBOOSTER_ARM=1 ../..
9 | make -j4
10 | make install
11 | popd
12 |
13 | mkdir -p armv7s
14 | pushd armv7s
15 | cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphoneos --show-sdk-path) -DIOS_ARCH=armv7s -DBOOSTER_ARM=1 ../..
16 | make -j4
17 | make install
18 | popd
19 |
20 | #mkdir -p armv7
21 | #pushd armv7
22 | #cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphoneos --show-sdk-path) -DIOS_ARCH=armv7 ../..
23 | #make -j4
24 | #make install
25 | #popd
26 |
27 | #mkdir -p x86_64
28 | #pushd x86_64
29 | #cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphonesimulator --show-sdk-path) -DIOS_ARCH=x86_64 ../..
30 | #make -j4
31 | #make install
32 | #popd
33 |
34 | #mkdir -p i386
35 | #pushd i386
36 | #cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphonesimulator --show-sdk-path) -DIOS_ARCH=i386 ../..
37 | #make -j4
38 | #make install
39 | #popd
40 |
41 | popd
42 | bash ./build_scripts/pack_ios_framework.sh
43 |
--------------------------------------------------------------------------------
/build_scripts/build_linux_aarch64.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir -p build-linux-aarch64
4 | pushd build-linux-aarch64
5 | #cmake -DCMAKE_TOOLCHAIN_FILE=../build_scripts/linux-aarch64.toolchain.cmake .. -DFEATHER_ARM=true -DCOMPILE_OPENCL=false
6 | cmake .. -DBOOSTER_ARM=true -DCOMPILE_OPENCL=false -DCMAKE_BUILD_TYPE=Release
7 | make -j4
8 | make install
9 | popd
10 |
--------------------------------------------------------------------------------
/build_scripts/build_linux_avx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir -p build-linux-avx
4 | pushd build-linux-avx
5 | cmake .. -DBOOSTER_AVX=1 -DCMAKE_BUILD_TYPE=Release
6 | make VERBOSE=1
7 | make install
8 | popd
--------------------------------------------------------------------------------
/build_scripts/build_macos_avx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | mkdir -p build-macos-avx
4 | pushd build-macos-avx
5 | cmake .. -DBOOSTER_AVX=1 -DCMAKE_BUILD_TYPE=Release
6 | make -j4
7 | make install
8 | popd
9 |
--------------------------------------------------------------------------------
/build_scripts/ios.toolchain.cmake:
--------------------------------------------------------------------------------
1 | # For cross-compiling on arm64 Linux using gcc-aarch64-linux-gnu package:
2 | # - install AArch64 tool chain:
3 | # $ sudo apt-get install g++-aarch64-linux-gnu
4 | # - cross-compiling config
5 | # $ cmake -DCMAKE_TOOLCHAIN_FILE=../dynamorio/make/toolchain-arm64.cmake ../dynamorio
6 | # You may have to set CMAKE_FIND_ROOT_PATH to point to the target enviroment, e.g.
7 | # by passing -DCMAKE_FIND_ROOT_PATH=/usr/aarch64-linux-gnu on Debian-like systems.
8 | set(CMAKE_SYSTEM_NAME Darwin)
9 | set(CMAKE_SYSTEM_VERSION 1)
10 | set(UNIX True)
11 | set(APPLE True)
12 | set(IOS True)
13 |
14 | # specify the cross compiler as clang.
15 | set(CMAKE_C_COMPILER clang)
16 | set(CMAKE_CXX_COMPILER clang++)
17 |
18 | # To build the tests, we need to set where the target environment containing
19 | # the required library is.
20 | set(CMAKE_FIND_ROOT_PATH ${IOS_SDK_PATH})
21 | # search for programs in the build host directories
22 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
23 | # for libraries and headers in the target directories
24 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
25 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
26 |
27 |
28 | # Set additional variables.
29 | # If we don't set some of these, CMake will end up using the host version.
30 | # We want the full path, however, so we can pass EXISTS and other checks in
31 | # the our CMake code.
32 | find_program(CC_FULL_PATH clang)
33 | if (NOT CC_FULL_PATH)
34 | message(FATAL_ERROR "Cross-compiler clang not found")
35 | endif ()
36 | get_filename_component(CC_DIR ${CC_FULL_PATH} PATH)
37 | message(STATUS "CC path is ${CC_FULL_PATH}")
38 | #set(IOS_ARCH arm64)
39 |
40 | #SET(CMAKE_LINKER ${CC_DIR}/aarch64-${TARGET_ABI}-ld CACHE FILEPATH "linker")
41 | #SET(CMAKE_ASM_COMPILER ${CC_DIR}/aarch64-${TARGET_ABI}-as CACHE FILEPATH "assembler")
42 | #SET(CMAKE_OBJCOPY ${CC_DIR}/aarch64-${TARGET_ABI}-objcopy CACHE FILEPATH "objcopy")
43 | #SET(CMAKE_STRIP ${CC_DIR}/aarch64-${TARGET_ABI}-strip CACHE FILEPATH "strip")
44 | #SET(CMAKE_CPP ${CC_DIR}/aarch64-${TARGET_ABI}-cpp CACHE FILEPATH "cpp")
45 |
46 | set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE 1)
47 | # Without this, Xcode adds -fembed-bitcode-marker compile options instead of -fembed-bitcode set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}")
48 | set(CMAKE_XCODE_ATTRIBUTE_BITCODE_GENERATION_MODE "bitcode")
49 | set(BITCODE_FLAGS "-fembed-bitcode")
50 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${BITCODE_FLAGS}" CACHE INTERNAL "ios c compiler flags" FORCE)
51 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BITCODE_FLAGS}" CACHE INTERNAL "ios c compiler flags" FORCE)
52 |
53 |
54 |
55 |
--------------------------------------------------------------------------------
/build_scripts/linux-aarch64.toolchain.cmake:
--------------------------------------------------------------------------------
1 | # **********************************************************
2 | # Copyright (c) 2014-2017 Google, Inc. All rights reserved.
3 | # **********************************************************
4 |
5 | # Redistribution and use in source and binary forms, with or without
6 | # modification, are permitted provided that the following conditions are met:
7 | #
8 | # * Redistributions of source code must retain the above copyright notice,
9 | # this list of conditions and the following disclaimer.
10 | #
11 | # * Redistributions in binary form must reproduce the above copyright notice,
12 | # this list of conditions and the following disclaimer in the documentation
13 | # and/or other materials provided with the distribution.
14 | #
15 | # * Neither the name of Google, Inc. nor the names of its contributors may be
16 | # used to endorse or promote products derived from this software without
17 | # specific prior written permission.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | # ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE, INC. OR CONTRIBUTORS BE LIABLE
23 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 | # DAMAGE.
30 |
31 | # For cross-compiling on arm64 Linux using gcc-aarch64-linux-gnu package:
32 | # - install AArch64 tool chain:
33 | # $ sudo apt-get install g++-aarch64-linux-gnu
34 | # - cross-compiling config
35 | # $ cmake -DCMAKE_TOOLCHAIN_FILE=../dynamorio/make/toolchain-arm64.cmake ../dynamorio
36 | # You may have to set CMAKE_FIND_ROOT_PATH to point to the target enviroment, e.g.
37 | # by passing -DCMAKE_FIND_ROOT_PATH=/usr/aarch64-linux-gnu on Debian-like systems.
38 | set(CMAKE_SYSTEM_NAME Linux)
39 | set(CMAKE_SYSTEM_PROCESSOR aarch64)
40 | set(TARGET_ABI "linux-gnu")
41 | # specify the cross compiler
42 | SET(CMAKE_C_COMPILER aarch64-${TARGET_ABI}-gcc)
43 | SET(CMAKE_CXX_COMPILER aarch64-${TARGET_ABI}-g++)
44 |
45 | # To build the tests, we need to set where the target environment containing
46 | # the required library is. On Debian-like systems, this is
47 | # /usr/aarch64-linux-gnu.
48 | SET(CMAKE_FIND_ROOT_PATH "/usr/aarch64-${TARGET_ABI}")
49 | # search for programs in the build host directories
50 | SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
51 | # for libraries and headers in the target directories
52 | SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
53 | SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
54 |
55 | # Set additional variables.
56 | # If we don't set some of these, CMake will end up using the host version.
57 | # We want the full path, however, so we can pass EXISTS and other checks in
58 | # the our CMake code.
59 | find_program(GCC_FULL_PATH aarch64-${TARGET_ABI}-gcc)
60 | if (NOT GCC_FULL_PATH)
61 | message(FATAL_ERROR "Cross-compiler aarch64-${TARGET_ABI}-gcc not found")
62 | endif ()
63 | get_filename_component(GCC_DIR ${GCC_FULL_PATH} PATH)
64 | SET(CMAKE_LINKER ${GCC_DIR}/aarch64-${TARGET_ABI}-ld CACHE FILEPATH "linker")
65 | SET(CMAKE_ASM_COMPILER ${GCC_DIR}/aarch64-${TARGET_ABI}-as CACHE FILEPATH "assembler")
66 | SET(CMAKE_OBJCOPY ${GCC_DIR}/aarch64-${TARGET_ABI}-objcopy CACHE FILEPATH "objcopy")
67 | SET(CMAKE_STRIP ${GCC_DIR}/aarch64-${TARGET_ABI}-strip CACHE FILEPATH "strip")
68 | SET(CMAKE_CPP ${GCC_DIR}/aarch64-${TARGET_ABI}-cpp CACHE FILEPATH "cpp")
69 |
--------------------------------------------------------------------------------
/build_scripts/pack_ios_framework.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | NAME=feather
4 |
5 | ##### package android lib
6 | #ANDROIDPKGNAME=${NAME}-android-lib
7 | #rm -rf $ANDROIDPKGNAME
8 | #mkdir -p $ANDROIDPKGNAME
9 | #mkdir -p $ANDROIDPKGNAME/armeabi-v7a
10 | #mkdir -p $ANDROIDPKGNAME/arm64-v8a
11 | #mkdir -p $ANDROIDPKGNAME/include
12 | #cp build-android-armv7/install/lib/lib${NAME}.a $ANDROIDPKGNAME/armeabi-v7a/
13 | #cp build-android-aarch64/install/lib/lib${NAME}.a $ANDROIDPKGNAME/arm64-v8a/
14 | #cp build-android-aarch64/install/include/* $ANDROIDPKGNAME/include/
15 | #rm -f $ANDROIDPKGNAME.zip
16 | #zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME
17 |
18 | ##### package ios framework
19 | IOSPKGNAME=./build-ios/${NAME}.framework
20 | rm -rf $IOSPKGNAME
21 | mkdir -p $IOSPKGNAME/Versions/A/Headers
22 | mkdir -p $IOSPKGNAME/Versions/A/Resources
23 | ln -s A $IOSPKGNAME/Versions/Current
24 | ln -s Versions/Current/Headers $IOSPKGNAME/Headers
25 | ln -s Versions/Current/Resources $IOSPKGNAME/Resources
26 | ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME}
27 | lipo -create \
28 | build-ios/arm64/install/${NAME}/lib/lib${NAME}.a \
29 | build-ios/armv7/install/${NAME}/lib/lib${NAME}.a \
30 | build-ios/x86_64/install/${NAME}/lib/lib${NAME}.a \
31 | build-ios/i386/install/${NAME}/lib/lib${NAME}.a \
32 | -o $IOSPKGNAME/Versions/A/${NAME}
33 | #build-ios-sim/install/${NAME}/lib/lib${NAME}.a \
34 | cp -r build-ios/arm64/install/${NAME}/include/* $IOSPKGNAME/Versions/A/Headers/
35 |
36 | #HEADER_PATH=$IOSPKGNAME/Versions/A/Headers
37 | #HEADERS_TO_EDIT=$HEADER_PATH/feather_simple_generated.h\ $HEADER_PATH/flatbuffers/flatbuffers.h\ $HEADER_PATH/flatbuffers/base.h
38 | #HEADERS_TO_EDIT=$HEADER_PATH/flatbuffers/flatbuffers.h
39 | #HEADERS_TO_EDIT=$HEADER_PATH/flatbuffers/base.h
40 |
41 | # Fix the relative path for the framework package.
42 | #for FILE in $HEADERS_TO_EDIT
43 | #do
44 | # echo $FILE
45 | # sed -i.bak 's/flatbuffers\//feather\/flatbuffers\//' $FILE
46 | # echo $FILE.bak
47 | # rm $FILE.bak
48 | #done
49 |
50 | cp ./build_scripts/Info.plist ${IOSPKGNAME}/Versions/A/Resources/
51 | rm -f $IOSPKGNAME.zip
52 | zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME
53 |
--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | file(GLOB LIB_HEADERS *.h)
2 | file(GLOB LIB_SRC *.cpp)
3 | file(GLOB LAYER_HEADERS layers/*.h)
4 | file(GLOB LAYER_SRC layers/*.cpp)
5 | file(GLOB FLATBUFFERS_HEADERS flatbuffers/*.h)
6 |
7 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wno-format")
8 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -std=c++11 -Wall")
9 |
10 | include_directories("${PROJECT_SOURCE_DIR}/src")
11 | include_directories("booster/include")
12 |
13 | message(STATUS "Using Booster backend.")
14 | add_subdirectory(./booster)
15 | if(BOOSTER_ARM)
16 | message(STATUS "Compiling for Arm backend.")
17 | add_library(feather STATIC ${LIB_SRC} ${LIB_HEADERS} ${LAYER_SRC} ${LAYER_HEADERS} $)
18 | elseif(BOOSTER_AVX)
19 | message(STATUS "Compiling for AVX backend.")
20 | add_library(feather STATIC ${LIB_SRC} ${LIB_HEADERS} ${LAYER_SRC} ${LAYER_HEADERS} $)
21 | else()
22 | error("You have to specify a backend, either FEATHER_ARM or FEATHER_AVX")
23 | endif()
24 |
25 | set(FEATHER_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/feather")
26 |
27 | message(Library headers: ${LIB_HEADERS})
28 | list(REMOVE_ITEM LIB_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/feather_simple_generated.h)
29 | message(Library headers: ${LIB_HEADERS})
30 | install(TARGETS feather DESTINATION "${FEATHER_INSTALL_DIR}/lib")
31 | install(FILES ${LIB_HEADERS} DESTINATION "${FEATHER_INSTALL_DIR}/include")
32 |
--------------------------------------------------------------------------------
/src/blob.cpp:
--------------------------------------------------------------------------------
1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
2 |
3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 |
5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 | //in compliance with the License. You may obtain a copy of the License at
7 | //
8 | //https://opensource.org/licenses/BSD-3-Clause
9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 |
15 | #include "blob.h"
16 |
17 | #include
18 |
19 | namespace feather
20 | {
21 | template
22 | void Blob::Alloc()
23 | {
24 | size_t dim_byte = _num * _channels * _height * _width * sizeof(Dtype);
25 | _data = (Dtype*) _mm_malloc(dim_byte, 32);
26 | }
27 | template
28 | void Blob::Free()
29 | {
30 | if (this->_data)
31 | {
32 | free(this->_data);
33 | this->_data = NULL;
34 | }
35 | }
36 |
37 | template
38 | void Blob::ReshapeWithRealloc(const Blob *p_blob)
39 | {
40 | int num = p_blob->num();
41 | int channels = p_blob->channels();
42 | int height = p_blob->height();
43 | int width = p_blob->width();
44 |
45 | ReshapeWithRealloc(num, channels, height, width);
46 | }
47 |
48 | template
49 | void Blob::ReshapeWithRealloc(int num, int channels, int height, int width)
50 | {
51 | // LOGI("Reallc: (%d %d %d %d) to (%d %d %d %d)", _num, _channels, _height, _width, num, channels, height, width);
52 | int elem_size = num * channels * height * width;
53 | Realloc(elem_size);
54 | this->_num = num;
55 | this->_channels = channels;
56 | this->_height = height;
57 | this->_width = width;
58 | }
59 |
60 | template
61 | void Blob::Realloc(size_t elem_size)
62 | {
63 | if (elem_size > this->data_size())
64 | {
65 | Free();
66 | _data = (Dtype*) _mm_malloc(elem_size * sizeof(Dtype), 32);
67 | }
68 | }
69 |
70 | template
71 | int Blob::CopyFromMat(const ncnn::Mat& mat)
72 | {
73 | this->ReshapeWithRealloc(1, mat.c, mat.h, mat.w);
74 | this->CopyDataFromMat(mat);
75 | return 0;
76 | }
77 |
78 | template
79 | int Blob::CopyDataFromMat(const ncnn::Mat& mat)
80 | {
81 | if (this->data_size() != mat.c * mat.h * mat.w)
82 | {
83 | LOGE("In Blob %s: Mat and target blob shape mismatch. blob shape (%zu %zu %zu %zu), mat shape (%d %d %d)\n", this->name.c_str(), num(), channels(), height(), width(), mat.c, mat.h, mat.w);
84 | return -500; // BAD DATA DIMENSION
85 | }
86 | Dtype* dst_p = (Dtype *) this->_data;
87 | size_t copy_stride = mat.h * mat.w;
88 | for (int c = 0; c < mat.c; ++c )
89 | {
90 | ncnn::Mat channel_mat = mat.channel(c);
91 | memcpy(dst_p, channel_mat.data, copy_stride * sizeof(Dtype));
92 | dst_p += copy_stride;
93 | }
94 | return 0;
95 | }
96 |
97 | template class Blob;
98 | template class Blob;
99 | template class Blob;
100 | };
101 |
--------------------------------------------------------------------------------
/src/blob.h:
--------------------------------------------------------------------------------
1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
2 |
3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 |
5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 | //in compliance with the License. You may obtain a copy of the License at
7 | //
8 | //https://opensource.org/licenses/BSD-3-Clause
9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 |
15 | #pragma once
16 |
17 | #include "utils.h"
18 |
19 | #include "ncnn/mat.h"
20 |
21 | #include
22 |
23 | namespace feather
24 | {
25 | template
26 | class Blob
27 | {
28 | public:
29 | Blob()
30 | : name(), _num(0), _channels(0), _height(0), _width(0), _data(NULL)
31 | {}
32 |
33 | explicit Blob(std::string name)
34 | : name(name), _num(0), _channels(0), _height(0), _width(0), _data(NULL)
35 | {}
36 |
37 | explicit Blob(const size_t num, const size_t channels, const size_t height, const size_t width)
38 | : name(), _data(NULL), _num(num), _channels(channels), _height(height), _width(width)
39 | {}
40 |
41 | explicit Blob(Dtype* data, const size_t num, const size_t channels, const size_t height, const size_t width)
42 | : name(), _data(data), _num(num), _channels(channels), _height(height), _width(width)
43 | {}
44 |
45 | ~Blob()
46 | {
47 | Free();
48 | }
49 |
50 | void Free();
51 | void Alloc();
52 |
53 | void ReshapeWithRealloc(const Blob *p_blob);
54 | void ReshapeWithRealloc(int num, int channels, int height, int width);
55 | void Realloc(size_t elem_size);
56 |
57 | int CopyFromMat(const ncnn::Mat &src_mat);
58 | int CopyDataFromMat(const ncnn::Mat &src_mat);
59 |
60 | void CopyData(const Dtype* data)
61 | {
62 | size_t size = _num * _channels * _height * _width;
63 | memcpy(_data, data, sizeof(Dtype) * size);
64 | }
65 | void CopyShape(const Blob* p_blob)
66 | {
67 | this->_num = p_blob->num();
68 | this->_channels = p_blob->channels();
69 | this->_width = p_blob->width();
70 | this->_height = p_blob->height();
71 | }
72 | void Copy(const Blob* p_blob)
73 | {
74 | this->Free();
75 | CopyShape(p_blob);
76 | this->Alloc();
77 | CopyData(p_blob->data());
78 | }
79 |
80 | Dtype* data() const
81 | {
82 | return (Dtype*) _data;
83 | }
84 |
85 | size_t data_size() const
86 | {
87 | return _num * _channels * _height * _width;
88 | }
89 | size_t num() const
90 | {
91 | return _num;
92 | }
93 | size_t channels() const
94 | {
95 | return _channels;
96 | }
97 | size_t height() const
98 | {
99 | return _height;
100 | }
101 | size_t width() const
102 | {
103 | return _width;
104 | }
105 | void PrintBlobInfo() const
106 | {
107 | printf("----BlobShape----\n");
108 | printf("NCHW=(%zu %zu %zu %zu)\n", _num, _channels, _height, _width);
109 | printf("----------------\n");
110 | }
111 |
112 | std::string name;
113 |
114 | void* _data;
115 | size_t _elemsize;
116 |
117 | size_t _num;
118 | size_t _channels;
119 | size_t _height;
120 | size_t _width;
121 | };
122 | };
123 |
--------------------------------------------------------------------------------
/src/booster/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | cmake_minimum_required(VERSION 2.8.10)
2 |
3 | file(GLOB LIB_HEADERS ./include/booster/*.h)
4 |
5 | if(CMAKE_SYSTEM_NAME MATCHES "Windows")
6 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O2 -std=c++11")
7 | else()
8 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wno-format")
9 | endif()
10 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -std=c++11 -Wall")
11 |
12 | include_directories("./include/")
13 |
14 | if(BOOSTER_AVX)
15 | message(STATUS "Compiling booster AVX version.")
16 | add_subdirectory(./avx)
17 | if(COMPILE_OPENCL)
18 | add_subdirectory(./cl)
19 | add_library(booster STATIC $ $)
20 | else()
21 | add_library(booster STATIC $)
22 | endif()
23 | elseif(BOOSTER_ARM)
24 | add_subdirectory(./arm)
25 | if(COMPILE_OPENCL)
26 | add_subdirectory(./cl)
27 | add_library(booster STATIC $ $)
28 | else()
29 | add_library(booster STATIC $)
30 | endif()
31 | else()
32 | error("Unkown booster configuration.")
33 | endif()
34 |
35 |
36 | set(BOOSTER_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/booster")
37 |
38 | message(Library headers: ${LIB_HEADERS})
39 | install(TARGETS booster DESTINATION "${BOOSTER_INSTALL_DIR}/lib")
40 | install(FILES ${LIB_HEADERS} DESTINATION "${BOOSTER_INSTALL_DIR}/include/booster")
41 |
--------------------------------------------------------------------------------
/src/booster/arm/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | file(GLOB ARM_SRC ./*.cpp)
2 | file(GLOB ARM_HEADERS ../include/*.h)
3 | list(REMOVE_ITEM ARM_SRC "${CMAKE_CURRENT_SOURCE_DIR}/./sgemm_legacy.cpp")
4 |
5 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a -fopenmp")
6 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -Wall")
7 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wno-format")
8 |
9 | add_library(booster_arm_obj OBJECT ${ARM_SRC} ${ARM_HEADERS})
10 | #add_library(arm_backend STATIC ${ARM_SRC} ${ARM_HEADERS})
11 |
12 | #target_include_directories(arm_backend PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
13 | #set(ARM_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/booster_arm/")
14 | #install(TARGETS arm_backend DESTINATION ${ARM_INSTALL_DIR}/lib)
15 | #install(FILES ${ARM_HEADERS} DESTINATION "${ARM_INSTALL_DIR}/include")
16 |
--------------------------------------------------------------------------------
/src/booster/arm/caffe_interp.cpp:
--------------------------------------------------------------------------------
1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
2 |
3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 |
5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 | //in compliance with the License. You may obtain a copy of the License at
7 | //
8 | //https://opensource.org/licenses/BSD-3-Clause
9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 |
15 | #include
16 |
17 | // Bi-linear interpolation
18 | // IN : [channels height1 width1] cropped from a bigger [Height1 Width1] image
19 | // OUT: [channels height2 width2] cropped from a bigger [Height2 Width2] image
20 | template
21 | void caffe_cpu_interp2(const int channels,
22 | const Dtype *data1, const int x1, const int y1, const int height1, const int width1, const int Height1, const int Width1,
23 | Dtype *data2, const int x2, const int y2, const int height2, const int width2, const int Height2, const int Width2)
24 | {
25 | // CHECK(x1 >= 0 && y1 >= 0 && height1 > 0 && width1 > 0 && x2 >= 0 && y2 >= 0 && height2 > 0 && width2 > 0);
26 | // CHECK(Width1 >= width1 + x1 && Height1 >= height1 + y1 && Width2 >= width2 + x2 && Height2 >= height2 + y2);
27 | // special case: just copy
28 | if (height1 == height2 && width1 == width2)
29 | {
30 | for (int h2 = 0; h2 < height2; ++h2)
31 | {
32 | const int h1 = h2;
33 | for (int w2 = 0; w2 < width2; ++w2)
34 | {
35 | const int w1 = w2;
36 | if (packed)
37 | {
38 | const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))];
39 | Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))];
40 | for (int c = 0; c < channels; ++c)
41 | {
42 | pos2[0] = pos1[0];
43 | pos1++;
44 | pos2++;
45 | }
46 | }
47 | else
48 | {
49 | const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)];
50 | Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)];
51 | for (int c = 0; c < channels; ++c)
52 | {
53 | pos2[0] = pos1[0];
54 | pos1 += Width1 * Height1;
55 | pos2 += Width2 * Height2;
56 | }
57 | }
58 | }
59 | }
60 | return;
61 | }
62 | const float rheight = (height2 > 1) ? static_cast(height1) / (height2) : 0.f;
63 | const float rwidth = (width2 > 1) ? static_cast(width1) / (width2) : 0.f;
64 | for (int h2 = 0; h2 < height2; ++h2)
65 | {
66 | const float h1r = rheight * h2;
67 | const int h1 = h1r;
68 | const int h1p = (h1 < height1 - 1) ? 1 : 0;
69 | const Dtype h1lambda = h1r - h1;
70 | const Dtype h0lambda = Dtype(1.) - h1lambda;
71 | for (int w2 = 0; w2 < width2; ++w2)
72 | {
73 | const float w1r = rwidth * w2;
74 | const int w1 = w1r;
75 | const int w1p = (w1 < width1 - 1) ? 1 : 0;
76 | const Dtype w1lambda = w1r - w1;
77 | const Dtype w0lambda = Dtype(1.) - w1lambda;
78 | if (packed)
79 | {
80 | const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))];
81 | Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))];
82 | for (int c = 0; c < channels; ++c)
83 | {
84 | pos2[0] =
85 | h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[channels * w1p]) +
86 | h1lambda * (w0lambda * pos1[channels * h1p * Width1] + w1lambda * pos1[channels * (h1p * Width1 + w1p)]);
87 | pos1++;
88 | pos2++;
89 | }
90 | }
91 | else
92 | {
93 | const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)];
94 | Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)];
95 | for (int c = 0; c < channels; ++c)
96 | {
97 | pos2[0] =
98 | h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) +
99 | h1lambda * (w0lambda * pos1[h1p * Width1] + w1lambda * pos1[h1p * Width1 + w1p]);
100 | pos1 += Width1 * Height1;
101 | pos2 += Width2 * Height2;
102 | }
103 | }
104 | }
105 | }
106 | }
107 |
108 | template void caffe_cpu_interp2(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int);
109 | template void caffe_cpu_interp2(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int);
110 | template void caffe_cpu_interp2(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int);
111 | template void caffe_cpu_interp2(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int);
112 |
--------------------------------------------------------------------------------
/src/booster/arm/helper.cpp:
--------------------------------------------------------------------------------
1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
2 |
3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 |
5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 | //in compliance with the License. You may obtain a copy of the License at
7 | //
8 | //https://opensource.org/licenses/BSD-3-Clause
9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 |
15 | #include
16 |
17 | #include
18 | #include
19 | #include
20 | #include
21 |
22 | void print_vec2(float32x4_t* vp)
23 | {
24 | float* ep = (float *) vp;
25 | printf("input %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
26 | }
27 |
28 | void print_vec3(float32x4_t* vp)
29 | {
30 | float* ep = (float *) vp;
31 | printf("transformed %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
32 | }
33 |
34 | void print_vec(float32x4_t* vp, const char* comment)
35 | {
36 | float* ep = (float *) vp;
37 | printf("%s %.3f, %.3f, %.3f, %.3f\n", comment, *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
38 | }
39 |
40 |
41 | void print_vec(float32x4_t* vp)
42 | {
43 | float* ep = (float *) vp;
44 | printf("vec %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
45 | }
46 |
47 | void print_arr(float* vp)
48 | {
49 | float* ep = (float *) vp;
50 | printf("arr %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
51 | }
52 |
53 | void print_floats(const float* arr, const int len)
54 | {
55 | for (int i = 0; i < len; ++i)
56 | {
57 | printf("%.2f ", arr[i]);
58 | }
59 | printf("\n\n");
60 | }
61 |
62 | void print_floats(const float* arr, const int dimX, const int dimY)
63 | {
64 | for (int i = 0; i < dimX; ++i)
65 | {
66 | for (int j = 0; j < dimY; ++j)
67 | printf("%.2f ", arr[i * dimY + j]);
68 | printf("\n");
69 | }
70 | printf("\n\n");
71 | }
72 |
73 |
74 | void diff(float* arr1, float* arr2, int len)
75 | {
76 | float dif = 0.0f;
77 | for (int i = 0; i < len; ++i)
78 | {
79 | float err = fabsf(arr1[i] - arr2[i]);
80 | if (err > 1.0f)
81 | {
82 | dif += err;
83 | }
84 | }
85 | LOGD("The difference is %.2f\n", dif);
86 | }
87 | void diff(float* arr1, float* arr2, int M, int N)
88 | {
89 | float dif = 0.0f;
90 | for (int i = 0; i < M; ++i)
91 | {
92 | for (int j = 0; j < N; ++j)
93 | {
94 | float err = fabsf(arr1[i * N + j] - arr2[i * N + j]);
95 | if (err > 1.0f)
96 | {
97 | dif += err;
98 | LOGD("Error position (%d, %d), value %.2f, %.2f\n", i, j, arr1[i * N + j], arr2[i * N + j]);
99 | }
100 | }
101 | }
102 | LOGD("The difference is %.2f\n", dif);
103 | }
104 |
105 | #include
106 |
107 | void Timer::startBench()
108 | {
109 | clock_gettime(CLOCK_MONOTONIC, &start);
110 | }
111 |
112 | double Timer::endBench()
113 | {
114 | clock_gettime(CLOCK_MONOTONIC, &stop);
115 | return (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
116 | }
117 |
118 | void Timer::endBench(const char* comment)
119 | {
120 | clock_gettime(CLOCK_MONOTONIC, &stop);
121 | double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
122 | LOGD("%s %lfms\n", comment, elapsedTime);
123 | }
124 |
125 | void Timer::endBench(const char* comment, double fold)
126 | {
127 | clock_gettime(CLOCK_MONOTONIC, &stop);
128 | double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
129 | printf("%s %lfms\n", comment, elapsedTime / fold);
130 | }
131 |
--------------------------------------------------------------------------------
/src/booster/arm/sgemm_legacy.h:
--------------------------------------------------------------------------------
1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
2 |
3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 |
5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 | //in compliance with the License. You may obtain a copy of the License at
7 | //
8 | //https://opensource.org/licenses/BSD-3-Clause
9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 |
15 | #ifndef TCNN_SGEMM_H_
16 | #define TCNN_SGEMM_H_
17 |
18 |
19 | void externalPackA(int M, int L, float* packA, float* a, int lda);//External packing for A, requires space allocation for packA
20 | void block_sgemm_external_pack_threading(int M, int N, int L, float *A, float *B, float *C, int num_threads);
21 |
22 |
23 | void externalPackA8(int M, int L, float* packA, float* a, int lda);//External packing for A, requires space allocation for packA
24 | void block_sgemm_external_pack_threading_8x8(int M, int N, int L, float *A, float *B, float *C, int num_threads);
25 |
26 |
27 | #endif
28 |
--------------------------------------------------------------------------------
/src/booster/arm/sgemv.cpp:
--------------------------------------------------------------------------------
1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
2 |
3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 |
5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 | //in compliance with the License. You may obtain a copy of the License at
7 | //
8 | //https://opensource.org/licenses/BSD-3-Clause
9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 |
15 | #include
16 |
17 | #include
18 | #include
19 | #include
20 |
21 | template
22 | void fully_connected_inference_direct(const int input_size, const int output_size, const float *x, const float *y, float *z, const int num_threads, float* bias_arr)
23 | {
24 | #pragma omp parallel for schedule(static) num_threads(num_threads)
25 | for (int i = 0; i < output_size; i++)
26 | {
27 | float sum = 0;
28 | for (int j = 0; j < input_size; j++)
29 | sum += x[j] * y[i * input_size + j];
30 | if (fuseBias)
31 | sum += bias_arr[i];
32 | if (fuseRelu)
33 | sum = (sum > 0.f) ? sum : 0.f;
34 | z[i] = sum;
35 | }
36 | }
37 |
38 | template
39 | void fully_connected_transpose_inference(const int input_size, const int output_size, const float *x, const float *y, float *z, const int num_threads, float* bias_arr)
40 | {
41 | assert(input_size % 8 == 0);
42 | assert(output_size % 8 == 0);
43 | #pragma omp parallel for schedule(static) num_threads(num_threads)
44 | for (int k = 0; k < output_size / 8; k++)
45 | {
46 | float32x4_t vBias = vld1q_f32(bias_arr + k * 8);
47 | float32x4_t vBias1 = vld1q_f32(bias_arr + k * 8 + 4);
48 | float32x4_t vZero = vdupq_n_f32(0.f);
49 | const float *yPtr = y + k * 8 * input_size;
50 | float32x4_t res = {0.0, 0.0, 0.0, 0.0};
51 | float32x4_t res1 = {0.0, 0.0, 0.0, 0.0};
52 | float32x4_t va, vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7;
53 | for (int i = 0; i < input_size; i += 4)
54 | {
55 | // float32x4_t v1, v2;
56 | va = vld1q_f32(x + i);
57 |
58 | vb0 = vld1q_f32(yPtr);
59 | vb1 = vld1q_f32(yPtr + 4);
60 | vb2 = vld1q_f32(yPtr + 8);
61 | vb3 = vld1q_f32(yPtr + 12);
62 | vb4 = vld1q_f32(yPtr + 16);
63 | vb5 = vld1q_f32(yPtr + 20);
64 | vb6 = vld1q_f32(yPtr + 24);
65 | vb7 = vld1q_f32(yPtr + 28);
66 |
67 | #if __aarch64__
68 | res = vfmaq_laneq_f32(res, vb0, va, 0);
69 | res1 = vfmaq_laneq_f32(res1, vb1, va, 0);
70 | res = vfmaq_laneq_f32(res, vb2, va, 1);
71 | res1 = vfmaq_laneq_f32(res1, vb3, va, 1);
72 | res = vfmaq_laneq_f32(res, vb4, va, 2);
73 | res1 = vfmaq_laneq_f32(res1, vb5, va, 2);
74 | res = vfmaq_laneq_f32(res, vb6, va, 3);
75 | res1 = vfmaq_laneq_f32(res1, vb7, va, 3);
76 | #else
77 | res = vmlaq_f32(res, vb0, vld1q_dup_f32(x + i + 0));
78 | res1 = vmlaq_f32(res1, vb1, vld1q_dup_f32(x + i + 0));
79 | res = vmlaq_f32(res, vb2, vld1q_dup_f32(x + i + 1));
80 | res1 = vmlaq_f32(res1, vb3, vld1q_dup_f32(x + i + 1));
81 | res = vmlaq_f32(res, vb4, vld1q_dup_f32(x + i + 2));
82 | res1 = vmlaq_f32(res1, vb5, vld1q_dup_f32(x + i + 2));
83 | res = vmlaq_f32(res, vb6, vld1q_dup_f32(x + i + 3));
84 | res1 = vmlaq_f32(res1, vb7, vld1q_dup_f32(x + i + 3));
85 | #endif
86 | yPtr += 32;
87 | }
88 |
89 | if (fuseBias)
90 | {
91 | res = vaddq_f32(res, vBias);
92 | res1 = vaddq_f32(res1, vBias1);
93 | }
94 | if (fuseRelu)
95 | {
96 | res = vmaxq_f32(res, vZero);
97 | res1 = vmaxq_f32(res1, vZero);
98 | }
99 | vst1q_f32((float32_t *)(z + 8 * k), res);
100 | vst1q_f32((float32_t *)(z + 8 * k + 4), res1);
101 | }
102 | }
103 |
104 | template void fully_connected_inference_direct(const int, const int, const float *, const float *, float *, const int, float*);
105 | template void fully_connected_inference_direct(const int, const int, const float *, const float *, float *, const int, float*);
106 | template void fully_connected_inference_direct(const int, const int, const float *, const float *, float *, const int, float*);
107 | template void fully_connected_inference_direct(const int, const int, const float *, const float *, float *, const int, float*);
108 |
109 | template void fully_connected_transpose_inference(const int, const int, const float *, const float *, float *, const int, float*);
110 | template void fully_connected_transpose_inference(const int, const int, const float *, const float *, float *, const int, float*);
111 | template void fully_connected_transpose_inference(const int, const int, const float *, const float *, float *, const int, float*);
112 | template void fully_connected_transpose_inference(const int, const int, const float *, const float *, float *, const int, float*);
113 |
114 | #if 0
115 | void fully_connected_inference_direct_BiasReLU(int input_size, int output_size, float *x, float *y, float *z, float* biasArr, int num_threads)
116 | {
117 | #pragma omp parallel for schedule(static) num_threads(num_threads)
118 | for (int i = 0; i < output_size; i++)
119 | {
120 | float sum = 0.f;
121 | for (int j = 0; j < input_size; j++)
122 | sum += x[j] * y[i * input_size + j];
123 |
124 | sum += biasArr[i];
125 | if (sum < 0.f) sum = 0.f;
126 | z[i] = sum;
127 | }
128 | }
129 |
130 | void fully_connected_transpose_inference_neon8_BiasReLU(int input_size, int output_size, float *x, float *y, float *z, float* biasArr, int num_threads)
131 | {
132 | assert(input_size % 8 == 0);
133 | assert(output_size % 8 == 0);
134 | #pragma omp parallel for schedule(static) num_threads(num_threads)
135 | for (int k = 0; k < output_size / 8; k++)
136 | {
137 | float *yPtr = y + k * 8 * input_size;
138 | const float32x4_t vzero = vdupq_n_f32(0.f);
139 |
140 | float32x4_t res = vld1q_f32(biasArr + k * 8);
141 | float32x4_t res1 = vld1q_f32(biasArr + k * 8 + 4);
142 |
143 | float32x4_t va, vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7;
144 | for (int i = 0; i < input_size; i += 4)
145 | {
146 | va = vld1q_f32(x + i);
147 |
148 | vb0 = vld1q_f32(yPtr);
149 | vb1 = vld1q_f32(yPtr + 4);
150 | vb2 = vld1q_f32(yPtr + 8);
151 | vb3 = vld1q_f32(yPtr + 12);
152 | vb4 = vld1q_f32(yPtr + 16);
153 | vb5 = vld1q_f32(yPtr + 20);
154 | vb6 = vld1q_f32(yPtr + 24);
155 | vb7 = vld1q_f32(yPtr + 28);
156 |
157 | #if __aarch64__
158 | res = vfmaq_laneq_f32(res, vb0, va, 0);
159 | res1 = vfmaq_laneq_f32(res1, vb1, va, 0);
160 | res = vfmaq_laneq_f32(res, vb2, va, 1);
161 | res1 = vfmaq_laneq_f32(res1, vb3, va, 1);
162 | res = vfmaq_laneq_f32(res, vb4, va, 2);
163 | res1 = vfmaq_laneq_f32(res1, vb5, va, 2);
164 | res = vfmaq_laneq_f32(res, vb6, va, 3);
165 | res1 = vfmaq_laneq_f32(res1, vb7, va, 3);
166 | #else
167 | res = vmlaq_f32(res, vb0, vld1q_dup_f32(x + i + 0));
168 | res1 = vmlaq_f32(res1, vb1, vld1q_dup_f32(x + i + 0));
169 | res = vmlaq_f32(res, vb2, vld1q_dup_f32(x + i + 1));
170 | res1 = vmlaq_f32(res1, vb3, vld1q_dup_f32(x + i + 1));
171 | res = vmlaq_f32(res, vb4, vld1q_dup_f32(x + i + 2));
172 | res1 = vmlaq_f32(res1, vb5, vld1q_dup_f32(x + i + 2));
173 | res = vmlaq_f32(res, vb6, vld1q_dup_f32(x + i + 3));
174 | res1 = vmlaq_f32(res1, vb7, vld1q_dup_f32(x + i + 3));
175 | #endif
176 | yPtr += 32;
177 | }
178 |
179 | //res = vaddq_f32(res, vBias);
180 | //res1 = vaddq_f32(res, vBias1);
181 |
182 | res = vmaxq_f32(res, vzero);
183 | res1 = vmaxq_f32(res1, vzero);
184 |
185 | vst1q_f32((float32_t *)(z + 8 * k), res);
186 | vst1q_f32((float32_t *)(z + 8 * k + 4), res1);
187 | }
188 | }
189 | /*
190 | void fully_connected_transpose_inference_neon(int input_size, int output_size, float *x, float *y, float *z)
191 | {
192 | assert(input_size %4==0);
193 | assert(output_size%4==0);
194 | //#pragma omp parallel for num_threads(32) schedule(static)
195 | for(int k=0; k A[n][m]
238 | {
239 | for (int i = 0; i < m; i++) for (int j = 0; j < n; j++)
240 | buffer[j * m + i] = array[i * n + j];
241 | memcpy(array, buffer, m * n * sizeof(float));
242 | }
243 |
--------------------------------------------------------------------------------
/src/booster/avx/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | file(GLOB AVX_SRC ./*.cpp)
2 | file(GLOB AVX_HEADERS ./*.h)
3 |
4 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a -fopenmp")
5 |
6 | if(CMAKE_SYSTEM_NAME MATCHES "Windows")
7 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -std=c++11 -Wall")
8 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
9 | else()
10 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -std=c++11 -march=core-avx2 -g -Wall")
11 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -std=c++11 -march=core-avx2 -O3 -Wno-format -Wno-unused-parameter")
12 | endif()
13 |
14 | add_library(booster_avx_obj OBJECT ${AVX_SRC} ${AVX_HEADERS})
15 | #add_library(arm_backend STATIC ${AVX_SRC} ${AVX_HEADERS})
16 |
17 | #target_include_directories(arm_backend PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
18 | #set(AVX_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/feather_backend_avx/")
19 | #install(TARGETS arm_backend DESTINATION ${AVX_INSTALL_DIR}/lib)
20 | #install(FILES ${AVX_HEADERS} DESTINATION "${AVX_INSTALL_DIR}/include")
21 |
--------------------------------------------------------------------------------
/src/booster/avx/caffe_interp.cpp:
--------------------------------------------------------------------------------
1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
2 |
3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 |
5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 | //in compliance with the License. You may obtain a copy of the License at
7 | //
8 | //https://opensource.org/licenses/BSD-3-Clause
9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 |
15 | #include
16 |
17 | // Bi-linear interpolation
18 | // IN : [channels height1 width1] cropped from a bigger [Height1 Width1] image
19 | // OUT: [channels height2 width2] cropped from a bigger [Height2 Width2] image
20 | template
21 | void caffe_cpu_interp2(const int channels,
22 | const Dtype *data1, const int x1, const int y1, const int height1, const int width1, const int Height1, const int Width1,
23 | Dtype *data2, const int x2, const int y2, const int height2, const int width2, const int Height2, const int Width2)
24 | {
25 | // CHECK(x1 >= 0 && y1 >= 0 && height1 > 0 && width1 > 0 && x2 >= 0 && y2 >= 0 && height2 > 0 && width2 > 0);
26 | // CHECK(Width1 >= width1 + x1 && Height1 >= height1 + y1 && Width2 >= width2 + x2 && Height2 >= height2 + y2);
27 | // special case: just copy
28 | if (height1 == height2 && width1 == width2)
29 | {
30 | for (int h2 = 0; h2 < height2; ++h2)
31 | {
32 | const int h1 = h2;
33 | for (int w2 = 0; w2 < width2; ++w2)
34 | {
35 | const int w1 = w2;
36 | if (packed)
37 | {
38 | const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))];
39 | Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))];
40 | for (int c = 0; c < channels; ++c)
41 | {
42 | pos2[0] = pos1[0];
43 | pos1++;
44 | pos2++;
45 | }
46 | }
47 | else
48 | {
49 | const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)];
50 | Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)];
51 | for (int c = 0; c < channels; ++c)
52 | {
53 | pos2[0] = pos1[0];
54 | pos1 += Width1 * Height1;
55 | pos2 += Width2 * Height2;
56 | }
57 | }
58 | }
59 | }
60 | return;
61 | }
62 | const float rheight = (height2 > 1) ? static_cast(height1) / (height2) : 0.f;
63 | const float rwidth = (width2 > 1) ? static_cast(width1) / (width2) : 0.f;
64 | for (int h2 = 0; h2 < height2; ++h2)
65 | {
66 | const float h1r = rheight * h2;
67 | const int h1 = h1r;
68 | const int h1p = (h1 < height1 - 1) ? 1 : 0;
69 | const Dtype h1lambda = h1r - h1;
70 | const Dtype h0lambda = Dtype(1.) - h1lambda;
71 | for (int w2 = 0; w2 < width2; ++w2)
72 | {
73 | const float w1r = rwidth * w2;
74 | const int w1 = w1r;
75 | const int w1p = (w1 < width1 - 1) ? 1 : 0;
76 | const Dtype w1lambda = w1r - w1;
77 | const Dtype w0lambda = Dtype(1.) - w1lambda;
78 | if (packed)
79 | {
80 | const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))];
81 | Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))];
82 | for (int c = 0; c < channels; ++c)
83 | {
84 | pos2[0] =
85 | h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[channels * w1p]) +
86 | h1lambda * (w0lambda * pos1[channels * h1p * Width1] + w1lambda * pos1[channels * (h1p * Width1 + w1p)]);
87 | pos1++;
88 | pos2++;
89 | }
90 | }
91 | else
92 | {
93 | const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)];
94 | Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)];
95 | for (int c = 0; c < channels; ++c)
96 | {
97 | pos2[0] =
98 | h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) +
99 | h1lambda * (w0lambda * pos1[h1p * Width1] + w1lambda * pos1[h1p * Width1 + w1p]);
100 | pos1 += Width1 * Height1;
101 | pos2 += Width2 * Height2;
102 | }
103 | }
104 | }
105 | }
106 | }
107 |
108 | template void caffe_cpu_interp2(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int);
109 | template void caffe_cpu_interp2(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int);
110 | template void caffe_cpu_interp2(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int);
111 | template void caffe_cpu_interp2(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int);
112 |
--------------------------------------------------------------------------------
/src/booster/avx/depthwise.cpp:
--------------------------------------------------------------------------------
1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
2 |
3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 |
5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 | //in compliance with the License. You may obtain a copy of the License at
7 | //
8 | //https://opensource.org/licenses/BSD-3-Clause
9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 |
15 |
16 | #include
17 | #include
18 | #include
19 | #include
20 | #include
21 |
22 | //#include
23 |
24 | #ifdef __APPLE__
25 | #else
26 | #include
27 | #endif
28 |
29 |
30 | template
31 | void globalDwConv(float *output, const float *input, int input_channels, int inw, int inh, float *kernel, int group, int nThreads, float *bias_arr)
32 | {
33 | assert(group > 0 || input_channels % group == 0);
34 | int step = inw * inh;
35 | int block = input_channels / group;
36 | int groupKernelSize = inw * inh * group;
37 |
38 | for (int i = 0; i < input_channels; i++)
39 | {
40 | int k = i / group, u = i % group;
41 | output[i] = 0;
42 | for (int j = 0; j < step; j++)
43 | {
44 | output[i] += input[i * step + j] * kernel[k * groupKernelSize + u * step + j];
45 | }
46 | if (fuseBias)
47 | {
48 | output[i] += bias_arr[i];
49 | }
50 | if (fuseRelu)
51 | {
52 | output[i] = (output[i] > 0.f) ? output[i] : 0.f;
53 | }
54 | }
55 |
56 | /*
57 | int kw = inw, kh = inh;
58 | int width = kw * kh;
59 | int widthAligned = width & 0xFFFFFFFC;
60 | int widthRem = width & 0x03; // int widthRem = width & 0x11;
61 | int height = group;
62 | int heightAligned = group & 0xFFFFFFFC;
63 | int heightRem = height & 0x03; // int heightRem = height & 0x11;
64 | float ext[8];
65 | for(int i = 0; i < heightAligned; i += 4)
66 | {
67 | float32x4_t sum = vdupq_n_f32(0.f);
68 | float* p0 = const_cast(input) + width * i;
69 | float* p1 = p0 + width;
70 | float* p2 = p1 + width;
71 | float* p3 = p2 + width;
72 | float* k0 = kernel + width * i;
73 | float* k1 = k0 + width;
74 | float* k2 = k1 + width;
75 | float* k3 = k2 + width;
76 |
77 | for(int j = 0; j < widthAligned; j += 4)
78 | {
79 | float32x4_t v0 = vld1q_f32(p0);
80 | p0 += 4;
81 | float32x4_t v1 = vld1q_f32(p1);
82 | p1 += 4;
83 | float32x4_t v2 = vld1q_f32(p2);
84 | p2 += 4;
85 | float32x4_t v3 = vld1q_f32(p3);
86 | p3 += 4;
87 |
88 | float32x4_t r0 = vld1q_f32(k0);
89 | k0 += 4;
90 | float32x4_t r1 = vld1q_f32(k1);
91 | k1 += 4;
92 | float32x4_t r2 = vld1q_f32(k2);
93 | k2 += 4;
94 | float32x4_t r3 = vld1q_f32(k3);
95 | k3 += 4;
96 |
97 | float32x4x2_t row01 = vtrnq_f32(v0, v1);
98 | float32x4x2_t row23 = vtrnq_f32(v2, v3);
99 |
100 | // * row0 = ( x00 x10 x20 x30 )
101 | // * row1 = ( x01 x11 x21 x31 )
102 | // * row2 = ( x02 x12 x22 x32 )
103 | // * row3 = ( x03 x13 x23 x33 )
104 |
105 | v0 = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0]));
106 | v1 = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1]));
107 | v2 = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0]));
108 | v3 = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1]));
109 | row01 = vtrnq_f32(r0, r1);
110 | row23 = vtrnq_f32(r2, r3);
111 | r0 = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0]));
112 | r1 = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1]));
113 | r2 = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0]));
114 | r3 = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1]));
115 | #ifdef __aarch64__
116 | sum = vfmaq_f32(sum, v0, r0);
117 | sum = vfmaq_f32(sum, v1, r1);
118 | sum = vfmaq_f32(sum, v2, r2);
119 | sum = vfmaq_f32(sum, v3, r3);
120 | #else
121 | sum = vmlaq_f32(sum, v0, r0);
122 | sum = vmlaq_f32(sum, v1, r1);
123 | sum = vmlaq_f32(sum, v2, r2);
124 | sum = vmlaq_f32(sum, v3, r3);
125 | #endif
126 | }
127 | if(widthRem){
128 | for(int j = 0; j < widthRem; ++j)
129 | {
130 | ext[0] = p0[j];
131 | ext[1] = p1[j];
132 | ext[2] = p2[j];
133 | ext[3] = p3[j];
134 | ext[4] = k0[j];
135 | ext[5] = k1[j];
136 | ext[6] = k2[j];
137 | ext[7] = k3[j];
138 | #ifdef __aarch64__
139 | sum = vfmaq_f32(sum, vld1q_f32(ext + 4), vld1q_f32(ext));
140 | #else
141 | sum = vmlaq_f32(sum, vld1q_f32(ext + 4), vld1q_f32(ext));
142 | #endif
143 | }
144 | }
145 | vst1q_f32(output + i, sum);
146 | }
147 | for(int i = heightAligned; i < height; ++i)
148 | {
149 | float* p = const_cast(input) + i * width;
150 | float* k = kernel + i * width;
151 | float sum = 0.f;
152 | for(int j = 0; j < width; ++j)
153 | {
154 | sum += p[j] * k[j];
155 | }
156 | output[i] = sum; // output[heightAligned + i] = sum;
157 | }
158 | */
159 | }
160 |
161 | template
162 | void dwConv_template(float *output, float *input, int input_channels, int inw, int inh, int stridew, int strideh, float *kernel, int kw, int kh, int group, int nThreads, float *bias_arr)
163 | {
164 | if ((kw == inw) && (kh == inh))
165 | {
166 | globalDwConv(output, input, input_channels, inw, inh, kernel, group, nThreads, bias_arr);
167 | }
168 | else
169 | {
170 | int outw = (inw - kw) / stridew + 1; //for strided case in odd dimensions, should take the floor value as output dim.
171 | int outh = (inh - kh) / strideh + 1;
172 |
173 | // #pragma omp parallel for num_threads(nThreads) schedule(static)
174 | //printf("dw param %d kernel %d %d stride %d %d input %d %d %d output %d %d\n", group, kh, kw, strideh, stridew, input_channels, inh, inw, outh, outw);
175 | for (int g = 0; g < group; ++g)
176 | {
177 | float *kp = kernel + kw * kh * g;
178 | float *outg = output + g * outw * outh;
179 | float *ing = input + g * inw * inh;
180 | for (int i = 0; i < outh; ++i)
181 | {
182 | for (int j = 0; j < outw; ++j)
183 | {
184 | float *inp = ing + inw * (i * stridew) + (j * strideh);
185 | float convSum = 0.f;
186 | for (int m = 0; m < kh; m++)
187 | {
188 | for (int n = 0; n < kw; n++)
189 | {
190 | convSum += inp[m * inw + n] * kp[m * kw + n];
191 | }
192 | }
193 | if (fuseBias)
194 | {
195 | convSum += bias_arr[g];
196 | }
197 | if (fuseRelu)
198 | {
199 | convSum = (convSum > 0.f) ? convSum : 0.f;
200 | }
201 | outg[j] = convSum;
202 | }
203 | outg += outw;
204 | }
205 | }
206 | }
207 | }
208 |
209 | template void dwConv_template(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *);
210 | template void dwConv_template(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *);
211 | template void dwConv_template(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *);
212 | template void dwConv_template(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *);
213 |
--------------------------------------------------------------------------------
/src/booster/avx/helper.cpp:
--------------------------------------------------------------------------------
1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
2 |
3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 |
5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 | //in compliance with the License. You may obtain a copy of the License at
7 | //
8 | //https://opensource.org/licenses/BSD-3-Clause
9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 |
15 | #include
16 |
17 | #include
18 | #include
19 | #include
20 |
21 | void print_arr(float* vp)
22 | {
23 | float* ep = (float *) vp;
24 | printf("arr %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
25 | }
26 |
27 | void print_floats(const float* arr, const int len)
28 | {
29 | for (int i = 0; i < len; ++i)
30 | {
31 | printf("%.2f ", arr[i]);
32 | }
33 | printf("\n\n");
34 | }
35 |
36 | void print_floats(const float* arr, const int dimX, const int dimY)
37 | {
38 | for (int i = 0; i < dimX; ++i)
39 | {
40 | for (int j = 0; j < dimY; ++j)
41 | printf("%.2f ", arr[i * dimY + j]);
42 | printf("\n");
43 | }
44 | printf("\n\n");
45 | }
46 |
47 |
48 | void diff(float* arr1, float* arr2, int len)
49 | {
50 | float dif = 0.0f;
51 | for (int i = 0; i < len; ++i)
52 | {
53 | float err = fabsf(arr1[i] - arr2[i]);
54 | if (err > 1.0f)
55 | {
56 | dif += err;
57 | }
58 | }
59 | printf("The difference is %.2f\n", dif);
60 | }
61 | void diff(float* arr1, float* arr2, int M, int N)
62 | {
63 | float dif = 0.0f;
64 | for (int i = 0; i < M; ++i)
65 | {
66 | for (int j = 0; j < N; ++j)
67 | {
68 | float err = fabsf(arr1[i * N + j] - arr2[i * N + j]);
69 | if (err > 1.0f)
70 | {
71 | dif += err;
72 | printf("Error position (%d, %d), value %.2f, %.2f\n", i, j, arr1[i * N + j], arr2[i * N + j]);
73 | }
74 | }
75 | }
76 | printf("The difference is %.2f\n", dif);
77 | }
78 |
79 | #include
80 |
81 | #ifdef _WIN32
82 | #define CLOCK_MONOTONIC 0
83 | int clock_gettime(int no_use, struct timespec *spec)
84 | {
85 | return timespec_get(spec, TIME_UTC);
86 | }
87 | #endif
88 |
89 | void Timer::startBench()
90 | {
91 | clock_gettime(CLOCK_MONOTONIC, &start);
92 | }
93 |
94 | double Timer::endBench()
95 | {
96 | clock_gettime(CLOCK_MONOTONIC, &stop);
97 | return (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
98 | }
99 |
100 | void Timer::endBench(const char* comment)
101 | {
102 | clock_gettime(CLOCK_MONOTONIC, &stop);
103 | double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
104 | printf("%s %lfms\n", comment, elapsedTime);
105 | }
106 |
107 | void Timer::endBench(const char* comment, double fold)
108 | {
109 | clock_gettime(CLOCK_MONOTONIC, &stop);
110 | double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
111 | printf("%s %lfms\n", comment, elapsedTime / fold);
112 | }
113 |
--------------------------------------------------------------------------------
/src/booster/include/booster/booster.h:
--------------------------------------------------------------------------------
1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
2 |
3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 |
5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 | //in compliance with the License. You may obtain a copy of the License at
7 | //
8 | //https://opensource.org/licenses/BSD-3-Clause
9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 |
15 | // Booster is the standalone backend of FeatherCNN, in order to facilitate unit testing
16 | // and multi-purpose deployment. I am currently focusing on the fast convolution kernels,
17 | // and will pack other operators as well. This backend library is now supporting
18 | // AVX and Neon, and is going to supoort OpenCL/GLES in the future.
19 | // Booster won't grow up into a hugh and abstract lib. I'll keep it simple and stupid.
20 | // -- Haidong Lan @ Tencent AI Platform, 08/30/2018
21 |
22 | #pragma once
23 |
24 | #include
25 | #include
26 | #include