├── CMakeLists.txt
├── README.md
├── build_scripts
    ├── Info.plist
    ├── build_android.sh
    ├── build_ios.sh
    ├── build_linux_aarch64.sh
    ├── build_linux_avx.sh
    ├── build_macos_avx.sh
    ├── ios.toolchain.cmake
    ├── linux-aarch64.toolchain.cmake
    └── pack_ios_framework.sh
└── src
    ├── CMakeLists.txt
    ├── blob.cpp
    ├── blob.h
    ├── booster
        ├── CMakeLists.txt
        ├── arm
        │   ├── CMakeLists.txt
        │   ├── booster.cpp
        │   ├── caffe_interp.cpp
        │   ├── depthwise.cpp
        │   ├── generic_kernels.cpp
        │   ├── helper.cpp
        │   ├── sgeconv.cpp
        │   ├── sgemm.cpp
        │   ├── sgemm_legacy.cpp
        │   ├── sgemm_legacy.h
        │   ├── sgemv.cpp
        │   ├── winograd_kernels.cpp
        │   └── winograd_kernels_F63.cpp
        ├── avx
        │   ├── CMakeLists.txt
        │   ├── booster.cpp
        │   ├── caffe_interp.cpp
        │   ├── depthwise.cpp
        │   ├── generic_kernels.cpp
        │   ├── helper.cpp
        │   ├── sgeconv.cpp
        │   ├── sgemm.cpp
        │   ├── sgemv.cpp
        │   ├── winograd_kernels_F63.cpp
        │   └── winograd_kernels_F63_fused.cpp
        └── include
        │   └── booster
        │       ├── booster.h
        │       ├── caffe_interp.h
        │       ├── depthwise.h
        │       ├── generic_kernels.h
        │       ├── helper.h
        │       ├── power.h
        │       ├── sgeconv.h
        │       ├── sgemm.h
        │       ├── sgemv.h
        │       ├── thpool.h
        │       └── winograd_kernels.h
    ├── layer.cpp
    ├── layer.h
    ├── layer_factory.cpp
    ├── layer_factory.h
    ├── layers
        ├── batchnorm_layer.h
        ├── concat_layer.h
        ├── conv_layer.h
        ├── dropout_layer.h
        ├── eltwise_layer.h
        ├── inner_product_layer.h
        ├── input_layer.h
        ├── pooling_layer.h
        ├── relu_layer.h
        ├── scale_layer.h
        ├── softmax_layer.h
        └── split_layer.h
    ├── mempool.cpp
    ├── mempool.h
    ├── ncnn
        ├── allocator.cpp
        ├── allocator.h
        ├── mat.cpp
        ├── mat.h
        ├── mat_pixel.cpp
        ├── mat_pixel_resize.cpp
        ├── modelbin.cpp
        ├── modelbin.h
        ├── paramdict.cpp
        ├── paramdict.h
        └── platform.h
    ├── net.cpp
    ├── net.h
    ├── rt_param.h
    ├── utils.cpp
    └── utils.h


/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | if(CMAKE_TOOLCHAIN_FILE)
 2 | set(LIBRARY_OUTPUT_PATH_ROOT ${CMAKE_BINARY_DIR} CACHE PATH "root for library output, set this to change where android libs are compiled to")
 3 | # get absolute path, but get_filename_component ABSOLUTE only refer with source dir, so find_file here :(
 4 | get_filename_component(CMAKE_TOOLCHAIN_FILE_NAME ${CMAKE_TOOLCHAIN_FILE} NAME)
 5 | find_file(CMAKE_TOOLCHAIN_FILE ${CMAKE_TOOLCHAIN_FILE_NAME} PATHS ${CMAKE_SOURCE_DIR} NO_DEFAULT_PATH)
 6 | message(STATUS "CMAKE_TOOLCHAIN_FILE = ${CMAKE_TOOLCHAIN_FILE}")
 7 | endif()
 8 | 
 9 | if(NOT DEFINED CMAKE_INSTALL_PREFIX)
10 | set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation Directory")
11 | endif()
12 | message(STATUS "CMAKE_INSTALL_PREFIX = ${CMAKE_INSTALL_PREFIX}")
13 | 
14 | project(feather)
15 | 
16 | cmake_minimum_required(VERSION 2.8)
17 | 
18 | #set(CMAKE_BUILD_TYPE Debug)
19 | #set(CMAKE_BUILD_TYPE Release)
20 | 
21 | option(FEATHER_OPENMP "openmp support" ON)
22 | 
23 | if(FEATHER_OPENMP)
24 | 	if(CMAKE_HOST_APPLE)
25 | 		#if(1)
26 | 		if(IOS)
27 | 		#if(0)
28 | 			message(STATUS "iOS doesn't support OpenMP, use GCD instead.")
29 | 			set(OPENMP_FOUND false)
30 | 			set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fembed-bitcode")
31 | 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fembed-bitcode")
32 | 		else()
33 | 			set(OpenMP_C_FLAGS)
34 | 			set(OpenMP_CXX_FLAGS)
35 | 			set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
36 | 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
37 | 			message(STATUS ${OpenMP_C_FLAGS})
38 | 			message(STATUS ${OpenMP_CXX_FLAGS})
39 | 		endif()
40 | 	else()
41 | 		#find_package(OpenMP)
42 | 		include(FindOpenMP)
43 | 		if(OPENMP_FOUND)
44 | 			#if(OpenMP_CXX_FOUND OR OPENMP_FOUND)
45 | 			#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
46 | 			#set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
47 | 			message(STATUS ${OpenMP_C_FLAGS})
48 | 			message(STATUS ${OpenMP_CXX_FLAGS})
49 | 			set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
50 | 			set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
51 | 		endif()
52 | 		message(STATUS "OpenMP flags ${CMAKE_CXX_FLAGS}")
53 | 	endif()
54 | endif()
55 | 
56 | #add_definitions(-Wall -Wextra -Wno-unused-function)
57 | add_definitions(-fPIC)
58 | add_definitions(-Ofast)
59 | add_definitions(-ffast-math)
60 | # add_definitions(-march=native)
61 | 
62 | # add_definitions(-flto)
63 | 
64 | add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
65 | 
66 | if(ANDROID)
67 |     # disable shared library on android
68 |     #set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
69 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti ")
70 |     add_definitions("-DFEATHER_ANDROID_LOG")
71 |     add_definitions("-D_NDK_MATH_NO_SOFTFP=1")
72 |     if(${ANDROID_ABI} STREQUAL "armeabi-v7a")
73 | 	    add_definitions("-mfpu=neon-vfpv4")
74 |     endif()
75 | elseif(IOS)
76 |     # disable shared library on xcode ios
77 |     add_definitions(-isysroot ${IOS_SDK_PATH} -arch ${IOS_ARCH})
78 |     set_property(GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS FALSE)
79 |     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-rtti ")
80 | endif()
81 | 
82 | ##############################################
83 | 
84 | # add_subdirectory(examples)
85 | # add_subdirectory(benchmark)
86 | add_subdirectory(src)
87 | #if(NOT ANDROID AND NOT IOS)
88 | #	add_subdirectory(tools)
89 | #endif()
90 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img width="420"  src="https://github.com/Tencent/FeatherCNN/wiki/Images/logo.png"/>
 2 | 
 3 | [![license](http://img.shields.io/badge/license-BSD3-blue.svg?style=flat)](https://github.com/Tencent/FeatherCNN/blob/master/LICENSE)
 4 | [![Release Version](https://img.shields.io/badge/release-0.1.0-red.svg)](https://github.com/Tencent/FeatherCNN/releases)
 5 | [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](https://github.com/Tencent/FeatherCNN/pulls)
 6 | 
 7 | ## Introduction
 8 | 
 9 | FeatherCNN is a high-performance lightweight CNN inference library, developed by Tencent AI Platform Department. 
10 | FeatureCNN origins from our game AI project for King of Glory (Chinese: 王者荣耀), in which we aim to build a neural model for MOBA game AI and run it on mobile devices. 
11 | FeatherCNN currently targets at ARM CPUs. 
12 | We will extend it to cover other architecutures in the near future.
13 | 
14 | Comparing with other libraries, FeatherCNN has the following features: 
15 | 
16 | - **High Performance** FeatherCNN delivers state-of-the-art inference computing performance on a wide range of devices, including mobile phones (iOS/Android), embedded devices (Linux) as well as ARM-based servers (Linux). 
17 | 
18 | - **Easy Deployment** FeatherCNN packs everything in a single code base to get rid of third-party dependencies. Hence, it facilitates deployment on mobile platforms. 
19 | 
20 | - **Featherweight** The compiled FeatherCNN library is small-sized (hundreds of KBs). 
21 | 
22 | Please kindly open an issue in this repo for bug reports and enhancement suggests. We are grateful to user responses and will actively polish this library.
23 | 
24 | ## Citation
25 | 
26 | FeatherCNN: Fast Inference Computation with TensorGEMM on ARM Architectures (TPDS September 2019, In press, DOI:10.1109/TPDS.2019.2939785)
27 | 
28 | ## Clone hints
29 | The FeatherCNN repository has a heavy development history, please only clone the master branch as follows:
30 | ```
31 | git clone -b master --single-branch https://github.com/tencent/FeatherCNN.git
32 | ```
33 | 
34 | ## Detailed Instructions for iOS/Android/Linux
35 | 
36 | [**Build From Source**](https://github.com/Tencent/FeatherCNN/wikis/Build-From-Source)
37 | 
38 | [**iOS Guide**](https://github.com/Tencent/FeatherCNN/wikis/iOS-Guide)
39 | 
40 | [**Android Guide**](https://github.com/Tencent/FeatherCNN/wiki/Android-Guide)
41 | 
42 | [**Android ADB Guide**](https://github.com/Tencent/FeatherCNN/wiki/Android-ADB-Guide)
43 | 
44 | ## Usage
45 | 
46 | ### Model Format Conversion
47 | 
48 | FeatherCNN accepts Caffemodels. It merges the structure file (.prototxt) and the weight file (.caffemodel) into a single binary model (.feathermodel). The convert tool requires protobuf, but you don't need them for the library. 
49 | 
50 | [**Model Convert Guide**](https://github.com/Tencent/FeatherCNN/wikis/Model-Convert-Guide).
51 | 
52 | ### Runtime Interfaces
53 | 
54 | The basic user interfaces are listed in feather/net.h. Currently we are using raw pointers to reference data.
55 | We may provide more convenient interfaces in the near future.
56 | 
57 | Before inference, FeatherCNN requires two steps to initialize the network.
58 | ```cpp
59 | feather::Net forward_net(num_threads);
60 | forward_net.InitFromPath(FILE_PATH_TO_FEATHERMODEL);
61 | ```
62 | The net can also be initialized with raw buffers and FILE pointers.
63 | We can perform forward computation with raw `float*` buffer consequently. 
64 | ```cpp
65 | forward_net.Forward(PTR_TO_YOUR_INPUT_DATA);
66 | ```
67 | The output can be extracted from the net by the name of blobs. The blob names are kept consistent with caffe prototxt.
68 | ```cpp
69 | forward_net.ExtractBlob(PTR_TO_YOUR_OUTPUT_BUFFER, BLOB_NAME);
70 | ```
71 | BTW, you can also get the blob's data size by calling
72 | ```cpp
73 | size_t data_size = 0;
74 | forward_net.GetBlobDataSize(&data_size, BLOB_NAME);
75 | ```
76 | 
77 | ## Performance Benchmarks
78 | We have tested FeatherCNN on a bunch of devices, see [**this page**](https://github.com/Tencent/FeatherCNN/wikis/Benchmarks) for details.
79 | 
80 | ## User Groups
81 | 
82 | Telegram: https://t.me/FeatherCNN
83 | 
84 | QQ: 728147343
85 | 


--------------------------------------------------------------------------------
/build_scripts/Info.plist:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
 3 | <plist version="1.0">
 4 | <dict>
 5 |     <key>CFBundleName</key>
 6 |     <string>feather</string>
 7 |     <key>CFBundleIdentifier</key>
 8 |     <string>com.tencent.feather</string>
 9 |     <key>CFBundleVersion</key>
10 |     <string>0.1</string>
11 |     <key>CFBundleShortVersionString</key>
12 |     <string>0.1</string>
13 |     <key>CFBundleSignature</key>
14 |     <string>????</string>
15 |     <key>CFBundlePackageType</key>
16 |     <string>FMWK</string>
17 | </dict>
18 | </plist>
19 | 


--------------------------------------------------------------------------------
/build_scripts/build_android.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p build-android
 4 | pushd build-android
 5 | mkdir -p arm64-v8a
 6 | pushd arm64-v8a
 7 | cmake -DCMAKE_TOOLCHAIN_FILE=$NDK_ROOT/build/cmake/android.toolchain.cmake -DANDROID_ABI="arm64-v8a" -DANDROID_PLATFORM=android-21 -DBOOSTER_ARM=1 -DCOMPILE_OPENCL=1 ../.. 
 8 | make -j6
 9 | make install
10 | popd
11 | 
12 | mkdir -p armeabi-v7a
13 | pushd armeabi-v7a
14 | cmake -DCMAKE_TOOLCHAIN_FILE=$NDK_ROOT/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi-v7a" -DANDROID_ARM_NEON=ON -DANDROID_PLATFORM=android-16 -DBOOSTER_ARM=1 -DCOMPILE_OPENCL=1 ../..
15 | make -j6
16 | make install
17 | popd
18 | 
19 | #mkdir -p armeabi
20 | #pushd armeabi
21 | #cmake -DCMAKE_TOOLCHAIN_FILE=$NDK_ROOT/build/cmake/android.toolchain.cmake -DANDROID_ABI="armeabi" -DANDROID_PLATFORM=android-16 -DFEATHER_ARM=0 ../..
22 | #make -j4
23 | #make install
24 | #popd
25 | 
26 | mkdir -p feather
27 | mkdir -p booster
28 | pushd feather
29 | mkdir -p include
30 | mkdir -p include/feather
31 | cp -r ../arm64-v8a/install/feather/include/* ./include/feather/
32 | mkdir -p arm64-v8a
33 | cp ../arm64-v8a/install/feather/lib/* ./arm64-v8a/
34 | mkdir -p armeabi-v7a
35 | cp ../armeabi-v7a/install/feather/lib/* ./armeabi-v7a/
36 | #mkdir -p armeabi
37 | #cp ../armeabi/install/feather/lib/* ./armeabi/
38 | #popd
39 | popd
40 | pushd booster
41 | mkdir -p include/booster
42 | cp -r ../arm64-v8a/install/booster/include/* ./include/
43 | mkdir -p arm64-v8a
44 | cp ../arm64-v8a/install/booster/lib/* ./arm64-v8a/
45 | mkdir -p armeabi-v7a
46 | cp ../armeabi-v7a/install/booster/lib/* ./armeabi-v7a/
47 | popd
48 | 


--------------------------------------------------------------------------------
/build_scripts/build_ios.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | echo $(xcrun --sdk iphoneos --show-sdk-path)
 4 | mkdir -p build-ios
 5 | pushd build-ios
 6 | mkdir -p arm64 
 7 | pushd arm64
 8 | cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphoneos --show-sdk-path) -DIOS_ARCH=arm64 -DBOOSTER_ARM=1 ../..
 9 | make -j4
10 | make install
11 | popd
12 | 
13 | mkdir -p armv7s
14 | pushd armv7s
15 | cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphoneos --show-sdk-path) -DIOS_ARCH=armv7s -DBOOSTER_ARM=1 ../..
16 | make -j4
17 | make install
18 | popd
19 | 
20 | #mkdir -p armv7
21 | #pushd armv7
22 | #cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphoneos --show-sdk-path) -DIOS_ARCH=armv7 ../..
23 | #make -j4
24 | #make install
25 | #popd
26 | 
27 | #mkdir -p x86_64
28 | #pushd x86_64
29 | #cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphonesimulator --show-sdk-path) -DIOS_ARCH=x86_64 ../..
30 | #make -j4
31 | #make install
32 | #popd
33 | 
34 | #mkdir -p i386
35 | #pushd i386
36 | #cmake -DCMAKE_TOOLCHAIN_FILE=../../build_scripts/ios.toolchain.cmake -DIOS_SDK_PATH=$(xcrun --sdk iphonesimulator --show-sdk-path) -DIOS_ARCH=i386 ../..
37 | #make -j4
38 | #make install
39 | #popd
40 | 
41 | popd
42 | bash ./build_scripts/pack_ios_framework.sh
43 | 


--------------------------------------------------------------------------------
/build_scripts/build_linux_aarch64.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p build-linux-aarch64
 4 | pushd build-linux-aarch64
 5 | #cmake -DCMAKE_TOOLCHAIN_FILE=../build_scripts/linux-aarch64.toolchain.cmake .. -DFEATHER_ARM=true -DCOMPILE_OPENCL=false
 6 | cmake  .. -DBOOSTER_ARM=true -DCOMPILE_OPENCL=false -DCMAKE_BUILD_TYPE=Release
 7 | make -j4
 8 | make install
 9 | popd
10 | 


--------------------------------------------------------------------------------
/build_scripts/build_linux_avx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p build-linux-avx
4 | pushd build-linux-avx
5 | cmake .. -DBOOSTER_AVX=1 -DCMAKE_BUILD_TYPE=Release
6 | make VERBOSE=1
7 | make install
8 | popd


--------------------------------------------------------------------------------
/build_scripts/build_macos_avx.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | mkdir -p build-macos-avx
4 | pushd build-macos-avx
5 | cmake .. -DBOOSTER_AVX=1 -DCMAKE_BUILD_TYPE=Release
6 | make -j4
7 | make install
8 | popd
9 | 


--------------------------------------------------------------------------------
/build_scripts/ios.toolchain.cmake:
--------------------------------------------------------------------------------
 1 | # For cross-compiling on arm64 Linux using gcc-aarch64-linux-gnu package:
 2 | # - install AArch64 tool chain:
 3 | #   $ sudo apt-get install g++-aarch64-linux-gnu
 4 | # - cross-compiling config
 5 | #   $ cmake -DCMAKE_TOOLCHAIN_FILE=../dynamorio/make/toolchain-arm64.cmake ../dynamorio
 6 | # You may have to set CMAKE_FIND_ROOT_PATH to point to the target enviroment, e.g.
 7 | # by passing -DCMAKE_FIND_ROOT_PATH=/usr/aarch64-linux-gnu on Debian-like systems.
 8 | set(CMAKE_SYSTEM_NAME Darwin)
 9 | set(CMAKE_SYSTEM_VERSION 1)
10 | set(UNIX True)
11 | set(APPLE True)
12 | set(IOS True)
13 | 
14 | # specify the cross compiler as clang.
15 | set(CMAKE_C_COMPILER clang)
16 | set(CMAKE_CXX_COMPILER clang++)
17 | 
18 | # To build the tests, we need to set where the target environment containing
19 | # the required library is. 
20 | set(CMAKE_FIND_ROOT_PATH ${IOS_SDK_PATH})
21 | # search for programs in the build host directories
22 | set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
23 | # for libraries and headers in the target directories
24 | set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
25 | set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
26 | 
27 | 
28 | # Set additional variables.
29 | # If we don't set some of these, CMake will end up using the host version.
30 | # We want the full path, however, so we can pass EXISTS and other checks in
31 | # the our CMake code.
32 | find_program(CC_FULL_PATH clang)
33 | if (NOT CC_FULL_PATH)
34 |   message(FATAL_ERROR "Cross-compiler clang not found")
35 | endif ()
36 | get_filename_component(CC_DIR ${CC_FULL_PATH} PATH)
37 | message(STATUS "CC path is ${CC_FULL_PATH}")
38 | #set(IOS_ARCH arm64)
39 | 
40 | #SET(CMAKE_LINKER       ${CC_DIR}/aarch64-${TARGET_ABI}-ld      CACHE FILEPATH "linker")
41 | #SET(CMAKE_ASM_COMPILER ${CC_DIR}/aarch64-${TARGET_ABI}-as      CACHE FILEPATH "assembler")
42 | #SET(CMAKE_OBJCOPY      ${CC_DIR}/aarch64-${TARGET_ABI}-objcopy CACHE FILEPATH "objcopy")
43 | #SET(CMAKE_STRIP        ${CC_DIR}/aarch64-${TARGET_ABI}-strip   CACHE FILEPATH "strip")
44 | #SET(CMAKE_CPP          ${CC_DIR}/aarch64-${TARGET_ABI}-cpp     CACHE FILEPATH "cpp")
45 | 
46 | set(CMAKE_XCODE_ATTRIBUTE_ENABLE_BITCODE 1)
47 | # Without this, Xcode adds -fembed-bitcode-marker compile options instead of -fembed-bitcode set(CMAKE_C_FLAGS "-fembed-bitcode ${CMAKE_C_FLAGS}")
48 | set(CMAKE_XCODE_ATTRIBUTE_BITCODE_GENERATION_MODE "bitcode") 
49 | set(BITCODE_FLAGS "-fembed-bitcode")
50 | set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${BITCODE_FLAGS}"  CACHE INTERNAL "ios c compiler flags" FORCE) 
51 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${BITCODE_FLAGS}" CACHE INTERNAL "ios c compiler flags" FORCE)
52 | 
53 | 
54 | 
55 | 


--------------------------------------------------------------------------------
/build_scripts/linux-aarch64.toolchain.cmake:
--------------------------------------------------------------------------------
 1 | # **********************************************************
 2 | # Copyright (c) 2014-2017 Google, Inc.    All rights reserved.
 3 | # **********************************************************
 4 | 
 5 | # Redistribution and use in source and binary forms, with or without
 6 | # modification, are permitted provided that the following conditions are met:
 7 | #
 8 | # * Redistributions of source code must retain the above copyright notice,
 9 | #   this list of conditions and the following disclaimer.
10 | #
11 | # * Redistributions in binary form must reproduce the above copyright notice,
12 | #   this list of conditions and the following disclaimer in the documentation
13 | #   and/or other materials provided with the distribution.
14 | #
15 | # * Neither the name of Google, Inc. nor the names of its contributors may be
16 | #   used to endorse or promote products derived from this software without
17 | #   specific prior written permission.
18 | #
19 | # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 | # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 | # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 | # ARE DISCLAIMED. IN NO EVENT SHALL GOOGLE, INC. OR CONTRIBUTORS BE LIABLE
23 | # FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 | # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 | # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 | # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 | # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 | # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 | # DAMAGE.
30 | 
31 | # For cross-compiling on arm64 Linux using gcc-aarch64-linux-gnu package:
32 | # - install AArch64 tool chain:
33 | #   $ sudo apt-get install g++-aarch64-linux-gnu
34 | # - cross-compiling config
35 | #   $ cmake -DCMAKE_TOOLCHAIN_FILE=../dynamorio/make/toolchain-arm64.cmake ../dynamorio
36 | # You may have to set CMAKE_FIND_ROOT_PATH to point to the target enviroment, e.g.
37 | # by passing -DCMAKE_FIND_ROOT_PATH=/usr/aarch64-linux-gnu on Debian-like systems.
38 | set(CMAKE_SYSTEM_NAME Linux)
39 | set(CMAKE_SYSTEM_PROCESSOR aarch64)
40 | set(TARGET_ABI "linux-gnu")
41 | # specify the cross compiler
42 | SET(CMAKE_C_COMPILER   aarch64-${TARGET_ABI}-gcc)
43 | SET(CMAKE_CXX_COMPILER aarch64-${TARGET_ABI}-g++)
44 | 
45 | # To build the tests, we need to set where the target environment containing
46 | # the required library is. On Debian-like systems, this is
47 | # /usr/aarch64-linux-gnu.
48 | SET(CMAKE_FIND_ROOT_PATH "/usr/aarch64-${TARGET_ABI}")
49 | # search for programs in the build host directories
50 | SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
51 | # for libraries and headers in the target directories
52 | SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
53 | SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
54 | 
55 | # Set additional variables.
56 | # If we don't set some of these, CMake will end up using the host version.
57 | # We want the full path, however, so we can pass EXISTS and other checks in
58 | # the our CMake code.
59 | find_program(GCC_FULL_PATH aarch64-${TARGET_ABI}-gcc)
60 | if (NOT GCC_FULL_PATH)
61 |   message(FATAL_ERROR "Cross-compiler aarch64-${TARGET_ABI}-gcc not found")
62 | endif ()
63 | get_filename_component(GCC_DIR ${GCC_FULL_PATH} PATH)
64 | SET(CMAKE_LINKER       ${GCC_DIR}/aarch64-${TARGET_ABI}-ld      CACHE FILEPATH "linker")
65 | SET(CMAKE_ASM_COMPILER ${GCC_DIR}/aarch64-${TARGET_ABI}-as      CACHE FILEPATH "assembler")
66 | SET(CMAKE_OBJCOPY      ${GCC_DIR}/aarch64-${TARGET_ABI}-objcopy CACHE FILEPATH "objcopy")
67 | SET(CMAKE_STRIP        ${GCC_DIR}/aarch64-${TARGET_ABI}-strip   CACHE FILEPATH "strip")
68 | SET(CMAKE_CPP          ${GCC_DIR}/aarch64-${TARGET_ABI}-cpp     CACHE FILEPATH "cpp")
69 | 


--------------------------------------------------------------------------------
/build_scripts/pack_ios_framework.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | NAME=feather
 4 | 
 5 | ##### package android lib
 6 | #ANDROIDPKGNAME=${NAME}-android-lib
 7 | #rm -rf $ANDROIDPKGNAME
 8 | #mkdir -p $ANDROIDPKGNAME
 9 | #mkdir -p $ANDROIDPKGNAME/armeabi-v7a
10 | #mkdir -p $ANDROIDPKGNAME/arm64-v8a
11 | #mkdir -p $ANDROIDPKGNAME/include
12 | #cp build-android-armv7/install/lib/lib${NAME}.a $ANDROIDPKGNAME/armeabi-v7a/
13 | #cp build-android-aarch64/install/lib/lib${NAME}.a $ANDROIDPKGNAME/arm64-v8a/
14 | #cp build-android-aarch64/install/include/* $ANDROIDPKGNAME/include/
15 | #rm -f $ANDROIDPKGNAME.zip
16 | #zip -9 -r $ANDROIDPKGNAME.zip $ANDROIDPKGNAME
17 | 
18 | ##### package ios framework
19 | IOSPKGNAME=./build-ios/${NAME}.framework
20 | rm -rf $IOSPKGNAME
21 | mkdir -p $IOSPKGNAME/Versions/A/Headers
22 | mkdir -p $IOSPKGNAME/Versions/A/Resources
23 | ln -s A $IOSPKGNAME/Versions/Current
24 | ln -s Versions/Current/Headers $IOSPKGNAME/Headers
25 | ln -s Versions/Current/Resources $IOSPKGNAME/Resources
26 | ln -s Versions/Current/${NAME} $IOSPKGNAME/${NAME}
27 | lipo -create \
28 |     build-ios/arm64/install/${NAME}/lib/lib${NAME}.a \
29 |     build-ios/armv7/install/${NAME}/lib/lib${NAME}.a \
30 |     build-ios/x86_64/install/${NAME}/lib/lib${NAME}.a \
31 |     build-ios/i386/install/${NAME}/lib/lib${NAME}.a \
32 |     -o $IOSPKGNAME/Versions/A/${NAME}
33 | #build-ios-sim/install/${NAME}/lib/lib${NAME}.a \
34 | cp -r build-ios/arm64/install/${NAME}/include/* $IOSPKGNAME/Versions/A/Headers/
35 | 
36 | #HEADER_PATH=$IOSPKGNAME/Versions/A/Headers
37 | #HEADERS_TO_EDIT=$HEADER_PATH/feather_simple_generated.h\ $HEADER_PATH/flatbuffers/flatbuffers.h\ $HEADER_PATH/flatbuffers/base.h
38 | #HEADERS_TO_EDIT=$HEADER_PATH/flatbuffers/flatbuffers.h
39 | #HEADERS_TO_EDIT=$HEADER_PATH/flatbuffers/base.h
40 | 
41 | # Fix the relative path for the framework package.
42 | #for FILE in $HEADERS_TO_EDIT
43 | #do
44 | #	echo $FILE
45 | #	sed -i.bak 's/flatbuffers\//feather\/flatbuffers\//' $FILE
46 | #	echo $FILE.bak
47 | #	rm $FILE.bak
48 | #done
49 | 
50 | cp ./build_scripts/Info.plist ${IOSPKGNAME}/Versions/A/Resources/
51 | rm -f $IOSPKGNAME.zip
52 | zip -9 -y -r $IOSPKGNAME.zip $IOSPKGNAME
53 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | file(GLOB LIB_HEADERS *.h)
 2 | file(GLOB LIB_SRC *.cpp)
 3 | file(GLOB LAYER_HEADERS layers/*.h)
 4 | file(GLOB LAYER_SRC layers/*.cpp)
 5 | file(GLOB FLATBUFFERS_HEADERS flatbuffers/*.h)
 6 | 
 7 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wno-format")
 8 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -std=c++11 -Wall")
 9 | 
10 | include_directories("${PROJECT_SOURCE_DIR}/src")
11 | include_directories("booster/include")
12 | 
13 | message(STATUS "Using Booster backend.")
14 | add_subdirectory(./booster)
15 | if(BOOSTER_ARM)
16 | message(STATUS "Compiling for Arm backend.")
17 | add_library(feather STATIC ${LIB_SRC} ${LIB_HEADERS} ${LAYER_SRC} ${LAYER_HEADERS} $<TARGET_OBJECTS:booster_arm_obj>)
18 | elseif(BOOSTER_AVX)
19 | message(STATUS "Compiling for AVX backend.")
20 | add_library(feather STATIC ${LIB_SRC} ${LIB_HEADERS} ${LAYER_SRC} ${LAYER_HEADERS} $<TARGET_OBJECTS:booster_avx_obj>)
21 | else()
22 | error("You have to specify a backend, either FEATHER_ARM or FEATHER_AVX")
23 | endif()
24 | 
25 | set(FEATHER_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/feather")
26 | 
27 | message(Library headers: ${LIB_HEADERS})
28 | list(REMOVE_ITEM LIB_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/feather_simple_generated.h)
29 | message(Library headers: ${LIB_HEADERS})
30 | install(TARGETS feather DESTINATION "${FEATHER_INSTALL_DIR}/lib")
31 | install(FILES ${LIB_HEADERS} DESTINATION "${FEATHER_INSTALL_DIR}/include")
32 | 


--------------------------------------------------------------------------------
/src/blob.cpp:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #include "blob.h"
 16 | 
 17 | #include <assert.h>
 18 | 
 19 | namespace feather
 20 | {
 21 | template<class Dtype>
 22 | void Blob<Dtype>::Alloc()
 23 | {
 24 |     size_t dim_byte = _num * _channels * _height * _width * sizeof(Dtype);
 25 |     _data = (Dtype*) _mm_malloc(dim_byte, 32);
 26 | }
 27 | template<class Dtype>
 28 | void Blob<Dtype>::Free()
 29 | {
 30 |     if (this->_data)
 31 |     {
 32 |         free(this->_data);
 33 |         this->_data = NULL;
 34 |     }
 35 | }
 36 | 
 37 | template<class Dtype>
 38 | void Blob<Dtype>::ReshapeWithRealloc(const Blob<Dtype> *p_blob)
 39 | {
 40 |     int num      = p_blob->num();
 41 |     int channels = p_blob->channels();
 42 |     int height   = p_blob->height();
 43 |     int width    = p_blob->width();
 44 | 
 45 |     ReshapeWithRealloc(num, channels, height, width);
 46 | }
 47 | 
 48 | template<class Dtype>
 49 | void Blob<Dtype>::ReshapeWithRealloc(int num, int channels, int height, int width)
 50 | {
 51 |     // LOGI("Reallc: (%d %d %d %d) to (%d %d %d %d)", _num, _channels, _height, _width, num, channels, height, width);
 52 |     int elem_size = num * channels * height * width;
 53 |     Realloc(elem_size);
 54 |     this->_num      = num;
 55 |     this->_channels = channels;
 56 |     this->_height   = height;
 57 |     this->_width    = width;
 58 | }
 59 | 
 60 | template<class Dtype>
 61 | void Blob<Dtype>::Realloc(size_t elem_size)
 62 | {
 63 |     if (elem_size > this->data_size())
 64 |     {
 65 |         Free();
 66 |         _data = (Dtype*) _mm_malloc(elem_size * sizeof(Dtype), 32);
 67 |     }
 68 | }
 69 | 
 70 | template<class Dtype>
 71 | int Blob<Dtype>::CopyFromMat(const ncnn::Mat& mat)
 72 | {
 73 |     this->ReshapeWithRealloc(1, mat.c, mat.h, mat.w);
 74 |     this->CopyDataFromMat(mat);
 75 |     return 0;
 76 | }
 77 | 
 78 | template<class Dtype>
 79 | int Blob<Dtype>::CopyDataFromMat(const ncnn::Mat& mat)
 80 | {
 81 |     if (this->data_size() != mat.c * mat.h * mat.w)
 82 |     {
 83 |         LOGE("In Blob %s: Mat and target blob shape mismatch. blob shape (%zu %zu %zu %zu), mat shape (%d %d %d)\n", this->name.c_str(), num(), channels(), height(), width(), mat.c, mat.h, mat.w);
 84 |         return -500; // BAD DATA DIMENSION
 85 |     }
 86 |     Dtype* dst_p = (Dtype *) this->_data;
 87 |     size_t copy_stride = mat.h * mat.w;
 88 |     for (int c = 0; c < mat.c; ++c )
 89 |     {
 90 |         ncnn::Mat channel_mat = mat.channel(c);
 91 |         memcpy(dst_p, channel_mat.data, copy_stride * sizeof(Dtype));
 92 |         dst_p += copy_stride;
 93 |     }
 94 |     return 0;
 95 | }
 96 | 
 97 | template class Blob<float>;
 98 | template class Blob<uint16_t>;
 99 | template class Blob<char>;
100 | };
101 | 


--------------------------------------------------------------------------------
/src/blob.h:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #pragma once
 16 | 
 17 | #include "utils.h"
 18 | 
 19 | #include "ncnn/mat.h"
 20 | 
 21 | #include <string>
 22 | 
 23 | namespace feather
 24 | {
 25 | template <class Dtype>
 26 | class Blob
 27 | {
 28 |     public:
 29 |         Blob()
 30 |             : name(), _num(0), _channels(0), _height(0), _width(0), _data(NULL)
 31 |         {}
 32 | 
 33 |         explicit Blob(std::string name)
 34 |             : name(name), _num(0), _channels(0), _height(0), _width(0), _data(NULL)
 35 |         {}
 36 | 
 37 |         explicit Blob(const size_t num, const size_t channels, const size_t height, const size_t width)
 38 |             : name(), _data(NULL), _num(num), _channels(channels), _height(height), _width(width)
 39 |         {}
 40 | 
 41 |         explicit Blob(Dtype* data, const size_t num, const size_t channels, const size_t height, const size_t width)
 42 |             : name(), _data(data), _num(num), _channels(channels), _height(height), _width(width)
 43 |         {}
 44 | 
 45 |         ~Blob()
 46 |         {
 47 |             Free();
 48 |         }
 49 | 
 50 |         void Free();
 51 |         void Alloc();
 52 | 
 53 |         void ReshapeWithRealloc(const Blob<Dtype> *p_blob);
 54 |         void ReshapeWithRealloc(int num, int channels, int height, int width);
 55 |         void Realloc(size_t elem_size);
 56 | 
 57 |         int CopyFromMat(const ncnn::Mat &src_mat);
 58 |         int CopyDataFromMat(const ncnn::Mat &src_mat);
 59 | 
 60 |         void CopyData(const Dtype* data)
 61 |         {
 62 |             size_t size = _num * _channels * _height * _width;
 63 |             memcpy(_data, data, sizeof(Dtype) * size);
 64 |         }
 65 |         void CopyShape(const Blob<Dtype>* p_blob)
 66 |         {
 67 |             this->_num = p_blob->num();
 68 |             this->_channels = p_blob->channels();
 69 |             this->_width = p_blob->width();
 70 |             this->_height = p_blob->height();
 71 |         }
 72 |         void Copy(const Blob<Dtype>* p_blob)
 73 |         {
 74 |             this->Free();
 75 |             CopyShape(p_blob);
 76 |             this->Alloc();
 77 |             CopyData(p_blob->data());
 78 |         }
 79 | 
 80 |         Dtype* data() const
 81 |         {
 82 |             return (Dtype*) _data;
 83 |         }
 84 | 
 85 |         size_t data_size() const
 86 |         {
 87 |             return _num * _channels * _height * _width;
 88 |         }
 89 |         size_t num() const
 90 |         {
 91 |             return _num;
 92 |         }
 93 |         size_t channels() const
 94 |         {
 95 |             return _channels;
 96 |         }
 97 |         size_t height() const
 98 |         {
 99 |             return _height;
100 |         }
101 |         size_t width() const
102 |         {
103 |             return _width;
104 |         }
105 |         void PrintBlobInfo() const
106 |         {
107 |             printf("----BlobShape----\n");
108 |             printf("NCHW=(%zu %zu %zu %zu)\n", _num, _channels, _height, _width);
109 |             printf("----------------\n");
110 |         }
111 | 
112 |         std::string name;
113 | 
114 |         void* _data;
115 |         size_t _elemsize;
116 | 
117 |         size_t _num;
118 |         size_t _channels;
119 |         size_t _height;
120 |         size_t _width;
121 | };
122 | };
123 | 


--------------------------------------------------------------------------------
/src/booster/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | cmake_minimum_required(VERSION 2.8.10)
 2 | 
 3 | file(GLOB LIB_HEADERS ./include/booster/*.h)
 4 | 
 5 | if(CMAKE_SYSTEM_NAME MATCHES "Windows")
 6 | 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O2 -std=c++11")
 7 | else()
 8 | 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wno-format")
 9 | endif()
10 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -std=c++11 -Wall")
11 | 
12 | include_directories("./include/")
13 | 
14 | if(BOOSTER_AVX)
15 | message(STATUS "Compiling booster AVX version.")
16 | add_subdirectory(./avx)
17 |   if(COMPILE_OPENCL)
18 |     add_subdirectory(./cl)
19 |     add_library(booster STATIC $<TARGET_OBJECTS:booster_avx_obj> $<TARGET_OBJECTS:booster_cl_obj>)
20 |   else()
21 |     add_library(booster STATIC $<TARGET_OBJECTS:booster_avx_obj>)
22 |   endif()
23 | elseif(BOOSTER_ARM)
24 | add_subdirectory(./arm)
25 |   if(COMPILE_OPENCL)
26 |     add_subdirectory(./cl)
27 |     add_library(booster STATIC $<TARGET_OBJECTS:booster_arm_obj> $<TARGET_OBJECTS:booster_cl_obj>)
28 |   else()
29 |     add_library(booster STATIC $<TARGET_OBJECTS:booster_arm_obj>)
30 |     endif()
31 | else()
32 | error("Unkown booster configuration.")
33 | endif()
34 | 
35 | 
36 | set(BOOSTER_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/booster")
37 | 
38 | message(Library headers: ${LIB_HEADERS})
39 | install(TARGETS booster DESTINATION "${BOOSTER_INSTALL_DIR}/lib")
40 | install(FILES ${LIB_HEADERS} DESTINATION "${BOOSTER_INSTALL_DIR}/include/booster")
41 | 


--------------------------------------------------------------------------------
/src/booster/arm/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | file(GLOB ARM_SRC ./*.cpp)
 2 | file(GLOB ARM_HEADERS ../include/*.h)
 3 | list(REMOVE_ITEM ARM_SRC "${CMAKE_CURRENT_SOURCE_DIR}/./sgemm_legacy.cpp")
 4 | 
 5 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a -fopenmp")
 6 | set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -g -Wall")
 7 | set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -O3 -std=c++11 -Wno-format")
 8 | 
 9 | add_library(booster_arm_obj OBJECT ${ARM_SRC} ${ARM_HEADERS})
10 | #add_library(arm_backend STATIC ${ARM_SRC} ${ARM_HEADERS})
11 | 
12 | #target_include_directories(arm_backend PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
13 | #set(ARM_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/booster_arm/")
14 | #install(TARGETS arm_backend DESTINATION ${ARM_INSTALL_DIR}/lib)
15 | #install(FILES ${ARM_HEADERS} DESTINATION "${ARM_INSTALL_DIR}/include")
16 | 


--------------------------------------------------------------------------------
/src/booster/arm/caffe_interp.cpp:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #include <booster/caffe_interp.h>
 16 | 
 17 | // Bi-linear interpolation
 18 | // IN : [channels height1 width1] cropped from a bigger [Height1 Width1] image
 19 | // OUT: [channels height2 width2] cropped from a bigger [Height2 Width2] image
 20 | template <typename Dtype, bool packed>
 21 | void caffe_cpu_interp2(const int channels,
 22 |                        const Dtype *data1, const int x1, const int y1, const int height1, const int width1, const int Height1, const int Width1,
 23 |                        Dtype *data2, const int x2, const int y2, const int height2, const int width2, const int Height2, const int Width2)
 24 | {
 25 |     // CHECK(x1 >= 0 && y1 >= 0 && height1 > 0 && width1 > 0 && x2 >= 0 && y2 >= 0 && height2 > 0 && width2 > 0);
 26 |     // CHECK(Width1 >= width1 + x1 && Height1 >= height1 + y1 && Width2 >= width2 + x2 && Height2 >= height2 + y2);
 27 |     // special case: just copy
 28 |     if (height1 == height2 && width1 == width2)
 29 |     {
 30 |         for (int h2 = 0; h2 < height2; ++h2)
 31 |         {
 32 |             const int h1 = h2;
 33 |             for (int w2 = 0; w2 < width2; ++w2)
 34 |             {
 35 |                 const int w1 = w2;
 36 |                 if (packed)
 37 |                 {
 38 |                     const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))];
 39 |                     Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))];
 40 |                     for (int c = 0; c < channels; ++c)
 41 |                     {
 42 |                         pos2[0] = pos1[0];
 43 |                         pos1++;
 44 |                         pos2++;
 45 |                     }
 46 |                 }
 47 |                 else
 48 |                 {
 49 |                     const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)];
 50 |                     Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)];
 51 |                     for (int c = 0; c < channels; ++c)
 52 |                     {
 53 |                         pos2[0] = pos1[0];
 54 |                         pos1 += Width1 * Height1;
 55 |                         pos2 += Width2 * Height2;
 56 |                     }
 57 |                 }
 58 |             }
 59 |         }
 60 |         return;
 61 |     }
 62 |     const float rheight = (height2 > 1) ? static_cast<float>(height1) / (height2) : 0.f;
 63 |     const float rwidth = (width2 > 1) ? static_cast<float>(width1) / (width2) : 0.f;
 64 |     for (int h2 = 0; h2 < height2; ++h2)
 65 |     {
 66 |         const float h1r = rheight * h2;
 67 |         const int h1 = h1r;
 68 |         const int h1p = (h1 < height1 - 1) ? 1 : 0;
 69 |         const Dtype h1lambda = h1r - h1;
 70 |         const Dtype h0lambda = Dtype(1.) - h1lambda;
 71 |         for (int w2 = 0; w2 < width2; ++w2)
 72 |         {
 73 |             const float w1r = rwidth * w2;
 74 |             const int w1 = w1r;
 75 |             const int w1p = (w1 < width1 - 1) ? 1 : 0;
 76 |             const Dtype w1lambda = w1r - w1;
 77 |             const Dtype w0lambda = Dtype(1.) - w1lambda;
 78 |             if (packed)
 79 |             {
 80 |                 const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))];
 81 |                 Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))];
 82 |                 for (int c = 0; c < channels; ++c)
 83 |                 {
 84 |                     pos2[0] =
 85 |                         h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[channels * w1p]) +
 86 |                         h1lambda * (w0lambda * pos1[channels * h1p * Width1] + w1lambda * pos1[channels * (h1p * Width1 + w1p)]);
 87 |                     pos1++;
 88 |                     pos2++;
 89 |                 }
 90 |             }
 91 |             else
 92 |             {
 93 |                 const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)];
 94 |                 Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)];
 95 |                 for (int c = 0; c < channels; ++c)
 96 |                 {
 97 |                     pos2[0] =
 98 |                         h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) +
 99 |                         h1lambda * (w0lambda * pos1[h1p * Width1] + w1lambda * pos1[h1p * Width1 + w1p]);
100 |                     pos1 += Width1 * Height1;
101 |                     pos2 += Width2 * Height2;
102 |                 }
103 |             }
104 |         }
105 |     }
106 | }
107 | 
108 | template void caffe_cpu_interp2<float, false>(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int);
109 | template void caffe_cpu_interp2<float, true>(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int);
110 | template void caffe_cpu_interp2<double, false>(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int);
111 | template void caffe_cpu_interp2<double, true>(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int);
112 | 


--------------------------------------------------------------------------------
/src/booster/arm/helper.cpp:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #include <booster/helper.h>
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdio.h>
 19 | #include <arm_neon.h>
 20 | #include <math.h>
 21 | 
 22 | void print_vec2(float32x4_t* vp)
 23 | {
 24 |     float* ep = (float *) vp;
 25 |     printf("input %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
 26 | }
 27 | 
 28 | void print_vec3(float32x4_t* vp)
 29 | {
 30 |     float* ep = (float *) vp;
 31 |     printf("transformed %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
 32 | }
 33 | 
 34 | void print_vec(float32x4_t* vp, const char* comment)
 35 | {
 36 |     float* ep = (float *) vp;
 37 |     printf("%s %.3f, %.3f, %.3f, %.3f\n", comment, *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
 38 | }
 39 | 
 40 | 
 41 | void print_vec(float32x4_t* vp)
 42 | {
 43 |     float* ep = (float *) vp;
 44 |     printf("vec %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
 45 | }
 46 | 
 47 | void print_arr(float* vp)
 48 | {
 49 |     float* ep = (float *) vp;
 50 |     printf("arr %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
 51 | }
 52 | 
 53 | void print_floats(const float* arr, const int len)
 54 | {
 55 |     for (int i = 0; i < len; ++i)
 56 |     {
 57 |         printf("%.2f ", arr[i]);
 58 |     }
 59 |     printf("\n\n");
 60 | }
 61 | 
 62 | void print_floats(const float* arr, const int dimX, const int dimY)
 63 | {
 64 |     for (int i = 0; i < dimX; ++i)
 65 |     {
 66 |         for (int j = 0; j < dimY; ++j)
 67 |             printf("%.2f ", arr[i * dimY + j]);
 68 |         printf("\n");
 69 |     }
 70 |     printf("\n\n");
 71 | }
 72 | 
 73 | 
 74 | void diff(float* arr1, float* arr2, int len)
 75 | {
 76 |     float dif = 0.0f;
 77 |     for (int i = 0; i < len; ++i)
 78 |     {
 79 |         float err = fabsf(arr1[i] - arr2[i]);
 80 |         if (err > 1.0f)
 81 |         {
 82 |             dif += err;
 83 |         }
 84 |     }
 85 |     LOGD("The difference is %.2f\n", dif);
 86 | }
 87 | void diff(float* arr1, float* arr2, int M, int N)
 88 | {
 89 |     float dif = 0.0f;
 90 |     for (int i = 0; i < M; ++i)
 91 |     {
 92 |         for (int j = 0; j < N; ++j)
 93 |         {
 94 |             float err = fabsf(arr1[i * N + j] - arr2[i * N + j]);
 95 |             if (err > 1.0f)
 96 |             {
 97 |                 dif += err;
 98 |                 LOGD("Error position (%d, %d), value %.2f, %.2f\n", i, j, arr1[i * N + j], arr2[i * N + j]);
 99 |             }
100 |         }
101 |     }
102 |     LOGD("The difference is %.2f\n", dif);
103 | }
104 | 
105 | #include <time.h>
106 | 
107 | void Timer::startBench()
108 | {
109 |     clock_gettime(CLOCK_MONOTONIC, &start);
110 | }
111 | 
112 | double Timer::endBench()
113 | {
114 |     clock_gettime(CLOCK_MONOTONIC, &stop);
115 |     return (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
116 | }
117 | 
118 | void Timer::endBench(const char* comment)
119 | {
120 |     clock_gettime(CLOCK_MONOTONIC, &stop);
121 |     double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
122 |     LOGD("%s %lfms\n", comment, elapsedTime);
123 | }
124 | 
125 | void Timer::endBench(const char* comment, double fold)
126 | {
127 |     clock_gettime(CLOCK_MONOTONIC, &stop);
128 |     double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
129 |     printf("%s %lfms\n", comment, elapsedTime / fold);
130 | }
131 | 


--------------------------------------------------------------------------------
/src/booster/arm/sgemm_legacy.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #ifndef TCNN_SGEMM_H_
16 | #define TCNN_SGEMM_H_
17 | 
18 | 
19 | void externalPackA(int M, int L, float* packA, float* a, int lda);//External packing for A, requires space allocation for packA
20 | void block_sgemm_external_pack_threading(int M, int N, int L, float *A, float *B, float *C, int num_threads);
21 | 
22 | 
23 | void externalPackA8(int M, int L, float* packA, float* a, int lda);//External packing for A, requires space allocation for packA
24 | void block_sgemm_external_pack_threading_8x8(int M, int N, int L, float *A, float *B, float *C, int num_threads);
25 | 
26 | 
27 | #endif
28 | 


--------------------------------------------------------------------------------
/src/booster/arm/sgemv.cpp:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #include <booster/sgemv.h>
 16 | 
 17 | #include <assert.h>
 18 | #include <arm_neon.h>
 19 | #include <string.h>
 20 | 
 21 | template <bool fuseBias, bool fuseRelu>
 22 | void fully_connected_inference_direct(const int input_size, const int output_size, const float *x, const float *y, float *z, const int num_threads, float* bias_arr)
 23 | {
 24 |     #pragma omp parallel for schedule(static) num_threads(num_threads)
 25 |     for (int i = 0; i < output_size; i++)
 26 |     {
 27 |         float sum = 0;
 28 |         for (int j = 0; j < input_size; j++)
 29 |             sum += x[j] * y[i * input_size + j];
 30 |         if (fuseBias)
 31 |             sum += bias_arr[i];
 32 |         if (fuseRelu)
 33 |             sum = (sum > 0.f) ? sum : 0.f;
 34 |         z[i] = sum;
 35 |     }
 36 | }
 37 | 
 38 | template <bool fuseBias, bool fuseRelu>
 39 | void fully_connected_transpose_inference(const int input_size, const int output_size, const float *x, const float *y, float *z, const int num_threads, float* bias_arr)
 40 | {
 41 |     assert(input_size % 8 == 0);
 42 |     assert(output_size % 8 == 0);
 43 |     #pragma omp parallel for schedule(static) num_threads(num_threads)
 44 |     for (int k = 0; k < output_size / 8; k++)
 45 |     {
 46 |         float32x4_t vBias = vld1q_f32(bias_arr + k * 8);
 47 |         float32x4_t vBias1 = vld1q_f32(bias_arr + k * 8 + 4);
 48 |         float32x4_t vZero = vdupq_n_f32(0.f);
 49 |         const float *yPtr = y + k * 8 * input_size;
 50 |         float32x4_t res = {0.0, 0.0, 0.0, 0.0};
 51 |         float32x4_t res1 = {0.0, 0.0, 0.0, 0.0};
 52 |         float32x4_t va, vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7;
 53 |         for (int i = 0; i < input_size; i += 4)
 54 |         {
 55 |             //          float32x4_t v1, v2;
 56 |             va = vld1q_f32(x + i);
 57 | 
 58 |             vb0 = vld1q_f32(yPtr);
 59 |             vb1 = vld1q_f32(yPtr + 4);
 60 |             vb2 = vld1q_f32(yPtr + 8);
 61 |             vb3 = vld1q_f32(yPtr + 12);
 62 |             vb4 = vld1q_f32(yPtr + 16);
 63 |             vb5 = vld1q_f32(yPtr + 20);
 64 |             vb6 = vld1q_f32(yPtr + 24);
 65 |             vb7 = vld1q_f32(yPtr + 28);
 66 | 
 67 | #if __aarch64__
 68 |             res = vfmaq_laneq_f32(res, vb0, va, 0);
 69 |             res1 = vfmaq_laneq_f32(res1, vb1, va, 0);
 70 |             res = vfmaq_laneq_f32(res, vb2, va, 1);
 71 |             res1 = vfmaq_laneq_f32(res1, vb3, va, 1);
 72 |             res = vfmaq_laneq_f32(res, vb4, va, 2);
 73 |             res1 = vfmaq_laneq_f32(res1, vb5, va, 2);
 74 |             res = vfmaq_laneq_f32(res, vb6, va, 3);
 75 |             res1 = vfmaq_laneq_f32(res1, vb7, va, 3);
 76 | #else
 77 |             res = vmlaq_f32(res, vb0, vld1q_dup_f32(x + i + 0));
 78 |             res1 = vmlaq_f32(res1, vb1, vld1q_dup_f32(x + i + 0));
 79 |             res = vmlaq_f32(res, vb2, vld1q_dup_f32(x + i + 1));
 80 |             res1 = vmlaq_f32(res1, vb3, vld1q_dup_f32(x + i + 1));
 81 |             res = vmlaq_f32(res, vb4, vld1q_dup_f32(x + i + 2));
 82 |             res1 = vmlaq_f32(res1, vb5, vld1q_dup_f32(x + i + 2));
 83 |             res = vmlaq_f32(res, vb6, vld1q_dup_f32(x + i + 3));
 84 |             res1 = vmlaq_f32(res1, vb7, vld1q_dup_f32(x + i + 3));
 85 | #endif
 86 |             yPtr += 32;
 87 |         }
 88 | 
 89 |         if (fuseBias)
 90 |         {
 91 |             res = vaddq_f32(res, vBias);
 92 |             res1 = vaddq_f32(res1, vBias1);
 93 |         }
 94 |         if (fuseRelu)
 95 |         {
 96 |             res = vmaxq_f32(res, vZero);
 97 |             res1 = vmaxq_f32(res1, vZero);
 98 |         }
 99 |         vst1q_f32((float32_t *)(z + 8 * k), res);
100 |         vst1q_f32((float32_t *)(z + 8 * k + 4), res1);
101 |     }
102 | }
103 | 
104 | template void fully_connected_inference_direct<false, false>(const int, const int, const float *, const float *, float *, const int, float*);
105 | template void fully_connected_inference_direct<false,  true>(const int, const int, const float *, const float *, float *, const int, float*);
106 | template void fully_connected_inference_direct<true,  false>(const int, const int, const float *, const float *, float *, const int, float*);
107 | template void fully_connected_inference_direct<true,   true>(const int, const int, const float *, const float *, float *, const int, float*);
108 | 
109 | template void fully_connected_transpose_inference<false, false>(const int, const int, const float *, const float *, float *, const int, float*);
110 | template void fully_connected_transpose_inference<false,  true>(const int, const int, const float *, const float *, float *, const int, float*);
111 | template void fully_connected_transpose_inference<true,  false>(const int, const int, const float *, const float *, float *, const int, float*);
112 | template void fully_connected_transpose_inference<true,   true>(const int, const int, const float *, const float *, float *, const int, float*);
113 | 
114 | #if 0
115 | void fully_connected_inference_direct_BiasReLU(int input_size, int output_size, float *x, float *y, float *z, float* biasArr, int num_threads)
116 | {
117 |     #pragma omp parallel for schedule(static) num_threads(num_threads)
118 |     for (int i = 0; i < output_size; i++)
119 |     {
120 |         float sum = 0.f;
121 |         for (int j = 0; j < input_size; j++)
122 |             sum += x[j] * y[i * input_size + j];
123 | 
124 |         sum += biasArr[i];
125 |         if (sum < 0.f) sum = 0.f;
126 |         z[i] = sum;
127 |     }
128 | }
129 | 
130 | void fully_connected_transpose_inference_neon8_BiasReLU(int input_size, int output_size, float *x, float *y, float *z, float* biasArr, int num_threads)
131 | {
132 |     assert(input_size % 8 == 0);
133 |     assert(output_size % 8 == 0);
134 |     #pragma omp parallel for schedule(static) num_threads(num_threads)
135 |     for (int k = 0; k < output_size / 8; k++)
136 |     {
137 |         float *yPtr = y + k * 8 * input_size;
138 |         const float32x4_t vzero = vdupq_n_f32(0.f);
139 | 
140 |         float32x4_t res  = vld1q_f32(biasArr + k * 8);
141 |         float32x4_t res1 = vld1q_f32(biasArr + k * 8 + 4);
142 | 
143 |         float32x4_t va, vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7;
144 |         for (int i = 0; i < input_size; i += 4)
145 |         {
146 |             va = vld1q_f32(x + i);
147 | 
148 |             vb0 = vld1q_f32(yPtr);
149 |             vb1 = vld1q_f32(yPtr + 4);
150 |             vb2 = vld1q_f32(yPtr + 8);
151 |             vb3 = vld1q_f32(yPtr + 12);
152 |             vb4 = vld1q_f32(yPtr + 16);
153 |             vb5 = vld1q_f32(yPtr + 20);
154 |             vb6 = vld1q_f32(yPtr + 24);
155 |             vb7 = vld1q_f32(yPtr + 28);
156 | 
157 | #if __aarch64__
158 |             res = vfmaq_laneq_f32(res, vb0, va, 0);
159 |             res1 = vfmaq_laneq_f32(res1, vb1, va, 0);
160 |             res = vfmaq_laneq_f32(res, vb2, va, 1);
161 |             res1 = vfmaq_laneq_f32(res1, vb3, va, 1);
162 |             res = vfmaq_laneq_f32(res, vb4, va, 2);
163 |             res1 = vfmaq_laneq_f32(res1, vb5, va, 2);
164 |             res = vfmaq_laneq_f32(res, vb6, va, 3);
165 |             res1 = vfmaq_laneq_f32(res1, vb7, va, 3);
166 | #else
167 |             res = vmlaq_f32(res, vb0, vld1q_dup_f32(x + i + 0));
168 |             res1 = vmlaq_f32(res1, vb1, vld1q_dup_f32(x + i + 0));
169 |             res = vmlaq_f32(res, vb2, vld1q_dup_f32(x + i + 1));
170 |             res1 = vmlaq_f32(res1, vb3, vld1q_dup_f32(x + i + 1));
171 |             res = vmlaq_f32(res, vb4, vld1q_dup_f32(x + i + 2));
172 |             res1 = vmlaq_f32(res1, vb5, vld1q_dup_f32(x + i + 2));
173 |             res = vmlaq_f32(res, vb6, vld1q_dup_f32(x + i + 3));
174 |             res1 = vmlaq_f32(res1, vb7, vld1q_dup_f32(x + i + 3));
175 | #endif
176 |             yPtr += 32;
177 |         }
178 | 
179 |         //res  = vaddq_f32(res, vBias);
180 |         //res1 = vaddq_f32(res, vBias1);
181 | 
182 |         res  = vmaxq_f32(res, vzero);
183 |         res1 = vmaxq_f32(res1, vzero);
184 | 
185 |         vst1q_f32((float32_t *)(z + 8 * k), res);
186 |         vst1q_f32((float32_t *)(z + 8 * k + 4), res1);
187 |     }
188 | }
189 | /*
190 | void fully_connected_transpose_inference_neon(int input_size, int output_size, float *x, float *y, float *z)
191 | {
192 |     assert(input_size %4==0);
193 |     assert(output_size%4==0);
194 | //#pragma omp parallel for num_threads(32) schedule(static)
195 |     for(int k=0; k<output_size/4; k++)
196 |     {
197 |         float *yPtr = y + k*4*input_size;
198 |         float32x4_t res = {0.0,0.0,0.0,0.0};
199 | 
200 |         for(int i=0; i<input_size; i+=4)
201 |         {
202 |             float32x4_t v1, v2;
203 |             v2 = vld1q_f32(x + i);
204 | 
205 | #if __aarch64__
206 |             v1 = vld1q_f32(yPtr);
207 |             res = vfmaq_laneq_f32(res, v1, v2, 0);
208 | 
209 |             v1 = vld1q_f32(yPtr + 4);
210 |             res = vfmaq_laneq_f32(res, v1, v2, 1);
211 | 
212 |             v1 = vld1q_f32(yPtr + 8);
213 |             res = vfmaq_laneq_f32(res, v1, v2, 2);
214 | 
215 |             v1 = vld1q_f32(yPtr + 12);
216 |             res = vfmaq_laneq_f32(res, v1, v2, 3);
217 | #else
218 |             v1 = vld1q_f32(yPtr);
219 |             res = vmlaq_f32(res, v1, vld1q_dup_f32(x + i + 0));
220 | 
221 |             v1 = vld1q_f32(yPtr + 4);
222 |             res = vmlaq_f32(res, v1, vld1q_dup_f32(x + i + 1));
223 | 
224 |             v1 = vld1q_f32(yPtr + 8);
225 |             res = vmlaq_f32(res, v1, vld1q_dup_f32(x + i + 2));
226 | 
227 |             v1 = vld1q_f32(yPtr + 12);
228 |             res = vmlaq_f32(res, v1, vld1q_dup_f32(x + i + 3));
229 | #endif
230 |             yPtr += 16;
231 |         }
232 |         vst1q_f32((float32_t *) (z+4*k), res);
233 |     }
234 | }
235 | */
236 | #endif
237 | void matrixTranspose(float* array, size_t m, size_t n, float *buffer)//  A[m][n] -> A[n][m]
238 | {
239 |     for (int i = 0; i < m; i++)    for (int j = 0; j < n; j++)
240 |             buffer[j * m + i] = array[i * n + j];
241 |     memcpy(array, buffer, m * n * sizeof(float));
242 | }
243 | 


--------------------------------------------------------------------------------
/src/booster/avx/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | file(GLOB AVX_SRC ./*.cpp)
 2 | file(GLOB AVX_HEADERS ./*.h)
 3 | 
 4 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a -fopenmp")
 5 | 
 6 | if(CMAKE_SYSTEM_NAME MATCHES "Windows")
 7 | 	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -std=c++11 -Wall")
 8 | 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -std=c++11 -O2")
 9 | else()
10 | 	set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS} -std=c++11 -march=core-avx2 -g -Wall")
11 | 	set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -std=c++11 -march=core-avx2 -O3 -Wno-format -Wno-unused-parameter")
12 | endif()
13 | 
14 | add_library(booster_avx_obj OBJECT ${AVX_SRC} ${AVX_HEADERS})
15 | #add_library(arm_backend STATIC ${AVX_SRC} ${AVX_HEADERS})
16 | 
17 | #target_include_directories(arm_backend PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
18 | #set(AVX_INSTALL_DIR "${PROJECT_BINARY_DIR}/install/feather_backend_avx/")
19 | #install(TARGETS arm_backend DESTINATION ${AVX_INSTALL_DIR}/lib)
20 | #install(FILES ${AVX_HEADERS} DESTINATION "${AVX_INSTALL_DIR}/include")
21 | 


--------------------------------------------------------------------------------
/src/booster/avx/caffe_interp.cpp:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #include <booster/caffe_interp.h>
 16 | 
 17 | // Bi-linear interpolation
 18 | // IN : [channels height1 width1] cropped from a bigger [Height1 Width1] image
 19 | // OUT: [channels height2 width2] cropped from a bigger [Height2 Width2] image
 20 | template <typename Dtype, bool packed>
 21 | void caffe_cpu_interp2(const int channels,
 22 |                        const Dtype *data1, const int x1, const int y1, const int height1, const int width1, const int Height1, const int Width1,
 23 |                        Dtype *data2, const int x2, const int y2, const int height2, const int width2, const int Height2, const int Width2)
 24 | {
 25 |     // CHECK(x1 >= 0 && y1 >= 0 && height1 > 0 && width1 > 0 && x2 >= 0 && y2 >= 0 && height2 > 0 && width2 > 0);
 26 |     // CHECK(Width1 >= width1 + x1 && Height1 >= height1 + y1 && Width2 >= width2 + x2 && Height2 >= height2 + y2);
 27 |     // special case: just copy
 28 |     if (height1 == height2 && width1 == width2)
 29 |     {
 30 |         for (int h2 = 0; h2 < height2; ++h2)
 31 |         {
 32 |             const int h1 = h2;
 33 |             for (int w2 = 0; w2 < width2; ++w2)
 34 |             {
 35 |                 const int w1 = w2;
 36 |                 if (packed)
 37 |                 {
 38 |                     const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))];
 39 |                     Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))];
 40 |                     for (int c = 0; c < channels; ++c)
 41 |                     {
 42 |                         pos2[0] = pos1[0];
 43 |                         pos1++;
 44 |                         pos2++;
 45 |                     }
 46 |                 }
 47 |                 else
 48 |                 {
 49 |                     const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)];
 50 |                     Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)];
 51 |                     for (int c = 0; c < channels; ++c)
 52 |                     {
 53 |                         pos2[0] = pos1[0];
 54 |                         pos1 += Width1 * Height1;
 55 |                         pos2 += Width2 * Height2;
 56 |                     }
 57 |                 }
 58 |             }
 59 |         }
 60 |         return;
 61 |     }
 62 |     const float rheight = (height2 > 1) ? static_cast<float>(height1) / (height2) : 0.f;
 63 |     const float rwidth = (width2 > 1) ? static_cast<float>(width1) / (width2) : 0.f;
 64 |     for (int h2 = 0; h2 < height2; ++h2)
 65 |     {
 66 |         const float h1r = rheight * h2;
 67 |         const int h1 = h1r;
 68 |         const int h1p = (h1 < height1 - 1) ? 1 : 0;
 69 |         const Dtype h1lambda = h1r - h1;
 70 |         const Dtype h0lambda = Dtype(1.) - h1lambda;
 71 |         for (int w2 = 0; w2 < width2; ++w2)
 72 |         {
 73 |             const float w1r = rwidth * w2;
 74 |             const int w1 = w1r;
 75 |             const int w1p = (w1 < width1 - 1) ? 1 : 0;
 76 |             const Dtype w1lambda = w1r - w1;
 77 |             const Dtype w0lambda = Dtype(1.) - w1lambda;
 78 |             if (packed)
 79 |             {
 80 |                 const Dtype *pos1 = &data1[channels * ((y1 + h1) * Width1 + (x1 + w1))];
 81 |                 Dtype *pos2 = &data2[channels * ((y2 + h2) * Width2 + (x2 + w2))];
 82 |                 for (int c = 0; c < channels; ++c)
 83 |                 {
 84 |                     pos2[0] =
 85 |                         h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[channels * w1p]) +
 86 |                         h1lambda * (w0lambda * pos1[channels * h1p * Width1] + w1lambda * pos1[channels * (h1p * Width1 + w1p)]);
 87 |                     pos1++;
 88 |                     pos2++;
 89 |                 }
 90 |             }
 91 |             else
 92 |             {
 93 |                 const Dtype *pos1 = &data1[(y1 + h1) * Width1 + (x1 + w1)];
 94 |                 Dtype *pos2 = &data2[(y2 + h2) * Width2 + (x2 + w2)];
 95 |                 for (int c = 0; c < channels; ++c)
 96 |                 {
 97 |                     pos2[0] =
 98 |                         h0lambda * (w0lambda * pos1[0] + w1lambda * pos1[w1p]) +
 99 |                         h1lambda * (w0lambda * pos1[h1p * Width1] + w1lambda * pos1[h1p * Width1 + w1p]);
100 |                     pos1 += Width1 * Height1;
101 |                     pos2 += Width2 * Height2;
102 |                 }
103 |             }
104 |         }
105 |     }
106 | }
107 | 
108 | template void caffe_cpu_interp2<float, false>(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int);
109 | template void caffe_cpu_interp2<float, true>(const int, const float *, const int, const int, const int, const int, const int, const int, float *, const int, const int, const int, const int, const int, const int);
110 | template void caffe_cpu_interp2<double, false>(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int);
111 | template void caffe_cpu_interp2<double, true>(const int, const double *, const int, const int, const int, const int, const int, const int, double *, const int, const int, const int, const int, const int, const int);
112 | 


--------------------------------------------------------------------------------
/src/booster/avx/depthwise.cpp:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | 
 16 | #include <booster/depthwise.h>
 17 | #include <math.h>
 18 | #include <string.h>
 19 | #include <stdlib.h>
 20 | #include <assert.h>
 21 | 
 22 | //#include <arm_neon.h>
 23 | 
 24 | #ifdef __APPLE__
 25 | #else
 26 | #include <omp.h>
 27 | #endif
 28 | 
 29 | 
 30 | template <bool fuseBias, bool fuseRelu>
 31 | void globalDwConv(float *output, const float *input, int input_channels, int inw, int inh, float *kernel, int group, int nThreads, float *bias_arr)
 32 | {
 33 |     assert(group > 0 || input_channels % group == 0);
 34 |     int step = inw * inh;
 35 |     int block = input_channels / group;
 36 |     int groupKernelSize = inw * inh * group;
 37 | 
 38 |     for (int i = 0; i < input_channels; i++)
 39 |     {
 40 |         int k = i / group, u = i % group;
 41 |         output[i] = 0;
 42 |         for (int j = 0; j < step; j++)
 43 |         {
 44 |             output[i] += input[i * step + j] * kernel[k * groupKernelSize + u * step + j];
 45 |         }
 46 |         if (fuseBias)
 47 |         {
 48 |             output[i] += bias_arr[i];
 49 |         }
 50 |         if (fuseRelu)
 51 |         {
 52 |             output[i] = (output[i] > 0.f) ? output[i] : 0.f;
 53 |         }
 54 |     }
 55 | 
 56 |     /*
 57 |     int kw = inw, kh = inh;
 58 |     int width = kw * kh;
 59 |     int widthAligned = width & 0xFFFFFFFC;
 60 |     int widthRem = width & 0x03; // int widthRem = width & 0x11;
 61 |     int height = group;
 62 |     int heightAligned = group & 0xFFFFFFFC;
 63 |     int heightRem = height & 0x03; // int heightRem = height & 0x11;
 64 |     float ext[8];
 65 |     for(int i = 0; i < heightAligned; i += 4)
 66 |     {
 67 |         float32x4_t sum = vdupq_n_f32(0.f);
 68 |         float* p0 = const_cast<float *>(input) + width * i;
 69 |         float* p1 = p0 + width;
 70 |         float* p2 = p1 + width;
 71 |         float* p3 = p2 + width;
 72 |         float* k0 = kernel + width * i;
 73 |         float* k1 = k0 + width;
 74 |         float* k2 = k1 + width;
 75 |         float* k3 = k2 + width;
 76 | 
 77 |         for(int j = 0; j < widthAligned; j += 4)
 78 |         {
 79 |             float32x4_t v0 = vld1q_f32(p0);
 80 |             p0 += 4;
 81 |             float32x4_t v1 = vld1q_f32(p1);
 82 |             p1 += 4;
 83 |             float32x4_t v2 = vld1q_f32(p2);
 84 |             p2 += 4;
 85 |             float32x4_t v3 = vld1q_f32(p3);
 86 |             p3 += 4;
 87 | 
 88 |             float32x4_t r0 = vld1q_f32(k0);
 89 |             k0 += 4;
 90 |             float32x4_t r1 = vld1q_f32(k1);
 91 |             k1 += 4;
 92 |             float32x4_t r2 = vld1q_f32(k2);
 93 |             k2 += 4;
 94 |             float32x4_t r3 = vld1q_f32(k3);
 95 |             k3 += 4;
 96 | 
 97 |             float32x4x2_t row01 = vtrnq_f32(v0, v1);
 98 |             float32x4x2_t row23 = vtrnq_f32(v2, v3);
 99 | 
100 |     //           * row0 = ( x00 x10 x20 x30 )
101 |     //           * row1 = ( x01 x11 x21 x31 )
102 |     //           * row2 = ( x02 x12 x22 x32 )
103 |     //           * row3 = ( x03 x13 x23 x33 )
104 | 
105 |             v0 = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0]));
106 |             v1 = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1]));
107 |             v2 = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0]));
108 |             v3 = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1]));
109 |             row01 = vtrnq_f32(r0, r1);
110 |             row23 = vtrnq_f32(r2, r3);
111 |             r0 = vcombine_f32(vget_low_f32(row01.val[0]), vget_low_f32(row23.val[0]));
112 |             r1 = vcombine_f32(vget_low_f32(row01.val[1]), vget_low_f32(row23.val[1]));
113 |             r2 = vcombine_f32(vget_high_f32(row01.val[0]), vget_high_f32(row23.val[0]));
114 |             r3 = vcombine_f32(vget_high_f32(row01.val[1]), vget_high_f32(row23.val[1]));
115 |     #ifdef __aarch64__
116 |             sum = vfmaq_f32(sum, v0, r0);
117 |             sum = vfmaq_f32(sum, v1, r1);
118 |             sum = vfmaq_f32(sum, v2, r2);
119 |             sum = vfmaq_f32(sum, v3, r3);
120 |     #else
121 |             sum = vmlaq_f32(sum, v0, r0);
122 |             sum = vmlaq_f32(sum, v1, r1);
123 |             sum = vmlaq_f32(sum, v2, r2);
124 |             sum = vmlaq_f32(sum, v3, r3);
125 |     #endif
126 |         }
127 |         if(widthRem){
128 |             for(int j = 0; j < widthRem; ++j)
129 |             {
130 |                 ext[0] = p0[j];
131 |                 ext[1] = p1[j];
132 |                 ext[2] = p2[j];
133 |                 ext[3] = p3[j];
134 |                 ext[4] = k0[j];
135 |                 ext[5] = k1[j];
136 |                 ext[6] = k2[j];
137 |                 ext[7] = k3[j];
138 |     #ifdef __aarch64__
139 |                 sum = vfmaq_f32(sum, vld1q_f32(ext + 4), vld1q_f32(ext));
140 |     #else
141 |                 sum = vmlaq_f32(sum, vld1q_f32(ext + 4), vld1q_f32(ext));
142 |     #endif
143 |             }
144 |         }
145 |         vst1q_f32(output + i, sum);
146 |     }
147 |     for(int i = heightAligned; i < height; ++i)
148 |     {
149 |         float* p = const_cast<float *>(input) + i * width;
150 |         float* k = kernel + i * width;
151 |         float sum = 0.f;
152 |         for(int j = 0; j < width; ++j)
153 |         {
154 |             sum += p[j] * k[j];
155 |         }
156 |         output[i] = sum; // output[heightAligned + i] = sum;
157 |     }
158 |     */
159 | }
160 | 
161 | template <bool fuseBias, bool fuseRelu>
162 | void dwConv_template(float *output, float *input, int input_channels, int inw, int inh, int stridew, int strideh, float *kernel, int kw, int kh, int group, int nThreads, float *bias_arr)
163 | {
164 |     if ((kw == inw) && (kh == inh))
165 |     {
166 |         globalDwConv<fuseBias, fuseRelu>(output, input, input_channels, inw, inh, kernel, group, nThreads, bias_arr);
167 |     }
168 |     else
169 |     {
170 |         int outw = (inw - kw) / stridew + 1; //for strided case in odd dimensions, should take the floor value as output dim.
171 |         int outh = (inh - kh) / strideh + 1;
172 | 
173 | // #pragma omp parallel for num_threads(nThreads) schedule(static)
174 |         //printf("dw param %d kernel %d %d stride %d %d input %d %d %d output %d %d\n", group, kh, kw, strideh, stridew, input_channels, inh, inw, outh, outw);
175 |         for (int g = 0; g < group; ++g)
176 |         {
177 |             float *kp = kernel + kw * kh * g;
178 |             float *outg = output + g * outw * outh;
179 |             float *ing = input + g * inw * inh;
180 |             for (int i = 0; i < outh; ++i)
181 |             {
182 |                 for (int j = 0; j < outw; ++j)
183 |                 {
184 |                     float *inp = ing + inw * (i * stridew) + (j * strideh);
185 |                     float convSum = 0.f;
186 |                     for (int m = 0; m < kh; m++)
187 |                     {
188 |                         for (int n = 0; n < kw; n++)
189 |                         {
190 |                             convSum += inp[m * inw + n] * kp[m * kw + n];
191 |                         }
192 |                     }
193 |                     if (fuseBias)
194 |                     {
195 |                         convSum += bias_arr[g];
196 |                     }
197 |                     if (fuseRelu)
198 |                     {
199 |                         convSum = (convSum > 0.f) ? convSum : 0.f;
200 |                     }
201 |                     outg[j] = convSum;
202 |                 }
203 |                 outg += outw;
204 |             }
205 |         }
206 |     }
207 | }
208 | 
209 | template void dwConv_template<false, false>(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *);
210 | template void dwConv_template<false,  true>(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *);
211 | template void dwConv_template<true,  false>(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *);
212 | template void dwConv_template<true,   true>(float *, float *, int, int, int, int, int, float *, int, int, int, int, float *);
213 | 


--------------------------------------------------------------------------------
/src/booster/avx/helper.cpp:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #include <booster/helper.h>
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdio.h>
 19 | #include <math.h>
 20 | 
 21 | void print_arr(float* vp)
 22 | {
 23 |     float* ep = (float *) vp;
 24 |     printf("arr %.1f, %.1f, %.1f, %.1f\n", *(ep), *(ep + 1), *(ep + 2), *(ep + 3));
 25 | }
 26 | 
 27 | void print_floats(const float* arr, const int len)
 28 | {
 29 |     for (int i = 0; i < len; ++i)
 30 |     {
 31 |         printf("%.2f ", arr[i]);
 32 |     }
 33 |     printf("\n\n");
 34 | }
 35 | 
 36 | void print_floats(const float* arr, const int dimX, const int dimY)
 37 | {
 38 |     for (int i = 0; i < dimX; ++i)
 39 |     {
 40 |         for (int j = 0; j < dimY; ++j)
 41 |             printf("%.2f ", arr[i * dimY + j]);
 42 |         printf("\n");
 43 |     }
 44 |     printf("\n\n");
 45 | }
 46 | 
 47 | 
 48 | void diff(float* arr1, float* arr2, int len)
 49 | {
 50 |     float dif = 0.0f;
 51 |     for (int i = 0; i < len; ++i)
 52 |     {
 53 |         float err = fabsf(arr1[i] - arr2[i]);
 54 |         if (err > 1.0f)
 55 |         {
 56 |             dif += err;
 57 |         }
 58 |     }
 59 |     printf("The difference is %.2f\n", dif);
 60 | }
 61 | void diff(float* arr1, float* arr2, int M, int N)
 62 | {
 63 |     float dif = 0.0f;
 64 |     for (int i = 0; i < M; ++i)
 65 |     {
 66 |         for (int j = 0; j < N; ++j)
 67 |         {
 68 |             float err = fabsf(arr1[i * N + j] - arr2[i * N + j]);
 69 |             if (err > 1.0f)
 70 |             {
 71 |                 dif += err;
 72 |                 printf("Error position (%d, %d), value %.2f, %.2f\n", i, j, arr1[i * N + j], arr2[i * N + j]);
 73 |             }
 74 |         }
 75 |     }
 76 |     printf("The difference is %.2f\n", dif);
 77 | }
 78 | 
 79 | #include <time.h>
 80 | 
 81 | #ifdef _WIN32
 82 | #define CLOCK_MONOTONIC 0
 83 | int clock_gettime(int no_use, struct timespec *spec)
 84 | {
 85 |     return timespec_get(spec, TIME_UTC);
 86 | }
 87 | #endif
 88 | 
 89 | void Timer::startBench()
 90 | {
 91 |     clock_gettime(CLOCK_MONOTONIC, &start);
 92 | }
 93 | 
 94 | double Timer::endBench()
 95 | {
 96 |     clock_gettime(CLOCK_MONOTONIC, &stop);
 97 |     return (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
 98 | }
 99 | 
100 | void Timer::endBench(const char* comment)
101 | {
102 |     clock_gettime(CLOCK_MONOTONIC, &stop);
103 |     double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
104 |     printf("%s %lfms\n", comment, elapsedTime);
105 | }
106 | 
107 | void Timer::endBench(const char* comment, double fold)
108 | {
109 |     clock_gettime(CLOCK_MONOTONIC, &stop);
110 |     double elapsedTime = (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_nsec - start.tv_nsec) / 1000000.0;
111 |     printf("%s %lfms\n", comment, elapsedTime / fold);
112 | }
113 | 


--------------------------------------------------------------------------------
/src/booster/include/booster/booster.h:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | // Booster is the standalone backend of FeatherCNN, in order to facilitate unit testing
 16 | // and multi-purpose deployment. I am currently focusing on the fast convolution kernels,
 17 | // and will pack other operators as well. This backend library is now supporting
 18 | // AVX and Neon, and is going to supoort OpenCL/GLES in the future.
 19 | // Booster won't grow up into a hugh and abstract lib. I'll keep it simple and stupid.
 20 | // -- Haidong Lan @ Tencent AI Platform, 08/30/2018
 21 | 
 22 | #pragma once
 23 | 
 24 | #include <stdlib.h>
 25 | #include <stdio.h>
 26 | #include <map>
 27 | #include <string>
 28 | #include <vector>
 29 | #ifdef FEATHER_OPENCL
 30 | #include "CLHPP/clhpp_runtime.hpp"
 31 | #endif
 32 | 
 33 | #ifdef _WIN32
 34 | #define FEATHER_MEN_ALIGN(alignment) __declspec(align(alignment))
 35 | #else
 36 | #define FEATHER_MEN_ALIGN(alignment) __attribute__((aligned(alignment)))
 37 | #endif
 38 | 
 39 | namespace booster
 40 | {
 41 | 
 42 | enum ConvAlgo
 43 | {
 44 |     NAIVE,
 45 |     IM2COL,
 46 |     SGECONV,
 47 |     DEPTHWISE,
 48 |     WINOGRADF63,
 49 |     WINOGRADF63FUSED,
 50 |     WINOGRADF23,
 51 | };
 52 | 
 53 | enum ActivationType
 54 | {
 55 |     None,
 56 |     ReLU,
 57 | };
 58 | 
 59 | struct ConvParam
 60 | {
 61 |     int output_channels;
 62 |     int input_channels;
 63 |     int input_h;
 64 |     int input_w;
 65 |     int kernel_h;
 66 |     int kernel_w;
 67 |     int output_h;
 68 |     int output_w;
 69 |     int stride_h;
 70 |     int stride_w;
 71 |     int pad_left;
 72 |     int pad_bottom;
 73 |     int pad_right;
 74 |     int pad_top;
 75 |     int group;
 76 |     bool bias_term;
 77 |     ActivationType activation;
 78 | #ifdef FEATHER_OPENCL
 79 |     int channel_block_size;
 80 |     int padded_input_channels;
 81 |     int padded_output_channels;
 82 |     int height_block_size;
 83 |     int width_block_size;
 84 |     int padded_input_h;
 85 |     int padded_input_w;
 86 |     int padded_output_h;
 87 |     int padded_output_w;
 88 |     bool padding_needed;
 89 | 
 90 |     void AssignCLPaddedDim()
 91 |     {
 92 |         channel_block_size = 4;
 93 |         if (padded_input_channels % 8 == 0 && padded_output_channels % 8 == 0)
 94 |         {
 95 |             channel_block_size = 8;
 96 |         }
 97 | 
 98 |         height_block_size = 1;
 99 |         width_block_size = 1;
100 |         padding_needed = false;
101 |         if (input_w >= 12)
102 |         {
103 |             width_block_size = 2;
104 |             padded_output_h = (output_h + height_block_size - 1) / height_block_size * height_block_size;
105 |             padded_output_w = (output_w + width_block_size - 1) / width_block_size * width_block_size;
106 |             padded_input_h = (padded_output_h - 1) * stride_h + kernel_h;
107 |             padded_input_w = (padded_output_w - 1) * stride_w + kernel_w;
108 |             padding_needed = (padded_input_h != input_h) || (padded_input_w != input_w);
109 |         }
110 |     }
111 | #endif
112 | 
113 |     void AssignOutputDim()
114 |     {
115 |         //Validate default values
116 |         if (group == 0) group = 1;
117 |         if (stride_h == 0) stride_h = 1;
118 |         if (stride_w == 0) stride_w = 1;
119 |         output_h = (input_h + pad_top + pad_bottom - kernel_h) / stride_h + 1;
120 |         output_w = (input_w + pad_left + pad_right - kernel_w) / stride_w + 1;
121 |         if (group == input_channels)
122 |         {
123 |             output_channels = input_channels;
124 |         }
125 |     }
126 |     void AssignPaddedDim()
127 |     {
128 |         input_h = input_h + pad_top + pad_bottom;
129 |         input_w = input_w + pad_left + pad_right;
130 |         pad_left = 0;
131 |         pad_bottom = 0;
132 |         pad_right = 0;
133 |         pad_top = 0;
134 |     }
135 |     void LogParams(const char* layer_name)
136 |     {
137 |         printf("-----Layer %s ConvParam----\n", layer_name);
138 |         printf("Input CxHxW=(%d, %d, %d)\n", input_channels, input_h, input_w);
139 |         printf("Output CxHxW=(%d, %d, %d)\n", output_channels, output_h, output_w);
140 |         printf("Group = %d\n", group);
141 |         printf("Kernel HxW=(%d, %d)\n", kernel_h, kernel_w);
142 |         printf("Stride HxW=(%d, %d)\n", stride_h, stride_w);
143 |         printf("Paddings (%d %d %d %d)\n", pad_left, pad_bottom, pad_right, pad_top);
144 |     }
145 |     double GetFLOPS()
146 |     {
147 |         return 2.0 * this->output_channels * this->input_channels * this->output_h * this->output_w * this->kernel_h * this->kernel_w / this->group;
148 |     }
149 | };
150 | 
151 | typedef int (*GET_BUFFER_SIZE_FUNC)(ConvParam *param, int* buffer_size, int* processed_kernel_size);
152 | typedef int (*INIT_FUNC)(ConvParam *param, float* processed_kernel, float* kernel);
153 | typedef int (*FORWARD_FUNC)(ConvParam *param, float* output, float* input, float* kernel, float* buffer, float* bias_arr, int num_threads);
154 | 
155 | //ConvBooster doesn't allocate any memory.
156 | class ConvBooster
157 | {
158 |     public:
159 |         ConvBooster();
160 |         ~ConvBooster() {}
161 |         int SelectAlgo(ConvParam* param);
162 |         int ForceSelectAlgo(ConvAlgo algo);
163 |         int SetFuncs();
164 |         GET_BUFFER_SIZE_FUNC GetBufferSize;
165 |         INIT_FUNC Init;
166 |         FORWARD_FUNC Forward;
167 | 
168 |     private:
169 |         ConvAlgo algo;
170 | };
171 | 
172 | #ifdef FEATHER_OPENCL
173 | 
174 | struct CLBuffers
175 | {
176 |     cl::Buffer* input_mem;
177 |     cl::Buffer* padded_input_mem;
178 |     cl::Buffer* output_mem;
179 |     cl::Buffer* weight_mem;
180 |     cl::Buffer* bias_mem;
181 |     cl::Buffer* input_trans_mem;
182 |     cl::Buffer* out_trans_mem;
183 | };
184 | 
185 | 
186 | template <class Dtype>
187 | class ConvBoosterCL
188 | {
189 |     public:
190 | 
191 |         typedef int (*INIT_FUNC_CL)(const std::vector<std::string>& program_names,
192 |                                     const std::vector<std::string>& kernel_names,
193 |                                     std::map<std::string, clhpp_feather::CLKernelInfo>& cl_kernel_info_map);
194 |         typedef int (*FORWARD_FUNC_CL)(cl::CommandQueue cmd_q,
195 |                                        std::vector<std::string> kernel_names,
196 |                                        std::map<std::string, clhpp_feather::CLKernelInfo>& cl_kernel_info_map,
197 |                                        const ConvParam& param,
198 |                                        clhpp_feather::OpenCLRuntime* cl_runtime,
199 |                                        std::string layer_name);
200 |         typedef int (*WEIGHT_REFORM_FUNC_CL)(const ConvParam& param,
201 |                                              size_t n_grp_size,
202 |                                              size_t c_grp_size,
203 |                                              const Dtype* weight,
204 |                                              Dtype* weight_reformed);
205 |         typedef int (*SET_CONV_KERNEL_PARAMS_CL)(const ConvParam& param,
206 |                 const CLBuffers& buffers,
207 |                 const std::vector<std::string>& kernel_names,
208 |                 std::map<std::string, clhpp_feather::CLKernelInfo>& cl_kernel_info_map,
209 |                 clhpp_feather::OpenCLRuntime* cl_runtime,
210 |                 bool is_reshape);
211 |         typedef int (*SET_CONV_WORK_SIZE_CL)(const ConvParam& param,
212 |                                              std::map<std::string, clhpp_feather::CLKernelInfo>& cl_kernel_info_map,
213 |                                              const std::vector<std::string>& kernel_names,
214 |                                              clhpp_feather::OpenCLRuntime* cl_runtime);
215 |         typedef int (*SET_BUILD_OPTS_CL)(const ConvParam& param,
216 |                                          bool is_fp16,
217 |                                          const std::vector<std::string>& kernel_names,
218 |                                          std::map<std::string, clhpp_feather::CLKernelInfo>& cl_kernel_info_map);
219 | 
220 |         ConvBoosterCL();
221 |         ~ConvBoosterCL() {}
222 |         int SelectAlgo(ConvParam* param);
223 |         int ForceSelectAlgo(ConvAlgo algo);
224 |         int SetFuncs();
225 |         size_t GetWeightSize();
226 |         const std::vector<std::string>& GetProgramNames();
227 |         const std::vector<std::string>& GetKernelNames();
228 |         INIT_FUNC_CL Init;
229 |         FORWARD_FUNC_CL Forward;
230 |         WEIGHT_REFORM_FUNC_CL WeightReform;
231 |         SET_CONV_KERNEL_PARAMS_CL SetConvKernelParams;
232 |         SET_CONV_WORK_SIZE_CL SetConvWorkSize;
233 |         SET_BUILD_OPTS_CL SetBuildOpts;
234 |     private:
235 |         ConvAlgo algo;
236 |         size_t weight_size;
237 |         std::vector<std::string> program_names;
238 |         std::vector<std::string> kernel_names;
239 | 
240 | };
241 | #endif
242 | 
243 | };
244 | 


--------------------------------------------------------------------------------
/src/booster/include/booster/caffe_interp.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | template <typename Dtype, bool packed>
18 | void caffe_cpu_interp2(const int channels,
19 |                        const Dtype *data1, const int x1, const int y1, const int height1, const int width1, const int Height1, const int Width1,
20 |                        Dtype *data2, const int x2, const int y2, const int height2, const int width2, const int Height2, const int Width2);


--------------------------------------------------------------------------------
/src/booster/include/booster/depthwise.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include <stdio.h>
18 | 
19 | template<bool fuseBias, bool fuseRelu>
20 | void dwConv_template(float* output, float* input, int input_channels, int inw, int inh, int stridew, int strideh, float* kernel, int kw, int kh, int group, int nThreads, float* bias_arr);


--------------------------------------------------------------------------------
/src/booster/include/booster/generic_kernels.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include <stdio.h>
18 | #include <booster/booster.h>
19 | 
20 | namespace booster
21 | {
22 | void pad_input(float* padded, const float* input, const size_t input_channels, const size_t input_width, const size_t input_height, const size_t padding_left, const size_t padding_top, const size_t padding_right, const size_t padding_bottom);
23 | void im2col(ConvParam *conv_param, float *img_buffer, float *input);
24 | void naive_sgemm(int M, int N, int L, float *A, float *B, float *C);
25 | 
26 | template<bool fuse_relu>
27 | void add_relu(float* dst, const float* A, const float* B, const size_t len, const size_t num_threads);
28 | 
29 | template<bool has_bias>
30 | void scale(const size_t channels, const size_t stride, const float* bias_data, const float* scale_data, const float* input, float* output, const size_t num_threads);
31 | 
32 | template<bool has_bias, bool has_scale, bool has_relu>
33 | void batchnorm(const size_t channels, const size_t stride, const float* alpha, const float* beta, const float* bias_data, const float* scale_data, const float* input, float* output, const size_t num_threads);
34 | 
35 | //void dwConv(float* output, const float* input, const int inw, const int inh, const int stridew, const int strideh, const float* kernel, const int kw, const int kh, const int group, const int nThreads);
36 | void softmax(float* input, float n);
37 | bool pooling(float *A, float *B, const char *type, int input_channels, size_t kernelw, size_t kernelh, size_t outputw, size_t outputh, int output_channels);
38 | 
39 | void naive_gemm(int M, int N, int L, float *A, float *B, float *C);
40 | 
41 | void relu(float* arr, int len);
42 | void biasRelu(float* arr, int len, float bias);
43 | void reluVec(float* arr, int len);
44 | void biasVec(float* arr, int len, float bias);
45 | void biasReluVec(float* arr, int len, float bias);
46 | void reluVecOpenmp(float* arr, int len, int nThreads);
47 | void biasVecOpenmp(float* arr, int len, float bias, int nThreads);
48 | void biasReluVecOpenmp(float* arr, int len, float bias, int nThreads);
49 | };


--------------------------------------------------------------------------------
/src/booster/include/booster/helper.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | // #include "common.h"
17 | 
18 | #if 0
19 | #include <android/log.h>
20 | #define LOGI(...) __android_log_print(ANDROID_LOG_INFO,  "FeatherLib", __VA_ARGS__)
21 | #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, "FeatherLib", __VA_ARGS__)
22 | #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "FeatherLib", __VA_ARGS__)
23 | #else
24 | #include <stdio.h>
25 | #define LOGI(...) fprintf(stdout, __VA_ARGS__);fprintf(stdout,"\n");
26 | #define LOGD(...) fprintf(stdout, __VA_ARGS__);fprintf(stdout,"\n");
27 | #define LOGE(...) fprintf(stderr, __VA_ARGS__);fprintf(stderr,"\n");
28 | #endif
29 | 
30 | void print_floats(const float* arr, const int len);
31 | void print_floats(const float* arr, const int dimX, const int dimY);
32 | void diff(float* arr1, float* arr2, int len);
33 | void diff(float* arr1, float* arr2, int M, int N);
34 | 
35 | #if __ARM_NEON
36 | #include <arm_neon.h>
37 | 
38 | void print_vec2(float32x4_t* vp);
39 | void print_vec3(float32x4_t* vp);
40 | void print_vec(float32x4_t* vp, const char* comment);
41 | void print_vec(float32x4_t* vp);
42 | void print_arr(float* vp);
43 | 
44 | //Thanks nihui for this code snippet!
45 | #ifndef __aarch64__
46 | 
47 | #ifndef __APPLE__
48 | //static inline float32x4_t vfmaq_f32(float32x4_t _s, float32x4_t _a, float32x4_t _b)
49 | //{
50 | //  return vmlaq_f32(_s, _a, _b);
51 | //}
52 | #endif
53 | static inline float32x4_t vfmaq_laneq_f32(float32x4_t _s, float32x4_t _a, float32x4_t _b, int lane)
54 | {
55 |     if (lane == 0)      return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 0);
56 |     else if (lane == 1) return vmlaq_lane_f32(_s, _a, vget_low_f32(_b), 1);
57 |     else if (lane == 2) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 0);
58 |     else if (lane == 3) return vmlaq_lane_f32(_s, _a, vget_high_f32(_b), 1);
59 |     else return vdupq_n_f32(0.f);
60 | }
61 | #endif
62 | #endif
63 | 
64 | #include <time.h>
65 | class Timer
66 | {
67 |     public:
68 |         Timer() {}
69 |         virtual ~Timer() {}
70 |         void startBench();
71 |         double endBench();
72 |         void endBench(const char *commets);
73 |         void endBench(const char *commets, double fold);
74 |     private:
75 |         timespec start, stop;
76 | };
77 | 


--------------------------------------------------------------------------------
/src/booster/include/booster/sgeconv.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | 
 3 | #include <booster/booster.h>
 4 | /*
 5 |  * Performs single-float matrix multiply C = A * B in row-major fashion,
 6 |  * where C is MxN, A is MxK and B is KxN.
 7 |  * Allocation requirement: packA: M * K
 8 |  */
 9 | 
10 | namespace booster
11 | {
12 | void pad_input_neon(booster::ConvParam *conv_param, float* padded_input, float* input);
13 | 
14 | template<int ROW_BATCH>
15 | void packed_sgeconv_init(booster::ConvParam* conv_param, int kc, float* packA, float* A);
16 | 
17 | template<bool fuseBias, bool fuseRelu>
18 | void packed_sgeconv_im2col_activation(booster::ConvParam* conv_param, float *packA, float *B, const int ldb, float *C, const int ldc, const int nc, const int kc, float* bias, int num_threads, float* pack_array);
19 | };


--------------------------------------------------------------------------------
/src/booster/include/booster/sgemm.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | /*
18 |  * Performs single-float matrix multiply C = A * B in row-major fashion,
19 |  * where C is MxN, A is MxK and B is KxN.
20 |  * Allocation requirement: C: get_aligned_size(M, N), A: get_aligned_size(M, K)
21 |  */
22 | 
23 | int get_aligned_size(int M, int N);
24 | 
25 | template<int ROW_BATCH>
26 | void packed_sgemm_init(int M, int K, int kc, float* packA, float* A, int lda);
27 | 
28 | //void packed_sgemm(int M, int N, int K, float *packA, float *B, int ldb, float *C, int ldc, int nc, int kc);
29 | template<bool fuseBias, bool fuseRelu>
30 | void packed_sgemm_activation(int M, int N, int K, float *packA, float *b, int ldb, float *c, int ldc, int nc, int kc, float* bias, int num_threads, float* pack_array);
31 | 


--------------------------------------------------------------------------------
/src/booster/include/booster/sgemv.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | #include <stdlib.h>
17 | 
18 | void matrixTranspose(float* array, size_t m, size_t n, float *buffer);
19 | template <bool fuseBias, bool fuseRelu>
20 | void fully_connected_inference_direct(const int input_size, const int output_size, const float *x, const float *y, float *z, const int num_threads, float* bias_arr);
21 | template <bool fuseBias, bool fuseRelu>
22 | void fully_connected_transpose_inference(const int input_size, const int output_size, const float *x, const float *y, float *z, const int num_threads, float* bias_arr);


--------------------------------------------------------------------------------
/src/booster/include/booster/thpool.h:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | #pragma once
 15 | 
 16 | #include <vector>
 17 | #include <queue>
 18 | #include <memory>
 19 | #include <thread>
 20 | #include <mutex>
 21 | #include <condition_variable>
 22 | #include <future>
 23 | #include <functional>
 24 | #include <stdexcept>
 25 | #include <map>
 26 | 
 27 | class ThreadPool {
 28 | public:
 29 |     ThreadPool(size_t);
 30 |     template<class F, class... Args>
 31 |     auto enqueue(F&& f, Args&&... args)
 32 |         -> std::future<typename std::result_of<F(Args...)>::type>;
 33 |     ~ThreadPool();
 34 | 
 35 |     size_t threadNum();
 36 |     int threadID(std::thread::id std_id);
 37 | private:
 38 |     // need to keep track of threads so we can join them
 39 |     std::vector< std::thread > workers;
 40 |     // the task queue
 41 |     std::queue< std::function<void()> > tasks;
 42 | 
 43 |     // synchronization
 44 |     std::mutex queue_mutex;
 45 |     std::condition_variable condition;
 46 |     bool stop;
 47 | 
 48 |     //thread IDs
 49 |     std::map< std::thread::id, int > id_map;
 50 | };
 51 | 
 52 | // the constructor just launches some amount of workers
 53 | inline ThreadPool::ThreadPool(size_t threads)
 54 |     :   stop(false)
 55 | {
 56 |     for(size_t i = 0;i<threads;++i)
 57 |     {
 58 |         workers.emplace_back(
 59 |             [this, i]
 60 |             {
 61 |                 for(;;)
 62 |                 {
 63 |                     std::function<void()> task;
 64 | 
 65 |                     {
 66 |                         std::unique_lock<std::mutex> lock(this->queue_mutex);
 67 |                         this->condition.wait(lock,
 68 |                             [this]{ return this->stop || !this->tasks.empty(); });
 69 |                         if(this->stop && this->tasks.empty())
 70 |                             return;
 71 |                         task = std::move(this->tasks.front());
 72 |                         this->tasks.pop();
 73 | 
 74 |                     this->id_map[std::this_thread::get_id()] = i;
 75 |                     //std::cout << std::this_thread::get_id() << std::endl;
 76 |                     }
 77 | 
 78 |                     task();
 79 |                 }
 80 |             }
 81 |         );
 82 |     }
 83 | }
 84 | 
 85 | // add new work item to the pool
 86 | template<class F, class... Args>
 87 | auto ThreadPool::enqueue(F&& f, Args&&... args)
 88 |     -> std::future<typename std::result_of<F(Args...)>::type>
 89 | {
 90 |     using return_type = typename std::result_of<F(Args...)>::type;
 91 | 
 92 |     auto task = std::make_shared< std::packaged_task<return_type()> >(
 93 |             std::bind(std::forward<F>(f), std::forward<Args>(args)...)
 94 |         );
 95 | 
 96 |     std::future<return_type> res = task->get_future();
 97 |     {
 98 |         std::unique_lock<std::mutex> lock(queue_mutex);
 99 | 
100 |         // don't allow enqueueing after stopping the pool
101 |         if(stop)
102 |             throw std::runtime_error("enqueue on stopped ThreadPool");
103 | 
104 |         tasks.emplace([task](){ (*task)(); });
105 |     }
106 |     condition.notify_one();
107 |     return res;
108 | }
109 | 
110 | // the destructor joins all threads
111 | inline ThreadPool::~ThreadPool()
112 | {
113 |     {
114 |         std::unique_lock<std::mutex> lock(queue_mutex);
115 |         stop = true;
116 |     }
117 |     condition.notify_all();
118 |     for(std::thread &worker : workers)
119 |         worker.join();
120 | }
121 | 
122 | inline size_t ThreadPool::threadNum()
123 | {
124 |     return workers.size();
125 | }
126 | 
127 | inline int ThreadPool::threadID(std::thread::id std_id)
128 | {
129 |     return id_map[std_id];
130 | }
131 | 


--------------------------------------------------------------------------------
/src/booster/include/booster/winograd_kernels.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | #include <stdio.h>
17 | #include <booster/booster.h>
18 | #include <booster/thpool.h>
19 | 
20 | enum WinogradOutType
21 | {
22 |     Nothing, Relu, Bias, BiasReLU
23 | };
24 | 
25 | //UT larger than 16 * inChannels * outChannels
26 | void transformKernel(float* UT, float* kernel, int inChannels, int outChannels, float *ST);
27 | 
28 | //VT larger than 16 * (inputw / 2 - 1) * (inputh / 2 - 1) * inChannels
29 | //WT larger than 16 * (inputw / 2 - 1) * (inputh / 2 - 1) * outChannels
30 | void winogradNonFusedTransform(float *output, int outChannels, float* WT, float* VT, float* UT, float* input, int inChannels, int inputw, int inputh, WinogradOutType outType, float* biasArr, int num_threads);
31 | 
32 | size_t getPackArraySize_F6x6_3x3(int inChannels, int num_threads);
33 | void transformKernel_F6x6_3x3(float* UT, float* kernel, int inChannels, int outChannels);
34 | void winogradNonFusedTransform_F6x6_3x3(float *output, int outChannels, float* WT, float* VT, float* UT, float* input, int inChannels, int inputw, int inputh, WinogradOutType outType, float* biasArr, float* pack_array, int num_threads);
35 | 
36 | namespace Winograd_F63_Fused
37 | {
38 | void transformKernel_F6x6_3x3(float* UT, float* kernel, int inChannels, int outChannels);
39 | 
40 | template <bool FuseReLU, bool FuseBias>
41 | void WinogradF63Fused(booster::ConvParam* conv_param, float* output, const float* input, const float* transformed_weights, const float* bias, float* buffers, ThreadPool* thpool);
42 | };


--------------------------------------------------------------------------------
/src/layer.cpp:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #ifndef _WIN32
 16 | #include <unistd.h>
 17 | #endif
 18 | 
 19 | #include "layer.h"
 20 | 
 21 | namespace feather
 22 | {
 23 | Layer::Layer(RuntimeParameter<float>* rt_param)
 24 |     : _fusible(false),
 25 |       _inplace(false),
 26 |       rt_param(rt_param),
 27 |       common_mempool(rt_param->common_mempool())
 28 | {
 29 | }
 30 | 
 31 | Layer::~Layer()
 32 | {
 33 |     if (!_inplace)
 34 |     {
 35 |         for (int i = 0; i < tops.size(); ++i)
 36 |         {
 37 |             delete tops[i];
 38 |         }
 39 |     }
 40 |     for (int i = 0; i < weights.size(); ++i)
 41 |     {
 42 |         delete weights[i];
 43 |     }
 44 | }
 45 | 
 46 | int Layer::FindBottomIDByName(std::string name)
 47 | {
 48 |     for( int i = 0; i < this->bottoms.size(); ++i)
 49 |     {
 50 |         if (this->bottoms[i]->name.compare(name) == 0)
 51 |         {
 52 |             return i;
 53 |         }
 54 |     }
 55 |     return -1;
 56 | }
 57 | 
 58 | int Layer::FindTopIDByName(std::string name)
 59 | {
 60 |     for( int i = 0; i < this->tops.size(); ++i)
 61 |     {
 62 |         if (this->tops[i]->name.compare(name) == 0)
 63 |         {
 64 |             return i;
 65 |         }
 66 |     }
 67 |     return -1;
 68 | }
 69 | 
 70 | int Layer::LoadParam(const ncnn::ParamDict& pd)
 71 | {
 72 |     // Do nothing.
 73 |     return 0;
 74 | }
 75 | 
 76 | int Layer::LoadWeights(const ncnn::ModelBin& mb)
 77 | {
 78 |     // Do nothing
 79 |     return 0;
 80 | }
 81 | 
 82 | int Layer::TryFuse(Layer *next_layer)
 83 | {
 84 |     //Judge if next_layer points to this layer.
 85 |     for (int i = 0; i < next_layer->bottoms.size(); ++i)
 86 |     {
 87 |         for (int j = 0; j < this->tops.size(); ++j)
 88 |         {
 89 |             if (this->tops[j]->name.compare(next_layer->bottoms[i]->name) == 0)
 90 |             {
 91 |                 return Fuse(next_layer);
 92 |             }
 93 |         }
 94 |     }
 95 |     return 0;
 96 | }
 97 | 
 98 | int Layer::Fuse(Layer* next_layer)
 99 | {
100 |     return 0;
101 | }
102 | 
103 | int Layer::Reshape()
104 | {
105 |     /* GenerateTopBlobs
106 |      * infers top blob shape and allocate space.
107 |      * 
108 |      * The default behavior is allocate a top with the same shape of bottom
109 |      */
110 |     if (tops.size() != 1 || bottoms.size() != 1)
111 |         return -400; //False calling base layer.
112 |     tops[0]->ReshapeWithRealloc(bottoms[0]->num(), bottoms[0]->channels(), bottoms[0]->height(), bottoms[0]->width());
113 |     return 0;
114 | }
115 | 
116 | int Layer::Init()
117 | {
118 |     return 0;
119 | }
120 | 
121 | int Layer::Forward()
122 | {
123 |     return false;
124 | }
125 | 
126 | int Layer::ForwardReshape()
127 | {
128 |     //Default Reshape Assertation:
129 |     //
130 |     //There should be a single top blob as well as bottom blob.
131 |     //The default behaviour is that the top blob is identical to the bottom blob
132 |     //Use default reallocation.
133 |     
134 |     tops[0]->ReshapeWithRealloc(bottoms[0]);
135 |     this->Forward();
136 |     return true;
137 | }
138 | 
139 | bool Layer::fusible() const
140 | {
141 |     return _fusible;
142 | }
143 | };


--------------------------------------------------------------------------------
/src/layer.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "blob.h"
18 | #include "mempool.h"
19 | #include "rt_param.h"
20 | #include "utils.h"
21 | 
22 | #include "ncnn/paramdict.h"
23 | #include "ncnn/modelbin.h"
24 | 
25 | #include <vector>
26 | 
27 | namespace feather
28 | {
29 | class Layer
30 | {
31 |     public:
32 |         Layer(RuntimeParameter<float>* rt_param);
33 |         ~Layer();
34 | 
35 |         /* LoadParam LoadWeights
36 |          * 
37 |          * Load layer specifc paramters and corresponding weight data into memory.
38 |          * The two functions rely on `ncnn` model files.
39 |          */
40 |         virtual int LoadParam(const ncnn::ParamDict& pd); 
41 |         virtual int LoadWeights(const ncnn::ModelBin& mb); 
42 |         // int CopyDataFromMat(Blob<float>* dst, const ncnn::Mat &src);
43 |         
44 |         /* GenerateTopBlobs
45 |          *
46 |          * Infer top blob shape and allocate memory.
47 |          */
48 |         virtual int Reshape(); 
49 | 
50 |         /* Init
51 |          *
52 |          * Preprocess the weights in order to reduce inference overhead.
53 |          * Common memory pool first memory allocation occurs in this place
54 |          * when specify an initial input.
55 |          */
56 |         virtual int Init();
57 |         
58 |         virtual int Forward();
59 |         virtual int ForwardReshape();
60 | 
61 |         /* Fusion functions
62 |          * 
63 |          * Layer fusion is an important technique to imporove memory accessing efficiency.
64 |          * We currently support three patterns: Convolutoin-Bias-ReLU, BN-Scale-Relu, InnerProduct-Bias-ReLU
65 |          */
66 |         virtual int Fuse(Layer* next_layer); //Fuse layers when possible.
67 |         int TryFuse(Layer *next_layer);
68 |         bool fusible() const;
69 | 
70 |         /* Utility functions for blob retrieval by name*/
71 |         int FindBottomIDByName(std::string name);
72 |         int FindTopIDByName(std::string name);
73 | 
74 |     public: // We just make everything public. Take care when you write a derived layer.
75 |         std::string name;
76 |         std::string type;
77 | 
78 |         std::vector<Blob<float>* > bottoms;
79 |         std::vector<Blob<float>* > tops;
80 | 
81 |         std::vector<Blob<float>* > weights;
82 | 
83 |         bool _fusible;
84 |         bool _inplace;
85 | 
86 |         CommonMemPool<float>    *common_mempool;
87 |         RuntimeParameter<float> *rt_param;
88 | };
89 | };
90 | 


--------------------------------------------------------------------------------
/src/layer_factory.cpp:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #include "layer_factory.h"
16 | #include "layers/input_layer.h"
17 | #include "layers/conv_layer.h"
18 | #include "layers/pooling_layer.h"
19 | #include "layers/relu_layer.h"
20 | #include "layers/inner_product_layer.h"
21 | #include "layers/dropout_layer.h"
22 | #include "layers/softmax_layer.h"
23 | #include "layers/batchnorm_layer.h"
24 | #include "layers/scale_layer.h"
25 | #include "layers/split_layer.h"
26 | #include "layers/eltwise_layer.h"
27 | #include "layers/concat_layer.h"
28 | 
29 | #include <stdio.h>
30 | 
31 | namespace feather
32 | {
33 | /* An example to register a layer:
34 |  *
35 |  * feather layer name: ConvLayer, ncnn type name: Convolution
36 |  * 1. Define a layer creator: DEFINE_LAYER_CREATOR(Conv)
37 |  * 2. Register layer in the register_layer_creators() function: REGISTER_LAYER_CREATOR(Convolution, Conv);
38 |  */
39 | 
40 | DEFINE_LAYER_CREATOR(Input)
41 | DEFINE_LAYER_CREATOR(Conv)
42 | DEFINE_LAYER_CREATOR(Relu)
43 | DEFINE_LAYER_CREATOR(Pooling)
44 | DEFINE_LAYER_CREATOR(InnerProduct)
45 | DEFINE_LAYER_CREATOR(Dropout)
46 | DEFINE_LAYER_CREATOR(Softmax)
47 | DEFINE_LAYER_CREATOR(BatchNorm)
48 | DEFINE_LAYER_CREATOR(Scale)
49 | DEFINE_LAYER_CREATOR(Split)
50 | DEFINE_LAYER_CREATOR(Eltwise)
51 | DEFINE_LAYER_CREATOR(Concat)
52 | 
53 | void register_layer_creators()
54 | {
55 |     REGISTER_LAYER_CREATOR(Input, Input);
56 |     REGISTER_LAYER_CREATOR(Convolution, Conv);
57 |     REGISTER_LAYER_CREATOR(ConvolutionDepthWise, Conv);
58 |     REGISTER_LAYER_CREATOR(ReLU, Relu);
59 |     REGISTER_LAYER_CREATOR(Pooling, Pooling);
60 |     REGISTER_LAYER_CREATOR(InnerProduct, InnerProduct);
61 |     REGISTER_LAYER_CREATOR(Dropout, Dropout);
62 |     REGISTER_LAYER_CREATOR(Softmax, Softmax);
63 |     REGISTER_LAYER_CREATOR(BatchNorm, BatchNorm);
64 |     REGISTER_LAYER_CREATOR(Scale, Scale);
65 |     REGISTER_LAYER_CREATOR(Split, Split);
66 |     REGISTER_LAYER_CREATOR(Eltwise, Eltwise);
67 |     REGISTER_LAYER_CREATOR(Concat, Concat);
68 | }
69 | };
70 | 


--------------------------------------------------------------------------------
/src/layer_factory.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | /*
16 |  * The layer factory modifies from caffe.
17 |  */
18 | #pragma once
19 | 
20 | #include "layer.h"
21 | 
22 | #include <map>
23 | #include <string>
24 | #include <vector>
25 | 
26 | using namespace std;
27 | 
28 | namespace feather
29 | {
30 | class Layer;
31 | class LayerRegistry
32 | {
33 |     public:
34 |         typedef Layer* (*Creator)(RuntimeParameter<float> *);
35 |         typedef std::map<string, Creator> CreatorRegistry;
36 | 
37 |         static CreatorRegistry &Registry()
38 |         {
39 |             static CreatorRegistry *g_registry_ = new CreatorRegistry();
40 |             return *g_registry_;
41 |         }
42 | 
43 |         // Adds a creator.
44 |         static void AddCreator(const string &type, Creator creator)
45 |         {
46 |             CreatorRegistry &registry = Registry();
47 |             registry[type] = creator;
48 |         }
49 | 
50 |         // Get a layer using a LayerParameter.
51 |         static Layer *CreateLayer(std::string type, RuntimeParameter<float> *rt_param)
52 |         {
53 |             // const string &type = param->type()->str();
54 |             CreatorRegistry &registry = Registry();
55 |             if (registry.find(type) != registry.end())
56 |             {
57 |                 return registry[type](rt_param);
58 |             }
59 |             else
60 |             {
61 |                 fprintf(stderr, "Layer type %s is not supported in FeatherCNN...Aborting\n", type.c_str());
62 |                 return NULL;
63 |             }
64 |         }
65 | 
66 |     private:
67 |         // Layer registry should never be instantiated - everything is done with its
68 |         // static variables.
69 |         LayerRegistry() {}
70 | };
71 | 
72 | 
73 | class LayerRegisterer
74 | {
75 |     public:
76 |         LayerRegisterer(const string &type,
77 |                         Layer * (*creator)(RuntimeParameter<float>* ))
78 |         {
79 |             LayerRegistry::AddCreator(type, creator);
80 |         }
81 | };
82 | 
83 | void register_layer_creators();
84 | 
85 | #define DEFINE_LAYER_CREATOR(feather_layer_name) \
86 |     static Layer *GetLayer##feather_layer_name(RuntimeParameter<float> * rt_param) \
87 |     {return (Layer *) new feather_layer_name##Layer(rt_param);}
88 | 
89 | #define REGISTER_LAYER_CREATOR(ncnn_type_name, feather_layer_name) \
90 |     static LayerRegisterer g_creator_f_##ncnn_type_name(#ncnn_type_name, GetLayer##feather_layer_name);
91 | };
92 | 


--------------------------------------------------------------------------------
/src/layers/batchnorm_layer.h:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #pragma once
 16 | 
 17 | #include "../layer.h"
 18 | 
 19 | #include <booster/generic_kernels.h>
 20 | 
 21 | namespace feather
 22 | {
 23 | class BatchNormLayer : Layer
 24 | {
 25 |     public:
 26 |         BatchNormLayer(RuntimeParameter<float>* rt_param)
 27 |             : channels(0),
 28 |               scale_bias_term(false),
 29 |               fuse_scale(false),
 30 |               fuse_relu(false),
 31 |               Layer(rt_param)
 32 |         {
 33 |             _fusible = true;
 34 |         }
 35 | 
 36 |         int LoadParam(const ncnn::ParamDict &pd)
 37 |         {
 38 |             this->channels = pd.get(0, 0);
 39 |             this->eps = pd.get(1, 0.f);
 40 |             return 0;
 41 |         }
 42 | 
 43 |         int LoadWeights(const ncnn::ModelBin &mb)
 44 |         {
 45 |             ncnn::Mat slope_data, mean_data, var_data, bias_data;
 46 |             slope_data = mb.load(channels, 1);
 47 |             if (slope_data.empty())
 48 |                 return -100;
 49 | 
 50 |             mean_data = mb.load(channels, 1);
 51 |             if (mean_data.empty())
 52 |                 return -100;
 53 | 
 54 |             var_data = mb.load(channels, 1);
 55 |             if (var_data.empty())
 56 |                 return -100;
 57 | 
 58 |             bias_data = mb.load(channels, 1);
 59 |             if (bias_data.empty())
 60 |                 return -100;
 61 | 
 62 |             Blob<float> * alpha_blob = new Blob<float>;
 63 |             Blob<float> * beta_blob = new Blob<float>;
 64 |             this->weights.push_back(alpha_blob);
 65 |             this->weights.push_back(beta_blob);
 66 |             alpha_blob->ReshapeWithRealloc(1,1,1,channels);
 67 |             beta_blob->ReshapeWithRealloc(1,1,1,channels);
 68 |             float* alpha_data = alpha_blob->data();
 69 |             float* beta_data = beta_blob->data();
 70 |             for (int i = 0; i < channels; i++)
 71 |             {
 72 |                 float sqrt_var = sqrt(var_data[i] + this->eps);
 73 |                 alpha_data[i] = bias_data[i] - slope_data[i] * mean_data[i] / sqrt_var;
 74 |                 beta_data[i] = slope_data[i] / sqrt_var;
 75 |             }
 76 |             return 0;
 77 |         }
 78 | 
 79 |         int Init()
 80 |         {
 81 |             const Blob<float> *p_blob = this->bottoms[0];
 82 |             if (this->channels != p_blob->channels())
 83 |             {
 84 |                 printf("Mismatch channel in layer %s, expected %d but the bottom %s has %d channels.\n", this->name.c_str(), this->channels, p_blob->name.c_str(), p_blob->channels());
 85 |                 return -100;
 86 |             }
 87 |             SetKernel();
 88 | 
 89 |             return 0;
 90 |         }
 91 | 
 92 |         int Forward()
 93 |         {
 94 |             const float *input = bottoms[0]->data();
 95 |             float *output = tops[0]->data();
 96 |             float* alpha_data = weights[0]->data();
 97 |             float* beta_data = weights[1]->data();
 98 |             float* scale_data = NULL;
 99 |             float* scale_bias_data = NULL;
100 |             if (fuse_scale)
101 |             {
102 |                 scale_data = weights[2]->data();
103 |             }
104 |             if (scale_bias_term)
105 |             {
106 |                 scale_bias_data = weights[3]->data();
107 |             }
108 |             // memset(output, 0xFF, sizeof(float) * this->top_blob(0)->data_size());
109 |             size_t stride = bottoms[0]->width() * bottoms[0]->height();
110 |             bn_kernel(channels, stride, alpha_data, beta_data, scale_bias_data, scale_data, input, output, 1);
111 |             return 0;
112 |         }
113 | 
114 |         int Fuse(Layer *next_layer)
115 |         {
116 |             if (next_layer->type.compare("Scale") == 0)
117 |             {
118 |                 printf("BN %s fuse Scale layer %s\n", this->name.c_str(), next_layer->name.c_str());
119 |                 fuse_scale = true;
120 |                 for (int i = 0; i < next_layer->weights.size(); ++i)
121 |                 {
122 |                     Blob<float> *p_blob = new Blob<float>();
123 |                     p_blob->Copy(next_layer->weights[i]);
124 |                     // _weight_blobs.push_back(p_blob);
125 |                 }
126 |                 // scale_bias_term = ((ScaleLayer *)next_layer)->bias_term;
127 |                 return 1;
128 |             }
129 |             else if (next_layer->type.compare("ReLU") == 0)
130 |             {
131 |                 printf("BN %s fuse ReLU layer %s\n", this->name.c_str(), next_layer->name.c_str());
132 |                 fuse_relu = true;
133 |                 return 1;
134 |             }
135 |             else
136 |                 return 0;
137 |         }
138 | 
139 |     private:
140 |         int SetKernel()
141 |         {
142 |             int pattern_code = 0;
143 |             pattern_code += (scale_bias_term) ? 0x1 : 0;
144 |             pattern_code += (fuse_scale) ? 0x10 : 0;
145 |             pattern_code += (fuse_relu) ? 0x100 : 0;
146 |             //printf("pat_code %x\n", pat_code);
147 |             switch (pattern_code)
148 |             {
149 |             case 0x000:
150 |                 bn_kernel = booster::batchnorm<false, false, false>;
151 |                 break;
152 |             case 0x001:
153 |                 bn_kernel = booster::batchnorm<true, false, false>;
154 |                 break;
155 |             case 0x010:
156 |                 bn_kernel = booster::batchnorm<false, true, false>;
157 |                 break;
158 |             case 0x011:
159 |                 bn_kernel = booster::batchnorm<true, true, false>;
160 |                 break;
161 |             case 0x100:
162 |                 bn_kernel = booster::batchnorm<false, false, true>;
163 |                 break;
164 |             case 0x101:
165 |                 bn_kernel = booster::batchnorm<true, false, true>;
166 |                 break;
167 |             case 0x110:
168 |                 bn_kernel = booster::batchnorm<false, true, true>;
169 |                 break;
170 |             case 0x111:
171 |                 bn_kernel = booster::batchnorm<true, true, true>;
172 |                 break;
173 |             default:
174 |                 fprintf(stderr, "Invalid pattern code 0x%x for batchnorm kernel\n", pattern_code);
175 |                 return -1;
176 |             }
177 |             return 0;
178 |         }
179 |         void (*bn_kernel)(const size_t channels, const size_t stride, const float* alpha, const float* beta, const float* bias_data, const float* scale_data, const float* input, float* output, const size_t num_threads);
180 | 
181 |       private:
182 |         // size_t input_channels;
183 |         // size_t input_width;
184 |         // size_t input_height;
185 |         int channels;
186 |         float eps;
187 | 
188 |         bool fuse_scale;
189 |         bool scale_bias_term;
190 |         bool fuse_relu;
191 | };
192 | };
193 | 


--------------------------------------------------------------------------------
/src/layers/concat_layer.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "../layer.h"
18 | 
19 | namespace feather
20 | {
21 | class ConcatLayer : public Layer
22 | {
23 |     public:
24 |         ConcatLayer(RuntimeParameter<float>* rt_param)
25 |             : Layer(rt_param),
26 |               axis(0)
27 |         {
28 | 
29 |         }
30 |         
31 |         int LoadParam(const ncnn::ParamDict& pd)
32 |         {
33 |             this->axis = pd.get(0, 0);
34 |             return 0;
35 |         }
36 | 
37 |         int Forward()
38 |         {
39 |             float* top_ptr = tops[0]->data();
40 |             for (int i = 0; i < bottoms.size(); ++i)
41 |             {
42 |                 const float* bottom_ptr = bottoms[i]->data();
43 |                 memcpy(top_ptr, bottom_ptr, sizeof(float) * bottoms[i]->data_size());
44 |                 top_ptr += bottoms[i]->data_size();
45 |             }
46 |             return 0;
47 |         }
48 | 
49 |         int Reshape()
50 |         {
51 |             auto first_blob = this->bottoms[0];
52 |             size_t num = 1;
53 |             size_t channels = first_blob->channels();
54 |             size_t width = first_blob->width();
55 |             size_t height = first_blob->height();
56 |             // printf("bottom %s\n", first_blob->name.c_str());
57 |             for (int i = 1; i < bottoms.size(); ++i)
58 |             {
59 |                 auto p_blob = bottoms[i];
60 |                 // printf("bottom %s\n", p_blob->name.c_str());
61 |                 if (this->axis == 0)
62 |                 {
63 |                     if(!(width == p_blob->width() && height == p_blob->height()))
64 |                     {
65 |                         printf("Images of different shapes cannot be concatenated together\n");
66 |                         return -100;
67 |                     }
68 |                     channels += p_blob->channels();
69 |                 }
70 |                 else
71 |                 {
72 |                     LOGE("FeatherCNN only supports concat at axis = 0.");
73 |                     return -100;
74 |                 }
75 |             }
76 |             // printf("Concat output shape %d %d %d\n", channels, height, width);
77 |             tops[0]->ReshapeWithRealloc(1, channels, height, width);
78 |             return 0;
79 |         }
80 |     private:
81 |         int axis;
82 | };
83 | };
84 | 


--------------------------------------------------------------------------------
/src/layers/conv_layer.h:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #pragma once
 16 | 
 17 | #include "../layer.h"
 18 | 
 19 | #include <booster/helper.h>
 20 | #include <booster/booster.h>
 21 | #include <assert.h>
 22 | #include <stdio.h>
 23 | 
 24 | namespace feather
 25 | {
 26 | class ConvLayer : public Layer
 27 | {
 28 |     public:
 29 |         ConvLayer(RuntimeParameter<float>* rt_param)
 30 |             : Layer(rt_param),
 31 |               conv_booster(),
 32 |               conv_param(),
 33 |               bias_data(NULL),
 34 |               processed_kernel(NULL)
 35 |         {
 36 |             // this->_fusible = true;
 37 |         }
 38 | 
 39 |         int LoadParam(const ncnn::ParamDict& pd)
 40 |         {
 41 |             int dilation_w = pd.get(2, 1);
 42 |             int dilation_h = pd.get(12, dilation_w);
 43 |             if ((dilation_w > 1) || (dilation_h > 1))
 44 |             {
 45 |                 printf("Dilated convolution is not supported in FeatherCNN. Please refer to the ncnn repository.\n");
 46 |                 return -200; //Not supported
 47 |             }
 48 | 
 49 |             int int8_scale_term = pd.get(8, 0);
 50 |             if (int8_scale_term)
 51 |             {
 52 |                 printf("Dilated convolution is not supported in FeatherCNN. Please refer to the ncnn repository.\n");
 53 |                 return -200; //Not supported
 54 |             }
 55 | 
 56 |             conv_param.kernel_w = pd.get(1, 0);
 57 |             conv_param.kernel_h = pd.get(11, conv_param.kernel_w);  
 58 |             conv_param.stride_w = pd.get(3, 1);
 59 |             conv_param.stride_h = pd.get(13, conv_param.stride_w);
 60 |             conv_param.pad_left = pd.get(4, 0);
 61 |             conv_param.pad_bottom = pd.get(14, conv_param.pad_left);
 62 |             conv_param.pad_right = pd.get(4, 0);
 63 |             conv_param.pad_top = pd.get(14, conv_param.pad_left);
 64 |             conv_param.group = pd.get(7, 1);
 65 |             conv_param.output_channels = pd.get(0, 0);
 66 |             conv_param.bias_term = pd.get(5, 0);
 67 |             conv_param.activation = booster::None;
 68 |             int weight_data_size = pd.get(6, 0);
 69 | 	    if(conv_param.group==0 || conv_param.output_channels%conv_param.group)	
 70 | 	    {
 71 | 		printf("Layer %s output_channels is not divisible by its group\n", this->name);
 72 | 		exit(0);
 73 | 	    }
 74 | 	    else	conv_param.output_channels /= conv_param.group;
 75 |             conv_param.input_channels = weight_data_size / conv_param.output_channels / conv_param.kernel_h / conv_param.kernel_w;
 76 | 
 77 | //            printf("ic=%d oc=%d (kw,kh)=(%d,%d) (sw,sh)=(%d,%d) (pad)=(%d,%d,%d,%d) group=%d\n", conv_param.input_channels, conv_param.output_channels, conv_param.kernel_w, conv_param.kernel_h, conv_param.stride_w, conv_param.stride_h, conv_param.pad_left, conv_param.pad_bottom, conv_param.pad_right, conv_param.pad_top, conv_param.group);
 78 | 
 79 |             // The params are known, therefore we can allocate space for weights.
 80 |             Blob<float> *conv_weights = new Blob<float>(this->name + "_weights");
 81 |             conv_weights->ReshapeWithRealloc(conv_param.output_channels, conv_param.input_channels, conv_param.kernel_h, conv_param.kernel_w);
 82 |             weights.push_back(conv_weights);
 83 |             if (conv_param.bias_term)
 84 |             {
 85 |                 Blob<float> *bias_weights = new Blob<float>(this->name + "_bias");
 86 |                 bias_weights->ReshapeWithRealloc(conv_param.output_channels, 1, 1, 1);
 87 |                 weights.push_back(bias_weights);
 88 |             }
 89 |             return 0;
 90 |         }
 91 | 
 92 |         int Reshape()
 93 |         {
 94 |             // Allocate space for the layer's own top.
 95 |             const Blob<float> *bottom_blob = this->bottoms[0];
 96 |             conv_param.input_w = bottom_blob->width();
 97 |             conv_param.input_h = bottom_blob->height();
 98 |             if (conv_param.input_channels != bottom_blob->channels())
 99 |             {
100 |                 LOGE("Loaded convolution layer %s has %d input channels while bottom blob has %zu channels\n", this->name.c_str(), conv_param.input_channels, bottom_blob->channels());
101 |                 return -300; //Topology error
102 |             }
103 |             // printf("##########################\n");
104 | //            conv_param.LogParams(this->name.c_str());
105 |             conv_param.AssignOutputDim();
106 | //            conv_param.LogParams(this->name.c_str());
107 |             tops[0]->ReshapeWithRealloc(1, conv_param.output_channels, conv_param.output_h, conv_param.output_w);
108 |             conv_booster.SelectAlgo(&this->conv_param);
109 |             int buffer_size = 0;
110 |             int dull = 0;
111 |             int ret = conv_booster.GetBufferSize(&conv_param, &buffer_size, &dull);
112 |             MEMPOOL_CHECK_RETURN(this->common_mempool->Request(sizeof(float) * buffer_size));
113 |             return 0;
114 |         }
115 | 
116 |         int LoadWeights(const ncnn::ModelBin& mb)
117 |         {
118 |             int weight_data_size = conv_param.input_channels * conv_param.output_channels * conv_param.kernel_h * conv_param.kernel_w;
119 |             ncnn::Mat weight_data = mb.load(weight_data_size, 0);
120 |             if (weight_data.empty())
121 |                 return -100;
122 |             if (this->weights.empty())
123 |                 return -100;
124 |             this->weights[0]->CopyDataFromMat(weight_data);
125 | 
126 |             if (conv_param.bias_term)
127 |             {
128 |                 ncnn::Mat bias_data = mb.load(conv_param.output_channels, 1);
129 |                 if (bias_data.empty())
130 |                     return -100;
131 |                 if (this->weights.size() < 2)
132 |                 {
133 |                     LOGE("In layer %s: Bias weight blob not allocated.", this->name.c_str());
134 |                     return -100;
135 |                 }
136 |                 weights[1]->CopyDataFromMat(bias_data);
137 |             }
138 |             return 0;
139 |         }
140 | 
141 |         int Forward()
142 |         {
143 | 	        // conv_param.LogParams(this->name().c_str());
144 |             float* input  = this->bottoms[0]->data();
145 |             float* output = this->tops[0]->data();
146 |             float* buffer = NULL;
147 |             MEMPOOL_CHECK_RETURN(this->common_mempool->GetPtr(&buffer));
148 | 	    printf("thread num =%d\n", this->rt_param->num_threads());
149 |             conv_booster.Forward(&conv_param, output, input, processed_kernel, buffer, bias_data,  this->rt_param->num_threads());
150 |             return 0;
151 |         }
152 | 
153 |         int Init()
154 |         {
155 |             int buffer_size = 0;
156 |             int processed_kernel_size = 0;
157 |             int ret = conv_booster.GetBufferSize(&conv_param, &buffer_size, &processed_kernel_size);
158 |             Blob<float> * processed_weights = new Blob<float>(this->name + "_proc_weights");
159 |             processed_weights->ReshapeWithRealloc(1, 1, 1, processed_kernel_size);
160 |             float* kernel_data = this->weights[0]->data();
161 |             float* processed_kernel = processed_weights->data();
162 |             conv_booster.Init(&conv_param, processed_kernel, kernel_data);
163 |             delete this->weights[0];
164 |             this->weights[0] = processed_weights;
165 |             this->processed_kernel = processed_weights->data();
166 |             if (conv_param.bias_term)
167 |             {
168 |                 bias_data = this->weights[1]->data();
169 |             }
170 |             // MEMPOOL_CHECK_RETURN(this->common_mempool->Request(sizeof(float) * buffer_size));
171 |             return 0;
172 |         }
173 |     
174 |         int Fuse(Layer *next_layer)
175 |         {
176 |             if (next_layer->type.compare("ReLU") == 0)
177 |             {
178 |                 conv_param.activation = booster::ReLU;
179 |                 return 1;
180 |             }
181 |             else
182 |             {
183 |                 return 0;
184 |             }
185 |         }
186 | 
187 |     protected:
188 |         booster::ConvBooster conv_booster;
189 |         booster::ConvParam conv_param;
190 | 
191 |         float *bias_data;
192 |         float *processed_kernel;
193 | };
194 | };
195 | 


--------------------------------------------------------------------------------
/src/layers/dropout_layer.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "../layer.h"
18 | 
19 | namespace feather
20 | {
21 | class DropoutLayer : public Layer
22 | {
23 |     public:
24 |         DropoutLayer(RuntimeParameter<float>* rt_param)
25 |             : Layer(rt_param)
26 |         {
27 |             _inplace = false;
28 |         }
29 | 
30 |         int LoadParam(const ncnn::ParamDict &pd)
31 |         {
32 |             scale = pd.get(0, 1.f);
33 |             return 0;
34 |         }
35 | 
36 |         int Forward()
37 |         {
38 |             if (scale == 1.f)
39 |             {
40 |                 memcpy(tops[0]->data(), bottoms[0]->data(), bottoms[0]->data_size() * sizeof(float));
41 |             }
42 |             else
43 |             {
44 |                 int w = bottoms[0]->width();
45 |                 int h = bottoms[0]->height();
46 |                 int channels = bottoms[0]->channels();
47 |                 int size = w * h;
48 | 
49 |                 float* inp = bottoms[0]->data();
50 |                 float* outp = tops[0]->data();
51 |                 for (int i = 0; i < bottoms[0]->data_size(); ++i)
52 |                 {
53 |                         outp[i] = inp[i] * scale;
54 |                 }
55 |             }
56 |             return 0;
57 |         }
58 |     private:
59 |         float scale;
60 | };
61 | };


--------------------------------------------------------------------------------
/src/layers/eltwise_layer.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "../layer.h"
18 | 
19 | namespace feather
20 | {
21 | class EltwiseLayer : public Layer
22 | {
23 |     public:
24 |         EltwiseLayer(RuntimeParameter<float>* rt_param)
25 |             : Layer(rt_param),
26 |               op_type(1),
27 |               fuse_relu(0)
28 |         {
29 |         }
30 | 
31 |         int Reshape()
32 |         {
33 |             size_t n = bottoms[0]->num();
34 |             size_t c = bottoms[0]->channels();
35 |             size_t h = bottoms[0]->height();
36 |             size_t w = bottoms[0]->width();
37 |             // Check bottom shapes
38 |             for (int i = 1; i < bottoms.size(); ++i)
39 |             {
40 |                 if ((n != bottoms[i]->num()) || (c != bottoms[i]->channels()) || (h != bottoms[i]->height()) || (w != bottoms[i]->width()))
41 |                 {
42 |                     LOGE("Shape mismatch among bottoms of layer %s.", this->name.c_str());
43 |                     return -100;
44 |                 }
45 |             }
46 |             for (int i = 0; i < tops.size(); ++i)
47 |             {
48 |                 tops[i]->ReshapeWithRealloc(n, c, h, w);
49 |             }
50 |             return 0;
51 |         }
52 | 
53 |         int LoadParam(const ncnn::ParamDict &pd)
54 |         {
55 |             op_type = pd.get(0, 0);
56 |             ncnn::Mat coeffs = pd.get(1, ncnn::Mat());
57 |             if (!coeffs.empty())
58 |             {
59 |                 LOGE("FeatherCNN doesn't support coeffs in eltwise layer. Please refer to ncnn.");
60 |                 return -100;
61 |             }
62 |             if (op_type != 1)
63 |             {
64 |                 LOGE("FeatherCNN doesn't support ops rather than SUM. Please refer to ncnn.");
65 |                 return -100;
66 |             }
67 |             return 0;
68 |         }
69 | 
70 |         int Forward()
71 |         {
72 |             float* input_alpha = bottoms[0]->data();
73 |             float* input_beta = bottoms[1]->data();
74 |             float* output_data = tops[0]->data();
75 |             size_t data_size = bottoms[0]->data_size();
76 | 
77 |             if (fuse_relu)
78 |                 booster::add_relu<true>(output_data, input_alpha, input_beta, data_size, 1);
79 |             else
80 |                 booster::add_relu<false>(output_data, input_alpha, input_beta, data_size, 1);
81 |             return 0;
82 |         }
83 |         
84 |         enum { Operation_PROD = 0, Operation_SUM = 1, Operation_MAX = 2 };
85 | 
86 |     private:
87 |         int op_type;
88 |         int fuse_relu;
89 | 
90 | };
91 | };


--------------------------------------------------------------------------------
/src/layers/inner_product_layer.h:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #pragma once
 16 | 
 17 | #include "../layer.h"
 18 | #include "booster/sgemv.h"
 19 | 
 20 | #include <assert.h>
 21 | #include <stdio.h>
 22 | 
 23 | namespace feather
 24 | {
 25 | class InnerProductLayer : public Layer
 26 | {
 27 |     public:
 28 |         InnerProductLayer(RuntimeParameter<float>* rt_param)
 29 |             : fuse_relu(false), Layer(rt_param)
 30 |         {
 31 |         }
 32 | 
 33 |         int Forward()
 34 |         {
 35 |             //this->bottom_blob(0)->PrintBlobInfo();
 36 |             //this->top_blob(0)->PrintBlobInfo();
 37 |             const float *input = bottoms[0]->data();
 38 |             float *output = tops[0]->data();
 39 |             sgemv_kernel((int)input_size, (int)output_size, input, kernel_data, output, rt_param->num_threads(), bias_data);
 40 |             return 0;
 41 |         }
 42 | 
 43 |         int Fuse(Layer *next_layer)
 44 |         {
 45 |             if (next_layer->type.compare("ReLU") == 0)
 46 |             {
 47 |                 fuse_relu = true;
 48 |                 return 1;
 49 |             }
 50 |             else
 51 |             {
 52 |                 return 0;
 53 |             }
 54 |         }
 55 |         int Init()
 56 |         {
 57 |             Blob<float> *p_blob = new Blob<float>;
 58 |             printf("input size %d\n", input_size);
 59 |             p_blob->ReshapeWithRealloc(1, 1, 1, input_size * 8);
 60 |             this->kernel_data = weights[0]->data();
 61 |             float* buffer = p_blob->data();
 62 |             if (input_size % 8 == 0 && output_size % 8 == 0)
 63 |             {
 64 |                 for (int i = 0; i < output_size / 8; i++)
 65 |                     matrixTranspose(this->kernel_data + i * 8 * input_size, 8, input_size, buffer);
 66 |             }
 67 |             delete p_blob;
 68 |             if (output_size % 8 == 0 && input_size % 8 == 0)
 69 |             {
 70 |                 if (bias_term && fuse_relu)
 71 |                     sgemv_kernel = fully_connected_transpose_inference<true, true>;
 72 |                 else if (bias_term && !fuse_relu)
 73 |                     sgemv_kernel = fully_connected_transpose_inference<true, false>;
 74 |                 else if (!bias_term && fuse_relu)
 75 |                     sgemv_kernel = fully_connected_transpose_inference<false, true>;
 76 |                 else if (!bias_term && !fuse_relu)
 77 |                     sgemv_kernel = fully_connected_transpose_inference<false, false>;
 78 |             }
 79 |             else
 80 |             {
 81 |                 if (bias_term && fuse_relu)
 82 |                     sgemv_kernel = fully_connected_inference_direct<true, true>;
 83 |                 else if (bias_term && !fuse_relu)
 84 |                     sgemv_kernel = fully_connected_inference_direct<true, false>;
 85 |                 else if (!bias_term && fuse_relu)
 86 |                     sgemv_kernel = fully_connected_inference_direct<false, true>;
 87 |                 else if (!bias_term && !fuse_relu)
 88 |                     sgemv_kernel = fully_connected_inference_direct<false, false>;
 89 |             }
 90 | 
 91 |             this->bias_data = this->weights[1]->data();
 92 |             return 0;
 93 |         }
 94 | 
 95 |         int Reshape()
 96 |         {
 97 |             // Allocate space for the layer's own top.
 98 |             const Blob<float> *bottom_blob = bottoms[0];
 99 |             int input_width = bottom_blob->width();
100 |             int input_height = bottom_blob->height();
101 |             int input_channels = bottom_blob->channels();
102 |             // this->input_size = input_width * input_height * input_channels;
103 |             if (input_size != bottom_blob->data_size())
104 |             {
105 |                 LOGE("In Layer %s: Bottom %s data size %zu is inconsistant with expected input size %zu.", this->name.c_str(), bottom_blob->name.c_str(), bottom_blob->data_size(), input_size);
106 |                 return -100;
107 |             }
108 |             this->tops[0]->ReshapeWithRealloc(1, output_size, 1, 1);
109 |             return 0;
110 |         }
111 | 
112 |         int LoadParam(const ncnn::ParamDict &pd)
113 |         {
114 |             this->output_size = pd.get(0, 0);
115 |             this->bias_term = pd.get(1, 0);
116 |             this->weight_data_size = pd.get(2, 0);
117 |             this->input_size = this->weight_data_size / this->output_size;
118 |             
119 |             // The params are known, therefore we can allocate space for weights.
120 |             Blob<float> *fc_weights = new Blob<float>(this->name + "_weights");
121 |             fc_weights->ReshapeWithRealloc(this->output_size, this->input_size, 1, 1);
122 |             weights.push_back(fc_weights);
123 |             if (this->bias_term)
124 |             {
125 |                 Blob<float> *bias_weights = new Blob<float>(this->name + "_bias");
126 |                 bias_weights->ReshapeWithRealloc(output_size, 1, 1, 1);
127 |                 weights.push_back(bias_weights);
128 |             }
129 |             return 0;
130 |         }
131 | 
132 |         int LoadWeights(const ncnn::ModelBin& mb)
133 |         {
134 |             printf("Loading dimension %zu %zu\n", output_size, input_size);
135 |             printf("weight data size %zu\n", weight_data_size);
136 |             ncnn::Mat weight_data = mb.load(weight_data_size, 0);
137 |             if (weight_data.empty())
138 |                 return -100;
139 |             if (this->weights.empty())
140 |                 return -100;
141 |             this->weights[0]->CopyDataFromMat(weight_data);
142 | 
143 |             if (this->bias_term)
144 |             {
145 |                 ncnn::Mat bias_data = mb.load(output_size, 1);
146 |                 if (bias_data.empty())
147 |                     return -100;
148 |                 if (this->weights.size() < 2)
149 |                 {
150 |                     LOGE("In layer %s: Bias weight blob not allocated.", this->name.c_str());
151 |                     return -100; 
152 |                 }
153 |                 weights[1]->CopyDataFromMat(bias_data);
154 |             }
155 |             return 0;
156 |         }
157 |     protected:
158 |         size_t weight_data_size;
159 | 
160 |         size_t input_size;
161 |         size_t output_size;
162 | 
163 |         bool bias_term;
164 | 
165 |         float *kernel_data;
166 |         float *bias_data;
167 | 
168 |         bool fuse_relu;
169 |         void (*sgemv_kernel)(const int, const int, const float *, const float *, float *, const int, float*);
170 | };
171 | };
172 | 


--------------------------------------------------------------------------------
/src/layers/input_layer.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "../layer.h"
18 | #include <assert.h>
19 | #include <stdio.h>
20 | 
21 | #include <map>
22 | 
23 | namespace feather
24 | {
25 | class InputLayer : public Layer 
26 | {
27 |     public:
28 |         InputLayer(RuntimeParameter<float>* rt_param)
29 |             : Layer(rt_param)
30 |         {
31 |         }
32 | 
33 |         int LoadParam(const ncnn::ParamDict& pd)
34 |         {
35 |             int w = pd.get(0, 0);
36 |             int h = pd.get(1, 0);
37 |             int c = pd.get(2, 0);
38 |             // this->tops[0]->ReshapeWithRealloc(1, c, h, w);
39 |             return 0;
40 |         }
41 | 
42 |         int Reshape()
43 |         {
44 |             // Nothing to do, don't call base class version.
45 |             return 0;
46 |         }
47 | 
48 |         int Init()
49 |         {
50 |             return 0;
51 |         }
52 | };
53 | };
54 | 


--------------------------------------------------------------------------------
/src/layers/pooling_layer.h:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #pragma once
 16 | 
 17 | #include "../layer.h"
 18 | 
 19 | #include <math.h>
 20 | #include <limits>
 21 | 
 22 | #define MAX(a,b) ((a)>(b))?(a):(b)
 23 | #define MIN(a,b) ((a)<(b))?(a):(b)
 24 | 
 25 | namespace feather
 26 | {
 27 | class PoolingLayer : public Layer
 28 | {
 29 |     public:
 30 |         PoolingLayer(RuntimeParameter<float>* rt_param)
 31 |             : stride_h(1),
 32 |               stride_w(1),
 33 |               Layer(rt_param)
 34 |         {
 35 |         }
 36 | 
 37 | 
 38 |         int Forward()
 39 |         {
 40 |             const float *input = bottoms[0]->data();
 41 |             float *output = tops[0]->data();
 42 |             float *p = output;
 43 | 
 44 |             int slot = input_channels * output_h;
 45 | 
 46 |             //#pragma omp parallel for schedule(static) num_threads(num_threads)
 47 |             for (int i = 0; i < input_channels; ++i)
 48 |             {
 49 |                 for (int j = 0; j < output_h; j ++)
 50 |                 {
 51 |                     // int i=slot/output_h,  j=slot%output_h;
 52 |                     float *p = output + i * output_h * output_w + j * output_w;
 53 |                     for (int l = 0; l < output_w; l++) 
 54 |                         p[l] = (this->pooling_type != 0 ? 0 : -1 * std::numeric_limits<float>::max());
 55 | 
 56 |                     int tmp_pos = j * stride_h - pad_top - pad_bottom;
 57 |                     int x_min = MAX(tmp_pos, 0);
 58 |                     int x_max = MIN((int)(tmp_pos + kernel_h), (int) input_h);
 59 | 
 60 |                     for (int k = 0; k < output_w; k ++)
 61 |                     {
 62 |                         int counter = 0;
 63 |                         float total = (this->pooling_type != 0 ? 0 : -1 * std::numeric_limits<float>::max());
 64 |                         for (int x = x_min; x < x_max; ++x)
 65 |                         {
 66 |                             int xpos = i * input_h * input_w + x * input_w;
 67 |                             int local_pos = k * stride_w - pad_left - pad_right;
 68 |                             int y_min     = MAX(local_pos, 0);
 69 |                             int y_max     = MIN((int)(local_pos + kernel_w), (int) input_w);
 70 | 
 71 |                             for (int y = y_min; y < y_max; ++y)
 72 |                             {
 73 |                                 float value = input[xpos + y];
 74 |                                 if (this->pooling_type != 0)
 75 |                                     total += value, counter++;
 76 |                                 else
 77 |                                 total = total > value ? total : value;
 78 |                             }
 79 |                         }
 80 |                         if (this->pooling_type != 0)
 81 |                             p[k] += total / (counter);
 82 |                         else
 83 |                             p[k]  = (p[k] > total) ? p[k] : total;
 84 |                     }
 85 |                 }
 86 |             }
 87 |             return 0;
 88 |         }
 89 | 
 90 |         int LoadParam(const ncnn::ParamDict& pd)
 91 |         {
 92 |             pooling_type = pd.get(0, 0); //Pooling type?
 93 |             kernel_w = pd.get(1, 0);
 94 |             kernel_h = pd.get(11, kernel_w);
 95 |             stride_w = pd.get(2, 1);
 96 |             stride_h = pd.get(12, stride_w);
 97 |             pad_left = pd.get(3, 0);
 98 |             pad_right = pd.get(14, pad_left);
 99 |             pad_top = pd.get(13, pad_left);
100 |             pad_bottom = pd.get(15, pad_top);
101 |             global_pooling = pd.get(4, 0);
102 |             tf_pad_mode = pd.get(5, 0);
103 |             // printf("$$ global_pooling %d\n", global_pooling);
104 |             // printf("$$ padding %d %d %d %d\n", pad_left, pad_bottom, pad_right, pad_top);
105 |             // printf("$$ stride %d %d\n", stride_h, stride_w);
106 |             return 0;
107 |         }
108 | 
109 |         int Reshape()
110 |         {
111 |             //Only accept a single bottom blob.
112 |             const Blob<float> *bottom_blob = bottoms[0];
113 |             input_h = bottom_blob->height();
114 |             input_w = bottom_blob->width();
115 |             input_channels = bottom_blob->channels();
116 |             // printf("$$ input %d %d %d\n", input_channels, input_h, input_w);
117 |             if (global_pooling)
118 |             {
119 |                 kernel_h = input_h;
120 |                 kernel_w = input_w;
121 |                 output_h = 1;
122 |                 output_w = 1;
123 |                 output_channels = input_channels;
124 |             }
125 |             else
126 |             {
127 |                 //General pooling.
128 |                 output_channels = input_channels;
129 |                 output_h = static_cast<int>(ceil(static_cast<float>(input_h + pad_top + pad_bottom - kernel_h) / stride_h)) + 1;
130 |                 output_w = static_cast<int>(ceil(static_cast<float>(input_w + pad_left + pad_right - kernel_w) / stride_w)) + 1;
131 |             }
132 |             this->tops[0]->ReshapeWithRealloc(1, output_channels, output_h, output_w);
133 |             return 0;
134 |         }
135 | 
136 |     private:
137 |         int input_h;
138 |         int input_w;
139 |         int input_channels;
140 |         int output_h;
141 |         int output_w;
142 |         int output_channels;
143 |         int pad_left;
144 |         int pad_bottom;
145 |         int pad_right;
146 |         int pad_top;
147 |         int kernel_h;
148 |         int kernel_w;
149 |         int stride_h;
150 |         int stride_w;
151 |         bool global_pooling;
152 |         int pooling_type;
153 |         int tf_pad_mode;
154 | };
155 | };
156 | 


--------------------------------------------------------------------------------
/src/layers/relu_layer.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "../layer.h"
18 | 
19 | namespace feather
20 | {
21 | class ReluLayer : public Layer
22 | {
23 |     public:
24 |         ReluLayer(RuntimeParameter<float>* rt_param)
25 |             : Layer(rt_param)
26 |         {
27 |         }
28 | 
29 |         int Forward()
30 |         {
31 |             const Blob<float> *p_bottom = bottoms[0];
32 |             const float *input = p_bottom->data();
33 |             const size_t data_size = p_bottom->num() * p_bottom->channels() * p_bottom->height() * p_bottom->width();
34 | 
35 |             float *output = tops[0]->data();
36 |             for (size_t i = 0; i < data_size; ++i)
37 |             {
38 |                 output[i] = input[i] > 0 ? input[i] : 0;
39 |             }
40 |             return 0;
41 |         }
42 | };
43 | };
44 | 


--------------------------------------------------------------------------------
/src/layers/scale_layer.h:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #pragma once
 16 | 
 17 | #include "../layer.h"
 18 | #include <booster/generic_kernels.h>
 19 | 
 20 | namespace feather
 21 | {
 22 | class ScaleLayer : public Layer
 23 | {
 24 |     public:
 25 |         ScaleLayer(RuntimeParameter<float>* rt_param)
 26 |             : channels(0),
 27 |               bias_term(0),
 28 |               scale_data_size(0),
 29 |               Layer(rt_param)
 30 |         {
 31 |         }
 32 | 
 33 |         int LoadParam(const ncnn::ParamDict &pd)
 34 |         {
 35 |             scale_data_size = pd.get(0, 0);
 36 |             bias_term = pd.get(1, 0);
 37 |             if (scale_data_size < 0)
 38 |             {
 39 |                 LOGE("feather doesn't accept negative scale data size, please use ncnn to run this model.\n");
 40 |                 return -100;
 41 |             }
 42 |             return 0;
 43 |         }
 44 | 
 45 |         int LoadWeights(const ncnn::ModelBin &mb)
 46 |         {
 47 |             ncnn::Mat scale_mat;
 48 |             ncnn::Mat bias_mat;
 49 |             if (scale_data_size == -233)
 50 |                 return 0;
 51 | 
 52 |             scale_mat = mb.load(scale_data_size, 1);
 53 |             if (scale_mat.empty())
 54 |                 return -100;
 55 |             channels = scale_data_size;
 56 |             Blob<float> *scale_blob = new Blob<float>;
 57 |             scale_blob->ReshapeWithRealloc(1, 1, 1, channels);
 58 |             scale_blob->CopyDataFromMat(scale_mat);
 59 |             weights.push_back(scale_blob);
 60 | 
 61 |             if (bias_term)
 62 |             {
 63 |                 bias_mat = mb.load(scale_data_size, 1);
 64 |                 if (bias_mat.empty())
 65 |                     return -100;
 66 |                 Blob<float> *bias_blob = new Blob<float>;
 67 |                 bias_blob->ReshapeWithRealloc(1, 1, 1, channels);
 68 |                 bias_blob->CopyDataFromMat(bias_mat);
 69 |                 weights.push_back(bias_blob);
 70 |             }
 71 |             return 0;
 72 |         }
 73 | 
 74 |         int Forward()
 75 |         {
 76 |             const float *input = bottoms[0]->data();
 77 |             float *output = tops[0]->data();
 78 |             size_t stride = bottoms[0]->width() * bottoms[0]->height();
 79 |             const float* scale_data = weights[0]->data();
 80 |             const float* bias_data = NULL;
 81 |             if (bias_term)
 82 |                 bias_data = weights[1]->data();
 83 |             scale_kernel(channels, stride, bias_data, scale_data, input, output, 1);
 84 |             return 0;
 85 |         }
 86 | 
 87 |         int Init()
 88 |         {
 89 |             // const Blob<float> *p_blob = bottoms[0];
 90 |             // input_channels = p_blob->channels();
 91 |             // input_height = p_blob->height();
 92 |             // input_width = p_blob->width();
 93 |             // printf("input %d %d %d", input_channels, input_width, input_height);
 94 |             // scale_data = _weight_blobs[0]->data();
 95 |             //printf("bias_term %d\n", _bias_term ? 1: 0);
 96 |             if (bias_term)
 97 |             {
 98 |                 scale_kernel = booster::scale<true>;
 99 |             }
100 |             else
101 |             {
102 |                 scale_kernel = booster::scale<false>;
103 |             }
104 |             return 0;
105 |         }
106 | 
107 |       private:
108 |         size_t channels;
109 |         int bias_term;
110 |         int scale_data_size;
111 | 
112 |     private:
113 |         void (*scale_kernel)(const size_t channels, const size_t stride, const  float* bias_data, const float* scale_data, const float* input, float* output, const size_t num_threads);
114 | };
115 | };
116 | 


--------------------------------------------------------------------------------
/src/layers/softmax_layer.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "../layer.h"
18 | 
19 | #include <math.h>
20 | #include <float.h>
21 | 
22 | namespace feather
23 | {
24 | class SoftmaxLayer : public Layer
25 | {
26 |     public:
27 |         SoftmaxLayer(RuntimeParameter<float>* rt_param)
28 |             : Layer(rt_param)
29 |         {
30 |         }
31 |         
32 |         int Forward()
33 |         {
34 |             const Blob<float> *p_bottom = bottoms[0];
35 |             const float *input = p_bottom->data();
36 |             const size_t data_size = p_bottom->num() * p_bottom->channels() * p_bottom->height() * p_bottom->width();
37 |             float *output = tops[0]->data();
38 | 
39 |             float sum = 0.0;
40 |             float max = -FLT_MAX;
41 |             for (size_t i = 0; i < data_size; ++i)
42 |             {
43 |                 max = std::max<float>(max, input[i]);
44 |             }
45 |             for (size_t i = 0; i < data_size; ++i)
46 |             {
47 |                 output[i] = static_cast<float>(exp(input[i] - max));
48 |                 sum += output[i];
49 |             }
50 |             for (size_t i = 0; i < data_size; ++i)
51 |             {
52 |                 output[i] = output[i] / sum;
53 |             }
54 |             return 0;
55 |         }
56 | };
57 | };
58 | 


--------------------------------------------------------------------------------
/src/layers/split_layer.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "../layer.h"
18 | 
19 | namespace feather
20 | {
21 | class SplitLayer : public Layer
22 | {
23 |     public:
24 |         SplitLayer(RuntimeParameter<float>* rt_param)
25 |             : Layer(rt_param)
26 |         {
27 | 
28 |         }
29 | 
30 |         int Reshape()
31 |         {
32 |             size_t n = bottoms[0]->num();
33 |             size_t c = bottoms[0]->channels();
34 |             size_t h = bottoms[0]->height();
35 |             size_t w = bottoms[0]->width();
36 |             for (int i = 0; i < tops.size(); ++i)
37 |             {
38 |                 tops[i]->ReshapeWithRealloc(n, c, h, w);
39 |             }
40 |             return 0;
41 |         }
42 | 
43 |         int Forward()
44 |         {
45 |             float* src_data = bottoms[0]->data();
46 |             size_t data_size = bottoms[0]->data_size();
47 |             
48 |             for (int i = 0; i < tops.size(); ++i)
49 |             {
50 |                 memcpy(tops[i]->data(), src_data, sizeof(float) * data_size);
51 |             }
52 |             return 0;
53 |         }
54 | };
55 | };


--------------------------------------------------------------------------------
/src/mempool.cpp:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #include "mempool.h"
 16 | 
 17 | #include <stdio.h>
 18 | #include <stdlib.h>
 19 | 
 20 | #include "utils.h"
 21 | 
 22 | template<typename PTR_TYPE>
 23 | CommonMemPool<PTR_TYPE>::~CommonMemPool()
 24 | {
 25 |     if (common_memory || common_size_map.size() || common_ptr_map.size())
 26 |     {
 27 |         Free();
 28 |     }
 29 | }
 30 | 
 31 | template<typename PTR_TYPE>
 32 | bool CommonMemPool<PTR_TYPE>::Alloc()
 33 | {
 34 |     if (common_memory)
 35 |     {
 36 |         fprintf(stderr, "Error: common memory already allocated.\n");
 37 |         return false;
 38 |     }
 39 |     if (common_size > 0)
 40 |     {
 41 |         common_memory = (PTR_TYPE *) _mm_malloc(common_size, 128);
 42 |         if (!common_memory)
 43 |         {
 44 |             fprintf(stderr, "Error: cannot allocate common memory.\n");
 45 |             return false;
 46 |         }
 47 |         allocated_size = common_size;
 48 |     }
 49 |     if (common_size_map.size())
 50 |     {
 51 |         std::map<size_t, size_t>::iterator it
 52 |             = common_size_map.begin();
 53 |         while (it != common_size_map.end())
 54 |         {
 55 |             PTR_TYPE *wptr = NULL;
 56 |             wptr = (PTR_TYPE *) _mm_malloc(it->second, 128);
 57 |             if (!wptr)
 58 |             {
 59 |                 fprintf(stderr, "Allocation for size %ld id %ld failed\n", it->second, it->first);
 60 |             }
 61 |             common_ptr_map[it->first] = wptr;
 62 |             ++it;
 63 |         }
 64 |     }
 65 |     return (common_ptr_map.size() == common_size_map.size()) ? true : false;
 66 | }
 67 | 
 68 | template<typename PTR_TYPE>
 69 | bool CommonMemPool<PTR_TYPE>::Free()
 70 | {
 71 |     if (common_memory)
 72 |     {
 73 |         free(common_memory);
 74 |         allocated_size = 0;
 75 |         common_memory = NULL;
 76 |     }
 77 |     return true;
 78 | }
 79 | 
 80 | template<typename PTR_TYPE>
 81 | bool CommonMemPool<PTR_TYPE>::Reset()
 82 | {
 83 |     common_size = 0;
 84 |     return this->Free();
 85 | }
 86 | 
 87 | template<typename PTR_TYPE>
 88 | bool CommonMemPool<PTR_TYPE>::Request(size_t size_byte)
 89 | {
 90 |     common_size = (common_size > size_byte) ? common_size : size_byte;
 91 |     return true;
 92 | }
 93 | 
 94 | template<typename PTR_TYPE>
 95 | bool CommonMemPool<PTR_TYPE>::GetPtr(PTR_TYPE ** ptr)
 96 | {
 97 |     if (!common_memory)
 98 |     {
 99 |         fprintf(stderr, "Common memroy not allocated\n");
100 |         // return false;
101 |     }
102 |     if (this->common_size > allocated_size)
103 |     {
104 |         this->Free();
105 |         this->Alloc();
106 |     }
107 |     *ptr = common_memory;
108 |     return true;
109 | }
110 | 
111 | template class CommonMemPool<float>;
112 | template class CommonMemPool<int>;
113 | template class CommonMemPool<uint16_t>;
114 | template class CommonMemPool<void>;


--------------------------------------------------------------------------------
/src/mempool.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include <map>
18 | 
19 | #define MEMPOOL_CHECK_RETURN(var) {if(!var){fprintf(stderr, "Err in file %s line %d\n", __FILE__, __LINE__);return false;}}
20 | 
21 | template<typename PTR_TYPE>
22 | class CommonMemPool
23 | {
24 |     public:
25 |         CommonMemPool(): common_size(0), allocated_size(0), common_memory(NULL) {}
26 |         ~CommonMemPool();
27 | 
28 |         bool Request(size_t size_byte);
29 |         bool GetPtr(PTR_TYPE ** ptr);
30 |         bool Reset();
31 |         bool Free();
32 |         bool Alloc();
33 | 
34 |     private:
35 |         //Default common memory pool
36 |         size_t common_size;
37 |         size_t allocated_size;
38 |         PTR_TYPE * common_memory;
39 | 
40 |         //Map common ID to size
41 |         std::map<size_t, size_t> common_size_map;
42 |         //Map common ID to pointer
43 |         std::map<size_t, PTR_TYPE *> common_ptr_map;
44 | };


--------------------------------------------------------------------------------
/src/ncnn/allocator.cpp:
--------------------------------------------------------------------------------
  1 | // Tencent is pleased to support the open source community by making ncnn available.
  2 | //
  3 | // Copyright (C) 2018 THL A29 Limited, a Tencent company. All rights reserved.
  4 | //
  5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | // in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | // https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software distributed
 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | // specific language governing permissions and limitations under the License.
 14 | 
 15 | #include "allocator.h"
 16 | 
 17 | #include <stdio.h>
 18 | #include <algorithm>
 19 | #include "gpu.h"
 20 | 
 21 | namespace ncnn {
 22 | 
 23 | Allocator::~Allocator() 
 24 | {
 25 | 
 26 | }
 27 | 
 28 | PoolAllocator::PoolAllocator()
 29 | {
 30 |     size_compare_ratio = 192;// 0.75f * 256
 31 | }
 32 | 
 33 | PoolAllocator::~PoolAllocator()
 34 | {
 35 |     clear();
 36 | 
 37 |     if (!payouts.empty())
 38 |     {
 39 |         fprintf(stderr, "FATAL ERROR! pool allocator destroyed too early\n");
 40 |         std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
 41 |         for (; it != payouts.end(); it++)
 42 |         {
 43 |             void* ptr = it->second;
 44 |             fprintf(stderr, "%p still in use\n", ptr);
 45 |         }
 46 |     }
 47 | }
 48 | 
 49 | void PoolAllocator::clear()
 50 | {
 51 |     budgets_lock.lock();
 52 | 
 53 |     std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
 54 |     for (; it != budgets.end(); it++)
 55 |     {
 56 |         void* ptr = it->second;
 57 |         ncnn::fastFree(ptr);
 58 |     }
 59 |     budgets.clear();
 60 | 
 61 |     budgets_lock.unlock();
 62 | }
 63 | 
 64 | void PoolAllocator::set_size_compare_ratio(float scr)
 65 | {
 66 |     if (scr < 0.f || scr > 1.f)
 67 |     {
 68 |         fprintf(stderr, "invalid size compare ratio %f\n", scr);
 69 |         return;
 70 |     }
 71 | 
 72 |     size_compare_ratio = (unsigned int)(scr * 256);
 73 | }
 74 | 
 75 | void* PoolAllocator::fastMalloc(size_t size)
 76 | {
 77 |     budgets_lock.lock();
 78 | 
 79 |     // find free budget
 80 |     std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
 81 |     for (; it != budgets.end(); it++)
 82 |     {
 83 |         size_t bs = it->first;
 84 | 
 85 |         // size_compare_ratio ~ 100%
 86 |         if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size)
 87 |         {
 88 |             void* ptr = it->second;
 89 | 
 90 |             budgets.erase(it);
 91 | 
 92 |             budgets_lock.unlock();
 93 | 
 94 |             payouts_lock.lock();
 95 | 
 96 |             payouts.push_back(std::make_pair(bs, ptr));
 97 | 
 98 |             payouts_lock.unlock();
 99 | 
100 |             return ptr;
101 |         }
102 |     }
103 | 
104 |     budgets_lock.unlock();
105 | 
106 |     // new
107 |     void* ptr = ncnn::fastMalloc(size);
108 | 
109 |     payouts_lock.lock();
110 | 
111 |     payouts.push_back(std::make_pair(size, ptr));
112 | 
113 |     payouts_lock.unlock();
114 | 
115 |     return ptr;
116 | }
117 | 
118 | void PoolAllocator::fastFree(void* ptr)
119 | {
120 |     payouts_lock.lock();
121 | 
122 |     // return to budgets
123 |     std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
124 |     for (; it != payouts.end(); it++)
125 |     {
126 |         if (it->second == ptr)
127 |         {
128 |             size_t size = it->first;
129 | 
130 |             payouts.erase(it);
131 | 
132 |             payouts_lock.unlock();
133 | 
134 |             budgets_lock.lock();
135 | 
136 |             budgets.push_back(std::make_pair(size, ptr));
137 | 
138 |             budgets_lock.unlock();
139 | 
140 |             return;
141 |         }
142 |     }
143 | 
144 |     payouts_lock.unlock();
145 | 
146 |     fprintf(stderr, "FATAL ERROR! pool allocator get wild %p\n", ptr);
147 |     ncnn::fastFree(ptr);
148 | }
149 | 
150 | UnlockedPoolAllocator::UnlockedPoolAllocator()
151 | {
152 |     size_compare_ratio = 192;// 0.75f * 256
153 | }
154 | 
155 | UnlockedPoolAllocator::~UnlockedPoolAllocator()
156 | {
157 |     clear();
158 | 
159 |     if (!payouts.empty())
160 |     {
161 |         fprintf(stderr, "FATAL ERROR! unlocked pool allocator destroyed too early\n");
162 |         std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
163 |         for (; it != payouts.end(); it++)
164 |         {
165 |             void* ptr = it->second;
166 |             fprintf(stderr, "%p still in use\n", ptr);
167 |         }
168 |     }
169 | }
170 | 
171 | void UnlockedPoolAllocator::clear()
172 | {
173 |     std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
174 |     for (; it != budgets.end(); it++)
175 |     {
176 |         void* ptr = it->second;
177 |         ncnn::fastFree(ptr);
178 |     }
179 |     budgets.clear();
180 | }
181 | 
182 | void UnlockedPoolAllocator::set_size_compare_ratio(float scr)
183 | {
184 |     if (scr < 0.f || scr > 1.f)
185 |     {
186 |         fprintf(stderr, "invalid size compare ratio %f\n", scr);
187 |         return;
188 |     }
189 | 
190 |     size_compare_ratio = (unsigned int)(scr * 256);
191 | }
192 | 
193 | void* UnlockedPoolAllocator::fastMalloc(size_t size)
194 | {
195 |     // find free budget
196 |     std::list< std::pair<size_t, void*> >::iterator it = budgets.begin();
197 |     for (; it != budgets.end(); it++)
198 |     {
199 |         size_t bs = it->first;
200 | 
201 |         // size_compare_ratio ~ 100%
202 |         if (bs >= size && ((bs * size_compare_ratio) >> 8) <= size)
203 |         {
204 |             void* ptr = it->second;
205 | 
206 |             budgets.erase(it);
207 | 
208 |             payouts.push_back(std::make_pair(bs, ptr));
209 | 
210 |             return ptr;
211 |         }
212 |     }
213 | 
214 |     // new
215 |     void* ptr = ncnn::fastMalloc(size);
216 | 
217 |     payouts.push_back(std::make_pair(size, ptr));
218 | 
219 |     return ptr;
220 | }
221 | 
222 | void UnlockedPoolAllocator::fastFree(void* ptr)
223 | {
224 |     // return to budgets
225 |     std::list< std::pair<size_t, void*> >::iterator it = payouts.begin();
226 |     for (; it != payouts.end(); it++)
227 |     {
228 |         if (it->second == ptr)
229 |         {
230 |             size_t size = it->first;
231 | 
232 |             payouts.erase(it);
233 | 
234 |             budgets.push_back(std::make_pair(size, ptr));
235 | 
236 |             return;
237 |         }
238 |     }
239 | 
240 |     fprintf(stderr, "FATAL ERROR! unlocked pool allocator get wild %p\n", ptr);
241 |     ncnn::fastFree(ptr);
242 | }
243 | 
244 | } // namespace ncnn
245 | 


--------------------------------------------------------------------------------
/src/ncnn/mat.cpp:
--------------------------------------------------------------------------------
  1 | // Tencent is pleased to support the open source community by making ncnn available.
  2 | //
  3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4 | //
  5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | // in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | // https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software distributed
 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | // specific language governing permissions and limitations under the License.
 14 | 
 15 | #include "mat.h"
 16 | 
 17 | #if __ARM_NEON
 18 | #include <arm_neon.h>
 19 | #endif // __ARM_NEON
 20 | #include <math.h>
 21 | 
 22 | // #include "cpu.h"
 23 | 
 24 | // #include "layer_type.h"
 25 | // #include "layer.h"
 26 | 
 27 | namespace ncnn {
 28 | 
 29 | #ifndef COMPILE_WITH_FEATHERCNN
 30 | void Mat::substract_mean_normalize(const float* mean_vals, const float* norm_vals)
 31 | {
 32 |     ncnn::Layer* op;
 33 | 
 34 |     if (mean_vals && !norm_vals)
 35 |     {
 36 |         // substract mean only
 37 |         op = ncnn::create_layer(ncnn::LayerType::Bias);
 38 | 
 39 |         ncnn::ParamDict pd;
 40 |         pd.set(0, c);
 41 | 
 42 |         op->load_param(pd);
 43 | 
 44 |         ncnn::Mat weights[1];
 45 |         weights[0] = Mat(c);
 46 |         for (int q=0; q<c; q++)
 47 |         {
 48 |             weights[0][q] = -mean_vals[q];
 49 |         }
 50 | 
 51 |         op->load_model(ncnn::ModelBinFromMatArray(weights));
 52 |     }
 53 |     else if (!mean_vals && norm_vals)
 54 |     {
 55 |         // normalize only
 56 |         op = ncnn::create_layer(ncnn::LayerType::Scale);
 57 | 
 58 |         ncnn::ParamDict pd;
 59 |         pd.set(0, c);
 60 | 
 61 |         op->load_param(pd);
 62 | 
 63 |         ncnn::Mat weights[1];
 64 |         weights[0] = Mat(c);
 65 |         for (int q=0; q<c; q++)
 66 |         {
 67 |             weights[0][q] = norm_vals[q];
 68 |         }
 69 | 
 70 |         op->load_model(ncnn::ModelBinFromMatArray(weights));
 71 |     }
 72 |     else if (mean_vals && norm_vals)
 73 |     {
 74 |         // substract mean and normalize
 75 |         op = ncnn::create_layer(ncnn::LayerType::Scale);
 76 | 
 77 |         ncnn::ParamDict pd;
 78 |         pd.set(0, c);
 79 |         pd.set(1, 1);
 80 | 
 81 |         op->load_param(pd);
 82 | 
 83 |         ncnn::Mat weights[2];
 84 |         weights[0] = Mat(c);
 85 |         weights[1] = Mat(c);
 86 |         for (int q=0; q<c; q++)
 87 |         {
 88 |             weights[0][q] = norm_vals[q];
 89 |             weights[1][q] = - mean_vals[q] * norm_vals[q];
 90 |         }
 91 | 
 92 |         op->load_model(ncnn::ModelBinFromMatArray(weights));
 93 |     }
 94 |     else // if (!mean_vals && !norm_vals)
 95 |     {
 96 |         return;
 97 |     }
 98 | 
 99 |     op->forward_inplace(*this, ncnn::get_default_option());
100 | 
101 |     delete op;
102 | }
103 | #endif
104 | // convert half precision floating point to float
105 | static float half2float(unsigned short value)
106 | {
107 |     // 1 : 5 : 10
108 |     unsigned short sign = (value & 0x8000) >> 15;
109 |     unsigned short exponent = (value & 0x7c00) >> 10;
110 |     unsigned short significand = value & 0x03FF;
111 | 
112 | //     fprintf(stderr, "%d %d %d\n", sign, exponent, significand);
113 | 
114 |     // 1 : 8 : 23
115 |     union
116 |     {
117 |         unsigned int u;
118 |         float f;
119 |     } tmp;
120 |     if (exponent == 0)
121 |     {
122 |         if (significand == 0)
123 |         {
124 |             // zero
125 |             tmp.u = (sign << 31);
126 |         }
127 |         else
128 |         {
129 |             // denormal
130 |             exponent = 0;
131 |             // find non-zero bit
132 |             while ((significand & 0x200) == 0)
133 |             {
134 |                 significand <<= 1;
135 |                 exponent++;
136 |             }
137 |             significand <<= 1;
138 |             significand &= 0x3FF;
139 |             tmp.u = (sign << 31) | ((-exponent + (-15 + 127)) << 23) | (significand << 13);
140 |         }
141 |     }
142 |     else if (exponent == 0x1F)
143 |     {
144 |         // infinity or NaN
145 |         tmp.u = (sign << 31) | (0xFF << 23) | (significand << 13);
146 |     }
147 |     else
148 |     {
149 |         // normalized
150 |         tmp.u = (sign << 31) | ((exponent + (-15 + 127)) << 23) | (significand << 13);
151 |     }
152 | 
153 |     return tmp.f;
154 | }
155 | 
156 | Mat Mat::from_float16(const unsigned short* data, int size)
157 | {
158 |     Mat m(size);
159 |     if (m.empty())
160 |         return m;
161 | 
162 |     float* ptr = m;//.data;
163 | 
164 | #if __ARM_NEON && (__ARM_FP & 2)
165 |     int nn = cpu_support_arm_vfpv4() ? size >> 2 : 0;
166 |     int remain = size - (nn << 2);
167 | #else
168 |     int remain = size;
169 | #endif // __ARM_NEON
170 | 
171 | #if __ARM_NEON && (__ARM_FP & 2)
172 | #if __aarch64__
173 |     if (nn > 0)
174 |     {
175 |     asm volatile(
176 |         "0:                             \n"
177 |         "ld1    {v0.4h}, [%1], #8       \n"
178 |         "fcvtl  v1.4s, v0.4h            \n"
179 |         "subs   %w0, %w0, #1            \n"
180 |         "st1    {v1.4s}, [%2], #16      \n"
181 |         "bne    0b                      \n"
182 |         : "=r"(nn),     // %0
183 |           "=r"(data),   // %1
184 |           "=r"(ptr)     // %2
185 |         : "0"(nn),
186 |           "1"(data),
187 |           "2"(ptr)
188 |         : "cc", "memory", "v0", "v1"
189 |     );
190 |     }
191 | #else
192 |     if (nn > 0)
193 |     {
194 |     asm volatile(
195 |         "0:                             \n"
196 |         "pld        [%1, #64]           \n"
197 |         "vld1.s16   {d0}, [%1 :64]!     \n"
198 |         "vcvt.f32.f16 q1, d0            \n"
199 |         "subs       %0, #1              \n"
200 |         "vst1.f32   {d2-d3}, [%2 :128]! \n"
201 |         "bne        0b                  \n"
202 |         : "=r"(nn),     // %0
203 |           "=r"(data),   // %1
204 |           "=r"(ptr)     // %2
205 |         : "0"(nn),
206 |           "1"(data),
207 |           "2"(ptr)
208 |         : "cc", "memory", "q0", "q1"
209 |     );
210 |     }
211 | #endif // __aarch64__
212 | #endif // __ARM_NEON
213 |     for (; remain>0; remain--)
214 |     {
215 |         *ptr = half2float(*data);
216 | 
217 |         data++;
218 |         ptr++;
219 |     }
220 | 
221 |     return m;
222 | }
223 | 
224 | #ifndef COMPILE_WITH_FEATHERCNN
225 | void copy_make_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int type, float v, Allocator* allocator, int num_threads)
226 | {
227 |     ncnn::Layer* padding = ncnn::create_layer(ncnn::LayerType::Padding);
228 | 
229 |     ncnn::ParamDict pd;
230 |     pd.set(0, top);
231 |     pd.set(1, bottom);
232 |     pd.set(2, left);
233 |     pd.set(3, right);
234 |     pd.set(4, type);
235 |     pd.set(5, v);
236 | 
237 |     padding->load_param(pd);
238 | 
239 |     ncnn::Option opt = ncnn::get_default_option();
240 |     opt.num_threads = num_threads;
241 |     opt.blob_allocator = allocator;
242 | 
243 |     padding->forward(src, dst, opt);
244 | 
245 |     delete padding;
246 | }
247 | 
248 | void copy_cut_border(const Mat& src, Mat& dst, int top, int bottom, int left, int right, Allocator* allocator, int num_threads)
249 | {
250 |     ncnn::Layer* crop = ncnn::create_layer(ncnn::LayerType::Crop);
251 | 
252 |     ncnn::ParamDict pd;
253 |     pd.set(0, left);
254 |     pd.set(1, top);
255 |     pd.set(2, 0);
256 |     pd.set(3, src.w - left - right);
257 |     pd.set(4, src.h - top - bottom);
258 |     pd.set(5, src.c);
259 | 
260 |     crop->load_param(pd);
261 | 
262 |     ncnn::Option opt = ncnn::get_default_option();
263 |     opt.num_threads = num_threads;
264 |     opt.blob_allocator = allocator;
265 | 
266 |     crop->forward(src, dst, opt);
267 | 
268 |     delete crop;
269 | }
270 | 
271 | void resize_bilinear(const Mat& src, Mat& dst, int w, int h, Allocator* allocator, int num_threads)
272 | {
273 |     ncnn::Layer* interp = ncnn::create_layer(ncnn::LayerType::Interp);
274 | 
275 |     ncnn::ParamDict pd;
276 |     pd.set(0, 2);
277 |     pd.set(3, h);
278 |     pd.set(4, w);
279 | 
280 |     interp->load_param(pd);
281 | 
282 |     ncnn::Option opt = ncnn::get_default_option();
283 |     opt.num_threads = num_threads;
284 |     opt.blob_allocator = allocator;
285 | 
286 |     interp->forward(src, dst, opt);
287 | 
288 |     delete interp;
289 | }
290 | 
291 | void resize_bicubic(const Mat& src, Mat& dst, int w, int h, Allocator* allocator, int num_threads)
292 | {
293 |     ncnn::Layer* interp = ncnn::create_layer(ncnn::LayerType::Interp);
294 | 
295 |     ncnn::ParamDict pd;
296 |     pd.set(0, 3);
297 |     pd.set(3, h);
298 |     pd.set(4, w);
299 | 
300 |     interp->load_param(pd);
301 | 
302 |     ncnn::Option opt = ncnn::get_default_option();
303 |     opt.num_threads = num_threads;
304 |     opt.blob_allocator = allocator;
305 | 
306 |     interp->forward(src, dst, opt);
307 | 
308 |     delete interp;
309 | }
310 | 
311 | void convert_packing(const Mat& src, Mat& dst, int _packing, Allocator* allocator, int num_threads)
312 | {
313 |     ncnn::Layer* packing = ncnn::create_layer(ncnn::LayerType::Packing);
314 | 
315 |     ncnn::ParamDict pd;
316 |     pd.set(0, _packing);
317 | 
318 |     packing->load_param(pd);
319 | 
320 |     ncnn::Option opt = ncnn::get_default_option();
321 |     opt.num_threads = num_threads;
322 |     opt.blob_allocator = allocator;
323 | 
324 |     packing->forward(src, dst, opt);
325 | 
326 |     delete packing;
327 | }
328 | 
329 | void cast_float32_to_float16(const Mat& src, Mat& dst, Allocator* allocator, int num_threads)
330 | {
331 |     ncnn::Layer* cast = ncnn::create_layer(ncnn::LayerType::Cast);
332 | 
333 |     ncnn::ParamDict pd;
334 |     pd.set(0, 1);
335 |     pd.set(1, 2);
336 | 
337 |     cast->load_param(pd);
338 | 
339 |     ncnn::Option opt = ncnn::get_default_option();
340 |     opt.num_threads = num_threads;
341 |     opt.blob_allocator = allocator;
342 | 
343 |     cast->forward(src, dst, opt);
344 | 
345 |     delete cast;
346 | }
347 | 
348 | void cast_float16_to_float32(const Mat& src, Mat& dst, Allocator* allocator, int num_threads)
349 | {
350 |     ncnn::Layer* cast = ncnn::create_layer(ncnn::LayerType::Cast);
351 | 
352 |     ncnn::ParamDict pd;
353 |     pd.set(0, 2);
354 |     pd.set(1, 1);
355 | 
356 |     cast->load_param(pd);
357 | 
358 |     ncnn::Option opt = ncnn::get_default_option();
359 |     opt.num_threads = num_threads;
360 |     opt.blob_allocator = allocator;
361 | 
362 |     cast->forward(src, dst, opt);
363 | 
364 |     delete cast;
365 | }
366 | #endif
367 | } // namespace ncnn
368 | 


--------------------------------------------------------------------------------
/src/ncnn/modelbin.cpp:
--------------------------------------------------------------------------------
  1 | // Tencent is pleased to support the open source community by making ncnn available.
  2 | //
  3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4 | //
  5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | // in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | // https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software distributed
 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | // specific language governing permissions and limitations under the License.
 14 | 
 15 | #include "modelbin.h"
 16 | 
 17 | #include <stdio.h>
 18 | #include <string.h>
 19 | #include <vector>
 20 | #include "platform.h"
 21 | 
 22 | namespace ncnn {
 23 | 
 24 | Mat ModelBin::load(int w, int h, int type) const
 25 | {
 26 |     Mat m = load(w * h, type);
 27 |     if (m.empty())
 28 |         return m;
 29 | 
 30 |     return m.reshape(w, h);
 31 | }
 32 | 
 33 | Mat ModelBin::load(int w, int h, int c, int type) const
 34 | {
 35 |     Mat m = load(w * h * c, type);
 36 |     if (m.empty())
 37 |         return m;
 38 | 
 39 |     return m.reshape(w, h, c);
 40 | }
 41 | 
 42 | #if NCNN_STDIO
 43 | ModelBinFromStdio::ModelBinFromStdio(FILE* _binfp) : binfp(_binfp)
 44 | {
 45 | }
 46 | 
 47 | Mat ModelBinFromStdio::load(int w, int type) const
 48 | {
 49 |     if (!binfp)
 50 |         return Mat();
 51 | 
 52 |     if (type == 0)
 53 |     {
 54 |         int nread;
 55 | 
 56 |         union
 57 |         {
 58 |             struct
 59 |             {
 60 |                 unsigned char f0;
 61 |                 unsigned char f1;
 62 |                 unsigned char f2;
 63 |                 unsigned char f3;
 64 |             };
 65 |             unsigned int tag;
 66 |         } flag_struct;
 67 | 
 68 |         nread = fread(&flag_struct, sizeof(flag_struct), 1, binfp);
 69 |         if (nread != 1)
 70 |         {
 71 |             fprintf(stderr, "ModelBin read flag_struct failed %d\n", nread);
 72 |             return Mat();
 73 |         }
 74 | 
 75 |         unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
 76 | 
 77 |         if (flag_struct.tag == 0x01306B47)
 78 |         {
 79 |             // half-precision data
 80 |             int align_data_size = alignSize(w * sizeof(unsigned short), 4);
 81 |             std::vector<unsigned short> float16_weights;
 82 |             float16_weights.resize(align_data_size);
 83 |             nread = fread(float16_weights.data(), align_data_size, 1, binfp);
 84 |             if (nread != 1)
 85 |             {
 86 |                 fprintf(stderr, "ModelBin read float16_weights failed %d\n", nread);
 87 |                 return Mat();
 88 |             }
 89 | 
 90 |             return Mat::from_float16(float16_weights.data(), w);
 91 |         }
 92 |         else if (flag_struct.tag == 0x000D4B38)
 93 |         {
 94 |             // int8 data
 95 |             int align_data_size = alignSize(w, 4);
 96 |             std::vector<signed char> int8_weights;
 97 |             int8_weights.resize(align_data_size);
 98 |             nread = fread(int8_weights.data(), align_data_size, 1, binfp);
 99 |             if (nread != 1)
100 |             {
101 |                 fprintf(stderr, "ModelBin read int8_weights failed %d\n", nread);
102 |                 return Mat();
103 |             }
104 | 
105 |             Mat m(w, (size_t)1u);
106 |             if (m.empty())
107 |                 return m;
108 | 
109 |             memcpy(m.data, int8_weights.data(), w);
110 | 
111 |             return m;
112 |         }
113 |         else if (flag_struct.tag == 0x0002C056)
114 |         {
115 |             Mat m(w);
116 |             if (m.empty())
117 |                 return m;
118 | 
119 |             // raw data with extra scaling
120 |             nread = fread(m, sizeof(float), w, binfp);
121 |             if (nread != 1)
122 |             {
123 |                 fprintf(stderr, "ModelBin read weight_data failed %d\n", nread);
124 |                 return Mat();
125 |             }
126 | 
127 |             return m;
128 |         }
129 | 
130 |         Mat m(w);
131 |         if (m.empty())
132 |             return m;
133 | 
134 |         if (flag != 0)
135 |         {
136 |             // quantized data
137 |             float quantization_value[256];
138 |             nread = fread(quantization_value, 256 * sizeof(float), 1, binfp);
139 |             if (nread != 1)
140 |             {
141 |                 fprintf(stderr, "ModelBin read quantization_value failed %d\n", nread);
142 |                 return Mat();
143 |             }
144 | 
145 |             int align_weight_data_size = alignSize(w * sizeof(unsigned char), 4);
146 |             std::vector<unsigned char> index_array;
147 |             index_array.resize(align_weight_data_size);
148 |             nread = fread(index_array.data(), align_weight_data_size, 1, binfp);
149 |             if (nread != 1)
150 |             {
151 |                 fprintf(stderr, "ModelBin read index_array failed %d\n", nread);
152 |                 return Mat();
153 |             }
154 | 
155 |             float* ptr = m;
156 |             for (int i = 0; i < w; i++)
157 |             {
158 |                 ptr[i] = quantization_value[ index_array[i] ];
159 |             }
160 |         }
161 |         else if (flag_struct.f0 == 0)
162 |         {
163 |             // raw data
164 |             nread = fread(m, w * sizeof(float), 1, binfp);
165 |             if (nread != 1)
166 |             {
167 |                 fprintf(stderr, "ModelBin read weight_data failed %d\n", nread);
168 |                 return Mat();
169 |             }
170 |         }
171 | 
172 |         return m;
173 |     }
174 |     else if (type == 1)
175 |     {
176 |         Mat m(w);
177 |         if (m.empty())
178 |             return m;
179 | 
180 |         // raw data
181 |         int nread = fread(m, w * sizeof(float), 1, binfp);
182 |         if (nread != 1)
183 |         {
184 |             fprintf(stderr, "ModelBin read weight_data failed %d\n", nread);
185 |             return Mat();
186 |         }
187 | 
188 |         return m;
189 |     }
190 |     else
191 |     {
192 |         fprintf(stderr, "ModelBin load type %d not implemented\n", type);
193 |         return Mat();
194 |     }
195 | 
196 |     return Mat();
197 | }
198 | #endif // NCNN_STDIO
199 | 
200 | ModelBinFromMemory::ModelBinFromMemory(const unsigned char*& _mem) : mem(_mem)
201 | {
202 | }
203 | 
204 | Mat ModelBinFromMemory::load(int w, int type) const
205 | {
206 |     if (!mem)
207 |         return Mat();
208 | 
209 |     if (type == 0)
210 |     {
211 |         union
212 |         {
213 |             struct
214 |             {
215 |                 unsigned char f0;
216 |                 unsigned char f1;
217 |                 unsigned char f2;
218 |                 unsigned char f3;
219 |             };
220 |             unsigned int tag;
221 |         } flag_struct;
222 | 
223 |         memcpy(&flag_struct, mem, sizeof(flag_struct));
224 |         mem += sizeof(flag_struct);
225 | 
226 |         unsigned int flag = flag_struct.f0 + flag_struct.f1 + flag_struct.f2 + flag_struct.f3;
227 | 
228 |         if (flag_struct.tag == 0x01306B47)
229 |         {
230 |             // half-precision data
231 |             Mat m = Mat::from_float16((unsigned short*)mem, w);
232 |             mem += alignSize(w * sizeof(unsigned short), 4);
233 |             return m;
234 |         }
235 |         else if (flag_struct.tag == 0x000D4B38)
236 |         {
237 |             // int8 data
238 |             Mat m = Mat(w, (signed char*)mem, 1u);
239 |             mem += alignSize(w, 4);
240 |             return m;
241 |         }
242 |         else if (flag_struct.tag == 0x0002C056)
243 |         {
244 |             // raw data with extra scaling
245 |             Mat m = Mat(w, (float*)mem);
246 |             mem += w * sizeof(float);
247 |             return m;
248 |         }
249 | 
250 |         if (flag != 0)
251 |         {
252 |             // quantized data
253 |             const float* quantization_value = (const float*)mem;
254 |             mem += 256 * sizeof(float);
255 | 
256 |             const unsigned char* index_array = (const unsigned char*)mem;
257 |             mem += alignSize(w * sizeof(unsigned char), 4);
258 | 
259 |             Mat m(w);
260 |             if (m.empty())
261 |                 return m;
262 | 
263 |             float* ptr = m;
264 |             for (int i = 0; i < w; i++)
265 |             {
266 |                 ptr[i] = quantization_value[ index_array[i] ];
267 |             }
268 | 
269 |             return m;
270 |         }
271 |         else if (flag_struct.f0 == 0)
272 |         {
273 |             // raw data
274 |             Mat m = Mat(w, (float*)mem);
275 |             mem += w * sizeof(float);
276 |             return m;
277 |         }
278 |     }
279 |     else if (type == 1)
280 |     {
281 |         // raw data
282 |         Mat m = Mat(w, (float*)mem);
283 |         mem += w * sizeof(float);
284 |         return m;
285 |     }
286 |     else
287 |     {
288 |         fprintf(stderr, "ModelBin load type %d not implemented\n", type);
289 |         return Mat();
290 |     }
291 | 
292 |     return Mat();
293 | }
294 | 
295 | ModelBinFromMatArray::ModelBinFromMatArray(const Mat* _weights) : weights(_weights)
296 | {
297 | }
298 | 
299 | Mat ModelBinFromMatArray::load(int /*w*/, int /*type*/) const
300 | {
301 |     if (!weights)
302 |         return Mat();
303 | 
304 |     Mat m = weights[0];
305 |     weights++;
306 |     return m;
307 | }
308 | 
309 | } // namespace ncnn
310 | 


--------------------------------------------------------------------------------
/src/ncnn/modelbin.h:
--------------------------------------------------------------------------------
 1 | // Tencent is pleased to support the open source community by making ncnn available.
 2 | //
 3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 4 | //
 5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | // in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | // https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software distributed
11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | // specific language governing permissions and limitations under the License.
14 | 
15 | #ifndef NCNN_MODELBIN_H
16 | #define NCNN_MODELBIN_H
17 | 
18 | #include <stdio.h>
19 | #include "mat.h"
20 | #include "platform.h"
21 | 
22 | namespace ncnn {
23 | 
24 | class Net;
25 | class ModelBin
26 | {
27 | public:
28 |     // element type
29 |     // 0 = auto
30 |     // 1 = float32
31 |     // 2 = float16
32 |     // 3 = int8
33 |     // load vec
34 |     virtual Mat load(int w, int type) const = 0;
35 |     // load image
36 |     virtual Mat load(int w, int h, int type) const;
37 |     // load dim
38 |     virtual Mat load(int w, int h, int c, int type) const;
39 | };
40 | 
41 | #if NCNN_STDIO
42 | class ModelBinFromStdio : public ModelBin
43 | {
44 | public:
45 |     // construct from file
46 |     ModelBinFromStdio(FILE* binfp);
47 | 
48 |     virtual Mat load(int w, int type) const;
49 | 
50 | protected:
51 |     FILE* binfp;
52 | };
53 | #endif // NCNN_STDIO
54 | 
55 | class ModelBinFromMemory : public ModelBin
56 | {
57 | public:
58 |     // construct from external memory
59 |     ModelBinFromMemory(const unsigned char*& mem);
60 | 
61 |     virtual Mat load(int w, int type) const;
62 | 
63 | protected:
64 |     const unsigned char*& mem;
65 | };
66 | 
67 | class ModelBinFromMatArray : public ModelBin
68 | {
69 | public:
70 |     // construct from weight blob array
71 |     ModelBinFromMatArray(const Mat* weights);
72 | 
73 |     virtual Mat load(int w, int type) const;
74 | 
75 | protected:
76 |     mutable const Mat* weights;
77 | };
78 | 
79 | } // namespace ncnn
80 | 
81 | #endif // NCNN_MODELBIN_H
82 | 


--------------------------------------------------------------------------------
/src/ncnn/paramdict.cpp:
--------------------------------------------------------------------------------
  1 | // Tencent is pleased to support the open source community by making ncnn available.
  2 | //
  3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
  4 | //
  5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | // in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | // https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | // Unless required by applicable law or agreed to in writing, software distributed
 11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | // specific language governing permissions and limitations under the License.
 14 | 
 15 | #include <ctype.h>
 16 | #include <stdarg.h>
 17 | #include <stdio.h>
 18 | #include "paramdict.h"
 19 | #include "platform.h"
 20 | 
 21 | namespace ncnn {
 22 | 
 23 | ParamDict::ParamDict()
 24 | {
 25 |     use_winograd_convolution = 1;
 26 |     use_sgemm_convolution = 1;
 27 |     use_int8_inference = 1;
 28 |     use_vulkan_compute = 0;
 29 | 
 30 |     clear();
 31 | }
 32 | 
 33 | int ParamDict::get(int id, int def) const
 34 | {
 35 |     return params[id].loaded ? params[id].i : def;
 36 | }
 37 | 
 38 | float ParamDict::get(int id, float def) const
 39 | {
 40 |     return params[id].loaded ? params[id].f : def;
 41 | }
 42 | 
 43 | Mat ParamDict::get(int id, const Mat& def) const
 44 | {
 45 |     return params[id].loaded ? params[id].v : def;
 46 | }
 47 | 
 48 | void ParamDict::set(int id, int i)
 49 | {
 50 |     params[id].loaded = 1;
 51 |     params[id].i = i;
 52 | }
 53 | 
 54 | void ParamDict::set(int id, float f)
 55 | {
 56 |     params[id].loaded = 1;
 57 |     params[id].f = f;
 58 | }
 59 | 
 60 | void ParamDict::set(int id, const Mat& v)
 61 | {
 62 |     params[id].loaded = 1;
 63 |     params[id].v = v;
 64 | }
 65 | 
 66 | void ParamDict::clear()
 67 | {
 68 |     for (int i = 0; i < NCNN_MAX_PARAM_COUNT; i++)
 69 |     {
 70 |         params[i].loaded = 0;
 71 |         params[i].v = Mat();
 72 |     }
 73 | }
 74 | 
 75 | #if NCNN_STDIO
 76 | #if NCNN_STRING
 77 | static bool vstr_is_float(const char vstr[16])
 78 | {
 79 |     // look ahead for determine isfloat
 80 |     for (int j=0; j<16; j++)
 81 |     {
 82 |         if (vstr[j] == '\0')
 83 |             break;
 84 | 
 85 |         if (vstr[j] == '.' || tolower(vstr[j]) == 'e')
 86 |             return true;
 87 |     }
 88 | 
 89 |     return false;
 90 | }
 91 | 
 92 | int ParamDict::load_param(FILE* fp)
 93 | {
 94 |     clear();
 95 | 
 96 | //     0=100 1=1.250000 -23303=5,0.1,0.2,0.4,0.8,1.0
 97 | 
 98 |     // parse each key=value pair
 99 |     int id = 0;
100 |     while (fscanf(fp, "%d=", &id) == 1)
101 |     {
102 |         bool is_array = id <= -23300;
103 |         if (is_array)
104 |         {
105 |             id = -id - 23300;
106 |         }
107 | 
108 |         if (is_array)
109 |         {
110 |             int len = 0;
111 |             int nscan = fscanf(fp, "%d", &len);
112 |             if (nscan != 1)
113 |             {
114 |                 fprintf(stderr, "ParamDict read array length fail\n");
115 |                 return -1;
116 |             }
117 | 
118 |             params[id].v.create(len);
119 | 
120 |             for (int j = 0; j < len; j++)
121 |             {
122 |                 char vstr[16];
123 |                 nscan = fscanf(fp, ",%15[^,\n ]", vstr);
124 |                 if (nscan != 1)
125 |                 {
126 |                     fprintf(stderr, "ParamDict read array element fail\n");
127 |                     return -1;
128 |                 }
129 | 
130 |                 bool is_float = vstr_is_float(vstr);
131 | 
132 |                 if (is_float)
133 |                 {
134 |                     float* ptr = params[id].v;
135 |                     nscan = sscanf(vstr, "%f", &ptr[j]);
136 |                 }
137 |                 else
138 |                 {
139 |                     int* ptr = params[id].v;
140 |                     nscan = sscanf(vstr, "%d", &ptr[j]);
141 |                 }
142 |                 if (nscan != 1)
143 |                 {
144 |                     fprintf(stderr, "ParamDict parse array element fail\n");
145 |                     return -1;
146 |                 }
147 |             }
148 |         }
149 |         else
150 |         {
151 |             char vstr[16];
152 |             int nscan = fscanf(fp, "%15s", vstr);
153 |             if (nscan != 1)
154 |             {
155 |                 fprintf(stderr, "ParamDict read value fail\n");
156 |                 return -1;
157 |             }
158 | 
159 |             bool is_float = vstr_is_float(vstr);
160 | 
161 |             if (is_float)
162 |                 nscan = sscanf(vstr, "%f", &params[id].f);
163 |             else
164 |                 nscan = sscanf(vstr, "%d", &params[id].i);
165 |             if (nscan != 1)
166 |             {
167 |                 fprintf(stderr, "ParamDict parse value fail\n");
168 |                 return -1;
169 |             }
170 |         }
171 |         params[id].loaded = 1;
172 |     }
173 |     return 0;
174 | }
175 | 
176 | #if _MSC_VER
177 | static inline int mem_sscanf_with_n(int* _internal_nconsumed_ptr, const char*& ptr, const char* format, ...)
178 | {
179 |     *_internal_nconsumed_ptr = 0;
180 | 
181 |     va_list args;
182 |     va_start(args, format);
183 | 
184 |     int _n = vsscanf(ptr, format, args);
185 | 
186 |     va_end(args);
187 | 
188 |     ptr += *_internal_nconsumed_ptr;
189 | 
190 |     return *_internal_nconsumed_ptr > 0 ? _n : 0;
191 | }
192 | #define mem_sscanf(ptr, format, ...)  mem_sscanf_with_n(&_internal_nconsumed, ptr, format "%n", __VA_ARGS__, &_internal_nconsumed)
193 | #else
194 | // return value from macro requires gcc extension https://gcc.gnu.org/onlinedocs/gcc/Statement-Exprs.html
195 | #define mem_sscanf(ptr, format, ...)  ({int _b=0; int _n = sscanf(ptr, format "%n", __VA_ARGS__, &_b); ptr+=_b;_b>0?_n:0;})
196 | #endif // _MSC_VER
197 | 
198 | int ParamDict::load_param_mem(const char*& mem)
199 | {
200 | #if _MSC_VER
201 |     int _internal_nconsumed;
202 | #endif
203 | 
204 |     clear();
205 | 
206 | //     0=100 1=1.250000 -23303=5,0.1,0.2,0.4,0.8,1.0
207 | 
208 |     // parse each key=value pair
209 |     int id = 0;
210 |     while (mem_sscanf(mem, "%d=", &id) == 1)
211 |     {
212 |         bool is_array = id <= -23300;
213 |         if (is_array)
214 |         {
215 |             id = -id - 23300;
216 |         }
217 | 
218 |         if (is_array)
219 |         {
220 |             int len = 0;
221 |             int nscan = mem_sscanf(mem, "%d", &len);
222 |             if (nscan != 1)
223 |             {
224 |                 fprintf(stderr, "ParamDict read array length fail\n");
225 |                 return -1;
226 |             }
227 | 
228 |             params[id].v.create(len);
229 | 
230 |             for (int j = 0; j < len; j++)
231 |             {
232 |                 char vstr[16];
233 |                 nscan = mem_sscanf(mem, ",%15[^,\n ]", vstr);
234 |                 if (nscan != 1)
235 |                 {
236 |                     fprintf(stderr, "ParamDict read array element fail\n");
237 |                     return -1;
238 |                 }
239 | 
240 |                 bool is_float = vstr_is_float(vstr);
241 | 
242 |                 if (is_float)
243 |                 {
244 |                     float* ptr = params[id].v;
245 |                     nscan = sscanf(vstr, "%f", &ptr[j]);
246 |                 }
247 |                 else
248 |                 {
249 |                     int* ptr = params[id].v;
250 |                     nscan = sscanf(vstr, "%d", &ptr[j]);
251 |                 }
252 |                 if (nscan != 1)
253 |                 {
254 |                     fprintf(stderr, "ParamDict parse array element fail\n");
255 |                     return -1;
256 |                 }
257 |             }
258 |         }
259 |         else
260 |         {
261 |             char vstr[16];
262 |             int nscan = mem_sscanf(mem, "%15s", vstr);
263 |             if (nscan != 1)
264 |             {
265 |                 fprintf(stderr, "ParamDict read value fail\n");
266 |                 return -1;
267 |             }
268 | 
269 |             bool is_float = vstr_is_float(vstr);
270 | 
271 |             if (is_float)
272 |                 nscan = sscanf(vstr, "%f", &params[id].f);
273 |             else
274 |                 nscan = sscanf(vstr, "%d", &params[id].i);
275 |             if (nscan != 1)
276 |             {
277 |                 fprintf(stderr, "ParamDict parse value fail\n");
278 |                 return -1;
279 |             }
280 |         }
281 | 
282 |         params[id].loaded = 1;
283 |     }
284 |     return 0;
285 | }
286 | #endif // NCNN_STRING
287 | 
288 | int ParamDict::load_param_bin(FILE* fp)
289 | {
290 |     clear();
291 | 
292 | //     binary 0
293 | //     binary 100
294 | //     binary 1
295 | //     binary 1.250000
296 | //     binary 3 | array_bit
297 | //     binary 5
298 | //     binary 0.1
299 | //     binary 0.2
300 | //     binary 0.4
301 | //     binary 0.8
302 | //     binary 1.0
303 | //     binary -233(EOP)
304 | 
305 |     int id = 0;
306 |     fread(&id, sizeof(int), 1, fp);
307 | 
308 |     while (id != -233)
309 |     {
310 |         bool is_array = id <= -23300;
311 |         if (is_array)
312 |         {
313 |             id = -id - 23300;
314 |         }
315 | 
316 |         if (is_array)
317 |         {
318 |             int len = 0;
319 |             fread(&len, sizeof(int), 1, fp);
320 | 
321 |             params[id].v.create(len);
322 | 
323 |             float* ptr = params[id].v;
324 |             fread(ptr, sizeof(float), len, fp);
325 |         }
326 |         else
327 |         {
328 |             fread(&params[id].f, sizeof(float), 1, fp);
329 |         }
330 | 
331 |         params[id].loaded = 1;
332 | 
333 |         fread(&id, sizeof(int), 1, fp);
334 |     }
335 | 
336 |     return 0;
337 | }
338 | #endif // NCNN_STDIO
339 | 
340 | int ParamDict::load_param(const unsigned char*& mem)
341 | {
342 |     clear();
343 | 
344 |     int id = *(int*)(mem);
345 |     mem += 4;
346 | 
347 |     while (id != -233)
348 |     {
349 |         bool is_array = id <= -23300;
350 |         if (is_array)
351 |         {
352 |             id = -id - 23300;
353 |         }
354 | 
355 |         if (is_array)
356 |         {
357 |             int len = *(int*)(mem);
358 |             mem += 4;
359 | 
360 |             params[id].v.create(len);
361 | 
362 |             memcpy(params[id].v.data, mem, len * 4);
363 |             mem += len * 4;
364 |         }
365 |         else
366 |         {
367 |             params[id].f = *(float*)(mem);
368 |             mem += 4;
369 |         }
370 | 
371 |         params[id].loaded = 1;
372 | 
373 |         id = *(int*)(mem);
374 |         mem += 4;
375 |     }
376 | 
377 |     return 0;
378 | }
379 | 
380 | } // namespace ncnn
381 | 


--------------------------------------------------------------------------------
/src/ncnn/paramdict.h:
--------------------------------------------------------------------------------
 1 | // Tencent is pleased to support the open source community by making ncnn available.
 2 | //
 3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 4 | //
 5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | // in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | // https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software distributed
11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | // specific language governing permissions and limitations under the License.
14 | 
15 | #ifndef NCNN_PARAMDICT_H
16 | #define NCNN_PARAMDICT_H
17 | 
18 | #include <stdio.h>
19 | #include "mat.h"
20 | #include "platform.h"
21 | 
22 | // at most 20 parameters
23 | #define NCNN_MAX_PARAM_COUNT 20
24 | 
25 | namespace ncnn {
26 | 
27 | class Net;
28 | class ParamDict
29 | {
30 | public:
31 |     // empty
32 |     ParamDict();
33 | 
34 |     // get int
35 |     int get(int id, int def) const;
36 |     // get float
37 |     float get(int id, float def) const;
38 |     // get array
39 |     Mat get(int id, const Mat& def) const;
40 | 
41 |     // set int
42 |     void set(int id, int i);
43 |     // set float
44 |     void set(int id, float f);
45 |     // set array
46 |     void set(int id, const Mat& v);
47 | 
48 | public:
49 |     int use_winograd_convolution;
50 |     int use_sgemm_convolution;
51 |     int use_int8_inference;
52 |     int use_vulkan_compute;
53 | 
54 | public:
55 |     // friend class Net;
56 | 
57 |     void clear();
58 | 
59 | #if NCNN_STDIO
60 | #if NCNN_STRING
61 |     int load_param(FILE* fp);
62 |     int load_param_mem(const char*& mem);
63 | #endif // NCNN_STRING
64 |     int load_param_bin(FILE* fp);
65 | #endif // NCNN_STDIO
66 |     int load_param(const unsigned char*& mem);
67 | 
68 | protected:
69 |     struct
70 |     {
71 |         int loaded;
72 |         union { int i; float f; };
73 |         Mat v;
74 |     } params[NCNN_MAX_PARAM_COUNT];
75 | };
76 | 
77 | } // namespace ncnn
78 | 
79 | #endif // NCNN_PARAMDICT_H
80 | 


--------------------------------------------------------------------------------
/src/ncnn/platform.h:
--------------------------------------------------------------------------------
 1 | // Tencent is pleased to support the open source community by making ncnn available.
 2 | //
 3 | // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
 4 | //
 5 | // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | // in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | // https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | // Unless required by applicable law or agreed to in writing, software distributed
11 | // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | // specific language governing permissions and limitations under the License.
14 | 
15 | #ifndef NCNN_PLATFORM_H
16 | #define NCNN_PLATFORM_H
17 | 
18 | #define NCNN_STDIO 1
19 | #define NCNN_STRING 1
20 | #define NCNN_OPENCV 0
21 | #define NCNN_BENCHMARK 0
22 | #define NCNN_PIXEL 1
23 | #define NCNN_PIXEL_ROTATE 1
24 | #define NCNN_VULKAN 0
25 | #define NCNN_REQUANT 0
26 | #define NCNN_IM2COL_SGEMM 0
27 | #define COMPILE_WITH_FEATHERCNN
28 | #endif // NCNN_PLATFORM_H
29 | 


--------------------------------------------------------------------------------
/src/net.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include "layer.h"
18 | #include "rt_param.h"
19 | #include <vector>
20 | #include <iostream>
21 | #include <fstream>
22 | #include <vector>
23 | #include <random>
24 | #include <sstream>
25 | 
26 | #include <ncnn/mat.h>
27 | 
28 | namespace feather
29 | {
30 | class Net
31 | {
32 |     public:
33 |         Net();
34 | 
35 |         ~Net();
36 | 
37 |         int LoadParam(const char * param_path);
38 |         int LoadParam(FILE* fp);
39 |         int LoadWeights(const char * weights_path);
40 |         int LoadWeights(FILE* fp);
41 |         
42 |         // int FeedInput(const char* input_name, const int w, const int h, const int c, const float* input_data);
43 |         
44 |         int FeedInput(const char* input_name, ncnn::Mat& in);
45 | 
46 |         int Forward();
47 |           
48 |         int Extract(std::string blob_name, float** output_ptr, int* n, int *c, int* h, int* w);
49 | 
50 |         int Extract(std::string blob_name, ncnn::Mat& out);
51 |         
52 |         int BuildBlobMap();
53 |         
54 |         std::map<std::string, Blob<float> *> blob_map;
55 |     
56 |     private:
57 |         int InitLayers();
58 |         int Reshape();
59 |         RuntimeParameter<float> *rt_param;
60 |         std::vector<Layer *> layers;
61 | 
62 |         /* Flag varibles indicating Net status.
63 |          * 
64 |          * _weights_loaded: if Net has loaded the weights.
65 |          * _net_initialized: if the weights are already initialized.
66 |          */
67 |         int _param_loaded;
68 |         int _weights_loaded;
69 |         int _net_initialized;
70 | };
71 | }; // namespace feather
72 | 


--------------------------------------------------------------------------------
/src/rt_param.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | /*
16 |  * For runtime parameters
17 |  */
18 | 
19 | #pragma once
20 | 
21 | #include "blob.h"
22 | #include "mempool.h"
23 | #include "utils.h"
24 | #include <map>
25 | 
26 | template<typename Dtype>
27 | class RuntimeParameter
28 | {
29 |     public:
30 |         RuntimeParameter() : _common_mempool(NULL),
31 |             _num_threads(1)
32 |         {
33 |         }
34 |         RuntimeParameter(CommonMemPool<Dtype> *common_mempool, size_t num_threads)
35 |             : _common_mempool(common_mempool),
36 |               _num_threads(num_threads)
37 |         {
38 |         }
39 |         ~RuntimeParameter()
40 |         {
41 |         }
42 | 
43 |         CommonMemPool<Dtype> *common_mempool() const
44 |         {
45 |             return _common_mempool;
46 |         }
47 |         size_t num_threads() const
48 |         {
49 |             return _num_threads;
50 |         }
51 | 
52 |     private:
53 |         CommonMemPool<Dtype> *_common_mempool;
54 |         size_t _num_threads;
55 | };
56 | 


--------------------------------------------------------------------------------
/src/utils.cpp:
--------------------------------------------------------------------------------
  1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
  2 | 
  3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
  4 | 
  5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
  6 | //in compliance with the License. You may obtain a copy of the License at
  7 | //
  8 | //https://opensource.org/licenses/BSD-3-Clause
  9 | //
 10 | //Unless required by applicable law or agreed to in writing, software distributed
 11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
 12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
 13 | //specific language governing permissions and limitations under the License.
 14 | 
 15 | #include "utils.h"
 16 | // #include "booster/helper.h"
 17 | #include <cstring>
 18 | #include <vector>
 19 | #include <cstdlib>
 20 | #include <iostream>
 21 | #include <sstream>
 22 | #include <fstream>
 23 | #include <string>
 24 | 
 25 | using namespace std;
 26 | 
 27 | int ChkParamHeader(FILE* fp)
 28 | {
 29 |     fseek(fp, 0, SEEK_SET);
 30 |     int magic = 0;
 31 |     int nbr = fscanf(fp, "%d", &magic);
 32 |     if (nbr != 1)
 33 |     {
 34 |         fprintf(stderr, "issue with param file\n");
 35 |         return -1;
 36 |     }
 37 |     if (magic != 7767517)
 38 |     {
 39 |         fprintf(stderr, "param is too old, please regenerate\n");
 40 |         return -1;
 41 |     }
 42 |     return 0;
 43 | }
 44 | 
 45 | int min(int a, int b)
 46 | {
 47 |     return (a < b) ? a : b;
 48 | }
 49 | 
 50 | #if (defined(__linux__)  && !(defined(__aarch64__)))|| defined(__APPLE_CC__)
 51 | #else
 52 | void* _mm_malloc(size_t sz, size_t align)
 53 | {
 54 |     void *ptr;
 55 | #if (defined __APPLE__) || (defined _WIN32)
 56 |     return malloc(sz);
 57 | #else
 58 |     int alloc_result = posix_memalign(&ptr, align, sz);
 59 |     if (alloc_result != 0)
 60 |     {
 61 |         return NULL;
 62 |     }
 63 |     return ptr;
 64 | #endif
 65 | }
 66 | 
 67 | void _mm_free(void* ptr)
 68 | {
 69 |     if (NULL != ptr)
 70 |     {
 71 |         free(ptr);
 72 |         ptr = NULL;
 73 |     }
 74 | }
 75 | #endif
 76 | 
 77 | void StringTool::SplitString(const std::string &input, const std::string &delim, std::vector<std::string> &parts)
 78 | {
 79 |     for (char *s = strtok((char *)input.data(), (char *)delim.data()); s; s = strtok(NULL, (char *)delim.data()))
 80 |     {
 81 |         if (s != NULL)
 82 |         {
 83 |             parts.push_back(s);
 84 |         }
 85 |     }
 86 | }
 87 | 
 88 | void StringTool::RelaceString(std::string &input, const std::string &delim, const std::string& repstr)
 89 | {
 90 |     size_t pos = input.find(delim);
 91 |     while (pos != std::string::npos)
 92 |     {
 93 |         // Replace this occurrence of Sub String
 94 |         input.replace(pos, delim.size(), repstr);
 95 |         // Get the next occurrence from the current position
 96 |         pos = input.find(delim, pos + delim.size());
 97 |     }
 98 | }
 99 | 
100 | #ifdef FEATHER_OPENCL
101 | bool judge_android7_opencl()
102 | {
103 |     //libOpenCL.so
104 |     //android7.0 sdk api 24
105 |     char sdk[93] = "";
106 |     __system_property_get("ro.build.version.sdk", sdk);
107 |     if (std::atoi(sdk) < 24)
108 |     {
109 |         LOGI("[device] sdk [%d] < 24\n", std::atoi(sdk));
110 |         return true;
111 |     }
112 | 
113 |     bool flage = false;
114 |     std::string lib_name1 = "libOpenCL.so";
115 |     std::string lib_name2 = "libGLES_mali.so";
116 |     std::vector<std::string> libraries_list =
117 |     {
118 |         "/vendor/etc/public.libraries.txt",
119 |         "/system/etc/public.libraries.txt",
120 |     };
121 |     for (int i = 0; i < libraries_list.size(); i++)
122 |     {
123 |         std::ifstream out;
124 |         std::string line;
125 |         out.open(libraries_list[i].c_str());
126 |         while (!out.eof())
127 |         {
128 |             std::getline(out, line);
129 |             if (line.find(lib_name1) != line.npos || line.find(lib_name2) != line.npos)
130 |             {
131 |                 LOGI("[public] %s:%s", libraries_list[i].c_str(), line.c_str());
132 |                 flage = true;
133 |                 break;
134 |             }
135 | 
136 |         }
137 |         out.close();
138 |     }
139 |     if(flage == false)
140 |         return flage;
141 | 
142 |     flage = false;
143 |     const std::vector<std::string> libpaths =
144 |     {
145 |         "libOpenCL.so",
146 | #if defined(__aarch64__)
147 |         // Qualcomm Adreno with Android
148 |         "/system/vendor/lib64/libOpenCL.so",
149 |         "/system/lib64/libOpenCL.so",
150 |         // Mali with Android
151 |         "/system/vendor/lib64/egl/libGLES_mali.so",
152 |         "/system/lib64/egl/libGLES_mali.so",
153 |         // Typical Linux board
154 |         "/usr/lib/aarch64-linux-gnu/libOpenCL.so",
155 | #else
156 |         // Qualcomm Adreno with Android
157 |         "/system/vendor/lib/libOpenCL.so",
158 |         "/system/lib/libOpenCL.so",
159 |         // Mali with Android
160 |         "/system/vendor/lib/egl/libGLES_mali.so",
161 |         "/system/lib/egl/libGLES_mali.so",
162 |         // Typical Linux board
163 |         "/usr/lib/arm-linux-gnueabihf/libOpenCL.so",
164 | #endif
165 |     };
166 |     for (int i = 0; i < libpaths.size(); i++)
167 |     {
168 |         ifstream f(libpaths[i].c_str());
169 |         if (f.good())
170 |         {
171 |             flage = true;
172 |             LOGI("[libpaths]:%s", libpaths[i].c_str());
173 |             break;
174 |         }
175 |     }
176 |     return flage;
177 | }
178 | #endif
179 | 
180 | unsigned short hs_floatToHalf(float f)
181 | {
182 |     union
183 |     {
184 |         float d;
185 |         unsigned int i;
186 |     } u = { f };
187 |     int s = (u.i >> 16) & 0x8000;
188 |     int e = ((u.i >> 23) & 0xff) - 112;
189 |     int m =          u.i & 0x7fffff;
190 |     if (e <= 0)
191 |     {
192 |         if (e < -10) return s; /* underflowed */
193 |         /* force leading 1 and round */
194 |         m |= 0x800000;
195 |         int t = 14 - e;
196 |         int a = (1 << (t - 1)) - 1;
197 |         int b = (m >> t) & 1;
198 |         return s | ((m + a + b) >> t);
199 |     }
200 |     if (e == 143)
201 |     {
202 |         if (m == 0) return s | 0x7c00; /* +/- infinity */
203 | 
204 |         /* NaN, m == 0 forces us to set at least one bit and not become an infinity */
205 |         m >>= 13;
206 |         return s | 0x7c00 | m | (m == 0);
207 |     }
208 | 
209 |     /* round the normalized float */
210 |     m = m + 0xfff + ((m >> 13) & 1);
211 | 
212 |     /* significand overflow */
213 |     if (m & 0x800000)
214 |     {
215 |         m =  0;
216 |         e += 1;
217 |     }
218 | 
219 |     /* exponent overflow */
220 |     if (e > 30) return s | 0x7c00;
221 | 
222 |     return s | (e << 10) | (m >> 13);
223 | }
224 | 
225 | int hs_halfToFloatRep(unsigned short c)
226 | {
227 |     int s = (c >> 15) & 0x001;
228 |     int e = (c >> 10) & 0x01f;
229 |     int m =         c & 0x3ff;
230 |     if (e == 0)
231 |     {
232 |         if (m == 0) /* +/- 0 */ return s << 31;
233 |         /* denormalized, renormalize it */
234 |         while (!(m & 0x400))
235 |         {
236 |             m <<= 1;
237 |             e -=  1;
238 |         }
239 |         e += 1;
240 |         m &= ~0x400;
241 |     }
242 |     else if (e == 31) return (s << 31) | 0x7f800000 | (m << 13);   /* NaN or +/- infinity */
243 |     e += 112;
244 |     m <<= 13;
245 |     return (s << 31) | (e << 23) | m;
246 | }
247 | 
248 | float hs_halfToFloat(unsigned short c)
249 | {
250 |     union
251 |     {
252 |         float d;
253 |         unsigned int i;
254 |     } u;
255 |     u.i = hs_halfToFloatRep(c);
256 |     return u.d;
257 | }
258 | 


--------------------------------------------------------------------------------
/src/utils.h:
--------------------------------------------------------------------------------
 1 | //Tencent is pleased to support the open source community by making FeatherCNN available.
 2 | 
 3 | //Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
 4 | 
 5 | //Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
 6 | //in compliance with the License. You may obtain a copy of the License at
 7 | //
 8 | //https://opensource.org/licenses/BSD-3-Clause
 9 | //
10 | //Unless required by applicable law or agreed to in writing, software distributed
11 | //under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 | //CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 | //specific language governing permissions and limitations under the License.
14 | 
15 | #pragma once
16 | 
17 | #include <string>
18 | #include <cstring>
19 | #include <vector>
20 | #include <cstdlib>
21 | #ifdef _WIN32
22 | 	#define HAVE_STRUCT_TIMESPEC
23 | #endif
24 | 
25 | 
26 | #if 0
27 | #include <android/log.h>
28 | #define LOGI(...) __android_log_print(ANDROID_LOG_INFO,  "FeatherLib", __VA_ARGS__)
29 | #define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG, "FeatherLib", __VA_ARGS__)
30 | #define LOGE(...) __android_log_print(ANDROID_LOG_ERROR, "FeatherLib", __VA_ARGS__)
31 | #else
32 | #include <stdio.h>
33 | #define LOGI(...) fprintf(stdout, __VA_ARGS__);fprintf(stdout,"\n");
34 | #define LOGD(...) fprintf(stdout, __VA_ARGS__);fprintf(stdout,"\n");
35 | #define LOGE(...) fprintf(stderr, __VA_ARGS__);fprintf(stderr,"\n");
36 | #endif
37 | 
38 | 
39 | typedef unsigned short half;
40 | 
41 | class StringTool
42 | {
43 |     public:
44 |         static void SplitString(const std::string &input, const std::string &delim, std::vector<std::string> &parts);
45 |         static void RelaceString(std::string &input, const std::string &delim, const std::string& repstr);
46 | };
47 | 
48 | int ChkParamHeader(FILE* fp);
49 | 
50 | int min(int a, int b);
51 | 
52 | #if (defined(__linux__) && !defined(__aarch64__)) || defined(__APPLE_CC__)
53 | #include <mm_malloc.h>
54 | #else
55 | void* _mm_malloc(size_t sz, size_t align);
56 | void _mm_free(void* ptr);
57 | #endif
58 | 
59 | unsigned short hs_floatToHalf(float f);
60 | int hs_halfToFloatRep(unsigned short c);
61 | float hs_halfToFloat(unsigned short c);
62 | 


--------------------------------------------------------------------------------