├── .gitignore ├── README.md ├── src ├── common.cpp ├── CMakeLists.txt ├── benchmark.cpp ├── libdnn.cpp ├── device.cpp ├── libdnn_tuner.cpp └── libdnn_pool.cpp ├── cmake ├── Templates │ ├── greentea_libdnn_config.h.in │ ├── GreenteaLibDNNConfig.cmake.in │ └── cmake-uninstall.cmake.in ├── Dependencies.cmake ├── Cuda.cmake ├── Modules │ ├── FindvecLib.cmake │ ├── FindViennaCL.cmake │ └── FindOpenCL.cmake ├── Misc.cmake ├── ConfigGen.cmake ├── Summary.cmake ├── Targets.cmake └── Utils.cmake ├── include ├── benchmark.hpp ├── device.hpp ├── common.hpp ├── libdnn_tuner.hpp └── libdnn.hpp ├── LICENSE └── CMakeLists.txt /.gitignore: -------------------------------------------------------------------------------- 1 | .project 2 | .cproject 3 | build 4 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Greentea LibDNN 2 | Greentea LibDNN - a universal convolution implementation supporting CUDA and OpenCL 3 | -------------------------------------------------------------------------------- /src/common.cpp: -------------------------------------------------------------------------------- 1 | #include "common.hpp" 2 | 3 | namespace greentea { 4 | 5 | #ifdef USE_OPENCL 6 | 7 | viennacl::ocl::handle WrapHandle(cl_mem in, 8 | viennacl::ocl::context *ctx) { 9 | if (in != nullptr) { 10 | // Valid cl_mem object, wrap to ViennaCL and return handle. 11 | viennacl::ocl::handle memhandle(in, *ctx); 12 | memhandle.inc(); 13 | return memhandle; 14 | } else { 15 | // Trick to pass nullptr via ViennaCL into OpenCL kernels. 16 | viennacl::ocl::handle memhandle; 17 | return memhandle; 18 | } 19 | } 20 | 21 | #endif 22 | 23 | 24 | } // namespace greentea 25 | -------------------------------------------------------------------------------- /cmake/Templates/greentea_libdnn_config.h.in: -------------------------------------------------------------------------------- 1 | #ifndef GREENTEA_LIBDNN_CONFIG_HPP_ 2 | #define GREENTEA_LIBDNN_CONFIG_HPP_ 3 | 4 | /* Version */ 5 | #define GREENTEA_VERSION "${GREENTEA_TARGET_VERSION}" 6 | 7 | /* Sources directory */ 8 | #define SOURCE_FOLDER "${PROJECT_SOURCE_DIR}" 9 | 10 | /* Binaries directory */ 11 | #define BINARY_FOLDER "${PROJECT_BINARY_DIR}" 12 | 13 | /* 64 bit indexing */ 14 | #cmakedefine USE_INDEX_64 15 | 16 | /* NVIDIA Cuda */ 17 | #cmakedefine HAVE_CUDA 18 | #cmakedefine USE_CUDA 19 | 20 | /* OpenCl kernels */ 21 | #cmakedefine HAVE_OPENCL 22 | #cmakedefine USE_OPENCL 23 | #cmakedefine VIENNACL_WITH_OPENCL 24 | 25 | #define CMAKE_SOURCE_DIR "src/" 26 | #define CMAKE_EXT "" 27 | 28 | #endif // GREENTEA_LIBDNN_CONFIG_HPP_ 29 | -------------------------------------------------------------------------------- /cmake/Templates/GreenteaLibDNNConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # Config file for the Greentea LibDNN package. 2 | # 3 | # After successful configuration the following variables 4 | # will be defined: 5 | # 6 | # GREENTEA_INCLUDE_DIRS - Greentea include directories 7 | # GREENTEA_LIBRARIES - libraries to link against 8 | # GREENTEA_LIBRARY_DIR - the directory to the library 9 | # GREENTEA_DOUND - boolean var telling us if the 10 | # package was found 11 | 12 | set(GREENTEA_VERSION @PROJECT_VERSION@) 13 | 14 | @PACKAGE_INIT@ 15 | 16 | if(NOT TARGET @PROJECT_LIBRARY_TARGET_NAME@) 17 | include("${CMAKE_CURRENT_LIST_DIR}/@CMAKE_TARGETS_FILE@") 18 | endif() 19 | 20 | set_and_check(GREENTEA_INCLUDE_DIRS "@PACKAGE_INCLUDE_INSTALL_DIR@") 21 | set_and_check(GREENTEA_LIBRARY_DIR "@PACKAGE_LIB_INSTALL_DIR@") 22 | 23 | set(GREENTEA_FOUND TRUE) 24 | -------------------------------------------------------------------------------- /cmake/Dependencies.cmake: -------------------------------------------------------------------------------- 1 | # This list is required for static linking and exported to GreenteaLibDNNConfig.cmake 2 | set(GREENTEA_LINKER_LIBS "") 3 | 4 | # ---[ CUDA 5 | include(cmake/Cuda.cmake) 6 | if(NOT HAVE_CUDA) 7 | if(NOT USE_CUDA) 8 | message(STATUS "-- CUDA is disabled. Building without it...") 9 | else() 10 | set(USE_CUDA OFF) 11 | message(WARNING "-- CUDA is not detected by cmake. Building without it...") 12 | endif() 13 | endif() 14 | 15 | # ---[ OpenCL & ViennaCL 16 | if(USE_OPENCL) 17 | find_package(OpenCL QUIET) 18 | if(NOT HAVE_OPENCL) 19 | message(FATAL_ERROR "OpenCL required for OpenCL but not found.") 20 | endif() 21 | find_package(ViennaCL) 22 | if(NOT HAVE_VIENNACL) 23 | message(FATAL_ERROR "ViennaCL required for OpenCL but not found.") 24 | endif() 25 | include_directories(SYSTEM ${VIENNACL_INCLUDE_DIRS}) 26 | list(APPEND GREENTEA_LINKER_LIBS ${VIENNACL_LIBRARIES}) 27 | set(VIENNACL_WITH_OPENCL ${VIENNACL_WITH_OPENCL}) 28 | endif() 29 | -------------------------------------------------------------------------------- /cmake/Cuda.cmake: -------------------------------------------------------------------------------- 1 | if(NOT USE_CUDA) 2 | return() 3 | endif() 4 | 5 | find_package(CUDA 7.5 QUIET) 6 | find_cuda_helper_libs(curand) # cmake 2.8.7 compartibility which doesn't search for curand 7 | 8 | if(NOT CUDA_FOUND) 9 | return() 10 | endif() 11 | 12 | set(HAVE_CUDA TRUE) 13 | message(STATUS "CUDA detected: " ${CUDA_VERSION}) 14 | include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) 15 | list(APPEND GREENTEA_LINKER_LIBS ${CUDA_CUDART_LIBRARY} 16 | ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) 17 | 18 | mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) 19 | mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) 20 | 21 | # Handle clang/libc++ issue 22 | if(APPLE) 23 | greentea_detect_darwin_version(OSX_VERSION) 24 | 25 | # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits 26 | if(OSX_VERSION VERSION_GREATER 10.8) 27 | # enabled by default if and only if CUDA version is less than 7.0 28 | greentea_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0)) 29 | endif() 30 | endif() 31 | -------------------------------------------------------------------------------- /cmake/Templates/cmake-uninstall.cmake.in: -------------------------------------------------------------------------------- 1 | if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") 2 | message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") 3 | endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt") 4 | 5 | file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files) 6 | string(REGEX REPLACE "\n" ";" files "${files}") 7 | foreach(file ${files}) 8 | message(STATUS "Uninstalling $ENV{DESTDIR}${file}") 9 | if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") 10 | exec_program( 11 | "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\"" 12 | OUTPUT_VARIABLE rm_out 13 | RETURN_VALUE rm_retval 14 | ) 15 | if(NOT "${rm_retval}" STREQUAL 0) 16 | message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}") 17 | endif(NOT "${rm_retval}" STREQUAL 0) 18 | else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") 19 | message(STATUS "File $ENV{DESTDIR}${file} does not exist.") 20 | endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}") 21 | endforeach(file) 22 | -------------------------------------------------------------------------------- /cmake/Modules/FindvecLib.cmake: -------------------------------------------------------------------------------- 1 | # Find the vecLib libraries as part of Accelerate.framework or as standalone framework 2 | # 3 | # The following are set after configuration is done: 4 | # VECLIB_FOUND 5 | # vecLib_INCLUDE_DIR 6 | # vecLib_LINKER_LIBS 7 | 8 | 9 | if(NOT APPLE) 10 | return() 11 | endif() 12 | 13 | set(__veclib_include_suffix "Frameworks/vecLib.framework/Versions/Current/Headers") 14 | 15 | find_path(vecLib_INCLUDE_DIR vecLibTypes.h 16 | DOC "vecLib include directory" 17 | PATHS /System/Library/${__veclib_include_suffix} 18 | /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix} 19 | /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/) 20 | 21 | include(FindPackageHandleStandardArgs) 22 | find_package_handle_standard_args(vecLib DEFAULT_MSG vecLib_INCLUDE_DIR) 23 | 24 | if(VECLIB_FOUND) 25 | if(vecLib_INCLUDE_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*") 26 | set(vecLib_LINKER_LIBS -lcblas "-framework vecLib") 27 | message(STATUS "Found standalone vecLib.framework") 28 | else() 29 | set(vecLib_LINKER_LIBS -lcblas "-framework Accelerate") 30 | message(STATUS "Found vecLib as part of Accelerate.framework") 31 | endif() 32 | 33 | mark_as_advanced(vecLib_INCLUDE_DIR) 34 | endif() 35 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # --[ Greenea LibDNN library 2 | 3 | # TODO(naibaf7): Check if it's needed or not since with 4 | # this macro we get `hdrs` and `srcs` in one, but below 5 | # we will need only `hdrs` to be copied in the install prefix. 6 | 7 | # creates 'srcs' lists 8 | #greentea_pickup_greentea_sources(${PROJECT_SOURCE_DIR}) 9 | 10 | file(GLOB_RECURSE srcs "${PROJECT_SOURCE_DIR}/src/*.cpp") 11 | file(GLOB_RECURSE hdrs "${PROJECT_SOURCE_DIR}/include/*.hpp") 12 | 13 | add_library(${PROJECT_LIBRARY_TARGET_NAME} ${srcs} ${hdrs}) 14 | 15 | # configure the library target 16 | target_include_directories( 17 | ${PROJECT_LIBRARY_TARGET_NAME} PUBLIC 18 | $ 19 | $) 20 | 21 | target_link_libraries( 22 | ${PROJECT_LIBRARY_TARGET_NAME} ${GREENTEA_LINKER_LIBS}) 23 | 24 | # TODO(naibaf7): Is it needed? 25 | # greentea_default_properties(greentea_libdnn) 26 | 27 | set_target_properties(${PROJECT_LIBRARY_TARGET_NAME} PROPERTIES 28 | VERSION ${PROJECT_VERSION} 29 | SOVERSION ${PROJECT_VERSION}) 30 | 31 | # ---[ Install and export library 32 | 33 | install(FILES ${hdrs} "${CMAKE_BINARY_DIR}/greentea_libdnn_config.h" 34 | DESTINATION ${INCLUDE_INSTALL_DIR}) 35 | 36 | install(TARGETS ${PROJECT_LIBRARY_TARGET_NAME} 37 | EXPORT ${CMAKE_TARGETS_NAME} 38 | RUNTIME DESTINATION lib 39 | ARCHIVE DESTINATION lib 40 | LIBRARY DESTINATION lib) 41 | -------------------------------------------------------------------------------- /cmake/Modules/FindViennaCL.cmake: -------------------------------------------------------------------------------- 1 | SET(VIENNACL_WITH_OPENCL TRUE) 2 | 3 | SET(VIENNACL_INCLUDE_SEARCH_PATHS 4 | . 5 | .. 6 | ../ViennaCL 7 | ../viennacl-dev 8 | /usr/include 9 | /usr/local/include 10 | /opt/VIENNACL 11 | $ENV{VIENNACL_HOME} 12 | ) 13 | 14 | SET(VIENNACL_FOUND OFF) 15 | 16 | FIND_PATH(VIENNACL_INCLUDE_DIR NAMES viennacl/version.hpp PATHS ${VIENNACL_INCLUDE_SEARCH_PATHS} DOC "Include for ViennaCL") 17 | 18 | SET(VIENNACL_FOUND ON) 19 | 20 | # Check include files 21 | IF(NOT VIENNACL_INCLUDE_DIR) 22 | MESSAGE(STATUS "Could not find VIENNACL include. Turning VIENNACL_FOUND off") 23 | ENDIF() 24 | 25 | IF (VIENNACL_FOUND) 26 | IF (NOT VIENNACL_FIND_QUIETLY) 27 | MESSAGE(STATUS "Found ViennaCL include: ${VIENNACL_INCLUDE_DIR}") 28 | ENDIF (NOT VIENNACL_FIND_QUIETLY) 29 | ELSE (VIENNACL_FOUND) 30 | IF (VIENNACL_FIND_REQUIRED) 31 | MESSAGE(FATAL_ERROR "Could not find VIENNACL") 32 | ENDIF (VIENNACL_FIND_REQUIRED) 33 | ENDIF (VIENNACL_FOUND) 34 | 35 | IF(VIENNACL_WITH_OPENCL) 36 | find_package(OpenCL) 37 | ENDIF(VIENNACL_WITH_OPENCL) 38 | 39 | LIST( APPEND VIENNACL_INCLUDE_DIRS ${VIENNACL_INCLUDE_DIR} ${OPENCL_INCLUDE_DIRS} ) 40 | LIST( APPEND VIENNACL_LIBRARIES ${OPENCL_LIBRARIES} ) 41 | LIST( REMOVE_DUPLICATES VIENNACL_INCLUDE_DIRS ) 42 | LIST( REMOVE_DUPLICATES VIENNACL_LIBRARIES ) 43 | 44 | SET( HAVE_VIENNACL TRUE ) 45 | message(STATUS "ViennaCL detected: " ${VIENNACL_INCLUDE_DIRS}) 46 | 47 | MARK_AS_ADVANCED( 48 | VIENNACL_INCLUDE_DIR 49 | VIENNACL_INCLUDE_DIRS 50 | VIENNACL_LIBRARIES 51 | ) 52 | -------------------------------------------------------------------------------- /include/benchmark.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GREENTEA_BENCHMARK_HPP_ 2 | #define GREENTEA_BENCHMARK_HPP_ 3 | 4 | #include 5 | #include "common.hpp" 6 | #include "device.hpp" 7 | 8 | namespace greentea { 9 | 10 | class Timer { 11 | public: 12 | Timer(device* dev_ptr); 13 | virtual ~Timer(); 14 | virtual void Start(); 15 | virtual void Stop(); 16 | virtual float MilliSeconds(); 17 | virtual float MicroSeconds(); 18 | virtual float Seconds(); 19 | 20 | inline bool initted() { return initted_; } 21 | inline bool running() { return running_; } 22 | inline bool has_run_at_least_once() { return has_run_at_least_once_; } 23 | 24 | protected: 25 | void Init(); 26 | 27 | device* dev_ptr_; 28 | bool initted_; 29 | bool running_; 30 | bool has_run_at_least_once_; 31 | #ifdef USE_CUDA 32 | cudaEvent_t start_gpu_cuda_; 33 | cudaEvent_t stop_gpu_cuda_; 34 | #endif // USE_CUDA 35 | #ifdef USE_OPENCL 36 | cl_event start_gpu_cl_; 37 | cl_event stop_gpu_cl_; 38 | #endif // USE_OPENCL 39 | std::chrono::time_point start_cpu_; 40 | std::chrono::time_point stop_cpu_; 41 | float elapsed_milliseconds_; 42 | float elapsed_microseconds_; 43 | }; 44 | 45 | class CPUTimer : public Timer { 46 | public: 47 | explicit CPUTimer(device* dev_ptr); 48 | virtual ~CPUTimer() {} 49 | virtual void Start(); 50 | virtual void Stop(); 51 | virtual float MilliSeconds(); 52 | virtual float MicroSeconds(); 53 | }; 54 | 55 | } // namespace greentea 56 | 57 | #endif // GREENTEA_BENCHMARK_HPP_ 58 | -------------------------------------------------------------------------------- /include/device.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GREENTEA_DEVICE_HPP_ 2 | #define GREENTEA_DEVICE_HPP_ 3 | 4 | #include 5 | #include 6 | 7 | #include "common.hpp" 8 | 9 | 10 | using std::vector; 11 | 12 | namespace greentea { 13 | 14 | class device { 15 | public: 16 | explicit device(); 17 | explicit device(int id, int list_id, Backend backend); 18 | Backend backend() const; 19 | int id() const; 20 | int list_id() const; 21 | int current_queue_id(); 22 | int workgroup_size(int id); 23 | 24 | #ifdef USE_OPENCL 25 | static void setupViennaCLContext(int id, 26 | const cl_context ctx, 27 | const cl_device_id dev, 28 | const cl_command_queue queue); 29 | 30 | viennacl::ocl::program& program(); 31 | void SetProgram(); 32 | bool is_host_unified(); 33 | #endif // USE_OPENCL 34 | 35 | int num_queues(); 36 | void SwitchQueue(int id); 37 | void FinishQueues(); 38 | 39 | void Init(); 40 | 41 | uint_tp memory_usage(); 42 | uint_tp peak_memory_usage(); 43 | std::string name(); 44 | void IncreaseMemoryUsage(uint_tp bytes); 45 | void DecreaseMemoryUsage(uint_tp bytes); 46 | void ResetPeakMemoryUsage(); 47 | bool CheckCapability(std::string cap); 48 | bool CheckVendor(std::string vendor); 49 | bool CheckType(std::string type); 50 | 51 | private: 52 | int current_queue_id_; 53 | std::vector workgroup_sizes_; 54 | int id_; 55 | int list_id_; 56 | Backend backend_; 57 | uint_tp memory_usage_; 58 | uint_tp peak_memory_usage_; 59 | bool host_unified_; 60 | std::string name_; 61 | #ifdef USE_OPENCL 62 | viennacl::ocl::program ocl_program_; 63 | #endif // USE_OPENCL 64 | }; 65 | } // namespace greentea 66 | 67 | #endif // GREENTEA_DEVICE_HPP_ 68 | -------------------------------------------------------------------------------- /include/common.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GREENTEA_COMMON_HPP_ 2 | #define GREENTEA_COMMON_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "greentea_libdnn_config.h" 10 | 11 | // #define LIBDNN_DEBUG 1 12 | // #define VIENNACL_DEBUG_ALL 1 13 | 14 | #ifdef USE_OPENCL 15 | #define VIENNACL_PROFILING_ENABLED 16 | #include "viennacl/backend/opencl.hpp" 17 | #include "viennacl/ocl/backend.hpp" 18 | #include "viennacl/ocl/context.hpp" 19 | #include "viennacl/ocl/device.hpp" 20 | #include "viennacl/ocl/platform.hpp" 21 | #endif // USE_OPENCL 22 | 23 | #ifdef USE_CUDA 24 | #include "cuda.h" 25 | #include "nvrtc.h" 26 | #include 27 | #include 28 | #include 29 | #endif // USE_CUDA 30 | 31 | #ifndef GREENTEA_QUEUE_COUNT 32 | #define GREENTEA_QUEUE_COUNT 1 33 | #endif 34 | 35 | #ifndef CUDA_NUM_THREADS 36 | #define CUDA_NUM_THREADS 1 37 | #endif 38 | 39 | #ifdef USE_OPENCL 40 | #ifndef VIENNACL_WITH_OPENCL 41 | #define VIENNACL_WITH_OPENCL 42 | #endif // VIENNACL_WITH_OPENCL 43 | #endif // USE_OPENCL 44 | 45 | #ifdef USE_INDEX_64 46 | #define int_tp int64_t 47 | #define uint_tp uint64_t 48 | #else 49 | #define int_tp int32_t 50 | #define uint_tp uint32_t 51 | #endif 52 | 53 | namespace greentea { 54 | 55 | #ifdef USE_OPENCL 56 | viennacl::ocl::handle WrapHandle(cl_mem in, 57 | viennacl::ocl::context *ctx); 58 | #endif 59 | 60 | enum Backend { 61 | BACKEND_CPU, 62 | BACKEND_CUDA, 63 | BACKEND_OpenCL 64 | }; 65 | 66 | template 67 | struct is_same { 68 | static const bool value = false; 69 | }; 70 | 71 | template 72 | struct is_same { 73 | static const bool value = true; 74 | }; 75 | 76 | } 77 | 78 | #endif // GREENTEA_COMMON_HPP_ 79 | -------------------------------------------------------------------------------- /cmake/Misc.cmake: -------------------------------------------------------------------------------- 1 | if(FALSE) 2 | # ---[ Configuration types 3 | set(CMAKE_CONFIGURATION_TYPES "Debug;Release" 4 | CACHE STRING "Possible configurations" FORCE) 5 | mark_as_advanced(CMAKE_CONFIGURATION_TYPES) 6 | 7 | if(DEFINED CMAKE_BUILD_TYPE) 8 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY 9 | STRINGS ${CMAKE_CONFIGURATION_TYPES}) 10 | endif() 11 | 12 | # --[ If user doesn't specify build type then assume release 13 | if("${CMAKE_BUILD_TYPE}" STREQUAL "") 14 | set(CMAKE_BUILD_TYPE Release) 15 | endif() 16 | 17 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 18 | set(CMAKE_COMPILER_IS_CLANGXX TRUE) 19 | endif() 20 | 21 | # ---[ Solution folders 22 | greentea_option(USE_PROJECT_FOLDERS "IDE Solution folders" (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) ) 23 | 24 | if(USE_PROJECT_FOLDERS) 25 | set_property(GLOBAL PROPERTY USE_FOLDERS ON) 26 | set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMakeTargets") 27 | endif() 28 | 29 | # ---[ Install options 30 | if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) 31 | set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/install" 32 | CACHE PATH "Default install path" FORCE) 33 | endif() 34 | 35 | if(FALSE) 36 | 37 | # ---[ RPATH settings 38 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Use link paths for shared library rpath") 39 | set(CMAKE_MACOSX_RPATH TRUE) 40 | 41 | list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES ${CMAKE_INSTALL_PREFIX}/lib __is_systtem_dir) 42 | if(${__is_systtem_dir} STREQUAL -1) 43 | set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib) 44 | endif() 45 | 46 | # ---[ Funny target 47 | if(UNIX OR APPLE) 48 | add_custom_target(symlink_to_build COMMAND "ln" "-sf" "${PROJECT_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/build" 49 | COMMENT "Adding symlink: /build -> ${PROJECT_BINARY_DIR}" ) 50 | endif() 51 | 52 | # ---[ Set debug postfix 53 | set(GREENTEA_DEBUG_POSTFIX "-d") 54 | 55 | set(GREENTEA_POSTFIX "") 56 | if(CMAKE_BUILD_TYPE MATCHES "Debug") 57 | set(GREENTEA_POSTFIX ${GREENTEA_DEBUG_POSTFIX}) 58 | endif() 59 | 60 | endif(FALSE) 61 | endif(FALSE) 62 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | COPYRIGHT 2 | 3 | All contributions by Fabian David Tschopp: 4 | Copyright (c) 2016-2017 Fabian David Tschopp 5 | All rights reserved. 6 | 7 | All other contributions: 8 | Copyright (c) 2016-2017, the respective contributors 9 | All rights reserved. 10 | 11 | LibDNN uses a shared copyright model: each contributor holds copyright over 12 | their contributions to LibDNN. The project versioning records all such 13 | contribution and copyright details. If a contributor wants to further mark 14 | their specific copyright on a particular contribution, they should indicate 15 | their copyright solely in the commit message of the change when it is 16 | committed. 17 | 18 | LICENSE 19 | 20 | Redistribution and use in source and binary forms, with or without 21 | modification, are permitted provided that the following conditions are met: 22 | 23 | 1. Redistributions of source code must retain the above copyright notice, this 24 | list of conditions and the following disclaimer. 25 | 2. Redistributions in binary form must reproduce the above copyright notice, 26 | this list of conditions and the following disclaimer in the documentation 27 | and/or other materials provided with the distribution. 28 | 29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 30 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 32 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 33 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 34 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 35 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 36 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 37 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 38 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 39 | 40 | CONTRIBUTION AGREEMENT 41 | 42 | By contributing to the LibDNN repository through pull-request, comment, 43 | or otherwise, the contributor releases their content to the 44 | license and copyright terms herein. 45 | -------------------------------------------------------------------------------- /cmake/ConfigGen.cmake: -------------------------------------------------------------------------------- 1 | 2 | ################################################################################################ 3 | # Helper function to fetch greentea includes which will be passed to dependent projects 4 | # Usage: 5 | # greentea_get_current_includes() 6 | function(greentea_get_current_includes includes_variable) 7 | get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES) 8 | greentea_convert_absolute_paths(current_includes) 9 | 10 | # remove at most one ${PROJECT_BINARY_DIR} include added for greentea_config.h 11 | list(FIND current_includes ${PROJECT_BINARY_DIR} __index) 12 | list(REMOVE_AT current_includes ${__index}) 13 | 14 | # removing numpy includes (since not required for client libs) 15 | set(__toremove "") 16 | foreach(__i ${current_includes}) 17 | if(${__i} MATCHES "python") 18 | list(APPEND __toremove ${__i}) 19 | endif() 20 | endforeach() 21 | if(__toremove) 22 | list(REMOVE_ITEM current_includes ${__toremove}) 23 | endif() 24 | 25 | greentea_list_unique(current_includes) 26 | set(${includes_variable} ${current_includes} PARENT_SCOPE) 27 | endfunction() 28 | 29 | ################################################################################################ 30 | # Helper function to get all list items that begin with given prefix 31 | # Usage: 32 | # greentea_get_items_with_prefix( ) 33 | function(greentea_get_items_with_prefix prefix list_variable output_variable) 34 | set(__result "") 35 | foreach(__e ${${list_variable}}) 36 | if(__e MATCHES "^${prefix}.*") 37 | list(APPEND __result ${__e}) 38 | endif() 39 | endforeach() 40 | set(${output_variable} ${__result} PARENT_SCOPE) 41 | endfunction() 42 | 43 | ################################################################################################ 44 | # Function for generation Greentea build- and install- tree export config files 45 | # Usage: 46 | # generate_export_configs() 47 | function(generate_export_configs) 48 | set(install_cmake_suffix "share/Greentea") 49 | 50 | # ---[ Configure build-tree GreenteaConfig.cmake file ]--- 51 | greentea_get_current_includes(GREENTEA_INCLUDE_DIRS) 52 | 53 | set(Greentea_DEFINITIONS "") 54 | if(NOT HAVE_OPENCL) 55 | set(HAVE_OPENCL FALSE) 56 | endif() 57 | 58 | if(NOT HAVE_CUDA) 59 | set(HAVE_CUDA FALSE) 60 | endif() 61 | 62 | # Add targets to the build-tree export set 63 | export(TARGETS greentea_libdnn 64 | FILE "${PROJECT_BINARY_DIR}/GreenteaLibDNNTargets.cmake") 65 | export(PACKAGE Greentea) 66 | 67 | # ---[ Configure install-tree GreenteaConfig.cmake file ]--- 68 | 69 | # remove source and build dir includes 70 | greentea_get_items_with_prefix( 71 | ${PROJECT_SOURCE_DIR} GREENTEA_INCLUDE_DIRS __insource) 72 | greentea_get_items_with_prefix( 73 | ${PROJECT_BINARY_DIR} GREENTEA_INCLUDE_DIRS __inbinary) 74 | list(REMOVE_ITEM GREENTEA_INCLUDE_DIRS ${__insource} ${__inbinary}) 75 | 76 | # add `install` include folder 77 | set(lines 78 | "get_filename_component(__greentea_include \"\${Greentea_CMAKE_DIR}/../../include\" ABSOLUTE)\n" 79 | "list(APPEND GREENTEA_INCLUDE_DIRS \${__greentea_include})\n" 80 | "unset(__greentea_include)\n") 81 | string(REPLACE ";" "" GREENTEA_INSTALL_INCLUDE_DIR_APPEND_COMMAND ${lines}) 82 | 83 | configure_file("cmake/Templates/GreenteaLibDNNConfig.cmake.in" 84 | "${PROJECT_BINARY_DIR}/cmake/GreenteaLibDNNConfig.cmake" @ONLY) 85 | 86 | install(FILES "${CMAKE_BINARY_DIR}/greentea_libdnn_config.h" 87 | DESTINATION include/greentea/include) 88 | 89 | # Install the GreenteaConfig.cmake and export set to use with install-tree 90 | install(FILES "${PROJECT_BINARY_DIR}/cmake/GreenteaLibDNNConfig.cmake" 91 | DESTINATION ${install_cmake_suffix}) 92 | 93 | install(EXPORT GreenteaLibDNNTargets 94 | DESTINATION ${install_cmake_suffix}) 95 | 96 | endfunction() 97 | -------------------------------------------------------------------------------- /cmake/Modules/FindOpenCL.cmake: -------------------------------------------------------------------------------- 1 | # This file taken from FindOpenCL project @ http://gitorious.com/findopencl 2 | # 3 | # - Try to find OpenCL 4 | # This module tries to find an OpenCL implementation on your system. It supports 5 | # AMD / ATI, Apple and NVIDIA implementations. 6 | # 7 | # Once done this will define 8 | # OPENCL_FOUND - system has OpenCL 9 | # OPENCL_INCLUDE_DIRS - the OpenCL include directory 10 | # OPENCL_LIBRARIES - link these to use OpenCL 11 | # 12 | # WIN32 should work, but is untested 13 | 14 | FIND_PACKAGE( PackageHandleStandardArgs ) 15 | 16 | SET (OPENCL_VERSION_STRING "1.1.0") 17 | SET (OPENCL_VERSION_MAJOR 1) 18 | SET (OPENCL_VERSION_MINOR 1) 19 | SET (OPENCL_VERSION_PATCH 0) 20 | 21 | IF (APPLE) 22 | 23 | FIND_LIBRARY(OPENCL_LIBRARIES OpenCL DOC "OpenCL lib for OSX") 24 | FIND_PATH(OPENCL_INCLUDE_DIRS OpenCL/cl.h DOC "Include for OpenCL on OSX") 25 | FIND_PATH(OPENCL_CPP_INCLUDE_DIRS OpenCL/cl.hpp DOC "Include for OpenCL CPP bindings on OSX") 26 | 27 | ELSE (APPLE) 28 | 29 | IF (WIN32) 30 | 31 | FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h) 32 | FIND_PATH(OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp) 33 | 34 | # The AMD SDK currently installs both x86 and x86_64 libraries 35 | # This is only a hack to find out architecture 36 | IF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64" ) 37 | SET(OPENCL_LIB_DIR "$ENV{AMDAPPSDKROOT}/lib/x86_64") 38 | SET(OPENCL_LIB_DIR "$ENV{AMDAPPSDKROOT}/lib/x86_64") 39 | ELSE (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64") 40 | SET(OPENCL_LIB_DIR "$ENV{AMDAPPSDKROOT}/lib/x86") 41 | SET(OPENCL_LIB_DIR "$ENV{AMDAPPSDKROOT}/lib/x86") 42 | ENDIF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64" ) 43 | 44 | # find out if the user asked for a 64-bit build, and use the corresponding 45 | # 64 or 32 bit NVIDIA library paths to the search: 46 | STRING(REGEX MATCH "Win64" ISWIN64 ${CMAKE_GENERATOR}) 47 | IF("${ISWIN64}" STREQUAL "Win64") 48 | FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/x64) 49 | ELSE("${ISWIN64}" STREQUAL "Win64") 50 | FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/Win32) 51 | ENDIF("${ISWIN64}" STREQUAL "Win64") 52 | 53 | GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE) 54 | 55 | # On Win32 search relative to the library 56 | FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS "${_OPENCL_INC_CAND}" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include) 57 | FIND_PATH(OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS "${_OPENCL_INC_CAND}" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include) 58 | 59 | ELSE (WIN32) 60 | 61 | # Unix style platforms 62 | FIND_LIBRARY(OPENCL_LIBRARIES OpenCL 63 | ENV LD_LIBRARY_PATH 64 | ) 65 | 66 | GET_FILENAME_COMPONENT(OPENCL_LIB_DIR ${OPENCL_LIBRARIES} PATH) 67 | GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE) 68 | 69 | # The AMD SDK currently does not place its headers 70 | # in /usr/include, therefore also search relative 71 | # to the library 72 | FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS ${_OPENCL_INC_CAND} "/usr/local/cuda/include") 73 | FIND_PATH(OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS ${_OPENCL_INC_CAND} "/usr/local/cuda/include") 74 | 75 | ENDIF (WIN32) 76 | 77 | ENDIF (APPLE) 78 | 79 | FIND_PACKAGE_HANDLE_STANDARD_ARGS( OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS ) 80 | 81 | IF( OPENCL_CPP_INCLUDE_DIRS ) 82 | SET( OPENCL_HAS_CPP_BINDINGS TRUE ) 83 | LIST( APPEND OPENCL_INCLUDE_DIRS ${OPENCL_CPP_INCLUDE_DIRS} ) 84 | # This is often the same, so clean up 85 | LIST( REMOVE_DUPLICATES OPENCL_INCLUDE_DIRS ) 86 | ENDIF( OPENCL_CPP_INCLUDE_DIRS ) 87 | 88 | SET( HAVE_OPENCL TRUE ) 89 | message(STATUS "OpenCL detected: " ${OPENCL_LIB_DIR}) 90 | 91 | MARK_AS_ADVANCED( 92 | OPENCL_INCLUDE_DIRS 93 | ) 94 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8.7) 2 | 3 | # enables set VERSION in command `project` 4 | if(POLICY CMP0048) 5 | cmake_policy(SET CMP0048 NEW) 6 | endif() 7 | 8 | # ---[ Greentea project 9 | project(Greentea-LibDNN VERSION 0.1.0 LANGUAGES C CXX) 10 | 11 | if(POLICY CMP0046) 12 | cmake_policy(SET CMP0046 NEW) 13 | endif() 14 | 15 | if(POLICY CMP0054) 16 | cmake_policy(SET CMP0054 NEW) 17 | endif() 18 | 19 | string(TOLOWER ${PROJECT_NAME} PROJECT_NAME_LOWER) 20 | string(REPLACE "-" "_" PROJECT_NAME_UNDERSCORE ${PROJECT_NAME_LOWER}) 21 | string(REPLACE "-" "" PROJECT_NAME_MERGE ${PROJECT_NAME}) 22 | 23 | set(PROJECT_EXPORT_NAME ${PROJECT_NAME_MERGE}) 24 | set(PROJECT_LIBRARY_TARGET_NAME ${PROJECT_NAME_UNDERSCORE}) 25 | 26 | set(INCLUDE_INSTALL_DIR include/greentea 27 | CACHE PATH "Install dir for headers") 28 | set(PACKAGE_INSTALL_DIR share/${PROJECT_EXPORT_NAME} 29 | CACHE PATH "Install dir for cmake package config files") 30 | set(LIB_INSTALL_DIR lib 31 | CACHE PATH "Install dir for shared libraries") 32 | 33 | set(CMAKE_CONFIG_FILE "${PROJECT_EXPORT_NAME}Config.cmake") 34 | set(CMAKE_CONFIG_VERSION_FILE "${PROJECT_EXPORT_NAME}ConfigVersion.cmake") 35 | 36 | set(CMAKE_TARGETS_NAME "${PROJECT_EXPORT_NAME}Targets") 37 | set(CMAKE_TARGETS_FILE "${CMAKE_TARGETS_NAME}.cmake") 38 | 39 | # ---[ Using cmake scripts and modules 40 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) 41 | 42 | include(ExternalProject) 43 | 44 | include(cmake/Utils.cmake) 45 | include(cmake/Targets.cmake) 46 | include(cmake/Misc.cmake) 47 | include(cmake/Summary.cmake) 48 | include(cmake/ConfigGen.cmake) 49 | 50 | # ---[ Options 51 | greentea_option(USE_INDEX_64 "Build Greentea LibDNN with 64 bit indexing" OFF) 52 | greentea_option(USE_CUDA "Build Greentea LibDNN with CUDA support" ON) 53 | greentea_option(USE_OPENCL "Build Greentea LibDNN with OpenCL support" ON) 54 | greentea_option(BUILD_SHARED_LIBS "Build shared libraries" ON) 55 | 56 | # ---[ Dependencies 57 | include(cmake/Dependencies.cmake) 58 | 59 | # ---[ Flags 60 | if(UNIX OR APPLE) 61 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++11 -DCMAKE_BUILD -Wno-unused-variable") 62 | endif() 63 | 64 | # TODO(naibaf7): Is it needed? 65 | greentea_set_greentea_link() 66 | 67 | if(USE_libstdcpp) 68 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++ -std=c++11") 69 | message("-- Warning: forcing libstdc++ (controlled by USE_libstdcpp option in cmake)") 70 | endif() 71 | 72 | add_definitions(-DGTEST_USE_OWN_TR1_TUPLE) 73 | 74 | # ---[ Warnings 75 | greentea_warnings_disable(CMAKE_CXX_FLAGS -Wno-sign-compare -Wno-uninitialized) 76 | 77 | # ---[ Config generation 78 | configure_file(cmake/Templates/greentea_libdnn_config.h.in 79 | "${PROJECT_BINARY_DIR}/greentea_libdnn_config.h") 80 | 81 | # ---[ Includes 82 | set(GREENTEA_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include) 83 | include_directories(${GREENTEA_INCLUDE_DIR} ${PROJECT_BINARY_DIR}) 84 | #include_directories(BEFORE src) # This is needed for gtest. 85 | 86 | # ---[ Subdirectories 87 | add_subdirectory(src) 88 | 89 | # ---[ Configuration summary 90 | greentea_print_configuration_summary() 91 | 92 | # ---[ Export configs generation 93 | # generate_export_configs() 94 | 95 | include(CMakePackageConfigHelpers) 96 | configure_package_config_file( 97 | "cmake/Templates/${CMAKE_CONFIG_FILE}.in" 98 | ${CMAKE_BINARY_DIR}/${CMAKE_CONFIG_FILE} 99 | PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR 100 | INSTALL_DESTINATION ${PACKAGE_INSTALL_DIR}) 101 | 102 | write_basic_package_version_file( 103 | ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CONFIG_VERSION_FILE} 104 | VERSION ${PROJECT_VERSION} 105 | COMPATIBILITY SameMajorVersion) 106 | 107 | # uninstall target 108 | configure_file( 109 | "${PROJECT_SOURCE_DIR}/cmake/Templates/cmake-uninstall.cmake.in" 110 | "${CMAKE_CURRENT_BINARY_DIR}/cmake-uninstall.cmake" 111 | IMMEDIATE @ONLY) 112 | 113 | add_custom_target(uninstall COMMAND ${CMAKE_COMMAND} 114 | -P ${CMAKE_CURRENT_BINARY_DIR}/cmake-uninstall.cmake) 115 | 116 | # ---[ Install and export package 117 | 118 | install(EXPORT ${CMAKE_TARGETS_NAME} 119 | FILE ${CMAKE_TARGETS_FILE} 120 | DESTINATION ${PACKAGE_INSTALL_DIR}) 121 | 122 | install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CONFIG_FILE} 123 | ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CONFIG_VERSION_FILE} 124 | DESTINATION ${PACKAGE_INSTALL_DIR} COMPONENT share) 125 | -------------------------------------------------------------------------------- /cmake/Summary.cmake: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Greentea status report function. 3 | # Automatically align right column and selects text based on condition. 4 | 5 | function(greentea_status text) 6 | set(status_cond) 7 | set(status_then) 8 | set(status_else) 9 | 10 | set(status_current_name "cond") 11 | foreach(arg ${ARGN}) 12 | if(arg STREQUAL "THEN") 13 | set(status_current_name "then") 14 | elseif(arg STREQUAL "ELSE") 15 | set(status_current_name "else") 16 | else() 17 | list(APPEND status_${status_current_name} ${arg}) 18 | endif() 19 | endforeach() 20 | 21 | if(DEFINED status_cond) 22 | set(status_placeholder_length 23) 23 | string(RANDOM LENGTH ${status_placeholder_length} ALPHABET " " status_placeholder) 24 | string(LENGTH "${text}" status_text_length) 25 | if(status_text_length LESS status_placeholder_length) 26 | string(SUBSTRING "${text}${status_placeholder}" 0 ${status_placeholder_length} status_text) 27 | elseif(DEFINED status_then OR DEFINED status_else) 28 | message(STATUS "${text}") 29 | set(status_text "${status_placeholder}") 30 | else() 31 | set(status_text "${text}") 32 | endif() 33 | 34 | if(DEFINED status_then OR DEFINED status_else) 35 | if(${status_cond}) 36 | string(REPLACE ";" " " status_then "${status_then}") 37 | string(REGEX REPLACE "^[ \t]+" "" status_then "${status_then}") 38 | message(STATUS "${status_text} ${status_then}") 39 | else() 40 | string(REPLACE ";" " " status_else "${status_else}") 41 | string(REGEX REPLACE "^[ \t]+" "" status_else "${status_else}") 42 | message(STATUS "${status_text} ${status_else}") 43 | endif() 44 | else() 45 | string(REPLACE ";" " " status_cond "${status_cond}") 46 | string(REGEX REPLACE "^[ \t]+" "" status_cond "${status_cond}") 47 | message(STATUS "${status_text} ${status_cond}") 48 | endif() 49 | else() 50 | message(STATUS "${text}") 51 | endif() 52 | endfunction() 53 | 54 | 55 | ################################################################################################ 56 | # Function for fetching Greentea LibDNN version from git and headers 57 | # Usage: 58 | # greentea_extract_greentea_libdnn_version() 59 | function(greentea_extract_greentea_libdnn_version) 60 | set(GREENTEA_LIBDNN_GIT_VERSION "unknown") 61 | find_package(Git) 62 | if(GIT_FOUND) 63 | execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --always --dirty 64 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE 65 | WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}" 66 | OUTPUT_VARIABLE GREENTEA_LIBDNN_GIT_VERSION 67 | RESULT_VARIABLE __git_result) 68 | if(NOT ${__git_result} EQUAL 0) 69 | set(GREENTEA_LIBDNN_GIT_VERSION "unknown") 70 | endif() 71 | endif() 72 | 73 | set(GREENTEA_LIBDNN_GIT_VERSION ${GREENTEA_LIBDNN_GIT_VERSION} PARENT_SCOPE) 74 | 75 | 76 | greentea_parse_header(${GREENTEA_INCLUDE_DIR}/version.hpp GREENTEA_LIBDNN_VERSION_LINES GREENTEA_LIBDNN_MAJOR GREENTEA_LIBDNN_MINOR GREENTEA_LIBDNN_PATCH) 77 | set(GREENTEA_LIBDNN_VERSION "${GREENTEA_LIBDNN_MAJOR}.${GREENTEA_LIBDNN_MINOR}.${GREENTEA_LIBDNN_PATCH}" PARENT_SCOPE) 78 | 79 | endfunction() 80 | 81 | 82 | ################################################################################################ 83 | # Prints accumulated Greentea LibDNN configuration summary 84 | # Usage: 85 | # greentea_print_configuration_summary() 86 | 87 | function(greentea_print_configuration_summary) 88 | greentea_extract_greentea_libdnn_version() 89 | set(GREENTEA_VERSION ${GREENTEA_VERSION} PARENT_SCOPE) 90 | 91 | greentea_merge_flag_lists(__flags_rel CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS) 92 | greentea_merge_flag_lists(__flags_deb CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS) 93 | 94 | greentea_status("") 95 | greentea_status("******************* Greentea LibDNN Configuration Summary *******************") 96 | greentea_status("General:") 97 | greentea_status(" Version : ${GREENTEA_TARGET_VERSION}") 98 | greentea_status(" Git : ${GREENTEA_GIT_VERSION}") 99 | greentea_status(" System : ${CMAKE_SYSTEM_NAME}") 100 | greentea_status(" C++ compiler : ${CMAKE_CXX_COMPILER}") 101 | greentea_status(" Release CXX flags : ${__flags_rel}") 102 | greentea_status(" Debug CXX flags : ${__flags_deb}") 103 | greentea_status(" Build type : ${CMAKE_BUILD_TYPE}") 104 | greentea_status("") 105 | greentea_status(" BUILD_SHARED_LIBS : ${BUILD_SHARED_LIBS}") 106 | greentea_status("") 107 | greentea_status("Dependencies:") 108 | greentea_status(" OpenCL : " HAVE_OPENCL THEN "Yes" ELSE "No") 109 | greentea_status(" ViennaCL : " HAVE_VIENNACL THEN "Yes" ELSE "No") 110 | greentea_status(" CUDA : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" ) 111 | greentea_status("") 112 | greentea_status("Install:") 113 | greentea_status(" Install path : ${CMAKE_INSTALL_PREFIX}") 114 | greentea_status("") 115 | endfunction() 116 | -------------------------------------------------------------------------------- /cmake/Targets.cmake: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Defines global GREENTEA_LINK flag, This flag is required to prevent linker from excluding 3 | # some objects which are not addressed directly but are registered via static constructors 4 | macro(greentea_set_greentea_link) 5 | if(BUILD_SHARED_LIBS) 6 | set(GREENTEA_LINK greentea_libdnn) 7 | else() 8 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 9 | set(GREENTEA_LINK -Wl,-force_load greentea_libdnn) 10 | elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 11 | set(GREENTEA_LINK -Wl,--whole-archive greentea_libdnn -Wl,--no-whole-archive) 12 | endif() 13 | endif() 14 | endmacro() 15 | ################################################################################################ 16 | # Convenient command to setup source group for IDEs that support this feature (VS, XCode) 17 | # Usage: 18 | # greentea_source_group( GLOB[_RECURSE] ) 19 | function(greentea_source_group group) 20 | cmake_parse_arguments(GREENTEA_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN}) 21 | if(GREENTEA_SOURCE_GROUP_GLOB) 22 | file(GLOB srcs1 ${GREENTEA_SOURCE_GROUP_GLOB}) 23 | source_group(${group} FILES ${srcs1}) 24 | endif() 25 | 26 | if(GREENTEA_SOURCE_GROUP_GLOB_RECURSE) 27 | file(GLOB_RECURSE srcs2 ${GREENTEA_SOURCE_GROUP_GLOB_RECURSE}) 28 | source_group(${group} FILES ${srcs2}) 29 | endif() 30 | endfunction() 31 | 32 | ################################################################################################ 33 | # Collecting sources from globbing and appending to output list variable 34 | # Usage: 35 | # greentea_collect_sources( GLOB[_RECURSE] ) 36 | function(greentea_collect_sources variable) 37 | cmake_parse_arguments(GREENTEA_COLLECT_SOURCES "" "" "GLOB;GLOB_RECURSE" ${ARGN}) 38 | if(GREENTEA_COLLECT_SOURCES_GLOB) 39 | file(GLOB srcs1 ${GREENTEA_COLLECT_SOURCES_GLOB}) 40 | set(${variable} ${variable} ${srcs1}) 41 | endif() 42 | 43 | if(GREENTEA_COLLECT_SOURCES_GLOB_RECURSE) 44 | file(GLOB_RECURSE srcs2 ${GREENTEA_COLLECT_SOURCES_GLOB_RECURSE}) 45 | set(${variable} ${variable} ${srcs2}) 46 | endif() 47 | endfunction() 48 | 49 | ################################################################################################ 50 | # Short command getting greentea sources (assuming standard Greentea code tree) 51 | # Usage: 52 | # greentea_pickup_greentea_sources() 53 | function(greentea_pickup_greentea_sources root) 54 | # put all files in source groups (visible as subfolder in many IDEs) 55 | greentea_source_group("Include" GLOB "${root}/include/*.h*") 56 | greentea_source_group("Include" GLOB "${PROJECT_BINARY_DIR}/greentea_config.h*") 57 | greentea_source_group("Source" GLOB "${root}/src/*.cpp") 58 | 59 | 60 | # collect files 61 | file(GLOB_RECURSE hdrs ${root}/include/*.h*) 62 | file(GLOB_RECURSE srcs ${root}/src/*.cpp) 63 | 64 | # adding headers to make the visible in some IDEs (Qt, VS, Xcode) 65 | list(APPEND srcs ${hdrs} ${PROJECT_BINARY_DIR}/greentea_libdnn_config.h) 66 | list(APPEND test_srcs ${test_hdrs}) 67 | 68 | # convert to absolute paths 69 | greentea_convert_absolute_paths(srcs) 70 | 71 | # propogate to parent scope 72 | set(srcs ${srcs} PARENT_SCOPE) 73 | endfunction() 74 | 75 | ################################################################################################ 76 | # Short command for setting defeault target properties 77 | # Usage: 78 | # greentea_default_properties() 79 | function(greentea_default_properties target) 80 | set_target_properties(${target} PROPERTIES 81 | DEBUG_POSTFIX ${GREENTEA_DEBUG_POSTFIX} 82 | ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib" 83 | LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib" 84 | RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin") 85 | # make sure we build all external depepdencies first 86 | if (DEFINED external_project_dependencies) 87 | add_dependencies(${target} ${external_project_dependencies}) 88 | endif() 89 | endfunction() 90 | 91 | ################################################################################################ 92 | # Short command for setting runtime directory for build target 93 | # Usage: 94 | # greentea_set_runtime_directory( ) 95 | function(greentea_set_runtime_directory target dir) 96 | set_target_properties(${target} PROPERTIES 97 | RUNTIME_OUTPUT_DIRECTORY "${dir}") 98 | endfunction() 99 | 100 | ################################################################################################ 101 | # Short command for setting solution folder property for target 102 | # Usage: 103 | # greentea_set_solution_folder( ) 104 | function(greentea_set_solution_folder target folder) 105 | if(USE_PROJECT_FOLDERS) 106 | set_target_properties(${target} PROPERTIES FOLDER "${folder}") 107 | endif() 108 | endfunction() 109 | 110 | ################################################################################################ 111 | # Reads lines from input file, prepends source directory to each line and writes to output file 112 | # Usage: 113 | # greentea_configure_testdatafile() 114 | function(greentea_configure_testdatafile file) 115 | file(STRINGS ${file} __lines) 116 | set(result "") 117 | foreach(line ${__lines}) 118 | set(result "${result}${PROJECT_SOURCE_DIR}/${line}\n") 119 | endforeach() 120 | file(WRITE ${file}.gen.cmake ${result}) 121 | endfunction() 122 | 123 | -------------------------------------------------------------------------------- /src/benchmark.cpp: -------------------------------------------------------------------------------- 1 | #include "benchmark.hpp" 2 | #include "common.hpp" 3 | #include "device.hpp" 4 | 5 | namespace greentea { 6 | 7 | Timer::Timer(device* dev_ptr) 8 | : dev_ptr_(dev_ptr) 9 | , initted_(false) 10 | , running_(false) 11 | , has_run_at_least_once_(false) { Init(); } 12 | 13 | Timer::~Timer() { 14 | #ifdef USE_CUDA 15 | if (dev_ptr_->backend() == BACKEND_CUDA) { 16 | (cudaEventDestroy(start_gpu_cuda_)); 17 | (cudaEventDestroy(stop_gpu_cuda_)); 18 | } 19 | #endif // USE_CUDA 20 | #ifdef USE_OPENCL 21 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 22 | clWaitForEvents(1, &start_gpu_cl_); 23 | clWaitForEvents(1, &stop_gpu_cl_); 24 | clReleaseEvent(start_gpu_cl_); 25 | clReleaseEvent(stop_gpu_cl_); 26 | } 27 | #endif // USE_OPENCL 28 | } 29 | 30 | void Timer::Start() { 31 | if (!running()) { 32 | #ifdef USE_CUDA 33 | if (dev_ptr_->backend() == BACKEND_CUDA) { 34 | (cudaEventRecord(start_gpu_cuda_, 0)); 35 | } 36 | #endif // USE_CUDA 37 | #ifdef USE_OPENCL 38 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 39 | clWaitForEvents(1, &start_gpu_cl_); 40 | clReleaseEvent(start_gpu_cl_); 41 | viennacl::ocl::context &ctx = viennacl::ocl::get_context( 42 | dev_ptr_->id()); 43 | viennacl::ocl::program &program = dev_ptr_->program(); 44 | viennacl::ocl::kernel &kernel = program.get_kernel("null_kernel_float"); 45 | // TODO(naibaf7): compiler shows deprecated declaration 46 | // use `clEnqueueNDRangeKernel` instead 47 | // https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueTask.html 48 | clEnqueueTask(ctx.get_queue().handle().get(), kernel.handle().get(), 0, 49 | NULL, &start_gpu_cl_); 50 | clFinish(ctx.get_queue().handle().get()); 51 | } 52 | #endif 53 | running_ = true; 54 | has_run_at_least_once_ = true; 55 | } 56 | } 57 | 58 | void Timer::Stop() { 59 | if (running()) { 60 | #ifdef USE_CUDA 61 | if (dev_ptr_->backend() == BACKEND_CUDA) { 62 | (cudaEventRecord(stop_gpu_cuda_, 0)); 63 | (cudaEventSynchronize(stop_gpu_cuda_)); 64 | } 65 | #endif // USE_CUDA 66 | #ifdef USE_OPENCL 67 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 68 | clWaitForEvents(1, &stop_gpu_cl_); 69 | clReleaseEvent(stop_gpu_cl_); 70 | viennacl::ocl::context &ctx = viennacl::ocl::get_context( 71 | dev_ptr_->id()); 72 | viennacl::ocl::program &program = dev_ptr_->program(); 73 | viennacl::ocl::kernel &kernel = program.get_kernel("null_kernel_float"); 74 | clEnqueueTask(ctx.get_queue().handle().get(), kernel.handle().get(), 0, 75 | NULL, &stop_gpu_cl_); 76 | clFinish(ctx.get_queue().handle().get()); 77 | } 78 | #endif 79 | running_ = false; 80 | } 81 | } 82 | 83 | float Timer::MicroSeconds() { 84 | if (!has_run_at_least_once()) { 85 | return 0; 86 | } 87 | if (running()) { 88 | Stop(); 89 | } 90 | #ifdef USE_CUDA 91 | if (dev_ptr_->backend() == BACKEND_CUDA) { 92 | (cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_cuda_, 93 | stop_gpu_cuda_)); 94 | // Cuda only measure milliseconds 95 | elapsed_microseconds_ = elapsed_milliseconds_ * 1000; 96 | } 97 | #endif // USE_CUDA 98 | #ifdef USE_OPENCL 99 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 100 | cl_ulong startTime, stopTime; 101 | clWaitForEvents(1, &stop_gpu_cl_); 102 | clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END, 103 | sizeof startTime, &startTime, NULL); 104 | clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START, 105 | sizeof stopTime, &stopTime, NULL); 106 | double us = static_cast(stopTime - startTime) / 1000.0; 107 | elapsed_microseconds_ = static_cast(us); 108 | } 109 | #endif 110 | return elapsed_microseconds_; 111 | } 112 | 113 | float Timer::MilliSeconds() { 114 | if (!has_run_at_least_once()) { 115 | return 0; 116 | } 117 | if (running()) { 118 | Stop(); 119 | } 120 | #ifdef USE_CUDA 121 | if (dev_ptr_->backend() == BACKEND_CUDA) { 122 | (cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_cuda_, 123 | stop_gpu_cuda_)); 124 | } 125 | #endif // USE_CUDA 126 | #ifdef USE_OPENCL 127 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 128 | cl_ulong startTime = 0, stopTime = 0; 129 | clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END, 130 | sizeof startTime, &startTime, NULL); 131 | clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START, 132 | sizeof stopTime, &stopTime, NULL); 133 | double ms = static_cast(stopTime - startTime) / 1000000.0; 134 | elapsed_milliseconds_ = static_cast(ms); 135 | } 136 | #endif 137 | return elapsed_milliseconds_; 138 | } 139 | 140 | float Timer::Seconds() { 141 | return MilliSeconds() / 1000.; 142 | } 143 | 144 | void Timer::Init() { 145 | if (!initted()) { 146 | #ifdef USE_CUDA 147 | if (dev_ptr_->backend() == BACKEND_CUDA) { 148 | (cudaEventCreate(&start_gpu_cuda_)); 149 | (cudaEventCreate(&stop_gpu_cuda_)); 150 | } 151 | #endif // USE_CUDA 152 | #ifdef USE_OPENCL 153 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 154 | start_gpu_cl_ = 0; 155 | stop_gpu_cl_ = 0; 156 | } 157 | #endif 158 | initted_ = true; 159 | } 160 | } 161 | 162 | CPUTimer::CPUTimer(device* dev_ptr) : Timer(dev_ptr) { 163 | this->initted_ = true; 164 | this->running_ = false; 165 | this->has_run_at_least_once_ = false; 166 | } 167 | 168 | void CPUTimer::Start() { 169 | if (!running()) { 170 | this->start_cpu_ = std::chrono::high_resolution_clock::now(); 171 | this->running_ = true; 172 | this->has_run_at_least_once_ = true; 173 | } 174 | } 175 | 176 | void CPUTimer::Stop() { 177 | if (running()) { 178 | this->stop_cpu_ = std::chrono::high_resolution_clock::now(); 179 | this->running_ = false; 180 | } 181 | } 182 | 183 | float CPUTimer::MilliSeconds() { 184 | if (!has_run_at_least_once()) { 185 | return 0; 186 | } 187 | if (running()) { 188 | Stop(); 189 | } 190 | this->elapsed_milliseconds_ = std::chrono::duration_cast( 191 | this->stop_cpu_ - 192 | this->start_cpu_).count(); 193 | return this->elapsed_milliseconds_; 194 | } 195 | 196 | float CPUTimer::MicroSeconds() { 197 | if (!has_run_at_least_once()) { 198 | return 0; 199 | } 200 | if (running()) { 201 | Stop(); 202 | } 203 | this->elapsed_microseconds_ = std::chrono::duration_cast( 204 | this->stop_cpu_ - 205 | this->start_cpu_).count(); 206 | return this->elapsed_microseconds_; 207 | } 208 | 209 | } // namespace greentea 210 | -------------------------------------------------------------------------------- /include/libdnn_tuner.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GREENTEA_LIBDNN_TUNER_HPP_ 2 | #define GREENTEA_LIBDNN_TUNER_HPP_ 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include "common.hpp" 11 | 12 | namespace greentea { 13 | 14 | typedef enum { 15 | LIBDNN_TUNER_METHOD_ALL = 0, 16 | LIBDNN_TUNER_METHOD_ANNEALING = 1, 17 | } libdnnTunerMethod_t; 18 | 19 | typedef enum { 20 | LIBDNN_TUNER_PARAM_STAT_OK = 0, 21 | LIBDNN_TUNER_PARAM_STAT_OVERFLOW = 1, 22 | LIBDNN_TUNER_PARAM_STAT_NO_SOLUTION = 2, 23 | } libdnnTunerParamStatus_t; 24 | 25 | class LibDNNTuner; 26 | 27 | class LibDNNTunerConstraint { 28 | public: 29 | LibDNNTunerConstraint(LibDNNTuner* tuner, std::vector con_params, 30 | std::vector con_adapt) : 31 | tuner_(tuner), con_params_(con_params), con_adapt_(con_adapt) { 32 | } 33 | virtual bool evaluate() = 0; 34 | protected: 35 | LibDNNTuner* tuner_; 36 | std::vector con_params_; 37 | std::vector con_adapt_; 38 | }; 39 | 40 | class LibDNNTunerConstraintBool : public LibDNNTunerConstraint { 41 | public: 42 | LibDNNTunerConstraintBool(LibDNNTuner* tuner, 43 | std::vector con_params, 44 | std::vector con_adapt, 45 | std::function)> func) : 46 | LibDNNTunerConstraint(tuner, con_params, con_adapt), 47 | func_(func) { 48 | } 49 | bool evaluate(); 50 | protected: 51 | std::function)> func_; 52 | }; 53 | 54 | class LibDNNTunerConstraintReal : public LibDNNTunerConstraint { 55 | public: 56 | LibDNNTunerConstraintReal(LibDNNTuner* tuner, 57 | std::vector con_params, 58 | std::vector con_adapt, 59 | std::function)> func) : 60 | LibDNNTunerConstraint(tuner, con_params, con_adapt), 61 | func_(func) { 62 | } 63 | bool evaluate(); 64 | protected: 65 | std::function)> func_; 66 | }; 67 | 68 | class LibDNNTunerConstraintInt : public LibDNNTunerConstraint { 69 | public: 70 | LibDNNTunerConstraintInt(LibDNNTuner* tuner, 71 | std::vector con_params, 72 | std::vector con_adapt, 73 | std::function)> func) : 74 | LibDNNTunerConstraint(tuner, con_params, con_adapt), 75 | func_(func) { 76 | } 77 | bool evaluate(); 78 | protected: 79 | std::function)> func_; 80 | }; 81 | 82 | class LibDNNTunerParam { 83 | public: 84 | LibDNNTunerParam(LibDNNTuner* tuner, std::string name, int_tp def_idx) : 85 | constraints_(), tuner_(tuner), name_(name), 86 | curr_idx_(def_idx), def_idx_(def_idx) 87 | {} 88 | LibDNNTunerParam(LibDNNTuner* tuner, LibDNNTunerParam& other) : // NOLINT 89 | constraints_(other.constraints_), tuner_(tuner), 90 | name_(other.name_), curr_idx_(other.curr_idx_), def_idx_(other.def_idx_) 91 | {} 92 | 93 | virtual int_tp count_values() = 0; 94 | virtual std::shared_ptr clone() = 0; 95 | 96 | std::string get_name(); 97 | 98 | libdnnTunerParamStatus_t advance(int_tp offset); 99 | 100 | int_tp get_curr_idx(); 101 | int_tp get_def_idx(); 102 | void set_curr_idx(int_tp curr_idx); 103 | void set_def_idx(int_tp def_idx); 104 | void update(std::shared_ptr other); 105 | void add_constraint(std::shared_ptr constraint); 106 | 107 | protected: 108 | LibDNNTuner* tuner_; 109 | std::string name_; 110 | int_tp curr_idx_; 111 | int_tp def_idx_; 112 | std::vector> constraints_; 113 | }; 114 | 115 | class LibDNNTunerParamInt: public LibDNNTunerParam { 116 | public: 117 | LibDNNTunerParamInt(LibDNNTuner* tuner, 118 | std::string name, std::vector values, 119 | int_tp def_idx) : 120 | LibDNNTunerParam(tuner, name, def_idx) { 121 | values_ = values; 122 | } 123 | LibDNNTunerParamInt(LibDNNTunerParamInt& other) : // NOLINT 124 | LibDNNTunerParam(other), values_(other.values_) { 125 | } 126 | int64_t get_value(); 127 | const std::vector& get_values(); 128 | int_tp count_values(); 129 | std::shared_ptr clone(); 130 | protected: 131 | std::vector values_; 132 | }; 133 | 134 | class LibDNNTunerParamBool: public LibDNNTunerParam { 135 | public: 136 | LibDNNTunerParamBool(LibDNNTuner* tuner, 137 | std::string name, std::vector values, 138 | int_tp def_idx) : 139 | LibDNNTunerParam(tuner, name, def_idx) { 140 | values_ = values; 141 | } 142 | LibDNNTunerParamBool(LibDNNTunerParamBool& other) : // NOLINT 143 | LibDNNTunerParam(other), values_(other.values_) { 144 | } 145 | bool get_value(); 146 | const std::vector& get_values(); 147 | int_tp count_values(); 148 | virtual std::shared_ptr clone(); 149 | protected: 150 | std::vector values_; 151 | }; 152 | 153 | class LibDNNTunerParamReal: public LibDNNTunerParam { 154 | public: 155 | LibDNNTunerParamReal(LibDNNTuner* tuner, 156 | std::string name, std::vector values, 157 | int_tp def_idx) : 158 | LibDNNTunerParam(tuner, name, def_idx) { 159 | values_ = values; 160 | } 161 | LibDNNTunerParamReal(LibDNNTunerParamReal& other) : // NOLINT 162 | LibDNNTunerParam(other), values_(other.values_) { 163 | } 164 | double get_value(); 165 | const std::vector& get_values(); 166 | int_tp count_values(); 167 | virtual std::shared_ptr clone(); 168 | protected: 169 | std::vector values_; 170 | }; 171 | 172 | 173 | 174 | class LibDNNTunerSnapshot { 175 | public: 176 | LibDNNTunerSnapshot(double score, 177 | std::vector>* params) : 178 | score_(score) { 179 | for (int i = 0; i < params->size(); ++i) { 180 | std::shared_ptr param((*params)[i]->clone()); 181 | params_.push_back(param); 182 | } 183 | } 184 | double get_score(); 185 | std::vector>* get_params(); 186 | protected: 187 | double score_; 188 | std::vector> params_; 189 | }; 190 | 191 | class LibDNNTunerSnapshotCompare { 192 | public: 193 | explicit LibDNNTunerSnapshotCompare(const bool& revparam = false) 194 | { reverse_ = revparam; } 195 | bool operator() (std::shared_ptr& lhs, // NOLINT 196 | std::shared_ptr& rhs) const { // NOLINT 197 | if (reverse_) 198 | return (lhs->get_score() > rhs->get_score()); 199 | else 200 | return (lhs->get_score() < rhs->get_score()); 201 | } 202 | private: 203 | bool reverse_; 204 | }; 205 | 206 | 207 | class LibDNNTuner { 208 | public: 209 | explicit LibDNNTuner() : 210 | constraints_(), params_() { 211 | } 212 | 213 | void Tune(libdnnTunerMethod_t method); 214 | 215 | std::string Serialize(); 216 | 217 | void Restore(std::string json); 218 | 219 | void Snapshot(double score); 220 | void RestoreSnapshot(std::shared_ptr snapshot); 221 | 222 | void set_setup_routine(std::function fun); 223 | 224 | void set_benchmark_routine(std::function fun); 225 | 226 | void add_boolean_param(std::string name, bool def_value, bool inverse); 227 | void add_boolean_param(const char* name, bool def_value, bool inverse); 228 | 229 | template 230 | void add_range_param(std::string name, T def_value, T min, T max, T step); 231 | template 232 | void add_range_param(const char* name, T def_value, T min, T max, T step); 233 | 234 | template 235 | void add_set_param(std::string name, T def_value, std::vector values); 236 | template 237 | void add_set_param(const char* name, T def_value, std::vector values); 238 | 239 | template 240 | void add_constraint(std::vector con_params, 241 | std::vector con_adapt, 242 | std::function)> con_func); 243 | 244 | template 245 | void add_constraint(std::vector con_params, 246 | std::vector con_adapt, 247 | std::function)> con_func); 248 | 249 | template 250 | void add_constraint(std::vector con_params, 251 | std::vector con_adapt, 252 | std::function)> con_func); 253 | 254 | 255 | template 256 | void add_constraint(std::vector con_params, 257 | std::vector con_adapt, 258 | std::function)> con_func); 259 | 260 | template 261 | T get_param(std::string name); 262 | template 263 | T get_param(const char* name); 264 | 265 | protected: 266 | void snapshot(); 267 | 268 | private: 269 | std::function setup_routine_; 270 | std::function benchmark_routine_; 271 | 272 | std::priority_queue, 273 | std::vector>, 274 | LibDNNTunerSnapshotCompare> snapshot_queue_; 275 | 276 | std::vector> snapshots_; 277 | 278 | std::vector > constraints_; 279 | std::vector > params_; 280 | std::map> param_map_; 281 | }; 282 | 283 | } // namespace greentea 284 | 285 | #endif // GREENTEA_TUNER_HPP_ 286 | -------------------------------------------------------------------------------- /include/libdnn.hpp: -------------------------------------------------------------------------------- 1 | #ifndef GREENTEA_LIBDNN_HPP_ 2 | #define GREENTEA_LIBDNN_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "device.hpp" 10 | #include "libdnn_tuner.hpp" 11 | 12 | #ifdef USE_OPENCL 13 | #include "viennacl/backend/opencl.hpp" 14 | #include "viennacl/ocl/backend.hpp" 15 | #include "viennacl/ocl/context.hpp" 16 | #include "viennacl/ocl/device.hpp" 17 | #include "viennacl/ocl/platform.hpp" 18 | #endif // USE_OPENCL 19 | 20 | #ifdef USE_CUDA 21 | #include 22 | #include "cuda.h" 23 | #include "nvrtc.h" 24 | #endif // USE_CUDA 25 | 26 | namespace greentea { 27 | 28 | typedef enum { 29 | // Stack the batch update into one GEMM block 30 | // (deterministic, 1 kernel call) 31 | // Serializes the batch and may therefore under use 32 | // the GPUs compute units. 33 | LIBDNN_CONVOLUTION_WG_ALGO_DIRECT = 0, 34 | // Use multiple GEMM blocks in parallel and update weights atomically 35 | // (non deterministic, 1 kernel call, not supported on all devices) 36 | // Parallelizes the batch and has therefore higher GPU usage. 37 | LIBDNN_CONVOLUTION_WG_ALGO_ATOMIC = 1, 38 | // Use multiple GEMM blocks and an intermediate buffer 39 | // to reduce weight updates 40 | // (deterministic, >= 2 kernel calls) 41 | // Parallelizes the batch and has therefore higher GPU usage. 42 | // NOT IMPLEMENTED YET 43 | LIBDNN_CONVOLUTION_WG_ALGO_REDUCTION = 2 44 | } libdnnConvolutionWeightAlgo_t; 45 | 46 | typedef enum { 47 | // Transform data before GEMM (load, im2col, gemm, store) 48 | // This method is suitable for convolutions with similar 49 | // spatial input == output sizes, but can become inefficient 50 | // if input >> output (with large strides and kernels). 51 | LIBDNN_CONVOLUTION_BW_ALGO_IM2COL = 0, 52 | // Transform data after GEMM (load, gemm, col2im, store) 53 | // Sometimes faster than im2col method, but uses 54 | // atomic operations and is not deterministic. 55 | LIBDNN_CONVOLUTION_BW_ALGO_COL2IM_ATOMIC = 1 56 | } libdnnConvolutionBackwardAlgo_t; 57 | 58 | typedef enum { 59 | LIBDNN_POOLING_METHOD_MAX = 0, 60 | LIBDNN_POOLING_METHOD_AVE = 1, 61 | LIBDNN_POOLING_METHOD_STO = 2 62 | } libdnnPoolingMethod_t; 63 | 64 | typedef enum { 65 | LIBDNN_POOLING_BW_ALGO_DIRECT = 0, 66 | LIBDNN_POOLING_BW_ALGO_ATOMIC = 1 67 | } libdnnPoolingBackwardAlgo_t; 68 | 69 | struct LibDNNConvConfig { 70 | LibDNNConvConfig() : 71 | in_shape(3, 1), 72 | out_shape(3, 1), 73 | kernel(1, 1), 74 | pad(0, 0), 75 | stride(1, 1), 76 | dilation(1, 1) 77 | {} 78 | device* dev_ptr = nullptr; 79 | std::vector in_shape; 80 | std::vector out_shape; 81 | std::vector kernel; 82 | std::vector pad; 83 | std::vector stride; 84 | std::vector dilation; 85 | int_tp group = 1; 86 | bool bias_term = false; 87 | bool fast_unsafe_math = false; 88 | bool weights_backward = true; 89 | bool bias_backward = true; 90 | libdnnConvolutionWeightAlgo_t wgalgo = 91 | LIBDNN_CONVOLUTION_WG_ALGO_ATOMIC; 92 | libdnnConvolutionBackwardAlgo_t bwalgo = 93 | LIBDNN_CONVOLUTION_BW_ALGO_COL2IM_ATOMIC; 94 | std::function 95 | memory_allocator = nullptr; 96 | }; 97 | 98 | template 99 | class LibDNN { 100 | protected: 101 | explicit LibDNN(); 102 | virtual void GenerateKernels() = 0; 103 | virtual std::string string_identifier() = 0; 104 | std::string generate_header(); 105 | std::string generate_common_defs(); 106 | bool CompileKernels(); 107 | void AllocateMemory(void** ptr, uint_tp size, int_tp flags); 108 | void SetMemory(Dtype* memory, int_tp count, int_tp offset, Dtype value); 109 | #ifdef USE_OPENCL 110 | viennacl::ocl::program CompileKernelsOpenCL(viennacl::ocl::context *ctx); 111 | #endif // USE_OPENCL 112 | #ifdef USE_CUDA 113 | nvrtcProgram CompileKernelsCuda(); 114 | #endif // USE_CUDA 115 | 116 | template 117 | inline void add_def(std::stringstream& ss, // NOLINT 118 | const char* name, T value) { 119 | ss << "#ifdef " << name << std::endl; 120 | ss << "#undef " << name << std::endl; 121 | ss << "#endif" << std::endl; 122 | if (std::is_same::value) { 123 | ss << "#define " << name << " (float) " << std::setprecision(32) << value 124 | << std::endl; 125 | } else if (std::is_same::value) { 126 | ss << "#define " << name << " (double) " << std::setprecision(32) << value 127 | << std::endl; 128 | } else { 129 | ss << "#define " << name << " " << value << std::endl; 130 | } 131 | } 132 | 133 | template 134 | inline void add_def(std::stringstream& ss, // NOLINT 135 | const std::string name, T value) { 136 | add_def(ss, name.c_str(), value); 137 | } 138 | 139 | device* dev_ptr_; 140 | 141 | #ifdef USE_OPENCL 142 | viennacl::ocl::program ocl_program_; 143 | #endif // USE_OPENCL 144 | 145 | #ifdef USE_CUDA 146 | nvrtcProgram cuda_program_; 147 | CUmodule cuda_module_; 148 | #endif // USE_CUDA 149 | 150 | std::string kernel_; 151 | 152 | bool fast_unsafe_math_; 153 | }; 154 | 155 | template 156 | class LibDNNConv : public LibDNN { 157 | public: 158 | explicit LibDNNConv(LibDNNConvConfig config); 159 | void Forward(const Dtype* bottom_data, const Dtype* weight, 160 | const Dtype* bias, 161 | Dtype* top_data, int_tp batch_size); 162 | void Backward(bool prop_down_data, bool prop_down_weights, 163 | const Dtype* top_data, const Dtype* top_diff, 164 | const Dtype* weight, Dtype* weight_diff, 165 | const Dtype* bias, Dtype* bias_diff, 166 | const Dtype* bottom_data, Dtype* bottom_diff, 167 | int_tp batch_size); 168 | 169 | void Tune(Dtype* top_data, Dtype* top_diff, 170 | Dtype* weight, Dtype* weight_diff, 171 | Dtype* bias, Dtype* bias_diff, 172 | Dtype* bottom_data, Dtype* bottom_diff, 173 | int_tp batch_size); 174 | 175 | const LibDNNConvConfig get_config(); 176 | 177 | protected: 178 | void GenerateKernels(); 179 | std::string string_identifier(); 180 | std::string generate_fw_defs(); 181 | std::string generate_bw_defs(); 182 | std::string generate_wg_defs(); 183 | std::string generate_gemm_core(std::shared_ptr tuner, 184 | bool dterm); 185 | std::string generate_accreg_init(std::shared_ptr tuner, 186 | bool dterm, bool load); 187 | std::string generate_fw_kernels(std::string name); 188 | std::string generate_bw_kernels(std::string name); 189 | std::string generate_wg_kernels(std::string name); 190 | 191 | private: 192 | LibDNNConvConfig config_; 193 | 194 | // Autotuners 195 | std::shared_ptr fw_tuner_; 196 | std::shared_ptr bw_tuner_; 197 | std::shared_ptr wg_tuner_; 198 | 199 | // Forward GEMM sizes 200 | int_tp M_FW_; 201 | int_tp MG_FW_; 202 | int_tp N_FW_; 203 | int_tp K_FW_; 204 | int_tp KG_FW_; 205 | 206 | // Backward GEMM sizes 207 | int_tp M_BW_; 208 | int_tp MG_BW_; 209 | int_tp N_BW_; 210 | int_tp K_BW_; 211 | int_tp KG_BW_; 212 | 213 | // Weight GEMM sizes 214 | int_tp M_WG_; 215 | int_tp MG_WG_; 216 | int_tp N_WG_; 217 | int_tp NG_WG_; 218 | int_tp K_WG_; 219 | 220 | // Convolution parameters 221 | int_tp num_axes_; 222 | int_tp fmaps_in_; 223 | int_tp fmaps_out_; 224 | int_tp group_; 225 | 226 | std::vector pad_; 227 | std::vector stride_; 228 | std::vector dilation_; 229 | std::vector kernel_shape_; 230 | std::vector im_in_shape_; 231 | std::vector im_out_shape_; 232 | 233 | // Compile and method flags 234 | bool weights_backward_; 235 | bool bias_backward_; 236 | bool bias_term_; 237 | bool skip_range_check_; 238 | Dtype bias_multiplier_; 239 | libdnnConvolutionWeightAlgo_t wgalgo_; 240 | libdnnConvolutionBackwardAlgo_t bwalgo_; 241 | }; 242 | 243 | struct LibDNNPoolConfig { 244 | LibDNNPoolConfig() : 245 | in_shape(3, 1), 246 | out_shape(3, 1), 247 | kernel(1, 1), 248 | pad(0, 0), 249 | stride(1, 1), 250 | dilation(1, 1) 251 | {} 252 | device* dev_ptr = nullptr; 253 | std::vector in_shape; 254 | std::vector out_shape; 255 | std::vector kernel; 256 | std::vector pad; 257 | std::vector stride; 258 | std::vector dilation; 259 | bool use_top_mask = false; 260 | bool fast_unsafe_math = false; 261 | libdnnPoolingMethod_t pool_method = LIBDNN_POOLING_METHOD_MAX; 262 | libdnnPoolingBackwardAlgo_t bwalgo = LIBDNN_POOLING_BW_ALGO_ATOMIC; 263 | bool global_pooling = false; 264 | std::function 265 | memory_allocator = nullptr; 266 | }; 267 | 268 | template 269 | class LibDNNPool : public LibDNN { 270 | public: 271 | explicit LibDNNPool(LibDNNPoolConfig config); 272 | void Forward(const Dtype* bottom_data, Dtype* top_data, 273 | int_tp channels, int_tp batch_size, 274 | bool test_mode, int_tp* mask, 275 | Dtype* top_mask, Dtype* rand_idx); 276 | void Backward(const Dtype* top_diff, Dtype* bottom_diff, 277 | int_tp channels, int_tp batch_size, 278 | const int_tp* mask, const Dtype* top_mask, 279 | const Dtype* rand_idx); 280 | 281 | const LibDNNPoolConfig get_config(); 282 | 283 | protected: 284 | void GenerateKernels(); 285 | std::string string_identifier(); 286 | std::string generate_fw_defs(); 287 | std::string generate_bw_defs(); 288 | std::string generate_fw_kernels(std::string name, bool test_mode); 289 | std::string generate_fwtr_kernels(std::string name); 290 | std::string generate_fwte_kernels(std::string name); 291 | std::string generate_bw_kernels(std::string name); 292 | 293 | private: 294 | LibDNNPoolConfig config_; 295 | 296 | // Autotuners 297 | std::shared_ptr fw_tuner_; 298 | std::shared_ptr bw_tuner_; 299 | 300 | // Pooling parameters 301 | int_tp num_axes_; 302 | 303 | std::vector pad_; 304 | std::vector stride_; 305 | std::vector dilation_; 306 | std::vector kernel_shape_; 307 | std::vector im_in_shape_; 308 | std::vector im_out_shape_; 309 | 310 | // Working memory for stochastic and max pooling 311 | int_tp* mask_ = nullptr; 312 | Dtype* rand_idx_ = nullptr; 313 | 314 | // Compile and method flags 315 | bool skip_range_check_; 316 | libdnnPoolingMethod_t pool_method_; 317 | libdnnPoolingBackwardAlgo_t bwalgo_; 318 | bool use_top_mask_; 319 | }; 320 | 321 | } // namespace libdnn 322 | 323 | #endif // LIBDNN_LIBDNN_HPP_ 324 | -------------------------------------------------------------------------------- /src/libdnn.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "common.hpp" 5 | #include "device.hpp" 6 | #include "libdnn.hpp" 7 | #include "benchmark.hpp" 8 | 9 | namespace greentea { 10 | 11 | template 12 | LibDNN::LibDNN() { 13 | } 14 | 15 | template 16 | std::string LibDNN::generate_header() { 17 | std::stringstream ss; 18 | 19 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 20 | if (std::is_same::value) { 21 | // Test/enable KHR 64 bit (double) 22 | ss << "#if defined(cl_khr_fp64)" << std::endl; 23 | ss << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl; 24 | ss << "#define DOUBLE_SUPPORT_AVAILABLE" << std::endl; 25 | 26 | // Test/enable AMD 64 bit (double) 27 | ss << "#elif defined(cl_amd_fp64)" << std::endl; 28 | ss << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl; 29 | ss << "#define DOUBLE_SUPPORT_AVAILABLE" << std::endl; 30 | ss << "#endif" << std::endl; 31 | } 32 | 33 | // Test/enable 32 bit atomics 34 | ss << "#if defined(cl_khr_int32_base_atomics)" << std::endl; 35 | ss << "#pragma OPENCL EXTENSION cl_khr_int32_base_atomics : enable" 36 | << std::endl; 37 | ss << "#define ATOMICS_32_AVAILABLE" << std::endl; 38 | ss << "#endif" << std::endl; 39 | ss << "#if defined(cl_khr_global_int32_base_atomics)" << std::endl; 40 | ss << "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable" 41 | << std::endl; 42 | ss << "#define ATOMICS_32_AVAILABLE" << std::endl; 43 | ss << "#endif" << std::endl; 44 | 45 | // 64 bit integers 46 | if (sizeof(int_tp) == 8 || std::is_same::value) { 47 | // Test/enable 64 bit atomics 48 | ss << "#if defined(cl_khr_int64_base_atomics)" << std::endl; 49 | ss << "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable" 50 | << std::endl; 51 | ss << "#define ATOMICS_64_AVAILABLE" << std::endl; 52 | ss << "#endif" << std::endl; 53 | } 54 | } 55 | 56 | if (std::is_same::value) { 57 | ss << "#define Dtype double" << std::endl; 58 | ss << "#define Dtype1 double" << std::endl; 59 | // double2, double4, double8, double16 60 | for (int_tp i = 2; i <= 16; i *= 2) { 61 | ss << "#define Dtype" << i << " double" << i << std::endl; 62 | } 63 | } else { 64 | ss << "#define Dtype float" << std::endl; 65 | ss << "#define Dtype1 float" << std::endl; 66 | // float2, float4, float8, float16 67 | for (int_tp i = 2; i <= 16; i *= 2) { 68 | ss << "#define Dtype" << i << " float" << i << std::endl; 69 | } 70 | } 71 | 72 | std::vector elems4({ 73 | "x", "y", "z", "w" }); 74 | std::vector elems16({ 75 | "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7", 76 | "s8", "s9", "sA", "sB", "sC", "sD", "sE", "sF" }); 77 | 78 | for (int_tp i = 1; i <= 16; i *= 2) { 79 | for (int_tp j = 0; j < i; ++j) { 80 | if (i == 1) { 81 | ss << "#define VEC_" << i << "_" << j << "(X)" << " X" << std::endl; 82 | } else if (i < 8) { 83 | ss << "#define VEC_" << i << "_" << j << "(X)" << " X." << elems4[j] 84 | << std::endl; 85 | } else { 86 | ss << "#define VEC_" << i << "_" << j << "(X)" << " X." << elems16[j] 87 | << std::endl; 88 | } 89 | } 90 | } 91 | 92 | if (sizeof(int_tp) == 8) { 93 | ss << "#define int_tp long" << std::endl; 94 | ss << "#define uint_tp unsigned long" << std::endl; 95 | ss << "#define int_tpc long" << std::endl; 96 | ss << "#define uint_tpc unsigned long" << std::endl; 97 | } else { 98 | ss << "#define int_tp int" << std::endl; 99 | ss << "#define uint_tp unsigned int" << std::endl; 100 | ss << "#define int_tpc int" << std::endl; 101 | ss << "#define uint_tpc unsigned int" << std::endl; 102 | } 103 | 104 | if (dev_ptr_->backend() == BACKEND_CUDA) { 105 | // Prepare definitions for OpenCL => CUDA cross compile 106 | // Mainly from: http://www.cedricnugteren.nl/tutorial.php?page=10 107 | ss << "#define __kernel __placeholder__" << std::endl; 108 | ss << "#define __global" << std::endl; 109 | ss << "#define __placeholder__ extern \"C\" __global__" << std::endl; 110 | ss << "#define __local __shared__" << std::endl; 111 | ss << "#define __restricted __restricted__" << std::endl; 112 | ss << "#define barrier(x) __syncthreads()" << std::endl; 113 | 114 | ss << "#define FLT_MIN 1.175494350822287507969e-38f" 115 | << std::endl; 116 | ss << "#define FLT_MAX 340282346638528859811704183484516925440.0f" 117 | << std::endl; 118 | 119 | ss << "__device__ int get_local_id(int x) {" << std::endl; 120 | ss << "if (x == 0) return threadIdx.x;" << std::endl; 121 | ss << "if (x == 1) return threadIdx.y;" << std::endl; 122 | ss << "if (x == 2) return threadIdx.z;" << std::endl; 123 | ss << "return 0;" << std::endl; 124 | ss << "}" << std::endl; 125 | 126 | ss << "__device__ int get_group_id(int x) {" << std::endl; 127 | ss << "if (x == 0) return blockIdx.x;" << std::endl; 128 | ss << "if (x == 1) return blockIdx.y;" << std::endl; 129 | ss << "if (x == 2) return blockIdx.z;" << std::endl; 130 | ss << "return 0;" << std::endl; 131 | ss << "}" << std::endl; 132 | 133 | ss << "__device__ int get_global_id(int x) {" << std::endl; 134 | ss << "if (x == 0) return blockIdx.x * blockDim.x" << " + threadIdx.x;" 135 | << std::endl; 136 | ss << "if (x == 1) return blockIdx.y * blockDim.y" << " + threadIdx.y;" 137 | << std::endl; 138 | ss << "if (x == 2) return blockIdx.z * blockDim.z" << " + threadIdx.z;" 139 | << std::endl; 140 | ss << "return 0;" << std::endl; 141 | ss << "}" << std::endl; 142 | 143 | ss << "__device__ int get_global_size(int x) {" << std::endl; 144 | ss << "if (x == 0) return blockDim.x * gridDim.x;" << std::endl; 145 | ss << "if (x == 1) return blockDim.y * gridDim.y;" << std::endl; 146 | ss << "if (x == 2) return blockDim.z * gridDim.z;" << std::endl; 147 | ss << "return 0;" << std::endl; 148 | ss << "}" << std::endl; 149 | } 150 | 151 | std::vector atomic_funcs({ "Add", "Sub", "Mul", "Div" }); 152 | std::vector atomic_ops({ "+", "-", "*", "/" }); 153 | 154 | // Atomic operations 155 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 156 | // OpenCL atomics, derived from: 157 | // https://streamcomputing.eu/blog/2016-02-09/atomic-operations-for-floats-in-opencl-improved/ 158 | if (std::is_same::value) { 159 | ss << "#ifdef ATOMICS_64_AVAILABLE" << std::endl; 160 | } else { 161 | ss << "#ifdef ATOMICS_32_AVAILABLE" << std::endl; 162 | } 163 | for (int i = 0; i < atomic_funcs.size(); ++i) { 164 | ss << "inline void atomic" << atomic_funcs[i]; 165 | ss << "(volatile __global Dtype* source, const Dtype operand) {" 166 | << std::endl; 167 | ss << "union {" << std::endl; 168 | if (std::is_same::value) { 169 | ss << "unsigned long intVal;" << std::endl; 170 | } else { 171 | ss << "unsigned int intVal;" << std::endl; 172 | } 173 | ss << "Dtype floatVal;" << std::endl; 174 | ss << "} next, expected, current;" << std::endl; 175 | ss << "current.floatVal = *source;" << std::endl; 176 | ss << "do {" << std::endl; 177 | ss << "expected.floatVal = current.floatVal;" << std::endl; 178 | ss << "next.floatVal = expected.floatVal " << atomic_ops[i] << " operand;" 179 | << std::endl; 180 | ss << "current.intVal = "; 181 | if (std::is_same::value) { 182 | ss << "atom_cmpxchg((volatile __global unsigned long *)"; 183 | } else { 184 | ss << "atomic_cmpxchg((volatile __global unsigned int *)"; 185 | } 186 | ss << "source, expected.intVal, next.intVal);" << std::endl; 187 | ss << "} while (current.intVal != expected.intVal);" << std::endl; 188 | ss << "}" << std::endl; 189 | } 190 | if (std::is_same::value) { 191 | ss << "#endif" << std::endl; 192 | } else { 193 | ss << "#endif" << std::endl; 194 | } 195 | } 196 | 197 | // Memory set 198 | ss << "__kernel void fill_memory(const int_tp n, const Dtype alpha," 199 | << "__global Dtype* x, const int_tp offx) {" << std::endl; 200 | ss << "for (int_tp index = get_global_id(0); index < n; " 201 | << "index += get_global_size(0)) {" << std::endl; 202 | ss << "x[index + offx] = alpha;" << std::endl; 203 | ss << "}" << std::endl; 204 | ss << "}" << std::endl; 205 | 206 | return ss.str(); 207 | } 208 | 209 | 210 | template 211 | bool LibDNN::CompileKernels() { 212 | std::string code_ext = ""; 213 | 214 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 215 | code_ext = ".cl"; 216 | } 217 | if (dev_ptr_->backend() == BACKEND_CUDA) { 218 | code_ext = ".cu"; 219 | } 220 | 221 | #ifdef LIBDNN_DEBUG 222 | FILE* fp = fopen((".libdnn_debug/" + string_identifier() + code_ext).c_str(), 223 | "wb"); 224 | fwrite(kernel_.c_str(), sizeof(char), kernel_.length(), fp); 225 | fclose(fp); 226 | #endif // LIBDNN_DEBUG 227 | 228 | #ifdef USE_OPENCL 229 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 230 | CompileKernelsOpenCL(&(viennacl::ocl::get_context(dev_ptr_->id()))); 231 | } 232 | #endif // USE_OPENCL 233 | #ifdef USE_CUDA 234 | if (dev_ptr_->backend() == BACKEND_CUDA) { 235 | CompileKernelsCuda(); 236 | } 237 | #endif // USE_CUDA 238 | return true; 239 | } 240 | 241 | #ifdef USE_OPENCL 242 | template 243 | viennacl::ocl::program LibDNN::CompileKernelsOpenCL( 244 | viennacl::ocl::context *ctx) { 245 | 246 | std::string build_opts = ""; 247 | 248 | if (fast_unsafe_math_) { 249 | build_opts += "-cl-fast-relaxed-math -cl-mad-enable "; 250 | } 251 | 252 | if (is_same::value) { 253 | build_opts += "-cl-single-precision-constant "; 254 | } 255 | 256 | ctx->build_options(build_opts); 257 | 258 | ocl_program_ = ctx->add_program(kernel_.c_str(), "kernel_program"); 259 | 260 | #ifdef LIBDNN_DEBUG 261 | size_t bin_sz; 262 | clGetProgramInfo(ocl_program_.handle().get(), 263 | CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bin_sz, NULL); 264 | unsigned char *bin = (unsigned char *)malloc(bin_sz); // NOLINT 265 | clGetProgramInfo(ocl_program_.handle().get(), 266 | CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL); 267 | FILE* fp = fopen((".libdnn_debug/" + string_identifier() + ".clptx").c_str(), 268 | "wb"); 269 | fwrite(bin, sizeof(char), bin_sz, fp); 270 | fclose(fp); 271 | free(bin); // NOLINT 272 | #endif // LIBDNN_DEBUG 273 | 274 | return ocl_program_; 275 | } 276 | #endif // USE_OPENCL 277 | 278 | #ifdef USE_CUDA 279 | template 280 | nvrtcProgram LibDNN::CompileKernelsCuda() { 281 | nvrtcCreateProgram(&cuda_program_, kernel_.c_str(), NULL, 0, NULL, NULL); 282 | 283 | std::vector build_opts; 284 | 285 | cudaDeviceProp prop; 286 | cudaGetDeviceProperties(&prop, dev_ptr_->id()); 287 | 288 | std::string arch_opt = "--gpu-architecture=compute_" 289 | + std::to_string(prop.major) + std::to_string(prop.minor); 290 | std::string stdcpp_opt = "--std=c++11"; 291 | std::string fum_opt = "--use_fast_math"; 292 | 293 | build_opts.push_back(arch_opt.c_str()); 294 | build_opts.push_back(stdcpp_opt.c_str()); 295 | if (fast_unsafe_math_) { 296 | build_opts.push_back(fum_opt.c_str()); 297 | } 298 | nvrtcCompileProgram(cuda_program_, build_opts.size(), &build_opts[0]); 299 | 300 | size_t ptxSize; 301 | nvrtcGetPTXSize(cuda_program_, &ptxSize); 302 | char *ptx = new char[ptxSize]; 303 | nvrtcGetPTX(cuda_program_, ptx); 304 | 305 | cuModuleLoadDataEx(&cuda_module_, ptx, 0, 0, 0); 306 | 307 | #ifdef LIBDNN_DEBUG 308 | size_t log_size; 309 | nvrtcGetProgramLogSize(cuda_program_, &log_size); 310 | std::vector log(log_size); 311 | nvrtcGetProgramLog(cuda_program_, log.data()); 312 | 313 | std::cout << "CUDA compile log:" << std::endl; 314 | std::cout << log.data() << std::endl; 315 | 316 | FILE* fp = fopen((".libdnn_debug/" + string_identifier() + ".cuptx").c_str(), 317 | "wb"); 318 | fwrite(ptx, sizeof(char), ptxSize, fp); 319 | fclose(fp); 320 | free(ptx); 321 | #endif // LIBDNN_DEBUG 322 | 323 | return cuda_program_; 324 | } 325 | #endif // USE_CUDA 326 | 327 | template 328 | void LibDNN::AllocateMemory(void** ptr, uint_tp size, int_tp flags) { 329 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 330 | #ifdef USE_OPENCL 331 | viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_ptr_->id()); 332 | *ptr = (void*)clCreateBuffer(ctx.handle().get(), // NOLINT 333 | flags, 334 | size, nullptr, nullptr); 335 | #endif // USE_OPENCL 336 | } else { 337 | #ifdef USE_CUDA 338 | cudaMalloc(ptr, size); 339 | #endif // USE_CUDA 340 | } 341 | } 342 | 343 | template 344 | void LibDNN::SetMemory(Dtype* memory, int_tp count, int_tp offset, 345 | Dtype value) { 346 | if (dev_ptr_->backend() == BACKEND_OpenCL) { 347 | #ifdef USE_OPENCL 348 | viennacl::ocl::kernel &kernel = ocl_program_.get_kernel("fill_memory"); 349 | viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_ptr_->id()); 350 | 351 | int wgs = dev_ptr_->workgroup_size(0); 352 | 353 | kernel.local_work_size(0, wgs); 354 | kernel.local_work_size(1, 1); 355 | kernel.local_work_size(2, 1); 356 | 357 | kernel.global_work_size(0, ((count - 1) / wgs + 1) * wgs); 358 | kernel.global_work_size(1, 1); 359 | kernel.global_work_size(2, 1); 360 | 361 | viennacl::ocl::enqueue( 362 | kernel(count, value, WrapHandle((cl_mem) memory, &ctx), offset), 363 | ctx.get_queue()); 364 | #endif // USE_OPENCL 365 | } else { 366 | #ifdef USE_CUDA 367 | CUfunction kernel; 368 | cuModuleGetFunction(&kernel, cuda_module_, "fill_memory"); 369 | 370 | void *args[] = { &count, &value, &memory, &offset }; 371 | cuLaunchKernel(kernel, (count + 512 - 1) / 512, // Grid X 372 | 1, // Grid Y 373 | 1, // Grid Z 374 | 512, 1, 1, // Local 375 | 0, NULL, args, 0); // Arguments 376 | #endif // USE_CUDA 377 | } 378 | } 379 | 380 | 381 | template class LibDNN; 382 | template class LibDNN; 383 | 384 | } // namespace greentea 385 | -------------------------------------------------------------------------------- /cmake/Utils.cmake: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Command alias for debugging messages 3 | # Usage: 4 | # dmsg() 5 | function(dmsg) 6 | message(STATUS ${ARGN}) 7 | endfunction() 8 | 9 | ################################################################################################ 10 | # Removes duplicates from list(s) 11 | # Usage: 12 | # greentea_list_unique( [] [...]) 13 | macro(greentea_list_unique) 14 | foreach(__lst ${ARGN}) 15 | if(${__lst}) 16 | list(REMOVE_DUPLICATES ${__lst}) 17 | endif() 18 | endforeach() 19 | endmacro() 20 | 21 | ################################################################################################ 22 | # Clears variables from list 23 | # Usage: 24 | # greentea_clear_vars() 25 | macro(greentea_clear_vars) 26 | foreach(_var ${ARGN}) 27 | unset(${_var}) 28 | endforeach() 29 | endmacro() 30 | 31 | ################################################################################################ 32 | # Removes duplicates from string 33 | # Usage: 34 | # greentea_string_unique() 35 | function(greentea_string_unique __string) 36 | if(${__string}) 37 | set(__list ${${__string}}) 38 | separate_arguments(__list) 39 | list(REMOVE_DUPLICATES __list) 40 | foreach(__e ${__list}) 41 | set(__str "${__str} ${__e}") 42 | endforeach() 43 | set(${__string} ${__str} PARENT_SCOPE) 44 | endif() 45 | endfunction() 46 | 47 | ################################################################################################ 48 | # Prints list element per line 49 | # Usage: 50 | # greentea_print_list() 51 | function(greentea_print_list) 52 | foreach(e ${ARGN}) 53 | message(STATUS ${e}) 54 | endforeach() 55 | endfunction() 56 | 57 | ################################################################################################ 58 | # Function merging lists of compiler flags to single string. 59 | # Usage: 60 | # greentea_merge_flag_lists(out_variable [] [] ...) 61 | function(greentea_merge_flag_lists out_var) 62 | set(__result "") 63 | foreach(__list ${ARGN}) 64 | foreach(__flag ${${__list}}) 65 | string(STRIP ${__flag} __flag) 66 | set(__result "${__result} ${__flag}") 67 | endforeach() 68 | endforeach() 69 | string(STRIP ${__result} __result) 70 | set(${out_var} ${__result} PARENT_SCOPE) 71 | endfunction() 72 | 73 | ################################################################################################ 74 | # Converts all paths in list to absolute 75 | # Usage: 76 | # greentea_convert_absolute_paths() 77 | function(greentea_convert_absolute_paths variable) 78 | set(__dlist "") 79 | foreach(__s ${${variable}}) 80 | get_filename_component(__abspath ${__s} ABSOLUTE) 81 | list(APPEND __list ${__abspath}) 82 | endforeach() 83 | set(${variable} ${__list} PARENT_SCOPE) 84 | endfunction() 85 | 86 | ################################################################################################ 87 | # Reads set of version defines from the header file 88 | # Usage: 89 | # greentea_parse_header( ..) 90 | macro(greentea_parse_header FILENAME FILE_VAR) 91 | set(vars_regex "") 92 | set(__parnet_scope OFF) 93 | set(__add_cache OFF) 94 | foreach(name ${ARGN}) 95 | if("${name}" STREQUAL "PARENT_SCOPE") 96 | set(__parnet_scope ON) 97 | elseif("${name}" STREQUAL "CACHE") 98 | set(__add_cache ON) 99 | elseif(vars_regex) 100 | set(vars_regex "${vars_regex}|${name}") 101 | else() 102 | set(vars_regex "${name}") 103 | endif() 104 | endforeach() 105 | if(EXISTS "${FILENAME}") 106 | file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" ) 107 | else() 108 | unset(${FILE_VAR}) 109 | endif() 110 | foreach(name ${ARGN}) 111 | if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE") 112 | if(${FILE_VAR}) 113 | if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*") 114 | string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}") 115 | else() 116 | set(${name} "") 117 | endif() 118 | if(__add_cache) 119 | set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE) 120 | elseif(__parnet_scope) 121 | set(${name} "${${name}}" PARENT_SCOPE) 122 | endif() 123 | else() 124 | unset(${name} CACHE) 125 | endif() 126 | endif() 127 | endforeach() 128 | endmacro() 129 | 130 | ################################################################################################ 131 | # Reads single version define from the header file and parses it 132 | # Usage: 133 | # greentea_parse_header_single_define( ) 134 | function(greentea_parse_header_single_define LIBNAME HDR_PATH VARNAME) 135 | set(${LIBNAME}_H "") 136 | if(EXISTS "${HDR_PATH}") 137 | file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1) 138 | endif() 139 | 140 | if(${LIBNAME}_H) 141 | string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}") 142 | string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR "${${LIBNAME}_H}") 143 | string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}") 144 | set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE) 145 | set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE) 146 | set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE) 147 | set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE) 148 | 149 | # append a TWEAK version if it exists: 150 | set(${LIBNAME}_VERSION_TWEAK "") 151 | if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$") 152 | set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE) 153 | endif() 154 | if(${LIBNAME}_VERSION_TWEAK) 155 | set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE) 156 | else() 157 | set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE) 158 | endif() 159 | endif() 160 | endfunction() 161 | 162 | ######################################################################################################## 163 | # An option that the user can select. Can accept condition to control when option is available for user. 164 | # Usage: 165 | # greentea_option( "doc string" [IF ]) 166 | function(greentea_option variable description value) 167 | set(__value ${value}) 168 | set(__condition "") 169 | set(__varname "__value") 170 | foreach(arg ${ARGN}) 171 | if(arg STREQUAL "IF" OR arg STREQUAL "if") 172 | set(__varname "__condition") 173 | else() 174 | list(APPEND ${__varname} ${arg}) 175 | endif() 176 | endforeach() 177 | unset(__varname) 178 | if("${__condition}" STREQUAL "") 179 | set(__condition 2 GREATER 1) 180 | endif() 181 | 182 | if(${__condition}) 183 | if("${__value}" MATCHES ";") 184 | if(${__value}) 185 | option(${variable} "${description}" ON) 186 | else() 187 | option(${variable} "${description}" OFF) 188 | endif() 189 | elseif(DEFINED ${__value}) 190 | if(${__value}) 191 | option(${variable} "${description}" ON) 192 | else() 193 | option(${variable} "${description}" OFF) 194 | endif() 195 | else() 196 | option(${variable} "${description}" ${__value}) 197 | endif() 198 | else() 199 | unset(${variable} CACHE) 200 | endif() 201 | endfunction() 202 | 203 | ################################################################################################ 204 | # Utility macro for comparing two lists. Used for CMake debugging purposes 205 | # Usage: 206 | # greentea_compare_lists( [description]) 207 | function(greentea_compare_lists list1 list2 desc) 208 | set(__list1 ${${list1}}) 209 | set(__list2 ${${list2}}) 210 | list(SORT __list1) 211 | list(SORT __list2) 212 | list(LENGTH __list1 __len1) 213 | list(LENGTH __list2 __len2) 214 | 215 | if(NOT ${__len1} EQUAL ${__len2}) 216 | message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}") 217 | endif() 218 | 219 | foreach(__i RANGE 1 ${__len1}) 220 | math(EXPR __index "${__i}- 1") 221 | list(GET __list1 ${__index} __item1) 222 | list(GET __list2 ${__index} __item2) 223 | if(NOT ${__item1} STREQUAL ${__item2}) 224 | message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}") 225 | endif() 226 | endforeach() 227 | endfunction() 228 | 229 | ################################################################################################ 230 | # Command for disabling warnings for different platforms (see below for gcc and VisualStudio) 231 | # Usage: 232 | # greentea_warnings_disable( -Wshadow /wd4996 ..,) 233 | macro(greentea_warnings_disable) 234 | set(_flag_vars "") 235 | set(_msvc_warnings "") 236 | set(_gxx_warnings "") 237 | 238 | foreach(arg ${ARGN}) 239 | if(arg MATCHES "^CMAKE_") 240 | list(APPEND _flag_vars ${arg}) 241 | elseif(arg MATCHES "^/wd") 242 | list(APPEND _msvc_warnings ${arg}) 243 | elseif(arg MATCHES "^-W") 244 | list(APPEND _gxx_warnings ${arg}) 245 | endif() 246 | endforeach() 247 | 248 | if(NOT _flag_vars) 249 | set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS) 250 | endif() 251 | 252 | if(MSVC AND _msvc_warnings) 253 | foreach(var ${_flag_vars}) 254 | foreach(warning ${_msvc_warnings}) 255 | set(${var} "${${var}} ${warning}") 256 | endforeach() 257 | endforeach() 258 | elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings) 259 | foreach(var ${_flag_vars}) 260 | foreach(warning ${_gxx_warnings}) 261 | if(NOT warning MATCHES "^-Wno-") 262 | string(REPLACE "${warning}" "" ${var} "${${var}}") 263 | string(REPLACE "-W" "-Wno-" warning "${warning}") 264 | endif() 265 | set(${var} "${${var}} ${warning}") 266 | endforeach() 267 | endforeach() 268 | endif() 269 | greentea_clear_vars(_flag_vars _msvc_warnings _gxx_warnings) 270 | endmacro() 271 | 272 | ################################################################################################ 273 | # Helper function get current definitions 274 | # Usage: 275 | # greentea_get_current_definitions() 276 | function(greentea_get_current_definitions definitions_var) 277 | get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS) 278 | set(result "") 279 | 280 | foreach(d ${current_definitions}) 281 | list(APPEND result -D${d}) 282 | endforeach() 283 | 284 | greentea_list_unique(result) 285 | set(${definitions_var} ${result} PARENT_SCOPE) 286 | endfunction() 287 | 288 | ################################################################################################ 289 | # Helper function get current includes/definitions 290 | # Usage: 291 | # greentea_get_current_cflags() 292 | function(greentea_get_current_cflags cflags_var) 293 | get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES) 294 | greentea_convert_absolute_paths(current_includes) 295 | greentea_get_current_definitions(cflags) 296 | 297 | foreach(i ${current_includes}) 298 | list(APPEND cflags "-I${i}") 299 | endforeach() 300 | 301 | greentea_list_unique(cflags) 302 | set(${cflags_var} ${cflags} PARENT_SCOPE) 303 | endfunction() 304 | 305 | ################################################################################################ 306 | # Helper function to parse current linker libs into link directories, libflags and osx frameworks 307 | # Usage: 308 | # greentea_parse_linker_libs( ) 309 | function(greentea_parse_linker_libs greentea_LINKER_LIBS_variable folders_var flags_var frameworks_var) 310 | 311 | set(__unspec "") 312 | set(__debug "") 313 | set(__optimized "") 314 | set(__framework "") 315 | set(__varname "__unspec") 316 | 317 | # split libs into debug, optimized, unspecified and frameworks 318 | foreach(list_elem ${${greentea_LINKER_LIBS_variable}}) 319 | if(list_elem STREQUAL "debug") 320 | set(__varname "__debug") 321 | elseif(list_elem STREQUAL "optimized") 322 | set(__varname "__optimized") 323 | elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)") 324 | list(APPEND __framework -framework ${CMAKE_MATCH_1}) 325 | else() 326 | list(APPEND ${__varname} ${list_elem}) 327 | set(__varname "__unspec") 328 | endif() 329 | endforeach() 330 | 331 | # attach debug or optimized libs to unspecified according to current configuration 332 | if(CMAKE_BUILD_TYPE MATCHES "Debug") 333 | set(__libs ${__unspec} ${__debug}) 334 | else() 335 | set(__libs ${__unspec} ${__optimized}) 336 | endif() 337 | 338 | set(libflags "") 339 | set(folders "") 340 | 341 | # convert linker libraries list to link flags 342 | foreach(lib ${__libs}) 343 | if(TARGET ${lib}) 344 | list(APPEND folders $) 345 | list(APPEND libflags -l${lib}) 346 | elseif(lib MATCHES "^-l.*") 347 | list(APPEND libflags ${lib}) 348 | elseif(IS_ABSOLUTE ${lib}) 349 | get_filename_component(folder ${lib} PATH) 350 | get_filename_component(filename ${lib} NAME) 351 | string(REGEX REPLACE "\\.[^.]*$" "" filename_without_shortest_ext ${filename}) 352 | 353 | string(REGEX MATCH "^lib(.*)" __match ${filename_without_shortest_ext}) 354 | list(APPEND libflags -l${CMAKE_MATCH_1}) 355 | list(APPEND folders ${folder}) 356 | else() 357 | message(FATAL_ERROR "Logic error. Need to update cmake script") 358 | endif() 359 | endforeach() 360 | 361 | greentea_list_unique(libflags folders) 362 | 363 | set(${folders_var} ${folders} PARENT_SCOPE) 364 | set(${flags_var} ${libflags} PARENT_SCOPE) 365 | set(${frameworks_var} ${__framework} PARENT_SCOPE) 366 | endfunction() 367 | 368 | ################################################################################################ 369 | # Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, .... 370 | # Usage: 371 | # greentea_detect_darwin_version() 372 | function(greentea_detect_darwin_version output_var) 373 | if(APPLE) 374 | execute_process(COMMAND /usr/bin/sw_vers -productVersion 375 | RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out 376 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) 377 | 378 | set(${output_var} ${__sw_vers_out} PARENT_SCOPE) 379 | else() 380 | set(${output_var} "" PARENT_SCOPE) 381 | endif() 382 | endfunction() 383 | -------------------------------------------------------------------------------- /src/device.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | 5 | #include "device.hpp" 6 | 7 | namespace greentea { 8 | 9 | #ifdef USE_OPENCL 10 | void device::setupViennaCLContext( 11 | int id, cl_context ctx, 12 | cl_device_id dev, 13 | cl_command_queue queue) { 14 | viennacl::ocl::setup_context(id, ctx, dev, queue); 15 | } 16 | #endif 17 | 18 | device::device() 19 | : current_queue_id_(0) 20 | , workgroup_sizes_(3, 0) 21 | , id_(0) 22 | , list_id_(0) 23 | , backend_(Backend::BACKEND_CPU) 24 | , memory_usage_(0) 25 | , peak_memory_usage_(0) 26 | , host_unified_(false) 27 | , name_(""){} 28 | 29 | device::device(int id, 30 | int list_id, 31 | Backend backend) 32 | : current_queue_id_(0) 33 | , workgroup_sizes_(3, 0) 34 | , id_(id) 35 | , list_id_(list_id) 36 | , backend_(backend) 37 | , memory_usage_(0) 38 | , peak_memory_usage_(0) 39 | , host_unified_(false) 40 | , name_(""){} 41 | 42 | void device::Init() { 43 | #ifndef CPU_ONLY 44 | if (backend_ == BACKEND_CUDA) { 45 | #ifdef USE_CUDA 46 | workgroup_sizes_[0] = CUDA_NUM_THREADS; 47 | workgroup_sizes_[1] = CUDA_NUM_THREADS; 48 | workgroup_sizes_[2] = CUDA_NUM_THREADS; 49 | #endif // USE_CUDA 50 | } else { 51 | #ifdef USE_OPENCL 52 | viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_); 53 | 54 | std::vector temp(3); 55 | clGetDeviceInfo(ctx.devices()[0].id(), 56 | CL_DEVICE_MAX_WORK_ITEM_SIZES, 57 | 3 * sizeof(size_t), &temp[0], NULL); 58 | workgroup_sizes_[0] = temp[0]; 59 | workgroup_sizes_[1] = temp[1]; 60 | workgroup_sizes_[2] = temp[2]; 61 | cl_bool host_unified; 62 | clGetDeviceInfo(ctx.devices()[0].id(), 63 | CL_DEVICE_HOST_UNIFIED_MEMORY, 64 | sizeof(cl_bool), &host_unified, NULL); 65 | 66 | host_unified_ = host_unified; 67 | #endif // USE_OPENCL 68 | } 69 | #endif // !CPU_ONLY 70 | } 71 | 72 | Backend device::backend() const { 73 | return backend_; 74 | } 75 | 76 | int device::id() const { 77 | return id_; 78 | } 79 | 80 | int device::list_id() const { 81 | return list_id_; 82 | } 83 | 84 | int device::workgroup_size(int id) { 85 | return workgroup_sizes_[id % 3]; 86 | } 87 | 88 | #ifdef USE_OPENCL 89 | viennacl::ocl::program& device::program() { 90 | return ocl_program_; 91 | } 92 | #endif 93 | 94 | int device::num_queues() { 95 | if (backend_ == BACKEND_CUDA) { 96 | #ifdef USE_CUDA 97 | return 1; 98 | #endif // USE_CUDA 99 | } else { 100 | #ifdef USE_OPENCL 101 | return 1; 102 | #endif // USE_OPENCL 103 | } 104 | return 1; 105 | } 106 | 107 | int device::current_queue_id() { 108 | return current_queue_id_; 109 | } 110 | 111 | void device::SwitchQueue(const int id) { 112 | if (backend_ == BACKEND_CUDA) { 113 | #ifdef USE_CUDA 114 | (void) id; 115 | #endif // USE_CUDA 116 | } else { 117 | #ifdef USE_OPENCL 118 | viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_); 119 | ctx.switch_queue(id % num_queues()); 120 | current_queue_id_ = id % num_queues(); 121 | #endif // USE_OPENCL 122 | } 123 | } 124 | 125 | void device::FinishQueues() { 126 | if (backend_ == BACKEND_CUDA) { 127 | #ifdef USE_CUDA 128 | #endif // USE_CUDA 129 | } else { 130 | #ifdef USE_OPENCL 131 | viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_); 132 | for (int i = 0; i < num_queues(); ++i) { 133 | ctx.switch_queue(i); 134 | ctx.get_queue().finish(); 135 | } 136 | ctx.switch_queue(0); 137 | current_queue_id_ = 0; 138 | #endif // USE_OPENCL 139 | } 140 | } 141 | 142 | uint_tp device::memory_usage() { 143 | return memory_usage_; 144 | } 145 | 146 | uint_tp device::peak_memory_usage() { 147 | return peak_memory_usage_; 148 | } 149 | 150 | std::string device::name() { 151 | if (name_ == "") { 152 | if (backend_ == BACKEND_OpenCL) { 153 | #ifdef USE_OPENCL 154 | viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_); 155 | 156 | size_t size; 157 | size_t max_size = 1024 * 1024; 158 | clGetDeviceInfo(ctx.devices()[0].id(), CL_DEVICE_NAME, 159 | 0, NULL, &size); 160 | 161 | // Cap at 1 MB to capture faulty OpenCL implementations (nVidia) 162 | std::vector exts(std::min(size, max_size)); 163 | 164 | clGetDeviceInfo(ctx.devices()[0].id(), CL_DEVICE_NAME, 165 | std::min(size, max_size), &(exts[0]), NULL); 166 | 167 | std::string extsstr(&(exts[0])); 168 | std::replace(extsstr.begin(), extsstr.end(), ' ', '_'); 169 | name_ = extsstr; 170 | #endif // USE_OPENCL 171 | } else { 172 | #ifdef USE_CUDA 173 | cudaDeviceProp prop; 174 | cudaGetDeviceProperties(&prop, id_); 175 | std::string extsstr(&prop.name[0]); 176 | std::replace(extsstr.begin(), extsstr.end(), ' ', '_'); 177 | name_ = extsstr; 178 | #endif // USE_CUDA 179 | } 180 | } 181 | return name_; 182 | } 183 | 184 | void device::IncreaseMemoryUsage(uint_tp bytes) { 185 | memory_usage_ += bytes; 186 | if (memory_usage_ > peak_memory_usage_) { 187 | peak_memory_usage_ = memory_usage_; 188 | } 189 | } 190 | 191 | void device::DecreaseMemoryUsage(uint_tp bytes) { 192 | memory_usage_ -= bytes; 193 | } 194 | 195 | void device::ResetPeakMemoryUsage() { 196 | peak_memory_usage_ = memory_usage_; 197 | } 198 | 199 | bool device::CheckCapability(std::string cap) { 200 | if (backend_ == BACKEND_OpenCL) { 201 | #ifdef USE_OPENCL 202 | viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_); 203 | 204 | size_t size; 205 | size_t max_size = 1024 * 1024; 206 | clGetDeviceInfo(ctx.devices()[0].id(), CL_DEVICE_EXTENSIONS, 207 | 0, NULL, &size); 208 | 209 | // Cap at 1 MB to capture faulty OpenCL implementations (nVidia) 210 | std::vector exts(std::min(size, max_size)); 211 | 212 | clGetDeviceInfo(ctx.devices()[0].id(), CL_DEVICE_EXTENSIONS, 213 | size, &(exts[0]), NULL); 214 | 215 | std::string extsstr(&(exts[0])); 216 | return extsstr.find(cap) != std::string::npos; 217 | #endif 218 | } 219 | return false; 220 | } 221 | 222 | bool device::CheckVendor(std::string vendor) { 223 | if (backend_ == Backend::BACKEND_CUDA) { 224 | if (vendor.compare("NVIDIA") == 0) 225 | return true; 226 | } 227 | #ifdef USE_OPENCL 228 | else if (backend_ == BACKEND_OpenCL) { 229 | viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_); 230 | const viennacl::ocl::device &device = ctx.current_device(); 231 | 232 | if (device.vendor().find(vendor) != std::string::npos) 233 | return true; 234 | } 235 | #endif 236 | 237 | return false; 238 | } 239 | 240 | #ifdef USE_OPENCL 241 | bool device::is_host_unified() { 242 | return host_unified_; 243 | } 244 | 245 | const char* clGetErrorString(cl_int error) { 246 | switch (error) { 247 | case 0: return "CL_SUCCESS"; 248 | case -1: return "CL_DEVICE_NOT_FOUND"; 249 | case -2: return "CL_DEVICE_NOT_AVAILABLE"; 250 | case -3: return "CL_COMPILER_NOT_AVAILABLE"; 251 | case -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; 252 | case -5: return "CL_OUT_OF_RESOURCES"; 253 | case -6: return "CL_OUT_OF_HOST_MEMORY"; 254 | case -7: return "CL_PROFILING_INFO_NOT_AVAILABLE"; 255 | case -8: return "CL_MEM_COPY_OVERLAP"; 256 | case -9: return "CL_IMAGE_FORMAT_MISMATCH"; 257 | case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; 258 | case -11: return "CL_BUILD_PROGRAM_FAILURE"; 259 | case -12: return "CL_MAP_FAILURE"; 260 | case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; 261 | case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; 262 | case -15: return "CL_COMPILE_PROGRAM_FAILURE"; 263 | case -16: return "CL_LINKER_NOT_AVAILABLE"; 264 | case -17: return "CL_LINK_PROGRAM_FAILURE"; 265 | case -18: return "CL_DEVICE_PARTITION_FAILED"; 266 | case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; 267 | case -30: return "CL_INVALID_VALUE"; 268 | case -31: return "CL_INVALID_DEVICE_TYPE"; 269 | case -32: return "CL_INVALID_PLATFORM"; 270 | case -33: return "CL_INVALID_DEVICE"; 271 | case -34: return "CL_INVALID_CONTEXT"; 272 | case -35: return "CL_INVALID_QUEUE_PROPERTIES"; 273 | case -36: return "CL_INVALID_COMMAND_QUEUE"; 274 | case -37: return "CL_INVALID_HOST_PTR"; 275 | case -38: return "CL_INVALID_MEM_OBJECT"; 276 | case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; 277 | case -40: return "CL_INVALID_IMAGE_SIZE"; 278 | case -41: return "CL_INVALID_SAMPLER"; 279 | case -42: return "CL_INVALID_BINARY"; 280 | case -43: return "CL_INVALID_BUILD_OPTIONS"; 281 | case -44: return "CL_INVALID_PROGRAM"; 282 | case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; 283 | case -46: return "CL_INVALID_KERNEL_NAME"; 284 | case -47: return "CL_INVALID_KERNEL_DEFINITION"; 285 | case -48: return "CL_INVALID_KERNEL"; 286 | case -49: return "CL_INVALID_ARG_INDEX"; 287 | case -50: return "CL_INVALID_ARG_VALUE"; 288 | case -51: return "CL_INVALID_ARG_SIZE"; 289 | case -52: return "CL_INVALID_KERNEL_ARGS"; 290 | case -53: return "CL_INVALID_WORK_DIMENSION"; 291 | case -54: return "CL_INVALID_WORK_GROUP_SIZE"; 292 | case -55: return "CL_INVALID_WORK_ITEM_SIZE"; 293 | case -56: return "CL_INVALID_GLOBAL_OFFSET"; 294 | case -57: return "CL_INVALID_EVENT_WAIT_LIST"; 295 | case -58: return "CL_INVALID_EVENT"; 296 | case -59: return "CL_INVALID_OPERATION"; 297 | case -60: return "CL_INVALID_GL_OBJECT"; 298 | case -61: return "CL_INVALID_BUFFER_SIZE"; 299 | case -62: return "CL_INVALID_MIP_LEVEL"; 300 | case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; 301 | case -64: return "CL_INVALID_PROPERTY"; 302 | case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; 303 | case -66: return "CL_INVALID_COMPILER_OPTIONS"; 304 | case -67: return "CL_INVALID_LINKER_OPTIONS"; 305 | case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; 306 | case -69: return "CL_INVALID_PIPE_SIZE"; 307 | case -70: return "CL_INVALID_DEVICE_QUEUE"; 308 | case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR"; 309 | case -1001: return "CL_PLATFORM_NOT_FOUND_KHR"; 310 | case -1002: return "CL_INVALID_D3D10_DEVICE_KHR"; 311 | case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR"; 312 | case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR"; 313 | case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR"; 314 | case -1024: return "clBLAS: Functionality is not implemented"; 315 | case -1023: return "clBLAS: Library is not initialized yet"; 316 | case -1022: return "clBLAS: Matrix A is not a valid memory object"; 317 | case -1021: return "clBLAS: Matrix B is not a valid memory object"; 318 | case -1020: return "clBLAS: Matrix C is not a valid memory object"; 319 | case -1019: return "clBLAS: Vector X is not a valid memory object"; 320 | case -1018: return "clBLAS: Vector Y is not a valid memory object"; 321 | case -1017: return "clBLAS: An input dimension (M:N:K) is invalid"; 322 | case -1016: return "clBLAS: Leading dimension A must not be less than the " 323 | "size of the first dimension"; 324 | case -1015: return "clBLAS: Leading dimension B must not be less than the " 325 | "size of the second dimension"; 326 | case -1014: return "clBLAS: Leading dimension C must not be less than the " 327 | "size of the third dimension"; 328 | case -1013: return "clBLAS: The increment for a vector X must not be 0"; 329 | case -1012: return "clBLAS: The increment for a vector Y must not be 0"; 330 | case -1011: return "clBLAS: The memory object for Matrix A is too small"; 331 | case -1010: return "clBLAS: The memory object for Matrix B is too small"; 332 | case -1009: return "clBLAS: The memory object for Matrix C is too small"; 333 | case -1008: return "clBLAS: The memory object for Vector X is too small"; 334 | case -1007: return "clBLAS: The memory object for Vector Y is too small"; 335 | default: return "Unknown OpenCL error"; 336 | } 337 | } 338 | 339 | #ifdef USE_FFT 340 | const char* clfftGetErrorString(clfftStatus status) { 341 | switch (status) { 342 | case CLFFT_SUCCESS: 343 | return "CLFFT_SUCCESS"; 344 | case CLFFT_INVALID_PLAN: 345 | return "CLFFT_INVALID_PLAN"; 346 | case CLFFT_INVALID_GLOBAL_WORK_SIZE: 347 | return "CLFFT_INVALID_GLOBAL_WORK_SIZE"; 348 | case CLFFT_INVALID_MIP_LEVEL: 349 | return "CLFFT_INVALID_MIP_LEVEL"; 350 | case CLFFT_INVALID_BUFFER_SIZE: 351 | return "CLFFT_INVALID_BUFFER_SIZE"; 352 | case CLFFT_INVALID_GL_OBJECT: 353 | return "CLFFT_INVALID_GL_OBJECT"; 354 | case CLFFT_INVALID_OPERATION: 355 | return "CLFFT_INVALID_OPERATION"; 356 | case CLFFT_INVALID_EVENT: 357 | return "CLFFT_INVALID_EVENT"; 358 | case CLFFT_INVALID_EVENT_WAIT_LIST: 359 | return "CLFFT_INVALID_EVENT_WAIT_LIST"; 360 | case CLFFT_INVALID_GLOBAL_OFFSET: 361 | return "CLFFT_INVALID_GLOBAL_OFFSET"; 362 | case CLFFT_INVALID_WORK_ITEM_SIZE: 363 | return "CLFFT_INVALID_WORK_ITEM_SIZE"; 364 | case CLFFT_INVALID_WORK_GROUP_SIZE: 365 | return "CLFFT_INVALID_WORK_GROUP_SIZE"; 366 | case CLFFT_INVALID_WORK_DIMENSION: 367 | return "CLFFT_INVALID_WORK_DIMENSION"; 368 | case CLFFT_INVALID_KERNEL_ARGS: 369 | return "CLFFT_INVALID_KERNEL_ARGS"; 370 | case CLFFT_INVALID_ARG_SIZE: 371 | return "CLFFT_INVALID_ARG_SIZE"; 372 | case CLFFT_INVALID_ARG_VALUE: 373 | return "CLFFT_INVALID_ARG_VALUE"; 374 | case CLFFT_INVALID_ARG_INDEX: 375 | return "CLFFT_INVALID_ARG_INDEX"; 376 | case CLFFT_INVALID_KERNEL: 377 | return "CLFFT_INVALID_KERNEL"; 378 | case CLFFT_INVALID_KERNEL_DEFINITION: 379 | return "CLFFT_INVALID_KERNEL_DEFINITION"; 380 | case CLFFT_INVALID_KERNEL_NAME: 381 | return "CLFFT_INVALID_KERNEL_NAME"; 382 | case CLFFT_INVALID_PROGRAM_EXECUTABLE: 383 | return "CLFFT_INVALID_PROGRAM_EXECUTABLE"; 384 | case CLFFT_INVALID_PROGRAM: 385 | return "CLFFT_INVALID_PROGRAM"; 386 | case CLFFT_INVALID_BUILD_OPTIONS: 387 | return "CLFFT_INVALID_BUILD_OPTIONS"; 388 | case CLFFT_INVALID_BINARY: 389 | return "CLFFT_INVALID_BINARY"; 390 | case CLFFT_INVALID_SAMPLER: 391 | return "CLFFT_INVALID_SAMPLER"; 392 | case CLFFT_INVALID_IMAGE_SIZE: 393 | return "CLFFT_INVALID_IMAGE_SIZE"; 394 | case CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR: 395 | return "CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR"; 396 | case CLFFT_INVALID_MEM_OBJECT: 397 | return "CLFFT_INVALID_MEM_OBJECT"; 398 | case CLFFT_INVALID_HOST_PTR: 399 | return "CLFFT_INVALID_HOST_PTR"; 400 | case CLFFT_INVALID_COMMAND_QUEUE: 401 | return "CLFFT_INVALID_COMMAND_QUEUE"; 402 | case CLFFT_INVALID_QUEUE_PROPERTIES: 403 | return "CLFFT_INVALID_QUEUE_PROPERTIES"; 404 | case CLFFT_INVALID_CONTEXT: 405 | return "CLFFT_INVALID_CONTEXT"; 406 | case CLFFT_INVALID_DEVICE: 407 | return "CLFFT_INVALID_DEVICE"; 408 | case CLFFT_INVALID_PLATFORM: 409 | return "CLFFT_INVALID_PLATFORM"; 410 | case CLFFT_INVALID_DEVICE_TYPE: 411 | return "CLFFT_INVALID_DEVICE_TYPE"; 412 | case CLFFT_INVALID_VALUE: 413 | return "CLFFT_INVALID_VALUE"; 414 | case CLFFT_MAP_FAILURE: 415 | return "CLFFT_MAP_FAILURE"; 416 | case CLFFT_BUILD_PROGRAM_FAILURE: 417 | return "CLFFT_BUILD_PROGRAM_FAILURE"; 418 | case CLFFT_IMAGE_FORMAT_NOT_SUPPORTED: 419 | return "CLFFT_IMAGE_FORMAT_NOT_SUPPORTED"; 420 | case CLFFT_IMAGE_FORMAT_MISMATCH: 421 | return "CLFFT_IMAGE_FORMAT_MISMATCH"; 422 | case CLFFT_MEM_COPY_OVERLAP: 423 | return "CLFFT_MEM_COPY_OVERLAP"; 424 | case CLFFT_PROFILING_INFO_NOT_AVAILABLE: 425 | return "CLFFT_PROFILING_INFO_NOT_AVAILABLE"; 426 | case CLFFT_OUT_OF_HOST_MEMORY: 427 | return "CLFFT_OUT_OF_HOST_MEMORY"; 428 | case CLFFT_OUT_OF_RESOURCES: 429 | return "CLFFT_OUT_OF_RESOURCES"; 430 | case CLFFT_MEM_OBJECT_ALLOCATION_FAILURE: 431 | return "CLFFT_MEM_OBJECT_ALLOCATION_FAILURE"; 432 | case CLFFT_COMPILER_NOT_AVAILABLE: 433 | return "CLFFT_COMPILER_NOT_AVAILABLE"; 434 | case CLFFT_DEVICE_NOT_AVAILABLE: 435 | return "CLFFT_DEVICE_NOT_AVAILABLE"; 436 | case CLFFT_DEVICE_NOT_FOUND: 437 | return "CLFFT_DEVICE_NOT_FOUND"; 438 | case CLFFT_BUGCHECK: 439 | return "CLFFT_BUGCHECK"; 440 | case CLFFT_NOTIMPLEMENTED: 441 | return "CLFFT_NOTIMPLEMENTED"; 442 | case CLFFT_TRANSPOSED_NOTIMPLEMENTED: 443 | return "CLFFT_TRANSPOSED_NOTIMPLEMENTED"; 444 | case CLFFT_FILE_NOT_FOUND: 445 | return "CLFFT_FILE_NOT_FOUND"; 446 | case CLFFT_FILE_CREATE_FAILURE: 447 | return "CLFFT_FILE_CREATE_FAILURE"; 448 | case CLFFT_VERSION_MISMATCH: 449 | return "CLFFT_VERSION_MISMATCH"; 450 | case CLFFT_DEVICE_NO_DOUBLE: 451 | return "CLFFT_DEVICE_NO_DOUBLE"; 452 | case CLFFT_DEVICE_MISMATCH: 453 | return "CLFFT_DEVICE_MISMATCH"; 454 | default: 455 | return "CLFFT_UNKNOWN_ERROR"; 456 | } 457 | } 458 | #endif // USE FFT 459 | 460 | #endif // USE_OPENCL 461 | 462 | } // namespace libnn 463 | -------------------------------------------------------------------------------- /src/libdnn_tuner.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include "common.hpp" 7 | #include "device.hpp" 8 | #include "libdnn_tuner.hpp" 9 | 10 | namespace greentea { 11 | 12 | void LibDNNTuner::set_setup_routine(std::function fun) { 13 | this->setup_routine_ = fun; 14 | } 15 | 16 | void LibDNNTuner::set_benchmark_routine(std::function fun) { 17 | this->benchmark_routine_ = fun; 18 | } 19 | 20 | void LibDNNTuner::Tune(libdnnTunerMethod_t method) { 21 | bool setup_success = setup_routine_(); 22 | int_tp current_param = 0; 23 | double baseline_score = 0; 24 | double best_score = 0; 25 | for (int i = 0; i < 5; ++i) { 26 | baseline_score += benchmark_routine_(); 27 | } 28 | baseline_score /= 5; 29 | best_score = baseline_score; 30 | 31 | if (method == LIBDNN_TUNER_METHOD_ALL) { 32 | while (true) { 33 | bool setup_success = setup_routine_(); 34 | if (setup_success) { 35 | double score = benchmark_routine_(); 36 | if (score > best_score) { 37 | best_score = score; 38 | } 39 | std::cout << "Score: " 40 | << (100.0/baseline_score)*score << "% (best: " 41 | << (100.0/baseline_score)*best_score << "%)"<< std::endl; 42 | } 43 | 44 | bool overflow = false; 45 | while (true) { 46 | overflow = params_[current_param]->advance(1); 47 | if (overflow) { 48 | // Parameter is at default value again 49 | // Switch to the next parameter 50 | ++current_param; 51 | if (current_param >= params_.size()) { 52 | // Through all parameters, stop 53 | break; 54 | } 55 | } else { 56 | // Current parameter has changed to a new value, stop 57 | break; 58 | } 59 | } 60 | if (current_param >= params_.size()) { 61 | // Through all parameters, stop 62 | break; 63 | } 64 | current_param = 0; 65 | } 66 | } 67 | if (method == LIBDNN_TUNER_METHOD_ANNEALING) { 68 | double temp = 1.0; 69 | double temp_min = 0.01; 70 | double alpha = 0.95; 71 | double old_score = baseline_score; 72 | 73 | std::random_device rd; 74 | std::mt19937 rng(rd()); 75 | std::uniform_int_distribution uni(0, params_.size() - 1); 76 | std::uniform_int_distribution adv(1, 3); 77 | std::uniform_int_distribution dir(0, 1); 78 | std::uniform_real_distribution aprn(0.0, 1.0); 79 | 80 | // Initial state snapshot 81 | Snapshot(baseline_score); 82 | 83 | while (temp > temp_min) { 84 | for (int i = 0; i < 100; ++i) { 85 | int next_param = uni(rng); 86 | libdnnTunerParamStatus_t status; 87 | while (true) { 88 | status = params_[next_param]->advance(dir(rng) == 0?-1:1*adv(rng)); 89 | if (status != LIBDNN_TUNER_PARAM_STAT_NO_SOLUTION) { 90 | break; 91 | } 92 | } 93 | std::cout << "Changing parameter: " << params_[next_param]->get_name() 94 | << ", new index: " 95 | << params_[next_param]->get_curr_idx() 96 | << ", new value: " 97 | << get_param(params_[next_param]->get_name()) << std::endl; 98 | bool setup_success = setup_routine_(); 99 | double score = -1.0; 100 | if (setup_success) { 101 | score = benchmark_routine_(); 102 | if (score > best_score) { 103 | best_score = score; 104 | } 105 | std::cout << "Score: " 106 | << (100.0/baseline_score)*score << "% (best: " 107 | << (100.0/baseline_score)*best_score << "%) temp: " 108 | << temp << ", step: " << i << std::endl; 109 | } else { 110 | std::cout << "Setup failure" << std::endl; 111 | RestoreSnapshot(snapshots_[snapshots_.size()-1]); 112 | } 113 | double ap = std::exp(((1.0/old_score)-(1.0/score))/temp); 114 | if (ap > aprn(rng)) { 115 | // Accept solution, create a snapshot 116 | Snapshot(score); 117 | old_score = score; 118 | } else { 119 | // Reject solution, restore the last snapshot 120 | RestoreSnapshot(snapshots_[snapshots_.size()-1]); 121 | } 122 | } 123 | temp *= alpha; 124 | } 125 | // Restore the best solution 126 | RestoreSnapshot(snapshot_queue_.top()); 127 | setup_routine_(); 128 | std::cout << "Final score: " 129 | << ((100.0/baseline_score)*benchmark_routine_()) << std::endl; 130 | } 131 | // Cleanup 132 | // TODO 133 | } 134 | 135 | void LibDNNTuner::Snapshot(double score) { 136 | std::shared_ptr 137 | snapshot(new LibDNNTunerSnapshot(score, ¶ms_)); 138 | snapshots_.push_back(snapshot); 139 | snapshot_queue_.push(snapshot); 140 | } 141 | 142 | void LibDNNTuner::RestoreSnapshot( 143 | std::shared_ptr snapshot) { 144 | std::vector>* params = 145 | snapshot->get_params(); 146 | for (int i = 0; i < params_.size(); ++i) { 147 | params_[i]->update((*params)[i]); 148 | } 149 | } 150 | 151 | template 152 | void LibDNNTuner::add_range_param(std::string name, 153 | T def_value, T min, T max, T step) { 154 | std::vector values; 155 | 156 | T value = static_cast(def_value); 157 | 158 | T vmin = std::min(max, min); 159 | T vmax = std::max(max, min); 160 | 161 | values.push_back(value); 162 | 163 | while (value >= vmin) { 164 | value -= step; 165 | if (value <= vmax && value >= vmin) { 166 | values.insert(values.begin(), value); 167 | } 168 | } 169 | 170 | value = static_cast(def_value); 171 | 172 | while (value <= vmax) { 173 | value += step; 174 | if (value >= vmin && value <= vmax) { 175 | values.push_back(value); std::vector set_values; 176 | } 177 | } 178 | 179 | add_set_param(name, def_value, values); 180 | } 181 | template void LibDNNTuner::add_range_param(std::string name, float def_value, 182 | float min, float max, float step); 183 | template void LibDNNTuner::add_range_param(std::string name, double def_value, 184 | double min, double max, double step); 185 | template void LibDNNTuner::add_range_param(std::string name, int32_t def_value, 186 | int32_t min, int32_t max, int32_t step); 187 | template void LibDNNTuner::add_range_param(std::string name, int64_t def_value, 188 | int64_t min, int64_t max, int64_t step); 189 | 190 | template 191 | void LibDNNTuner::add_range_param(const char* name, 192 | T def_value, T min, T max, T step) { 193 | std::string str(name); 194 | add_range_param(str, def_value, min, max, step); 195 | } 196 | template void LibDNNTuner::add_range_param(const char* name, float def_value, 197 | float min, float max, float step); 198 | template void LibDNNTuner::add_range_param(const char* name, double def_value, 199 | double min, double max, double step); 200 | template void LibDNNTuner::add_range_param(const char* name, int32_t def_value, 201 | int32_t min, int32_t max, int32_t step); 202 | template void LibDNNTuner::add_range_param(const char* name, int64_t def_value, 203 | int64_t min, int64_t max, int64_t step); 204 | 205 | 206 | template 207 | void LibDNNTuner::add_set_param(std::string name, 208 | T def_value, std::vector values) { 209 | if (std::is_same::value || std::is_same::value) { 210 | std::vector set_values; 211 | int_tp def_idx = -1; 212 | for (int_tp i = 0; i < values.size(); ++i) { 213 | set_values.push_back(values[i]); 214 | if (def_value == values[i]) { 215 | def_idx = i; 216 | } 217 | } 218 | if (def_idx == -1) { 219 | def_idx = set_values.size(); 220 | set_values.push_back(def_value); 221 | } 222 | std::shared_ptr param( 223 | new LibDNNTunerParamReal(this, name, set_values, def_idx)); 224 | params_.push_back(param); 225 | param_map_.insert(std::pair>(name, param)); 227 | } 228 | 229 | if (std::is_same::value) { 230 | std::vector set_values; 231 | int_tp def_idx = -1; 232 | for (int_tp i = 0; i < values.size(); ++i) { 233 | set_values.push_back(values[i]); 234 | if (def_value == values[i]) { 235 | def_idx = i; 236 | } 237 | } 238 | if (def_idx == -1) { 239 | def_idx = set_values.size(); 240 | set_values.push_back(def_value); 241 | } 242 | std::shared_ptr param( 243 | new LibDNNTunerParamBool(this, name, set_values, def_idx)); 244 | params_.push_back(param); 245 | param_map_.insert(std::pair>(name, param)); 247 | } 248 | 249 | if (std::is_same::value || std::is_same::value) { 250 | std::vector set_values; 251 | int_tp def_idx = -1; 252 | for (int_tp i = 0; i < values.size(); ++i) { 253 | set_values.push_back(values[i]); 254 | if (def_value == values[i]) { 255 | def_idx = i; 256 | } 257 | } 258 | if (def_idx == -1) { 259 | def_idx = set_values.size(); 260 | set_values.push_back(def_value); 261 | } 262 | std::shared_ptr 263 | param(new LibDNNTunerParamInt(this, name, set_values, def_idx)); 264 | params_.push_back(param); 265 | param_map_.insert(std::pair>(name, param)); 267 | } 268 | } 269 | template void LibDNNTuner::add_set_param(std::string name, 270 | float def_value, std::vector values); 271 | template void LibDNNTuner::add_set_param(std::string name, 272 | double def_value, std::vector values); 273 | template void LibDNNTuner::add_set_param(std::string name, 274 | int32_t def_value, std::vector values); 275 | template void LibDNNTuner::add_set_param(std::string name, 276 | int64_t def_value, std::vector values); 277 | 278 | template<> 279 | void LibDNNTuner::add_constraint(std::vector con_params, 280 | std::vector con_adapt, 281 | std::function)> con_func) { 282 | std::shared_ptr constraint; 283 | constraint = std::shared_ptr( 284 | new LibDNNTunerConstraintBool( 285 | this, con_params, con_adapt, con_func)); 286 | constraints_.push_back(constraint); 287 | for (int_tp i = 0; i < con_params.size(); ++i) { 288 | std::shared_ptr param = param_map_.at(con_params[i]); 289 | param->add_constraint(constraint); 290 | } 291 | } 292 | template<> 293 | void LibDNNTuner::add_constraint(std::vector con_params, 294 | std::vector con_adapt, 295 | std::function)> con_func) { 296 | std::shared_ptr constraint; 297 | constraint = std::shared_ptr( 298 | new LibDNNTunerConstraintReal( 299 | this, con_params, con_adapt, con_func)); 300 | constraints_.push_back(constraint); 301 | for (int_tp i = 0; i < con_params.size(); ++i) { 302 | std::shared_ptr param = param_map_.at(con_params[i]); 303 | param->add_constraint(constraint); 304 | } 305 | } 306 | template<> 307 | void LibDNNTuner::add_constraint(std::vector con_params, 308 | std::vector con_adapt, 309 | std::function)> con_func) { 310 | std::shared_ptr constraint; 311 | constraint = std::shared_ptr( 312 | new LibDNNTunerConstraintInt( 313 | this, con_params, con_adapt, con_func)); 314 | constraints_.push_back(constraint); 315 | for (int_tp i = 0; i < con_params.size(); ++i) { 316 | std::shared_ptr param = param_map_.at(con_params[i]); 317 | param->add_constraint(constraint); 318 | } 319 | } 320 | 321 | template 322 | void LibDNNTuner::add_constraint(std::vector con_params, 323 | std::vector con_adapt, 324 | std::function)> con_func) { 325 | std::vector con_params_str; 326 | std::vector con_adapt_str; 327 | 328 | for (int_tp i = 0; i < con_params.size(); ++i) { 329 | std::string str(con_params[i]); 330 | con_params_str.push_back(str); 331 | } 332 | 333 | for (int_tp i = 0; i < con_adapt.size(); ++i) { 334 | std::string str(con_adapt[i]); 335 | con_adapt_str.push_back(str); 336 | } 337 | 338 | add_constraint(con_params_str, con_adapt_str, con_func); 339 | } 340 | template void LibDNNTuner::add_constraint(std::vector con_params, 341 | std::vector con_adapt, 342 | std::function)> con_func); 343 | template void LibDNNTuner::add_constraint(std::vector con_params, 344 | std::vector con_adapt, 345 | std::function)> con_func); 346 | template void LibDNNTuner::add_constraint(std::vector con_params, 347 | std::vector con_adapt, 348 | std::function)> con_func); 349 | 350 | template 351 | void LibDNNTuner::add_constraint(std::vector con_params, 352 | std::vector con_adapt, 353 | std::function)> con_func) { 354 | std::vector con_params_str; 355 | std::vector con_adapt_str; 356 | 357 | for (int_tp i = 0; i < con_params.size(); ++i) { 358 | std::string str(con_params[i]); 359 | con_params_str.push_back(str); 360 | } 361 | 362 | for (int_tp i = 0; i < con_adapt.size(); ++i) { 363 | std::string str(con_adapt[i]); 364 | con_adapt_str.push_back(str); 365 | } 366 | } 367 | template void LibDNNTuner::add_constraint(std::vector con_params, 368 | std::vector con_adapt, 369 | std::function)> con_func); 370 | template void LibDNNTuner::add_constraint(std::vector con_params, 371 | std::vector con_adapt, 372 | std::function)> con_func); 373 | template void LibDNNTuner::add_constraint(std::vector con_params, 374 | std::vector con_adapt, 375 | std::function)> con_func); 376 | 377 | template 378 | void LibDNNTuner::add_constraint(std::vector con_params, 379 | std::vector con_adapt, 380 | std::function)> con_func) { 381 | std::vector con_params_str; 382 | std::vector con_adapt_str; 383 | 384 | for (int_tp i = 0; i < con_params.size(); ++i) { 385 | std::string str(con_params[i]); 386 | con_params_str.push_back(str); 387 | } 388 | 389 | for (int_tp i = 0; i < con_adapt.size(); ++i) { 390 | std::string str(con_adapt[i]); 391 | con_adapt_str.push_back(str); 392 | } 393 | } 394 | template void LibDNNTuner::add_constraint(std::vector con_params, 395 | std::vector con_adapt, 396 | std::function)> con_func); 397 | template void LibDNNTuner::add_constraint(std::vector con_params, 398 | std::vector con_adapt, 399 | std::function)> con_func); 400 | template void LibDNNTuner::add_constraint(std::vector con_params, 401 | std::vector con_adapt, 402 | std::function)> con_func); 403 | 404 | template 405 | void LibDNNTuner::add_set_param(const char* name, 406 | T def_value, std::vector values) { 407 | std::string str(name); 408 | add_set_param(str, def_value, values); 409 | } 410 | template void LibDNNTuner::add_set_param(const char* name, 411 | float def_value, std::vector values); 412 | template void LibDNNTuner::add_set_param(const char* name, 413 | double def_value, std::vector values); 414 | template void LibDNNTuner::add_set_param(const char* name, 415 | int32_t def_value, std::vector values); 416 | template void LibDNNTuner::add_set_param(const char* name, 417 | int64_t def_value, std::vector values); 418 | 419 | void LibDNNTuner::add_boolean_param(std::string name, 420 | bool def_value, bool inverse) { 421 | std::vector set_values; 422 | set_values.push_back(def_value); 423 | if (inverse) { 424 | set_values.push_back(!def_value); 425 | } 426 | std::shared_ptr param( 427 | new LibDNNTunerParamBool(this, name, set_values, 0)); 428 | params_.push_back(param); 429 | param_map_.insert(std::pair>(name, param)); 431 | } 432 | 433 | void LibDNNTuner::add_boolean_param(const char* name, 434 | bool def_value, bool inverse) { 435 | std::string str(name); 436 | add_boolean_param(str, def_value, inverse); 437 | } 438 | 439 | 440 | template 441 | T LibDNNTuner::get_param(std::string name) { 442 | T value; 443 | std::shared_ptr param = param_map_.at(name); 444 | 445 | std::shared_ptr param_bool = 446 | std::dynamic_pointer_cast(param); 447 | if (param_bool.get() != nullptr) { 448 | value = static_cast(param_bool->get_value()); 449 | return value; 450 | } 451 | 452 | std::shared_ptr param_int = 453 | std::dynamic_pointer_cast(param); 454 | if (param_int.get() != nullptr) { 455 | value = static_cast(param_int->get_value()); 456 | return value; 457 | } 458 | 459 | std::shared_ptr param_real = 460 | std::dynamic_pointer_cast(param); 461 | if (param_real.get() != nullptr) { 462 | value = static_cast(param_real->get_value()); 463 | return value; 464 | } 465 | 466 | return value; 467 | } 468 | template float LibDNNTuner::get_param(std::string name); 469 | template double LibDNNTuner::get_param(std::string name); 470 | template int32_t LibDNNTuner::get_param(std::string name); 471 | template int64_t LibDNNTuner::get_param(std::string name); 472 | template bool LibDNNTuner::get_param(std::string name); 473 | 474 | template 475 | T LibDNNTuner::get_param(const char* name) { 476 | std::string str(name); 477 | return get_param(str); 478 | } 479 | template float LibDNNTuner::get_param(const char* name); 480 | template double LibDNNTuner::get_param(const char* name); 481 | template int32_t LibDNNTuner::get_param(const char* name); 482 | template int64_t LibDNNTuner::get_param(const char* name); 483 | template bool LibDNNTuner::get_param(const char* name); 484 | 485 | std::string LibDNNTunerParam::get_name() { 486 | return name_; 487 | } 488 | 489 | libdnnTunerParamStatus_t LibDNNTunerParam::advance(int_tp offset) { 490 | for (int i = 0; i < abs(offset); ++i) { 491 | if (offset > 0) { 492 | ++curr_idx_; 493 | } else { 494 | --curr_idx_; 495 | } 496 | if (curr_idx_ >= count_values()) { 497 | curr_idx_ = 0; 498 | } 499 | if (curr_idx_ < 0) { 500 | curr_idx_ = count_values() - 1; 501 | } 502 | } 503 | if (curr_idx_ == def_idx_) { 504 | return LIBDNN_TUNER_PARAM_STAT_OVERFLOW; 505 | } 506 | 507 | bool constraints_ok = true; 508 | for (int i = 0; i < constraints_.size(); ++i) { 509 | constraints_ok &= constraints_[i]->evaluate(); 510 | } 511 | 512 | if (constraints_ok) { 513 | return LIBDNN_TUNER_PARAM_STAT_OK; 514 | } else { 515 | return LIBDNN_TUNER_PARAM_STAT_NO_SOLUTION; 516 | } 517 | } 518 | 519 | int_tp LibDNNTunerParam::get_curr_idx() { 520 | return curr_idx_; 521 | } 522 | 523 | int_tp LibDNNTunerParam::get_def_idx() { 524 | return def_idx_; 525 | } 526 | 527 | void LibDNNTunerParam::set_curr_idx(int_tp curr_idx) { 528 | curr_idx_ = curr_idx; 529 | } 530 | 531 | void LibDNNTunerParam::set_def_idx(int_tp def_idx) { 532 | def_idx_ = def_idx; 533 | } 534 | 535 | void LibDNNTunerParam::add_constraint( 536 | std::shared_ptr constraint) { 537 | constraints_.push_back(constraint); 538 | } 539 | 540 | double LibDNNTunerSnapshot::get_score() { 541 | return score_; 542 | } 543 | 544 | std::vector>* 545 | LibDNNTunerSnapshot::get_params() { 546 | return ¶ms_; 547 | } 548 | 549 | 550 | int_tp LibDNNTunerParamInt::count_values() { 551 | return values_.size(); 552 | } 553 | int_tp LibDNNTunerParamReal::count_values() { 554 | return values_.size(); 555 | } 556 | int_tp LibDNNTunerParamBool::count_values() { 557 | return values_.size(); 558 | } 559 | 560 | int64_t LibDNNTunerParamInt::get_value() { 561 | // std::cout << name_ << ", value: " << values_[curr_idx_] << std::endl; 562 | return values_[curr_idx_]; 563 | } 564 | double LibDNNTunerParamReal::get_value() { 565 | // std::cout << name_ << ", value: " << values_[curr_idx_] << std::endl; 566 | return values_[curr_idx_]; 567 | } 568 | bool LibDNNTunerParamBool::get_value() { 569 | // std::cout << name_ << ", value: " << values_[curr_idx_] << std::endl; 570 | return values_[curr_idx_]; 571 | } 572 | 573 | const std::vector& LibDNNTunerParamInt::get_values() { 574 | return values_; 575 | } 576 | const std::vector& LibDNNTunerParamReal::get_values() { 577 | return values_; 578 | } 579 | const std::vector& LibDNNTunerParamBool::get_values() { 580 | return values_; 581 | } 582 | 583 | 584 | std::shared_ptr LibDNNTunerParamInt::clone() { 585 | return std::shared_ptr 586 | (new LibDNNTunerParamInt(*this)); 587 | } 588 | 589 | std::shared_ptr LibDNNTunerParamReal::clone() { 590 | return std::shared_ptr 591 | (new LibDNNTunerParamReal(*this)); 592 | } 593 | 594 | std::shared_ptr LibDNNTunerParamBool::clone() { 595 | return std::shared_ptr 596 | (new LibDNNTunerParamBool(*this)); 597 | } 598 | 599 | 600 | void LibDNNTunerParam::update(std::shared_ptr other) { 601 | curr_idx_ = other->get_curr_idx(); 602 | def_idx_ = other->get_def_idx(); 603 | } 604 | 605 | bool LibDNNTunerConstraintBool::evaluate() { 606 | std::vector values; 607 | 608 | for (int_tp i = 0; i < con_params_.size(); ++i) { 609 | values.push_back(tuner_->get_param(con_params_[i])); 610 | } 611 | 612 | return func_(values); 613 | } 614 | 615 | bool LibDNNTunerConstraintInt::evaluate() { 616 | std::vector values; 617 | 618 | for (int_tp i = 0; i < con_params_.size(); ++i) { 619 | values.push_back(tuner_->get_param(con_params_[i])); 620 | } 621 | 622 | return func_(values); 623 | } 624 | 625 | bool LibDNNTunerConstraintReal::evaluate() { 626 | std::vector values; 627 | 628 | for (int_tp i = 0; i < con_params_.size(); ++i) { 629 | values.push_back(tuner_->get_param(con_params_[i])); 630 | } 631 | 632 | return func_(values); 633 | } 634 | 635 | } // namespace libdnn 636 | -------------------------------------------------------------------------------- /src/libdnn_pool.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "common.hpp" 7 | #include "device.hpp" 8 | #include "libdnn.hpp" 9 | #include "benchmark.hpp" 10 | 11 | // #define LIBDNN_DEBUG 1 12 | 13 | namespace greentea { 14 | 15 | template 16 | LibDNNPool::LibDNNPool(LibDNNPoolConfig config) { 17 | config_ = config; 18 | LibDNN::dev_ptr_ = config.dev_ptr; 19 | LibDNN::fast_unsafe_math_ = config.fast_unsafe_math; 20 | int_tp dims = config.in_shape.size(); 21 | int_tp spatial_dims = config.kernel.size(); 22 | 23 | num_axes_ = spatial_dims; 24 | 25 | pool_method_ = config.pool_method; 26 | bwalgo_ = config.bwalgo; 27 | use_top_mask_ = config.use_top_mask; 28 | 29 | skip_range_check_ = true; 30 | 31 | for (int_tp i = 0; i < spatial_dims; ++i) { 32 | kernel_shape_.push_back(config.kernel[i]); 33 | pad_.push_back(config.pad[i]); 34 | if (pad_[i] > 0) { 35 | skip_range_check_ = false; 36 | } 37 | stride_.push_back(config.stride[i]); 38 | dilation_.push_back(config.dilation[i]); 39 | im_in_shape_.push_back(config.in_shape[dims - spatial_dims + i]); 40 | im_out_shape_.push_back(config.out_shape[dims - spatial_dims + i]); 41 | } 42 | 43 | fw_tuner_ = std::shared_ptr(new LibDNNTuner()); 44 | bw_tuner_ = std::shared_ptr(new LibDNNTuner()); 45 | 46 | fw_tuner_->add_range_param("LW0", 8, 4, 16, 4); 47 | bw_tuner_->add_range_param("LW0", 8, 4, 16, 4); 48 | fw_tuner_->add_range_param("LW1", 8, 4, 16, 4); 49 | bw_tuner_->add_range_param("LW1", 8, 4, 16, 4); 50 | 51 | 52 | GenerateKernels(); 53 | LibDNN::CompileKernels(); 54 | } 55 | 56 | template 57 | const LibDNNPoolConfig LibDNNPool::get_config() { 58 | return config_; 59 | } 60 | 61 | 62 | template 63 | std::string LibDNNPool::string_identifier() { 64 | std::stringstream ss; 65 | ss << "POOL_"; 66 | switch (pool_method_) { 67 | case LIBDNN_POOLING_METHOD_MAX: 68 | ss << "MAX_"; 69 | break; 70 | case LIBDNN_POOLING_METHOD_AVE: 71 | ss << "AVE_"; 72 | break; 73 | case LIBDNN_POOLING_METHOD_STO: 74 | ss << "STO_"; 75 | break; 76 | } 77 | if (std::is_same::value) { 78 | ss << "double_"; 79 | } else { 80 | ss << "float_"; 81 | } 82 | // Device name 83 | ss << LibDNN::dev_ptr_->name(); 84 | ss << "_"; 85 | ss << num_axes_ << "D_"; 86 | ss << "IN["; 87 | for (int_tp i = 0; i < im_in_shape_.size(); ++i) { 88 | ss << im_in_shape_[i]; 89 | if (i < im_in_shape_.size() - 1) { 90 | ss << ","; 91 | } 92 | } 93 | ss << "]_OUT["; 94 | for (int_tp i = 0; i < im_out_shape_.size(); ++i) { 95 | ss << im_out_shape_[i]; 96 | if (i < im_out_shape_.size() - 1) { 97 | ss << ","; 98 | } 99 | } 100 | ss << "]_K["; 101 | for (int_tp i = 0; i < kernel_shape_.size(); ++i) { 102 | ss << kernel_shape_[i]; 103 | if (i < kernel_shape_.size() - 1) { 104 | ss << ","; 105 | } 106 | } 107 | ss << "]_S["; 108 | for (int_tp i = 0; i < stride_.size(); ++i) { 109 | ss << stride_[i]; 110 | if (i < stride_.size() - 1) { 111 | ss << ","; 112 | } 113 | } 114 | ss << "]_P["; 115 | for (int_tp i = 0; i < pad_.size(); ++i) { 116 | ss << pad_[i]; 117 | if (i < pad_.size() - 1) { 118 | ss << ","; 119 | } 120 | } 121 | ss << "]_D["; 122 | for (int_tp i = 0; i < dilation_.size(); ++i) { 123 | ss << dilation_[i]; 124 | if (i < dilation_.size() - 1) { 125 | ss << ","; 126 | } 127 | } 128 | ss << "]"; 129 | return ss.str(); 130 | } 131 | 132 | template 133 | std::string LibDNNPool::generate_fw_defs() { 134 | std::stringstream ss; 135 | 136 | // Number of spatial axes 137 | LibDNN::add_def(ss, "v_nax", num_axes_); 138 | 139 | for (int_tp i = 0; i < kernel_shape_.size(); ++i) { 140 | LibDNN::add_def(ss, "v_k_" + std::to_string(i), kernel_shape_[i]); 141 | } 142 | for (int_tp i = 0; i < pad_.size(); ++i) { 143 | LibDNN::add_def(ss, "v_p_" + std::to_string(i), pad_[i]); 144 | } 145 | for (int_tp i = 0; i < stride_.size(); ++i) { 146 | LibDNN::add_def(ss, "v_s_" + std::to_string(i), stride_[i]); 147 | } 148 | for (int_tp i = 0; i < dilation_.size(); ++i) { 149 | LibDNN::add_def(ss, "v_d_" + std::to_string(i), dilation_[i]); 150 | } 151 | 152 | int_tp imsi = 1; 153 | int_tp imso = 1; 154 | for (int_tp i = 0; i < im_in_shape_.size(); ++i) { 155 | LibDNN::add_def(ss, "v_imsi_" + std::to_string(i), im_in_shape_[i]); 156 | imsi *= im_in_shape_[i]; 157 | LibDNN::add_def(ss, "v_imso_" + std::to_string(i), im_out_shape_[i]); 158 | imso *= im_out_shape_[i]; 159 | } 160 | LibDNN::add_def(ss, "v_imsi", imsi); 161 | LibDNN::add_def(ss, "v_imso", imso); 162 | 163 | return ss.str(); 164 | } 165 | 166 | 167 | template 168 | std::string LibDNNPool::generate_bw_defs() { 169 | std::stringstream ss; 170 | 171 | // Number of spatial axes 172 | LibDNN::add_def(ss, "v_nax", num_axes_); 173 | for (int_tp i = 0; i < kernel_shape_.size(); ++i) { 174 | LibDNN::add_def(ss, "v_k_" + std::to_string(i), kernel_shape_[i]); 175 | } 176 | for (int_tp i = 0; i < pad_.size(); ++i) { 177 | LibDNN::add_def(ss, "v_p_" + std::to_string(i), pad_[i]); 178 | } 179 | for (int_tp i = 0; i < stride_.size(); ++i) { 180 | LibDNN::add_def(ss, "v_s_" + std::to_string(i), stride_[i]); 181 | } 182 | for (int_tp i = 0; i < dilation_.size(); ++i) { 183 | LibDNN::add_def(ss, "v_d_" + std::to_string(i), dilation_[i]); 184 | } 185 | 186 | int_tp imsi = 1; 187 | int_tp imso = 1; 188 | for (int_tp i = 0; i < im_in_shape_.size(); ++i) { 189 | LibDNN::add_def(ss, "v_imsi_" + std::to_string(i), im_in_shape_[i]); 190 | imsi *= im_in_shape_[i]; 191 | LibDNN::add_def(ss, "v_imso_" + std::to_string(i), im_out_shape_[i]); 192 | imso *= im_out_shape_[i]; 193 | } 194 | LibDNN::add_def(ss, "v_imsi", imsi); 195 | LibDNN::add_def(ss, "v_imso", imso); 196 | 197 | return ss.str(); 198 | } 199 | 200 | template 201 | std::string LibDNNPool::generate_fw_kernels(std::string name, 202 | bool test_mode) { 203 | std::stringstream ss; 204 | 205 | ss << "__kernel void " + name + "("; 206 | ss << "__global const Dtype* __restrict bottom_data, "; 207 | ss << "__global Dtype* __restrict top_data, "; 208 | if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) { 209 | if (use_top_mask_) { 210 | ss << "__global Dtype* __restrict top_mask, "; 211 | } else { 212 | ss << "__global int_tp* __restrict mask, "; 213 | } 214 | } 215 | if (pool_method_ == LIBDNN_POOLING_METHOD_STO && !test_mode) { 216 | ss << "__global Dtype* __restrict rand_idx, "; 217 | } 218 | ss << "int_tp channels, "; 219 | ss << "int_tp batch_size"; 220 | ss << ") {" << std::endl; 221 | 222 | ss << "int_tp out_idx = get_global_id(0);" << std::endl; 223 | ss << "if (get_global_id(1) >= channels * batch_size) {return;}" << std::endl; 224 | ss << "int_tp idx_0 = get_global_id(0);" << std::endl; 225 | for (int_tp i = num_axes_ - 1; i >= 1; --i) { 226 | ss << "int_tp idx_" << i << " = (idx_0 % v_imso_" << i << ");" << std::endl; 227 | ss << "idx_" << i << " = idx_" << i 228 | << " * v_s_" << i << " - v_p_" << i << ";" << std::endl; 229 | ss << "idx_0 /= v_imso_" << i << ";" << std::endl; 230 | } 231 | ss << "if (idx_0 >= v_imso_0) {return;}" << std::endl; 232 | ss << "idx_0 = idx_0 * v_s_0 - v_p_0;" << std::endl; 233 | ss << "int_tp in_idx = idx_0;" << std::endl; 234 | for (int_tp i = 1; i < num_axes_; ++i) { 235 | ss << "in_idx = in_idx * v_imsi_" << i 236 | << " + " << "idx_" << i << ";" << std::endl; 237 | } 238 | ss << "__global const Dtype* in_ptr = bottom_data + " 239 | << "get_global_id(1) * v_imsi + in_idx;" << std::endl; 240 | ss << "__global Dtype* out_ptr = top_data + " 241 | << "get_global_id(1) * v_imso;" << std::endl; 242 | 243 | if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) { 244 | if (use_top_mask_) { 245 | ss << "__global Dtype* mask_ptr = top_mask + get_global_id(1) * v_imso;" 246 | << std::endl; 247 | } else { 248 | ss << "__global int_tp* mask_ptr = mask + get_global_id(1) * v_imso;" 249 | << std::endl; 250 | } 251 | ss << "Dtype val = -FLT_MAX;" << std::endl; 252 | ss << "int_tp maxidx = -1;" << std::endl; 253 | } 254 | 255 | if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) { 256 | ss << "Dtype val = 0;" << std::endl; 257 | } 258 | 259 | if (pool_method_ == LIBDNN_POOLING_METHOD_STO) { 260 | if (test_mode) { 261 | ss << "Dtype cumsum = FLT_MIN;" << std::endl; 262 | ss << "Dtype cumvalues = 0;" << std::endl; 263 | } else { 264 | ss << "__global Dtype* rand_ptr = rand_idx + get_global_id(1) * v_imso;" 265 | << std::endl; 266 | ss << "Dtype val = 0;" << std::endl; 267 | ss << "Dtype cumsum = 0;" << std::endl; 268 | ss << "int_tp stoidx = -1;" << std::endl; 269 | } 270 | } 271 | 272 | std::vector d_iter; 273 | int_tp curr_idx = 0; 274 | 275 | for (int_tp i = 0; i < kernel_shape_.size(); ++i) { 276 | d_iter.push_back(0); 277 | } 278 | 279 | if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) { 280 | int_tp ave = std::accumulate(kernel_shape_.begin(), 281 | kernel_shape_.end(), 282 | 1, std::multiplies()); 283 | ss << "int_tp ave = " << ave << ";" << std::endl; 284 | } 285 | 286 | for (int_tp sto_idx = 0; 287 | sto_idx < ((pool_method_ == LIBDNN_POOLING_METHOD_STO && !test_mode) 288 | ? 2 : 1); ++sto_idx) { 289 | if (pool_method_ == LIBDNN_POOLING_METHOD_STO && sto_idx == 1) { 290 | ss << "Dtype thres = rand_ptr[out_idx] * cumsum;" << std::endl; 291 | ss << "cumsum = 0;" << std::endl; 292 | } 293 | // Loop over the kernel 294 | bool incremented; 295 | do { 296 | int_tp kernel_offset = 0; 297 | int_tp size_prod = 1; 298 | for (int_tp i = num_axes_ - 1; i >= 0; --i) { 299 | kernel_offset += size_prod * d_iter[i] * dilation_[i]; 300 | size_prod *= im_in_shape_[i]; 301 | } 302 | 303 | bool max_guard = false; 304 | bool pad_guard = false; 305 | bool overspill_guard = false; 306 | for (int_tp i = 0; i < num_axes_; ++i) { 307 | if (d_iter[i] * dilation_[i] < pad_[i]) { 308 | pad_guard = true; 309 | } 310 | if (d_iter[i] * dilation_[i] >= 311 | ((kernel_shape_[i] - 1) * dilation_[i] + 1) - pad_[i] || 312 | (im_out_shape_[i] - 1) * stride_[i] + d_iter[i] 313 | * dilation_[i] - pad_[i] >= im_in_shape_[i] ) { 314 | pad_guard = true; 315 | } 316 | if ((im_out_shape_[i] - 1) * stride_[i] + d_iter[i] 317 | * dilation_[i] - pad_[i] >= im_in_shape_[i]) { 318 | overspill_guard = true; 319 | } 320 | } 321 | if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) { 322 | max_guard = true; 323 | } 324 | 325 | if (max_guard || pad_guard || overspill_guard) { 326 | ss << "if ("; 327 | } 328 | if (pad_guard || overspill_guard) { 329 | for (int_tp i = 0; i < num_axes_; ++i) { 330 | if (d_iter[i] * dilation_[i] < pad_[i]) { 331 | ss << "idx_" << i << " >= -" << (d_iter[i] * dilation_[i]) 332 | << " && "; 333 | } 334 | if ((d_iter[i] * dilation_[i] >= ((kernel_shape_[i] - 1) 335 | * dilation_[i] + 1) - pad_[i]) || 336 | ((im_out_shape_[i] - 1) * stride_[i] 337 | + d_iter[i] * dilation_[i] - pad_[i] 338 | >= im_in_shape_[i])) { 339 | ss << "idx_" << i << " < v_imsi_" << i << " - " 340 | << (d_iter[i] * dilation_[i]) << " && "; 341 | } 342 | } 343 | } 344 | if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) { 345 | if (max_guard || pad_guard || overspill_guard) { 346 | ss << "in_ptr[" << kernel_offset << "] > val) {" << std::endl; 347 | } 348 | ss << "maxidx = in_idx + " << kernel_offset << ";" << std::endl; 349 | ss << "val = in_ptr[" << kernel_offset << "];" << std::endl; 350 | if (max_guard || pad_guard || overspill_guard) { 351 | ss << "}" << std::endl; 352 | } 353 | } 354 | if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) { 355 | if (pad_guard || overspill_guard) { 356 | ss << "true) {" << std::endl; 357 | } 358 | ss << "val += in_ptr[" << kernel_offset << "];" << std::endl; 359 | if (pad_guard || overspill_guard) { 360 | ss << "}" << std::endl; 361 | } 362 | if (overspill_guard) { 363 | ss << "if ("; 364 | for (int_tp i = 0; i < num_axes_; ++i) { 365 | if ((im_out_shape_[i] - 1) * stride_[i] 366 | + d_iter[i] * dilation_[i] - pad_[i] 367 | >= im_in_shape_[i]) { 368 | ss << "idx_" << i << " + " << d_iter[i] * dilation_[i] 369 | << " >= v_imsi_" << i << " + " 370 | << pad_[i] << " || "; 371 | } 372 | } 373 | ss << "false) {--ave;}" << std::endl; 374 | } 375 | } 376 | if (pool_method_ == LIBDNN_POOLING_METHOD_STO) { 377 | if (pad_guard || overspill_guard) { 378 | ss << "true) {" << std::endl; 379 | } 380 | ss << "cumsum += in_ptr[" << kernel_offset << "];" << std::endl; 381 | if (test_mode) { 382 | ss << "cumvalues += in_ptr[" << kernel_offset << "]" 383 | << " * in_ptr[" << kernel_offset << "];" << std::endl; 384 | } else { 385 | if (sto_idx == 1) { 386 | // Second pass 387 | ss << "if (cumsum > thres) {" << std::endl; 388 | ss << "stoidx = in_idx + " << kernel_offset << ";" << std::endl; 389 | ss << "val = in_ptr[" << kernel_offset << "];" << std::endl; 390 | ss << "thres = FLT_MAX;" << std::endl; 391 | ss << "}" << std::endl; 392 | } 393 | } 394 | if (pad_guard || overspill_guard) { 395 | ss << "}" << std::endl; 396 | } 397 | } 398 | 399 | incremented = false; 400 | for (int_tp i = num_axes_ - 1; i >= 0; --i) { 401 | if (d_iter[i] >= kernel_shape_[i] - 1) { 402 | d_iter[i] = 0; 403 | } else { 404 | d_iter[i] += 1; 405 | incremented = true; 406 | break; 407 | } 408 | } 409 | } while (incremented); 410 | } 411 | 412 | // Write out the pooling result 413 | if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) { 414 | ss << "out_ptr[out_idx] = val / ((Dtype)ave);" << std::endl; 415 | } 416 | if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) { 417 | ss << "out_ptr[out_idx] = val;" << std::endl; 418 | ss << "mask_ptr[out_idx] = (Dtype)maxidx;" << std::endl; 419 | } 420 | if (pool_method_ == LIBDNN_POOLING_METHOD_STO) { 421 | if (test_mode) { 422 | ss << "out_ptr[out_idx] = cumvalues / cumsum;" << std::endl; 423 | } else { 424 | ss << "out_ptr[out_idx] = val;" << std::endl; 425 | ss << "rand_ptr[out_idx] = (Dtype)stoidx;" << std::endl; 426 | } 427 | } 428 | 429 | ss << "}" << std::endl; // Kernel 430 | return ss.str(); 431 | } 432 | 433 | template 434 | std::string LibDNNPool::generate_fwtr_kernels(std::string name) { 435 | std::stringstream ss; 436 | ss << generate_fw_kernels(name, false); 437 | return ss.str(); 438 | } 439 | 440 | template 441 | std::string LibDNNPool::generate_fwte_kernels(std::string name) { 442 | std::stringstream ss; 443 | ss << generate_fw_kernels(name, true); 444 | return ss.str(); 445 | } 446 | 447 | 448 | 449 | template 450 | std::string LibDNNPool::generate_bw_kernels(std::string name) { 451 | std::stringstream ss; 452 | 453 | ss << "__kernel void " + name + "("; 454 | ss << "__global const Dtype* __restrict top_diff, "; 455 | ss << "__global Dtype* __restrict bottom_diff, "; 456 | if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) { 457 | if (use_top_mask_) { 458 | ss << "__global const Dtype* __restrict top_mask, "; 459 | } else { 460 | ss << "__global const int_tp* __restrict mask, "; 461 | } 462 | } 463 | if (pool_method_ == LIBDNN_POOLING_METHOD_STO) { 464 | ss << "__global const Dtype* __restrict rand_idx, "; 465 | } 466 | ss << "int_tp channels, "; 467 | ss << "int_tp batch_size"; 468 | ss << ") {" << std::endl; 469 | if (bwalgo_ == LIBDNN_POOLING_BW_ALGO_ATOMIC) { 470 | // Atomic kernel 471 | ss << "int_tp in_idx = get_global_id(0);" << std::endl; 472 | ss << "if (get_global_id(1) >= channels * batch_size) {return;}" 473 | << std::endl; 474 | ss << "int_tp idx_0 = get_global_id(0);" << std::endl; 475 | for (int_tp i = num_axes_ - 1; i >= 1; --i) { 476 | ss << "int_tp idx_" << i << " = (idx_0 % v_imso_" << i << ");" 477 | << std::endl; 478 | ss << "idx_" << i << " = idx_" << i << " * v_s_" 479 | << i << " - v_p_" << i << ";" << std::endl; 480 | ss << "idx_0 /= v_imso_" << i << ";" << std::endl; 481 | } 482 | ss << "if (idx_0 >= v_imso_0) {return;}" << std::endl; 483 | 484 | if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) { 485 | ss << "idx_0 = idx_0 * v_s_0 - v_p_0;" << std::endl; 486 | ss << "int_tp out_idx = idx_0;" << std::endl; 487 | for (int_tp i = 1; i < num_axes_; ++i) { 488 | ss << "out_idx = out_idx * v_imsi_" << i 489 | << " + " << "idx_" << i << ";" << std::endl; 490 | } 491 | ss << "__global Dtype* out_ptr = bottom_diff " 492 | << "+ get_global_id(1) * v_imsi + out_idx;" << std::endl; 493 | } else { 494 | ss << "__global Dtype* out_ptr = bottom_diff " 495 | << "+ get_global_id(1) * v_imsi;" << std::endl; 496 | } 497 | ss << "__global const Dtype* in_ptr = top_diff " 498 | << "+ get_global_id(1) * v_imso + in_idx;" << std::endl; 499 | 500 | if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) { 501 | if (use_top_mask_) { 502 | ss << "__global const Dtype* mask_ptr = top_mask " 503 | << "+ get_global_id(1) * v_imso + in_idx;" << std::endl; 504 | } else { 505 | ss << "__global const int_tp* mask_ptr = mask " 506 | << "+ get_global_id(1) * v_imso + in_idx;" << std::endl; 507 | } 508 | } 509 | 510 | if (pool_method_ == LIBDNN_POOLING_METHOD_STO) { 511 | ss << "__global const Dtype* rand_ptr = rand_idx " 512 | << "+ get_global_id(1) * v_imso + in_idx;" << std::endl; 513 | } 514 | 515 | std::vector d_iter; 516 | 517 | for (int_tp i = 0; i < kernel_shape_.size(); ++i) { 518 | d_iter.push_back(0); 519 | } 520 | 521 | if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) { 522 | int_tp ave = std::accumulate(kernel_shape_.begin(), 523 | kernel_shape_.end(), 524 | 1, std::multiplies()); 525 | ss << "int_tp ave = " << ave << ";" << std::endl; 526 | ss << "Dtype val = in_ptr[0];" << std::endl; 527 | } 528 | 529 | for (int_tp ave_idx = 0; 530 | ave_idx < ((pool_method_ == LIBDNN_POOLING_METHOD_AVE) 531 | ? 2 : 0); ++ave_idx) { 532 | if (ave_idx == 1) { 533 | ss << "val /= ((Dtype)ave);" << std::endl; 534 | } 535 | // Loop over the kernel 536 | bool incremented; 537 | do { 538 | int_tp kernel_offset = 0; 539 | int_tp size_prod = 1; 540 | for (int_tp i = num_axes_ - 1; i >= 0; --i) { 541 | kernel_offset += size_prod * d_iter[i] * dilation_[i]; 542 | size_prod *= im_in_shape_[i]; 543 | } 544 | 545 | bool pad_guard = false; 546 | bool overspill_guard = false; 547 | for (int_tp i = 0; i < num_axes_; ++i) { 548 | if (d_iter[i] * dilation_[i] < pad_[i]) { 549 | pad_guard = true; 550 | } 551 | if (d_iter[i] * dilation_[i] >= 552 | ((kernel_shape_[i] - 1) * dilation_[i] + 1) - pad_[i] || 553 | (im_out_shape_[i] - 1) * stride_[i] + d_iter[i] 554 | * dilation_[i] - pad_[i] >= im_in_shape_[i] ) { 555 | pad_guard = true; 556 | } 557 | if ((im_out_shape_[i] - 1) * stride_[i] + d_iter[i] 558 | * dilation_[i] - pad_[i] >= im_in_shape_[i]) { 559 | overspill_guard = true; 560 | } 561 | } 562 | 563 | if ((ave_idx == 1) && (pad_guard || overspill_guard)) { 564 | ss << "if ("; 565 | } 566 | if ((ave_idx == 1) && (pad_guard || overspill_guard)) { 567 | for (int_tp i = 0; i < num_axes_; ++i) { 568 | if (d_iter[i] * dilation_[i] < pad_[i]) { 569 | ss << "idx_" << i << " >= -" << (d_iter[i] * dilation_[i]) 570 | << " && "; 571 | } 572 | if ((d_iter[i] * dilation_[i] >= ((kernel_shape_[i] - 1) 573 | * dilation_[i] + 1) - pad_[i]) || 574 | ((im_out_shape_[i] - 1) * stride_[i] 575 | + d_iter[i] * dilation_[i] - pad_[i] 576 | >= im_in_shape_[i])) { 577 | ss << "idx_" << i << " < v_imsi_" << i << " - " 578 | << (d_iter[i] * dilation_[i]) << " && "; 579 | } 580 | } 581 | } 582 | if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) { 583 | if ((ave_idx == 1) && (pad_guard || overspill_guard)) { 584 | ss << "true) {" << std::endl; 585 | } 586 | if (ave_idx == 1) { 587 | ss << "atomicAdd((&out_ptr[" << kernel_offset << "]), val);" 588 | << std::endl; 589 | } 590 | if ((ave_idx == 1) && (pad_guard || overspill_guard)) { 591 | ss << "}" << std::endl; 592 | } 593 | if (overspill_guard && ave_idx == 0) { 594 | ss << "if ("; 595 | for (int_tp i = 0; i < num_axes_; ++i) { 596 | if ((im_out_shape_[i] - 1) * stride_[i] 597 | + d_iter[i] * dilation_[i] - pad_[i] 598 | >= im_in_shape_[i]) { 599 | ss << "idx_" << i << " + " << d_iter[i] * dilation_[i] 600 | << " >= v_imsi_" << i << " + " 601 | << pad_[i] << " || "; 602 | } 603 | } 604 | ss << "false) {--ave;}" << std::endl; 605 | } 606 | } 607 | 608 | incremented = false; 609 | for (int_tp i = num_axes_ - 1; i >= 0; --i) { 610 | if (d_iter[i] >= kernel_shape_[i] - 1) { 611 | d_iter[i] = 0; 612 | } else { 613 | d_iter[i] += 1; 614 | incremented = true; 615 | break; 616 | } 617 | } 618 | } while (incremented); 619 | } 620 | if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) { 621 | ss << "if (mask_ptr[0] >= 0 && mask_ptr[0] < v_imsi) {" << std::endl; 622 | ss << "atomicAdd(&out_ptr[(int_tp)(mask_ptr[0])], " 623 | << "in_ptr[0]);" << std::endl; 624 | ss << "}" << std::endl; 625 | } 626 | if (pool_method_ == LIBDNN_POOLING_METHOD_STO) { 627 | ss << "if (mask_ptr[0] >= 0 && mask_ptr[0] < v_imsi) {" << std::endl; 628 | ss << "atomicAdd(&out_ptr[(int_tp)(rand_ptr[0])], " 629 | << "in_ptr[0]);" << std::endl; 630 | ss << "}" << std::endl; 631 | } 632 | 633 | } else { 634 | // Direct, deterministic kernel 635 | ss << "int_tp d_start[" << num_axes_ << "];" << std::endl; 636 | ss << "int_tp d_end[" << num_axes_ << "];" << std::endl; 637 | ss << "int_tp d_iter[" << num_axes_ << "];" << std::endl; 638 | 639 | ss << "int_tp out_idx = get_global_id(0);" << std::endl; 640 | ss << "int_tp idx_0 = get_global_id(0);" << std::endl; 641 | ss << "if (get_global_id(1) >= channels * batch_size) {return;}" 642 | << std::endl; 643 | 644 | for (int_tp i = num_axes_ - 1; i >= 1; --i) { 645 | ss << "int_tp idx_" << i << " = (idx_0 % v_imsi_" << i << ");" 646 | << std::endl; 647 | ss << "idx_0 /= v_imsi_" << i << ";" << std::endl; 648 | } 649 | ss << "if (idx_0 >= v_imsi_0) {return;}" << std::endl; 650 | 651 | if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) { 652 | ss << "__global Dtype* out_ptr = bottom_diff " 653 | << "+ get_global_id(1) * v_imsi + out_idx;" << std::endl; 654 | } else { 655 | ss << "__global Dtype* out_ptr = bottom_diff " 656 | << "+ get_global_id(1) * v_imsi + out_idx;" << std::endl; 657 | } 658 | ss << "__global const Dtype* in_ptr = top_diff " 659 | << "+ get_global_id(1) * v_imso;" << std::endl; 660 | 661 | if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) { 662 | if (use_top_mask_) { 663 | ss << "__global const Dtype* mask_ptr = top_mask " 664 | << "+ get_global_id(1) * v_imso;" << std::endl; 665 | } else { 666 | ss << "__global const int_tp* mask_ptr = mask " 667 | << "+ get_global_id(1) * v_imso;" << std::endl; 668 | } 669 | } 670 | 671 | if (pool_method_ == LIBDNN_POOLING_METHOD_STO) { 672 | ss << "__global const Dtype* rand_ptr = rand_idx " 673 | << "+ get_global_id(1) * v_imso;" << std::endl; 674 | } 675 | 676 | for (int_tp i = 0; i < num_axes_; ++i) { 677 | ss << "d_start[" << i << "] = (idx_" << i << " + v_p_" << i << " < " 678 | << "((v_k_" << i << " - 1) * v_d_" << i << " + 1)) ? 0 : (idx_" << i 679 | << " + v_p_" << i 680 | << " - ((v_k_" << i << " - 1) * v_d_" << i << " + 1))" 681 | << " / v_s_" << i << " + 1;" << std::endl; 682 | ss << "d_end[" << i << "] = min(v_imso_" << i << " - 1, " 683 | << "(idx_" << i << " + v_p_" << i << ")" 684 | << " / v_s_" << i << ");" << std::endl; 685 | ss << "d_iter[" << i << "] = d_start[" << i << "];" << std::endl; 686 | ss << "if (d_start[" << i << "] > d_end[" << i << "]) {" << std::endl; 687 | ss << "out_ptr[0] = 0;" << std::endl; 688 | ss << "return;" << std::endl; 689 | ss << "}" << std::endl; 690 | } 691 | 692 | if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) { 693 | ss << "int_tp av_start[" << num_axes_ << "];" << std::endl; 694 | ss << "int_tp av_end[" << num_axes_ << "];" << std::endl; 695 | } 696 | // ss << "printf(\"%f\\n\", (float)ave);" << std::endl; 697 | ss << "Dtype gradient = 0.0;" << std::endl; 698 | ss << "bool incremented;" << std::endl; 699 | ss << "do {" << std::endl; 700 | ss << "int_tp offset = 0;" << std::endl; 701 | for (int_tp i = 0; i < num_axes_; ++i) { 702 | ss << "offset += d_iter[" << i << "];" << std::endl; 703 | if (i < num_axes_ - 1) { 704 | ss << "offset *= v_imso_" << (i + 1) << ";" << std::endl; 705 | } 706 | } 707 | if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) { 708 | ss << "int_tp ave = 1;" << std::endl; 709 | for (int_tp i = 0; i < num_axes_; ++i) { 710 | ss << "av_start[" << i << "] = d_iter[" << i << "] * v_s_" << i 711 | << " - v_p_" << i << ";" << std::endl; 712 | ss << "av_end[" << i << "] = min(av_start[" << i << "] + ((v_k_" 713 | << i << " - 1) * v_d_" 714 | << i << " + 1), v_imsi_" << i << " + v_p_" << i << ");" 715 | << std::endl; 716 | ss << "ave *= ((av_end[" << i << "] - av_start[" << i << "] - 1) / v_d_" 717 | << i << " + 1);" 718 | << std::endl; 719 | } 720 | } 721 | // Dilation filters 722 | bool has_dilation = false; 723 | for (int_tp i = 0; i < num_axes_; ++i) { 724 | if (dilation_[i] > 1) { 725 | has_dilation = true; 726 | } 727 | } 728 | if (has_dilation && 729 | (pool_method_ == LIBDNN_POOLING_METHOD_AVE || 730 | pool_method_ == LIBDNN_POOLING_METHOD_STO)) { 731 | ss << "if ("; 732 | for (int i = 0; i < num_axes_; ++i) { 733 | ss << "idx_" << i << " >= av_start[" << i << "] && "; 734 | ss << "idx_" << i << " < av_end[" << i << "] && "; 735 | ss << "(idx_" << i <<" - av_start[" << i << "]) % v_d_" << i << " == 0" 736 | << " && "; 737 | } 738 | ss << "true) {" << std::endl; 739 | } 740 | if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) { 741 | ss << "if ((int_tp)mask_ptr[offset] == out_idx) {" << std::endl; 742 | } else if (pool_method_ == LIBDNN_POOLING_METHOD_STO) { 743 | ss << "if ((int_tp)rand_ptr[offset] == out_idx) {" << std::endl; 744 | } else { 745 | ss << "{" << std::endl; 746 | } 747 | ss << "gradient += in_ptr[offset]"; 748 | if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) { 749 | ss << " / (Dtype)ave;" << std::endl; 750 | } else { 751 | ss << ";" << std::endl; 752 | } 753 | ss << "}" << std::endl; 754 | if (has_dilation && 755 | (pool_method_ == LIBDNN_POOLING_METHOD_AVE || 756 | pool_method_ == LIBDNN_POOLING_METHOD_STO)) { 757 | ss << "}" << std::endl; 758 | } 759 | // Increment 760 | ss << "incremented = false;" << std::endl; 761 | ss << "for (int_tp i = v_nax - 1; i >= 0; --i) {" << std::endl; 762 | ss << "if (d_iter[i] >= d_end[i]) {" << std::endl; 763 | ss << "d_iter[i] = d_start[i];" << std::endl; 764 | ss << "} else {" << std::endl; 765 | ss << "++d_iter[i];" << std::endl; 766 | ss << "incremented = true;" << std::endl; 767 | ss << "break;" << std::endl; 768 | ss << "}}} while (incremented);" << std::endl; 769 | 770 | ss << "out_ptr[0] = gradient;" << std::endl; 771 | } // Deterministic kernel 772 | ss << "}" << std::endl; // Kernel 773 | 774 | return ss.str(); 775 | } 776 | 777 | template 778 | void LibDNNPool::GenerateKernels() { 779 | std::stringstream ss; 780 | 781 | ss << LibDNN::generate_header(); 782 | ss << generate_fw_defs(); 783 | ss << generate_fwtr_kernels("pool_forward_train"); 784 | ss << generate_fwte_kernels("pool_forward_test"); 785 | ss << generate_bw_defs(); 786 | ss << generate_bw_kernels("pool_backward"); 787 | 788 | // Write complete kernel string 789 | LibDNN::kernel_ = ss.str(); 790 | } 791 | 792 | template 793 | void LibDNNPool::Forward(const Dtype* bottom_data, 794 | Dtype* top_data, 795 | int_tp channels, 796 | int_tp batch_size, 797 | bool test_mode, 798 | int_tp* mask, 799 | Dtype* top_mask, 800 | Dtype* rand_idx) { 801 | int_tp imsi = std::accumulate(im_in_shape_.begin(), im_in_shape_.end(), 802 | 1, std::multiplies()); 803 | int_tp imso = std::accumulate(im_out_shape_.begin(), im_out_shape_.end(), 804 | 1, std::multiplies()); 805 | 806 | int_tp lw0 = fw_tuner_->get_param("LW0"); 807 | int_tp lw1 = fw_tuner_->get_param("LW1"); 808 | 809 | #ifdef USE_OPENCL 810 | if (LibDNN::dev_ptr_->backend() == BACKEND_OpenCL) { 811 | viennacl::ocl::kernel &kernel = 812 | LibDNN::ocl_program_.get_kernel( 813 | test_mode ? "pool_forward_test" : "pool_forward_train"); 814 | viennacl::ocl::context &ctx = 815 | viennacl::ocl::get_context(LibDNN::dev_ptr_->id()); 816 | 817 | kernel.local_work_size(0, lw0); 818 | kernel.local_work_size(1, lw1); 819 | kernel.local_work_size(2, 1); 820 | 821 | kernel.global_work_size(0, ((imso - 1) / lw0 + 1) * lw0); 822 | kernel.global_work_size(1, ((channels * batch_size - 1) / lw1 + 1) * lw1); 823 | kernel.global_work_size(2, 1); 824 | 825 | switch (pool_method_) { 826 | case LIBDNN_POOLING_METHOD_MAX: 827 | if (use_top_mask_) { 828 | viennacl::ocl::enqueue( 829 | kernel(WrapHandle((cl_mem) bottom_data, &ctx), 830 | WrapHandle((cl_mem) top_data, &ctx), 831 | WrapHandle((cl_mem) top_mask, &ctx), 832 | channels, 833 | batch_size), 834 | ctx.get_queue()); 835 | } else { 836 | viennacl::ocl::enqueue( 837 | kernel(WrapHandle((cl_mem) bottom_data, &ctx), 838 | WrapHandle((cl_mem) top_data, &ctx), 839 | WrapHandle((cl_mem) mask, &ctx), 840 | channels, 841 | batch_size), 842 | ctx.get_queue()); 843 | } 844 | break; 845 | case LIBDNN_POOLING_METHOD_AVE: 846 | viennacl::ocl::enqueue( 847 | kernel(WrapHandle((cl_mem) bottom_data, &ctx), 848 | WrapHandle((cl_mem) top_data, &ctx), 849 | channels, 850 | batch_size), 851 | ctx.get_queue()); 852 | break; 853 | case LIBDNN_POOLING_METHOD_STO: 854 | viennacl::ocl::enqueue( 855 | kernel(WrapHandle((cl_mem) bottom_data, &ctx), 856 | WrapHandle((cl_mem) top_data, &ctx), 857 | WrapHandle((cl_mem) rand_idx, &ctx), 858 | channels, 859 | batch_size), 860 | ctx.get_queue()); 861 | break; 862 | } 863 | } 864 | #endif // USE_OPENCL 865 | 866 | #ifdef USE_CUDA 867 | if (LibDNN::dev_ptr_->backend() == BACKEND_CUDA) { 868 | CUfunction kernel; 869 | cuModuleGetFunction(&kernel, LibDNN::cuda_module_, 870 | test_mode ? "pool_forward_test" : "pool_forward_train"); 871 | 872 | switch (pool_method_) { 873 | case LIBDNN_POOLING_METHOD_MAX: { 874 | if (use_top_mask_) { 875 | void *args[] = { &bottom_data, &top_data, &top_mask, 876 | &channels, &batch_size }; 877 | cuLaunchKernel(kernel, 878 | (imso - 1) / lw0 + 1, // Grid X 879 | (channels * batch_size - 1) / lw1 + 1, // Grid Y 880 | 1, // Grid Z 881 | lw0, lw1, 1, // Local 882 | 0, NULL, args, 0); // Arguments 883 | } else { 884 | void *args[] = { &bottom_data, &top_data, &mask, 885 | &channels, &batch_size }; 886 | cuLaunchKernel(kernel, 887 | (imso - 1) / lw0 + 1, // Grid X 888 | (channels * batch_size - 1) / lw1 + 1, // Grid Y 889 | 1, // Grid Z 890 | lw0, lw1, 1, // Local 891 | 0, NULL, args, 0); // Arguments 892 | } 893 | break; 894 | } 895 | case LIBDNN_POOLING_METHOD_AVE: { 896 | void *args[] = { &bottom_data, &top_data, 897 | &channels, &batch_size }; 898 | cuLaunchKernel(kernel, 899 | (imso - 1) / lw0 + 1, // Grid X 900 | (channels * batch_size - 1) / lw1 + 1, // Grid Y 901 | 1, // Grid Z 902 | lw0, lw1, 1, // Local 903 | 0, NULL, args, 0); // Arguments 904 | break; 905 | } 906 | case LIBDNN_POOLING_METHOD_STO: { 907 | void *args[] = { &bottom_data, &top_data, &rand_idx, 908 | &channels, &batch_size }; 909 | cuLaunchKernel(kernel, 910 | (imso - 1) / lw0 + 1, // Grid X 911 | (channels * batch_size - 1) / lw1 + 1, // Grid Y 912 | 1, // Grid Z 913 | lw0, lw1, 1, // Local 914 | 0, NULL, args, 0); // Arguments 915 | break; 916 | } 917 | } 918 | cuCtxSynchronize(); 919 | } 920 | #endif // USE_CUDA 921 | } 922 | 923 | 924 | template 925 | void LibDNNPool::Backward(const Dtype* top_diff, 926 | Dtype* bottom_diff, 927 | int_tp channels, 928 | int_tp batch_size, 929 | const int_tp* mask, 930 | const Dtype* top_mask, 931 | const Dtype* rand_idx) { 932 | int_tp ims = batch_size * channels; 933 | for (int_tp i = 0; i < im_in_shape_.size(); ++i) { 934 | ims *= im_in_shape_[i]; 935 | } 936 | LibDNN::SetMemory(bottom_diff, ims, 0, (Dtype) 0); 937 | 938 | int_tp imsi = std::accumulate(im_in_shape_.begin(), im_in_shape_.end(), 939 | 1, std::multiplies()); 940 | int_tp imso = std::accumulate(im_out_shape_.begin(), im_out_shape_.end(), 941 | 1, std::multiplies()); 942 | 943 | int_tp imsw = 0; 944 | if (bwalgo_ == LIBDNN_POOLING_BW_ALGO_DIRECT) { 945 | // Direct kernel iterates over input size 946 | imsw = imsi; 947 | } else { 948 | // Atomic kernel iterates over output size 949 | imsw = imso; 950 | } 951 | 952 | int_tp lw0 = bw_tuner_->get_param("LW0"); 953 | int_tp lw1 = bw_tuner_->get_param("LW1"); 954 | 955 | #ifdef USE_OPENCL 956 | if (LibDNN::dev_ptr_->backend() == BACKEND_OpenCL) { 957 | viennacl::ocl::kernel &kernel = 958 | LibDNN::ocl_program_.get_kernel("pool_backward"); 959 | viennacl::ocl::context &ctx = 960 | viennacl::ocl::get_context(LibDNN::dev_ptr_->id()); 961 | 962 | kernel.local_work_size(0, lw0); 963 | kernel.local_work_size(1, lw1); 964 | kernel.local_work_size(2, 1); 965 | 966 | kernel.global_work_size(0, ((imsw - 1) / lw0 + 1) * lw0); 967 | kernel.global_work_size(1, ((channels * batch_size - 1) / lw1 + 1) * lw1); 968 | kernel.global_work_size(2, 1); 969 | 970 | switch (pool_method_) { 971 | case LIBDNN_POOLING_METHOD_MAX: 972 | if (use_top_mask_) { 973 | viennacl::ocl::enqueue( 974 | kernel(WrapHandle((cl_mem) top_diff, &ctx), 975 | WrapHandle((cl_mem) bottom_diff, &ctx), 976 | WrapHandle((cl_mem) top_mask, &ctx), 977 | channels, 978 | batch_size), 979 | ctx.get_queue()); 980 | } else { 981 | viennacl::ocl::enqueue( 982 | kernel(WrapHandle((cl_mem) top_diff, &ctx), 983 | WrapHandle((cl_mem) bottom_diff, &ctx), 984 | WrapHandle((cl_mem) mask, &ctx), 985 | channels, 986 | batch_size), 987 | ctx.get_queue()); 988 | } 989 | break; 990 | case LIBDNN_POOLING_METHOD_AVE: 991 | viennacl::ocl::enqueue( 992 | kernel(WrapHandle((cl_mem) top_diff, &ctx), 993 | WrapHandle((cl_mem) bottom_diff, &ctx), 994 | channels, 995 | batch_size), 996 | ctx.get_queue()); 997 | break; 998 | case LIBDNN_POOLING_METHOD_STO: 999 | viennacl::ocl::enqueue( 1000 | kernel(WrapHandle((cl_mem) top_diff, &ctx), 1001 | WrapHandle((cl_mem) bottom_diff, &ctx), 1002 | WrapHandle((cl_mem) rand_idx, &ctx), 1003 | channels, 1004 | batch_size), 1005 | ctx.get_queue()); 1006 | break; 1007 | } 1008 | } 1009 | #endif // USE_OPENCL 1010 | 1011 | #ifdef USE_CUDA 1012 | if (LibDNN::dev_ptr_->backend() == BACKEND_CUDA) { 1013 | CUfunction kernel; 1014 | cuModuleGetFunction(&kernel, LibDNN::cuda_module_, "pool_backward"); 1015 | 1016 | switch (pool_method_) { 1017 | case LIBDNN_POOLING_METHOD_MAX: { 1018 | if (use_top_mask_) { 1019 | void *args[] = { &top_diff, &bottom_diff, &top_mask, 1020 | &channels, &batch_size }; 1021 | cuLaunchKernel(kernel, 1022 | (imsw - 1) / lw0 + 1, // Grid X 1023 | (channels * batch_size - 1) / lw1 + 1, // Grid Y 1024 | 1, // Grid Z 1025 | lw0, lw1, 1, // Local 1026 | 0, NULL, args, 0); // Arguments 1027 | } else { 1028 | void *args[] = { &top_diff, &bottom_diff, &mask, 1029 | &channels, &batch_size }; 1030 | cuLaunchKernel(kernel, 1031 | (imsw - 1) / lw0 + 1, // Grid X 1032 | (channels * batch_size - 1) / lw1 + 1, // Grid Y 1033 | 1, // Grid Z 1034 | lw0, lw1, 1, // Local 1035 | 0, NULL, args, 0); // Arguments 1036 | } 1037 | break; 1038 | } 1039 | case LIBDNN_POOLING_METHOD_AVE: { 1040 | void *args[] = { &top_diff, &bottom_diff, 1041 | &channels, &batch_size }; 1042 | cuLaunchKernel(kernel, 1043 | (imsw - 1) / lw0 + 1, // Grid X 1044 | (channels * batch_size - 1) / lw1 + 1, // Grid Y 1045 | 1, // Grid Z 1046 | lw0, lw1, 1, // Local 1047 | 0, NULL, args, 0); // Arguments 1048 | break; 1049 | } 1050 | case LIBDNN_POOLING_METHOD_STO: { 1051 | void *args[] = { &top_diff, &bottom_diff, &rand_idx, 1052 | &channels, &batch_size }; 1053 | cuLaunchKernel(kernel, 1054 | (imsw - 1) / lw0 + 1, // Grid X 1055 | (channels * batch_size - 1) / lw1 + 1, // Grid Y 1056 | 1, // Grid Z 1057 | lw0, lw1, 1, // Local 1058 | 0, NULL, args, 0); // Arguments 1059 | break; 1060 | } 1061 | } 1062 | cuCtxSynchronize(); 1063 | } 1064 | #endif // USE_CUDA 1065 | } 1066 | 1067 | template class LibDNNPool; 1068 | template class LibDNNPool; 1069 | 1070 | } // namespace greentea 1071 | --------------------------------------------------------------------------------