├── .gitignore
├── README.md
├── src
    ├── common.cpp
    ├── CMakeLists.txt
    ├── benchmark.cpp
    ├── libdnn.cpp
    ├── device.cpp
    ├── libdnn_tuner.cpp
    └── libdnn_pool.cpp
├── cmake
    ├── Templates
    │   ├── greentea_libdnn_config.h.in
    │   ├── GreenteaLibDNNConfig.cmake.in
    │   └── cmake-uninstall.cmake.in
    ├── Dependencies.cmake
    ├── Cuda.cmake
    ├── Modules
    │   ├── FindvecLib.cmake
    │   ├── FindViennaCL.cmake
    │   └── FindOpenCL.cmake
    ├── Misc.cmake
    ├── ConfigGen.cmake
    ├── Summary.cmake
    ├── Targets.cmake
    └── Utils.cmake
├── include
    ├── benchmark.hpp
    ├── device.hpp
    ├── common.hpp
    ├── libdnn_tuner.hpp
    └── libdnn.hpp
├── LICENSE
└── CMakeLists.txt


/.gitignore:
--------------------------------------------------------------------------------
1 | .project
2 | .cproject
3 | build
4 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Greentea LibDNN
2 | Greentea LibDNN - a universal convolution implementation supporting CUDA and OpenCL
3 | 


--------------------------------------------------------------------------------
/src/common.cpp:
--------------------------------------------------------------------------------
 1 | #include "common.hpp"
 2 | 
 3 | namespace greentea {
 4 | 
 5 | #ifdef USE_OPENCL
 6 | 
 7 | viennacl::ocl::handle<cl_mem> WrapHandle(cl_mem in,
 8 |                                          viennacl::ocl::context *ctx) {
 9 |   if (in != nullptr) {
10 |     // Valid cl_mem object, wrap to ViennaCL and return handle.
11 |     viennacl::ocl::handle<cl_mem> memhandle(in, *ctx);
12 |     memhandle.inc();
13 |     return memhandle;
14 |   } else {
15 |     // Trick to pass nullptr via ViennaCL into OpenCL kernels.
16 |     viennacl::ocl::handle<cl_mem> memhandle;
17 |     return memhandle;
18 |   }
19 | }
20 | 
21 | #endif
22 | 
23 | 
24 | }  // namespace greentea
25 | 


--------------------------------------------------------------------------------
/cmake/Templates/greentea_libdnn_config.h.in:
--------------------------------------------------------------------------------
 1 | #ifndef GREENTEA_LIBDNN_CONFIG_HPP_
 2 | #define GREENTEA_LIBDNN_CONFIG_HPP_
 3 | 
 4 | /* Version */
 5 | #define GREENTEA_VERSION "${GREENTEA_TARGET_VERSION}"
 6 | 
 7 | /* Sources directory */
 8 | #define SOURCE_FOLDER "${PROJECT_SOURCE_DIR}"
 9 | 
10 | /* Binaries directory */
11 | #define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
12 | 
13 | /* 64 bit indexing */
14 | #cmakedefine USE_INDEX_64
15 | 
16 | /* NVIDIA Cuda */
17 | #cmakedefine HAVE_CUDA
18 | #cmakedefine USE_CUDA
19 | 
20 | /* OpenCl kernels */
21 | #cmakedefine HAVE_OPENCL
22 | #cmakedefine USE_OPENCL
23 | #cmakedefine VIENNACL_WITH_OPENCL
24 | 
25 | #define CMAKE_SOURCE_DIR "src/"
26 | #define CMAKE_EXT ""
27 | 
28 | #endif  // GREENTEA_LIBDNN_CONFIG_HPP_
29 | 


--------------------------------------------------------------------------------
/cmake/Templates/GreenteaLibDNNConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | # Config file for the Greentea LibDNN package.
 2 | #
 3 | # After successful configuration the following variables
 4 | # will be defined:
 5 | #
 6 | #   GREENTEA_INCLUDE_DIRS - Greentea include directories
 7 | #   GREENTEA_LIBRARIES    - libraries to link against
 8 | #   GREENTEA_LIBRARY_DIR  - the directory to the library
 9 | #   GREENTEA_DOUND        - boolean var telling us if the 
10 | #                           package was found
11 | 
12 | set(GREENTEA_VERSION @PROJECT_VERSION@)
13 | 
14 | @PACKAGE_INIT@
15 | 
16 | if(NOT TARGET @PROJECT_LIBRARY_TARGET_NAME@)
17 |     include("${CMAKE_CURRENT_LIST_DIR}/@CMAKE_TARGETS_FILE@")
18 | endif()
19 | 
20 | set_and_check(GREENTEA_INCLUDE_DIRS "@PACKAGE_INCLUDE_INSTALL_DIR@")
21 | set_and_check(GREENTEA_LIBRARY_DIR  "@PACKAGE_LIB_INSTALL_DIR@")
22 | 
23 | set(GREENTEA_FOUND TRUE)
24 | 


--------------------------------------------------------------------------------
/cmake/Dependencies.cmake:
--------------------------------------------------------------------------------
 1 | # This list is required for static linking and exported to GreenteaLibDNNConfig.cmake
 2 | set(GREENTEA_LINKER_LIBS "")
 3 | 
 4 | # ---[ CUDA
 5 | include(cmake/Cuda.cmake)
 6 | if(NOT HAVE_CUDA)
 7 |   if(NOT USE_CUDA)
 8 |     message(STATUS "-- CUDA is disabled. Building without it...")
 9 |   else()
10 |     set(USE_CUDA OFF)
11 |     message(WARNING "-- CUDA is not detected by cmake. Building without it...")
12 |   endif()
13 | endif()
14 | 
15 | # ---[ OpenCL & ViennaCL
16 | if(USE_OPENCL)
17 |   find_package(OpenCL QUIET)
18 |   if(NOT HAVE_OPENCL)
19 |     message(FATAL_ERROR "OpenCL required for OpenCL but not found.")
20 |   endif()
21 |   find_package(ViennaCL)
22 |   if(NOT HAVE_VIENNACL)
23 |     message(FATAL_ERROR "ViennaCL required for OpenCL but not found.")
24 |   endif()
25 |   include_directories(SYSTEM ${VIENNACL_INCLUDE_DIRS})
26 |   list(APPEND GREENTEA_LINKER_LIBS ${VIENNACL_LIBRARIES})
27 |   set(VIENNACL_WITH_OPENCL ${VIENNACL_WITH_OPENCL})
28 | endif()
29 | 


--------------------------------------------------------------------------------
/cmake/Cuda.cmake:
--------------------------------------------------------------------------------
 1 | if(NOT USE_CUDA)
 2 |   return()
 3 | endif()
 4 | 
 5 | find_package(CUDA 7.5 QUIET)
 6 | find_cuda_helper_libs(curand)  # cmake 2.8.7 compartibility which doesn't search for curand
 7 | 
 8 | if(NOT CUDA_FOUND)
 9 |   return()
10 | endif()
11 | 
12 | set(HAVE_CUDA TRUE)
13 | message(STATUS "CUDA detected: " ${CUDA_VERSION})
14 | include_directories(SYSTEM ${CUDA_INCLUDE_DIRS})
15 | list(APPEND GREENTEA_LINKER_LIBS ${CUDA_CUDART_LIBRARY}
16 |                               ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
17 | 
18 | mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
19 | mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
20 | 
21 | # Handle clang/libc++ issue
22 | if(APPLE)
23 |   greentea_detect_darwin_version(OSX_VERSION)
24 | 
25 |   # OSX 10.9 and higher uses clang/libc++ by default which is incompartible with old CUDA toolkits
26 |   if(OSX_VERSION VERSION_GREATER 10.8)
27 |     # enabled by default if and only if CUDA version is less than 7.0
28 |     greentea_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
29 |   endif()
30 | endif()
31 | 


--------------------------------------------------------------------------------
/cmake/Templates/cmake-uninstall.cmake.in:
--------------------------------------------------------------------------------
 1 | if(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
 2 |   message(FATAL_ERROR "Cannot find install manifest: @CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
 3 | endif(NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
 4 | 
 5 | file(READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
 6 | string(REGEX REPLACE "\n" ";" files "${files}")
 7 | foreach(file ${files})
 8 |   message(STATUS "Uninstalling $ENV{DESTDIR}${file}")
 9 |   if(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
10 |     exec_program(
11 |       "@CMAKE_COMMAND@" ARGS "-E remove \"$ENV{DESTDIR}${file}\""
12 |       OUTPUT_VARIABLE rm_out
13 |       RETURN_VALUE rm_retval
14 |       )
15 |     if(NOT "${rm_retval}" STREQUAL 0)
16 |       message(FATAL_ERROR "Problem when removing $ENV{DESTDIR}${file}")
17 |     endif(NOT "${rm_retval}" STREQUAL 0)
18 |   else(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
19 |     message(STATUS "File $ENV{DESTDIR}${file} does not exist.")
20 |   endif(IS_SYMLINK "$ENV{DESTDIR}${file}" OR EXISTS "$ENV{DESTDIR}${file}")
21 | endforeach(file)
22 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindvecLib.cmake:
--------------------------------------------------------------------------------
 1 | # Find the vecLib libraries as part of Accelerate.framework or as standalone framework
 2 | #
 3 | # The following are set after configuration is done:
 4 | #  VECLIB_FOUND
 5 | #  vecLib_INCLUDE_DIR
 6 | #  vecLib_LINKER_LIBS
 7 | 
 8 | 
 9 | if(NOT APPLE)
10 |   return()
11 | endif()
12 | 
13 | set(__veclib_include_suffix "Frameworks/vecLib.framework/Versions/Current/Headers")
14 | 
15 | find_path(vecLib_INCLUDE_DIR vecLibTypes.h
16 |           DOC "vecLib include directory"
17 |           PATHS /System/Library/${__veclib_include_suffix}
18 |                 /System/Library/Frameworks/Accelerate.framework/Versions/Current/${__veclib_include_suffix}
19 |                 /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk/System/Library/Frameworks/Accelerate.framework/Versions/Current/Frameworks/vecLib.framework/Headers/)
20 | 
21 | include(FindPackageHandleStandardArgs)
22 | find_package_handle_standard_args(vecLib DEFAULT_MSG vecLib_INCLUDE_DIR)
23 | 
24 | if(VECLIB_FOUND)
25 |   if(vecLib_INCLUDE_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
26 |     set(vecLib_LINKER_LIBS -lcblas "-framework vecLib")
27 |     message(STATUS "Found standalone vecLib.framework")
28 |   else()
29 |     set(vecLib_LINKER_LIBS -lcblas "-framework Accelerate")
30 |     message(STATUS "Found vecLib as part of Accelerate.framework")
31 |   endif()
32 | 
33 |   mark_as_advanced(vecLib_INCLUDE_DIR)
34 | endif()
35 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | # --[ Greenea LibDNN library
 2 | 
 3 | # TODO(naibaf7): Check if it's needed or not since with
 4 | # this macro we get `hdrs` and `srcs` in one, but below
 5 | # we will need only `hdrs` to be copied in the install prefix.
 6 | 
 7 | # creates 'srcs' lists
 8 | #greentea_pickup_greentea_sources(${PROJECT_SOURCE_DIR})
 9 | 
10 | file(GLOB_RECURSE srcs "${PROJECT_SOURCE_DIR}/src/*.cpp")
11 | file(GLOB_RECURSE hdrs "${PROJECT_SOURCE_DIR}/include/*.hpp")
12 | 
13 | add_library(${PROJECT_LIBRARY_TARGET_NAME} ${srcs} ${hdrs})
14 | 
15 | # configure the library target
16 | target_include_directories(
17 |     ${PROJECT_LIBRARY_TARGET_NAME} PUBLIC
18 |     $<BUILD_INTERFACE:${PROJECT_SOURCE_DIR}/include>
19 |     $<INSTALL_INTERFACE:include/greentea>)
20 | 
21 | target_link_libraries(
22 |     ${PROJECT_LIBRARY_TARGET_NAME} ${GREENTEA_LINKER_LIBS})
23 | 
24 | # TODO(naibaf7): Is it needed?
25 | # greentea_default_properties(greentea_libdnn)
26 | 
27 | set_target_properties(${PROJECT_LIBRARY_TARGET_NAME} PROPERTIES
28 |     VERSION   ${PROJECT_VERSION}
29 |     SOVERSION ${PROJECT_VERSION})
30 | 
31 | # ---[ Install and export library
32 | 
33 | install(FILES ${hdrs} "${CMAKE_BINARY_DIR}/greentea_libdnn_config.h"
34 |         DESTINATION ${INCLUDE_INSTALL_DIR})
35 | 
36 | install(TARGETS ${PROJECT_LIBRARY_TARGET_NAME}
37 |         EXPORT  ${CMAKE_TARGETS_NAME}
38 |         RUNTIME DESTINATION lib
39 |         ARCHIVE DESTINATION lib
40 |         LIBRARY DESTINATION lib)
41 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindViennaCL.cmake:
--------------------------------------------------------------------------------
 1 | SET(VIENNACL_WITH_OPENCL TRUE)
 2 | 
 3 | SET(VIENNACL_INCLUDE_SEARCH_PATHS
 4 |   .
 5 |   ..
 6 |   ../ViennaCL
 7 |   ../viennacl-dev
 8 |   /usr/include
 9 |   /usr/local/include
10 |   /opt/VIENNACL
11 |   $ENV{VIENNACL_HOME}
12 | )
13 | 
14 | SET(VIENNACL_FOUND OFF)
15 | 
16 | FIND_PATH(VIENNACL_INCLUDE_DIR NAMES viennacl/version.hpp PATHS ${VIENNACL_INCLUDE_SEARCH_PATHS} DOC "Include for ViennaCL")
17 | 
18 | SET(VIENNACL_FOUND ON)
19 | 
20 | #    Check include files
21 | IF(NOT VIENNACL_INCLUDE_DIR)
22 |     MESSAGE(STATUS "Could not find VIENNACL include. Turning VIENNACL_FOUND off")
23 | ENDIF()
24 | 
25 | IF (VIENNACL_FOUND)
26 |   IF (NOT VIENNACL_FIND_QUIETLY)
27 |     MESSAGE(STATUS "Found ViennaCL include: ${VIENNACL_INCLUDE_DIR}")
28 |   ENDIF (NOT VIENNACL_FIND_QUIETLY)
29 | ELSE (VIENNACL_FOUND)
30 |   IF (VIENNACL_FIND_REQUIRED)
31 |     MESSAGE(FATAL_ERROR "Could not find VIENNACL")
32 |   ENDIF (VIENNACL_FIND_REQUIRED)
33 | ENDIF (VIENNACL_FOUND)
34 | 
35 | IF(VIENNACL_WITH_OPENCL)
36 |   find_package(OpenCL)
37 | ENDIF(VIENNACL_WITH_OPENCL)
38 | 
39 | LIST( APPEND VIENNACL_INCLUDE_DIRS ${VIENNACL_INCLUDE_DIR} ${OPENCL_INCLUDE_DIRS} )
40 | LIST( APPEND VIENNACL_LIBRARIES ${OPENCL_LIBRARIES} )
41 | LIST( REMOVE_DUPLICATES VIENNACL_INCLUDE_DIRS )
42 | LIST( REMOVE_DUPLICATES VIENNACL_LIBRARIES )
43 | 
44 | SET( HAVE_VIENNACL TRUE )
45 | message(STATUS "ViennaCL detected: " ${VIENNACL_INCLUDE_DIRS})
46 | 
47 | MARK_AS_ADVANCED(
48 |   VIENNACL_INCLUDE_DIR
49 |   VIENNACL_INCLUDE_DIRS
50 |   VIENNACL_LIBRARIES
51 | )
52 | 


--------------------------------------------------------------------------------
/include/benchmark.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef GREENTEA_BENCHMARK_HPP_
 2 | #define GREENTEA_BENCHMARK_HPP_
 3 | 
 4 | #include <chrono>
 5 | #include "common.hpp"
 6 | #include "device.hpp"
 7 | 
 8 | namespace greentea {
 9 | 
10 | class Timer {
11 |  public:
12 |   Timer(device* dev_ptr);
13 |   virtual ~Timer();
14 |   virtual void Start();
15 |   virtual void Stop();
16 |   virtual float MilliSeconds();
17 |   virtual float MicroSeconds();
18 |   virtual float Seconds();
19 | 
20 |   inline bool initted() { return initted_; }
21 |   inline bool running() { return running_; }
22 |   inline bool has_run_at_least_once() { return has_run_at_least_once_; }
23 | 
24 |  protected:
25 |   void Init();
26 | 
27 |   device* dev_ptr_;
28 |   bool initted_;
29 |   bool running_;
30 |   bool has_run_at_least_once_;
31 | #ifdef USE_CUDA
32 |   cudaEvent_t start_gpu_cuda_;
33 |   cudaEvent_t stop_gpu_cuda_;
34 | #endif  // USE_CUDA
35 | #ifdef USE_OPENCL
36 |   cl_event start_gpu_cl_;
37 |   cl_event stop_gpu_cl_;
38 | #endif  // USE_OPENCL
39 |   std::chrono::time_point<std::chrono::high_resolution_clock> start_cpu_;
40 |   std::chrono::time_point<std::chrono::high_resolution_clock> stop_cpu_;
41 |   float elapsed_milliseconds_;
42 |   float elapsed_microseconds_;
43 | };
44 | 
45 | class CPUTimer : public Timer {
46 |  public:
47 |   explicit CPUTimer(device* dev_ptr);
48 |   virtual ~CPUTimer() {}
49 |   virtual void Start();
50 |   virtual void Stop();
51 |   virtual float MilliSeconds();
52 |   virtual float MicroSeconds();
53 | };
54 | 
55 | }  // namespace greentea
56 | 
57 | #endif  // GREENTEA_BENCHMARK_HPP_
58 | 


--------------------------------------------------------------------------------
/include/device.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef GREENTEA_DEVICE_HPP_
 2 | #define GREENTEA_DEVICE_HPP_
 3 | 
 4 | #include <string>
 5 | #include <vector>
 6 | 
 7 | #include "common.hpp"
 8 | 
 9 | 
10 | using std::vector;
11 | 
12 | namespace greentea {
13 | 
14 | class device {
15 |  public:
16 |   explicit device();
17 |   explicit device(int id, int list_id, Backend backend);
18 |   Backend backend() const;
19 |   int id() const;
20 |   int list_id() const;
21 |   int current_queue_id();
22 |   int workgroup_size(int id);
23 | 
24 | #ifdef USE_OPENCL
25 |   static void setupViennaCLContext(int id,
26 |                                    const cl_context ctx,
27 |                                    const cl_device_id dev,
28 |                                    const cl_command_queue queue);
29 | 
30 |   viennacl::ocl::program& program();
31 |   void SetProgram();
32 |   bool is_host_unified();
33 | #endif  // USE_OPENCL
34 | 
35 |   int num_queues();
36 |   void SwitchQueue(int id);
37 |   void FinishQueues();
38 | 
39 |   void Init();
40 | 
41 |   uint_tp memory_usage();
42 |   uint_tp peak_memory_usage();
43 |   std::string name();
44 |   void IncreaseMemoryUsage(uint_tp bytes);
45 |   void DecreaseMemoryUsage(uint_tp bytes);
46 |   void ResetPeakMemoryUsage();
47 |   bool CheckCapability(std::string cap);
48 |   bool CheckVendor(std::string vendor);
49 |   bool CheckType(std::string type);
50 | 
51 |  private:
52 |   int current_queue_id_;
53 |   std::vector<int> workgroup_sizes_;
54 |   int id_;
55 |   int list_id_;
56 |   Backend backend_;
57 |   uint_tp memory_usage_;
58 |   uint_tp peak_memory_usage_;
59 |   bool host_unified_;
60 |   std::string name_;
61 | #ifdef USE_OPENCL
62 |   viennacl::ocl::program ocl_program_;
63 | #endif  // USE_OPENCL
64 | };
65 | }  // namespace greentea
66 | 
67 | #endif  // GREENTEA_DEVICE_HPP_
68 | 


--------------------------------------------------------------------------------
/include/common.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef GREENTEA_COMMON_HPP_
 2 | #define GREENTEA_COMMON_HPP_
 3 | 
 4 | #include <iostream>
 5 | #include <type_traits>
 6 | #include <sstream>
 7 | #include <iomanip>
 8 | 
 9 | #include "greentea_libdnn_config.h"
10 | 
11 | // #define LIBDNN_DEBUG 1
12 | // #define VIENNACL_DEBUG_ALL 1
13 | 
14 | #ifdef USE_OPENCL
15 | #define VIENNACL_PROFILING_ENABLED
16 | #include "viennacl/backend/opencl.hpp"
17 | #include "viennacl/ocl/backend.hpp"
18 | #include "viennacl/ocl/context.hpp"
19 | #include "viennacl/ocl/device.hpp"
20 | #include "viennacl/ocl/platform.hpp"
21 | #endif  // USE_OPENCL
22 | 
23 | #ifdef USE_CUDA
24 | #include "cuda.h"
25 | #include "nvrtc.h"
26 | #include <cuda_runtime.h>
27 | #include <curand.h>
28 | #include <driver_types.h>
29 | #endif  // USE_CUDA
30 | 
31 | #ifndef GREENTEA_QUEUE_COUNT
32 | #define GREENTEA_QUEUE_COUNT 1
33 | #endif
34 | 
35 | #ifndef CUDA_NUM_THREADS
36 | #define CUDA_NUM_THREADS 1
37 | #endif
38 | 
39 | #ifdef USE_OPENCL
40 | #ifndef VIENNACL_WITH_OPENCL
41 | #define VIENNACL_WITH_OPENCL
42 | #endif  // VIENNACL_WITH_OPENCL
43 | #endif  // USE_OPENCL
44 | 
45 | #ifdef USE_INDEX_64
46 | #define int_tp int64_t
47 | #define uint_tp uint64_t
48 | #else
49 | #define int_tp int32_t
50 | #define uint_tp uint32_t
51 | #endif
52 | 
53 | namespace greentea {
54 | 
55 | #ifdef USE_OPENCL
56 | viennacl::ocl::handle<cl_mem> WrapHandle(cl_mem in,
57 |                                          viennacl::ocl::context *ctx);
58 | #endif
59 | 
60 | enum Backend {
61 |   BACKEND_CPU,
62 |   BACKEND_CUDA,
63 |   BACKEND_OpenCL
64 | };
65 | 
66 | template<typename T, typename U>
67 | struct is_same {
68 |   static const bool value = false;
69 | };
70 | 
71 | template<typename T>
72 | struct is_same<T, T> {
73 |   static const bool value = true;
74 | };
75 | 
76 | }
77 | 
78 | #endif  // GREENTEA_COMMON_HPP_
79 | 


--------------------------------------------------------------------------------
/cmake/Misc.cmake:
--------------------------------------------------------------------------------
 1 | if(FALSE)
 2 | # ---[ Configuration types
 3 | set(CMAKE_CONFIGURATION_TYPES "Debug;Release"
 4 |     CACHE STRING "Possible configurations" FORCE)
 5 | mark_as_advanced(CMAKE_CONFIGURATION_TYPES)
 6 | 
 7 | if(DEFINED CMAKE_BUILD_TYPE)
 8 |   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY
 9 |                STRINGS ${CMAKE_CONFIGURATION_TYPES})
10 | endif()
11 | 
12 | # --[ If user doesn't specify build type then assume release
13 | if("${CMAKE_BUILD_TYPE}" STREQUAL "")
14 |   set(CMAKE_BUILD_TYPE Release)
15 | endif()
16 | 
17 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
18 |   set(CMAKE_COMPILER_IS_CLANGXX TRUE)
19 | endif()
20 | 
21 | # ---[ Solution folders
22 | greentea_option(USE_PROJECT_FOLDERS "IDE Solution folders" (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) )
23 | 
24 | if(USE_PROJECT_FOLDERS)
25 |   set_property(GLOBAL PROPERTY USE_FOLDERS ON)
26 |   set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMakeTargets")
27 | endif()
28 | 
29 | # ---[ Install options
30 | if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
31 |   set(CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/install"
32 |       CACHE PATH "Default install path" FORCE)
33 | endif()
34 | 
35 | if(FALSE)
36 | 
37 | # ---[ RPATH settings
38 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Use link paths for shared library rpath")
39 | set(CMAKE_MACOSX_RPATH TRUE)
40 | 
41 | list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES ${CMAKE_INSTALL_PREFIX}/lib __is_systtem_dir)
42 | if(${__is_systtem_dir} STREQUAL -1)
43 |   set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib)
44 | endif()
45 | 
46 | # ---[ Funny target
47 | if(UNIX OR APPLE)
48 |   add_custom_target(symlink_to_build COMMAND "ln" "-sf" "${PROJECT_BINARY_DIR}" "${PROJECT_SOURCE_DIR}/build"
49 |                                      COMMENT "Adding symlink: <greentea_root>/build -> ${PROJECT_BINARY_DIR}" )
50 | endif()
51 | 
52 | # ---[ Set debug postfix
53 | set(GREENTEA_DEBUG_POSTFIX "-d")
54 | 
55 | set(GREENTEA_POSTFIX "")
56 | if(CMAKE_BUILD_TYPE MATCHES "Debug")
57 |   set(GREENTEA_POSTFIX ${GREENTEA_DEBUG_POSTFIX})
58 | endif()
59 | 
60 | endif(FALSE)
61 | endif(FALSE)
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | COPYRIGHT
 2 | 
 3 | All contributions by Fabian David Tschopp:
 4 | Copyright (c) 2016-2017 Fabian David Tschopp
 5 | All rights reserved.
 6 | 
 7 | All other contributions:
 8 | Copyright (c) 2016-2017, the respective contributors
 9 | All rights reserved.
10 | 
11 | LibDNN uses a shared copyright model: each contributor holds copyright over
12 | their contributions to LibDNN. The project versioning records all such
13 | contribution and copyright details. If a contributor wants to further mark
14 | their specific copyright on a particular contribution, they should indicate
15 | their copyright solely in the commit message of the change when it is
16 | committed.
17 | 
18 | LICENSE
19 | 
20 | Redistribution and use in source and binary forms, with or without
21 | modification, are permitted provided that the following conditions are met: 
22 | 
23 | 1. Redistributions of source code must retain the above copyright notice, this
24 |    list of conditions and the following disclaimer. 
25 | 2. Redistributions in binary form must reproduce the above copyright notice,
26 |    this list of conditions and the following disclaimer in the documentation
27 |    and/or other materials provided with the distribution. 
28 | 
29 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
30 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
31 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
32 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
33 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
34 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
35 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
36 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
37 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
38 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
39 | 
40 | CONTRIBUTION AGREEMENT
41 | 
42 | By contributing to the LibDNN repository through pull-request, comment,
43 | or otherwise, the contributor releases their content to the
44 | license and copyright terms herein.
45 | 


--------------------------------------------------------------------------------
/cmake/ConfigGen.cmake:
--------------------------------------------------------------------------------
 1 | 
 2 | ################################################################################################
 3 | # Helper function to fetch greentea includes which will be passed to dependent projects
 4 | # Usage:
 5 | #   greentea_get_current_includes(<includes_list_variable>)
 6 | function(greentea_get_current_includes includes_variable)
 7 |   get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
 8 |   greentea_convert_absolute_paths(current_includes)
 9 | 
10 |   # remove at most one ${PROJECT_BINARY_DIR} include added for greentea_config.h
11 |   list(FIND current_includes ${PROJECT_BINARY_DIR} __index)
12 |   list(REMOVE_AT current_includes ${__index})
13 | 
14 |   # removing numpy includes (since not required for client libs)
15 |   set(__toremove "")
16 |   foreach(__i ${current_includes})
17 |     if(${__i} MATCHES "python")
18 |       list(APPEND __toremove ${__i})
19 |     endif()
20 |   endforeach()
21 |   if(__toremove)
22 |     list(REMOVE_ITEM current_includes ${__toremove})
23 |   endif()
24 | 
25 |   greentea_list_unique(current_includes)
26 |   set(${includes_variable} ${current_includes} PARENT_SCOPE)
27 | endfunction()
28 | 
29 | ################################################################################################
30 | # Helper function to get all list items that begin with given prefix
31 | # Usage:
32 | #   greentea_get_items_with_prefix(<prefix> <list_variable> <output_variable>)
33 | function(greentea_get_items_with_prefix prefix list_variable output_variable)
34 |   set(__result "")
35 |   foreach(__e ${${list_variable}})
36 |     if(__e MATCHES "^${prefix}.*")
37 |       list(APPEND __result ${__e})
38 |     endif()
39 |   endforeach()
40 |   set(${output_variable} ${__result} PARENT_SCOPE)
41 | endfunction()
42 | 
43 | ################################################################################################
44 | # Function for generation Greentea build- and install- tree export config files
45 | # Usage:
46 | #  generate_export_configs()
47 | function(generate_export_configs)
48 |   set(install_cmake_suffix "share/Greentea")
49 | 
50 |   # ---[ Configure build-tree GreenteaConfig.cmake file ]---
51 |   greentea_get_current_includes(GREENTEA_INCLUDE_DIRS)
52 | 
53 |   set(Greentea_DEFINITIONS "")
54 |   if(NOT HAVE_OPENCL)
55 |     set(HAVE_OPENCL FALSE)
56 |   endif()
57 |   
58 |   if(NOT HAVE_CUDA)
59 |     set(HAVE_CUDA FALSE)
60 |   endif()
61 | 
62 |   # Add targets to the build-tree export set
63 |   export(TARGETS greentea_libdnn
64 |          FILE "${PROJECT_BINARY_DIR}/GreenteaLibDNNTargets.cmake")
65 |   export(PACKAGE Greentea)
66 | 
67 |   # ---[ Configure install-tree GreenteaConfig.cmake file ]---
68 | 
69 |   # remove source and build dir includes
70 |   greentea_get_items_with_prefix(
71 |       ${PROJECT_SOURCE_DIR} GREENTEA_INCLUDE_DIRS __insource)
72 |   greentea_get_items_with_prefix(
73 |       ${PROJECT_BINARY_DIR} GREENTEA_INCLUDE_DIRS __inbinary)
74 |   list(REMOVE_ITEM GREENTEA_INCLUDE_DIRS ${__insource} ${__inbinary})
75 | 
76 |   # add `install` include folder
77 |   set(lines
78 |      "get_filename_component(__greentea_include \"\${Greentea_CMAKE_DIR}/../../include\" ABSOLUTE)\n"
79 |      "list(APPEND GREENTEA_INCLUDE_DIRS \${__greentea_include})\n"
80 |      "unset(__greentea_include)\n")
81 |   string(REPLACE ";" "" GREENTEA_INSTALL_INCLUDE_DIR_APPEND_COMMAND ${lines})
82 | 
83 |   configure_file("cmake/Templates/GreenteaLibDNNConfig.cmake.in"
84 |      "${PROJECT_BINARY_DIR}/cmake/GreenteaLibDNNConfig.cmake" @ONLY)
85 | 
86 |  install(FILES "${CMAKE_BINARY_DIR}/greentea_libdnn_config.h"
87 |          DESTINATION include/greentea/include)
88 | 
89 |   # Install the GreenteaConfig.cmake and export set to use with install-tree
90 |   install(FILES "${PROJECT_BINARY_DIR}/cmake/GreenteaLibDNNConfig.cmake"
91 |           DESTINATION ${install_cmake_suffix})
92 | 
93 |   install(EXPORT GreenteaLibDNNTargets
94 |           DESTINATION ${install_cmake_suffix})
95 | 
96 | endfunction()
97 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindOpenCL.cmake:
--------------------------------------------------------------------------------
 1 | #  This file taken from FindOpenCL project @ http://gitorious.com/findopencl
 2 | #
 3 | # - Try to find OpenCL
 4 | # This module tries to find an OpenCL implementation on your system. It supports
 5 | # AMD / ATI, Apple and NVIDIA implementations.
 6 | #
 7 | # Once done this will define
 8 | #  OPENCL_FOUND        - system has OpenCL
 9 | #  OPENCL_INCLUDE_DIRS - the OpenCL include directory
10 | #  OPENCL_LIBRARIES    - link these to use OpenCL
11 | #
12 | # WIN32 should work, but is untested
13 | 
14 | FIND_PACKAGE( PackageHandleStandardArgs )
15 | 
16 | SET (OPENCL_VERSION_STRING "1.1.0")
17 | SET (OPENCL_VERSION_MAJOR 1)
18 | SET (OPENCL_VERSION_MINOR 1)
19 | SET (OPENCL_VERSION_PATCH 0)
20 | 
21 | IF (APPLE)
22 | 
23 |   FIND_LIBRARY(OPENCL_LIBRARIES OpenCL DOC "OpenCL lib for OSX")
24 |   FIND_PATH(OPENCL_INCLUDE_DIRS OpenCL/cl.h DOC "Include for OpenCL on OSX")
25 |   FIND_PATH(OPENCL_CPP_INCLUDE_DIRS OpenCL/cl.hpp DOC "Include for OpenCL CPP bindings on OSX")
26 | 
27 | ELSE (APPLE)
28 | 
29 | 	IF (WIN32)
30 | 	
31 | 	    FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h)
32 | 	    FIND_PATH(OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp)
33 | 	
34 | 	    # The AMD SDK currently installs both x86 and x86_64 libraries
35 | 	    # This is only a hack to find out architecture
36 | 	    IF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64" )
37 | 	    	SET(OPENCL_LIB_DIR "$ENV{AMDAPPSDKROOT}/lib/x86_64")
38 | 			SET(OPENCL_LIB_DIR "$ENV{AMDAPPSDKROOT}/lib/x86_64")
39 | 	    ELSE (${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64")
40 | 	    	SET(OPENCL_LIB_DIR "$ENV{AMDAPPSDKROOT}/lib/x86")
41 | 	   		SET(OPENCL_LIB_DIR "$ENV{AMDAPPSDKROOT}/lib/x86")
42 | 	    ENDIF( ${CMAKE_SYSTEM_PROCESSOR} STREQUAL "AMD64" )
43 | 
44 | 	    # find out if the user asked for a 64-bit build, and use the corresponding 
45 | 	    # 64 or 32 bit NVIDIA library paths to the search:
46 | 	    STRING(REGEX MATCH "Win64" ISWIN64 ${CMAKE_GENERATOR})
47 | 	    IF("${ISWIN64}" STREQUAL "Win64") 
48 | 	    	FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/x64)
49 | 	    ELSE("${ISWIN64}" STREQUAL "Win64") 
50 | 	    	FIND_LIBRARY(OPENCL_LIBRARIES OpenCL.lib ${OPENCL_LIB_DIR} $ENV{CUDA_LIB_PATH} $ENV{CUDA_PATH}/lib/Win32)
51 | 	    ENDIF("${ISWIN64}" STREQUAL "Win64") 
52 | 
53 | 	    GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
54 | 	    
55 | 	    # On Win32 search relative to the library
56 | 	    FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS "${_OPENCL_INC_CAND}" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include)
57 | 	    FIND_PATH(OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS "${_OPENCL_INC_CAND}" $ENV{CUDA_INC_PATH} $ENV{CUDA_PATH}/include)
58 | 	
59 | 	ELSE (WIN32)
60 | 
61 |             # Unix style platforms
62 |             FIND_LIBRARY(OPENCL_LIBRARIES OpenCL
63 |               ENV LD_LIBRARY_PATH
64 |             )
65 | 
66 |             GET_FILENAME_COMPONENT(OPENCL_LIB_DIR ${OPENCL_LIBRARIES} PATH)
67 |             GET_FILENAME_COMPONENT(_OPENCL_INC_CAND ${OPENCL_LIB_DIR}/../../include ABSOLUTE)
68 | 
69 |             # The AMD SDK currently does not place its headers
70 |             # in /usr/include, therefore also search relative
71 |             # to the library
72 |             FIND_PATH(OPENCL_INCLUDE_DIRS CL/cl.h PATHS ${_OPENCL_INC_CAND} "/usr/local/cuda/include")
73 |             FIND_PATH(OPENCL_CPP_INCLUDE_DIRS CL/cl.hpp PATHS ${_OPENCL_INC_CAND} "/usr/local/cuda/include")
74 | 
75 | 	ENDIF (WIN32)
76 | 
77 | ENDIF (APPLE)
78 | 
79 | FIND_PACKAGE_HANDLE_STANDARD_ARGS( OpenCL DEFAULT_MSG OPENCL_LIBRARIES OPENCL_INCLUDE_DIRS )
80 | 
81 | IF( OPENCL_CPP_INCLUDE_DIRS )
82 | 	SET( OPENCL_HAS_CPP_BINDINGS TRUE )
83 | 	LIST( APPEND OPENCL_INCLUDE_DIRS ${OPENCL_CPP_INCLUDE_DIRS} )
84 | 	# This is often the same, so clean up
85 | 	LIST( REMOVE_DUPLICATES OPENCL_INCLUDE_DIRS )
86 | ENDIF( OPENCL_CPP_INCLUDE_DIRS )
87 | 
88 | SET( HAVE_OPENCL TRUE )
89 | message(STATUS "OpenCL detected: " ${OPENCL_LIB_DIR})
90 | 
91 | MARK_AS_ADVANCED(
92 |   OPENCL_INCLUDE_DIRS
93 | )
94 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 2.8.7)
  2 | 
  3 | # enables set VERSION in command `project`
  4 | if(POLICY CMP0048)
  5 |     cmake_policy(SET CMP0048 NEW)
  6 | endif()
  7 | 
  8 | # ---[ Greentea project
  9 | project(Greentea-LibDNN VERSION 0.1.0 LANGUAGES C CXX)
 10 | 
 11 | if(POLICY CMP0046)
 12 |   cmake_policy(SET CMP0046 NEW)
 13 | endif()
 14 | 
 15 | if(POLICY CMP0054)
 16 |   cmake_policy(SET CMP0054 NEW)
 17 | endif()
 18 | 
 19 | string(TOLOWER ${PROJECT_NAME} PROJECT_NAME_LOWER)
 20 | string(REPLACE "-" "_" PROJECT_NAME_UNDERSCORE ${PROJECT_NAME_LOWER})
 21 | string(REPLACE "-" ""  PROJECT_NAME_MERGE      ${PROJECT_NAME})
 22 | 
 23 | set(PROJECT_EXPORT_NAME         ${PROJECT_NAME_MERGE})
 24 | set(PROJECT_LIBRARY_TARGET_NAME ${PROJECT_NAME_UNDERSCORE})
 25 | 
 26 | set(INCLUDE_INSTALL_DIR include/greentea
 27 |     CACHE PATH "Install dir for headers")
 28 | set(PACKAGE_INSTALL_DIR share/${PROJECT_EXPORT_NAME}
 29 |     CACHE PATH "Install dir for cmake package config files")
 30 | set(LIB_INSTALL_DIR lib
 31 |     CACHE PATH "Install dir for shared libraries")
 32 | 
 33 | set(CMAKE_CONFIG_FILE         "${PROJECT_EXPORT_NAME}Config.cmake")
 34 | set(CMAKE_CONFIG_VERSION_FILE "${PROJECT_EXPORT_NAME}ConfigVersion.cmake")
 35 | 
 36 | set(CMAKE_TARGETS_NAME "${PROJECT_EXPORT_NAME}Targets")
 37 | set(CMAKE_TARGETS_FILE "${CMAKE_TARGETS_NAME}.cmake")
 38 | 
 39 | # ---[ Using cmake scripts and modules
 40 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 41 | 
 42 | include(ExternalProject)
 43 | 
 44 | include(cmake/Utils.cmake)
 45 | include(cmake/Targets.cmake)
 46 | include(cmake/Misc.cmake)
 47 | include(cmake/Summary.cmake)
 48 | include(cmake/ConfigGen.cmake)
 49 | 
 50 | # ---[ Options
 51 | greentea_option(USE_INDEX_64 "Build Greentea LibDNN with 64 bit indexing" OFF)
 52 | greentea_option(USE_CUDA "Build Greentea LibDNN with CUDA support" ON)
 53 | greentea_option(USE_OPENCL "Build Greentea LibDNN with OpenCL support" ON)
 54 | greentea_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 55 | 
 56 | # ---[ Dependencies
 57 | include(cmake/Dependencies.cmake)
 58 | 
 59 | # ---[ Flags
 60 | if(UNIX OR APPLE)
 61 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall -std=c++11 -DCMAKE_BUILD -Wno-unused-variable")
 62 | endif()
 63 | 
 64 | # TODO(naibaf7): Is it needed?
 65 | greentea_set_greentea_link()
 66 | 
 67 | if(USE_libstdcpp)
 68 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++ -std=c++11")
 69 |   message("-- Warning: forcing libstdc++ (controlled by USE_libstdcpp option in cmake)")
 70 | endif()
 71 | 
 72 | add_definitions(-DGTEST_USE_OWN_TR1_TUPLE)
 73 | 
 74 | # ---[ Warnings
 75 | greentea_warnings_disable(CMAKE_CXX_FLAGS -Wno-sign-compare -Wno-uninitialized)
 76 | 
 77 | # ---[ Config generation
 78 | configure_file(cmake/Templates/greentea_libdnn_config.h.in
 79 |         "${PROJECT_BINARY_DIR}/greentea_libdnn_config.h")
 80 | 
 81 | # ---[ Includes
 82 | set(GREENTEA_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
 83 | include_directories(${GREENTEA_INCLUDE_DIR} ${PROJECT_BINARY_DIR})
 84 | #include_directories(BEFORE src) # This is needed for gtest.
 85 | 
 86 | # ---[ Subdirectories
 87 | add_subdirectory(src)
 88 | 
 89 | # ---[ Configuration summary
 90 | greentea_print_configuration_summary()
 91 | 
 92 | # ---[ Export configs generation
 93 | # generate_export_configs()
 94 | 
 95 | include(CMakePackageConfigHelpers)
 96 | configure_package_config_file(
 97 |     "cmake/Templates/${CMAKE_CONFIG_FILE}.in"
 98 |     ${CMAKE_BINARY_DIR}/${CMAKE_CONFIG_FILE}
 99 |     PATH_VARS LIB_INSTALL_DIR INCLUDE_INSTALL_DIR
100 |     INSTALL_DESTINATION ${PACKAGE_INSTALL_DIR}) 
101 | 
102 | write_basic_package_version_file(
103 |     ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CONFIG_VERSION_FILE}
104 |     VERSION ${PROJECT_VERSION} 
105 |     COMPATIBILITY SameMajorVersion)
106 | 
107 | # uninstall target
108 | configure_file(
109 |     "${PROJECT_SOURCE_DIR}/cmake/Templates/cmake-uninstall.cmake.in"
110 |     "${CMAKE_CURRENT_BINARY_DIR}/cmake-uninstall.cmake"
111 |     IMMEDIATE @ONLY)
112 | 
113 | add_custom_target(uninstall COMMAND ${CMAKE_COMMAND}
114 |     -P ${CMAKE_CURRENT_BINARY_DIR}/cmake-uninstall.cmake)
115 | 
116 | # ---[ Install and export package
117 | 
118 | install(EXPORT ${CMAKE_TARGETS_NAME}
119 |         FILE   ${CMAKE_TARGETS_FILE}
120 |         DESTINATION ${PACKAGE_INSTALL_DIR})
121 | 
122 | install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CONFIG_FILE}
123 |               ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_CONFIG_VERSION_FILE}
124 |         DESTINATION ${PACKAGE_INSTALL_DIR} COMPONENT share)
125 | 


--------------------------------------------------------------------------------
/cmake/Summary.cmake:
--------------------------------------------------------------------------------
  1 | ################################################################################################
  2 | # Greentea status report function.
  3 | # Automatically align right column and selects text based on condition.
  4 | 
  5 | function(greentea_status text)
  6 |   set(status_cond)
  7 |   set(status_then)
  8 |   set(status_else)
  9 | 
 10 |   set(status_current_name "cond")
 11 |   foreach(arg ${ARGN})
 12 |     if(arg STREQUAL "THEN")
 13 |       set(status_current_name "then")
 14 |     elseif(arg STREQUAL "ELSE")
 15 |       set(status_current_name "else")
 16 |     else()
 17 |       list(APPEND status_${status_current_name} ${arg})
 18 |     endif()
 19 |   endforeach()
 20 | 
 21 |   if(DEFINED status_cond)
 22 |     set(status_placeholder_length 23)
 23 |     string(RANDOM LENGTH ${status_placeholder_length} ALPHABET " " status_placeholder)
 24 |     string(LENGTH "${text}" status_text_length)
 25 |     if(status_text_length LESS status_placeholder_length)
 26 |       string(SUBSTRING "${text}${status_placeholder}" 0 ${status_placeholder_length} status_text)
 27 |     elseif(DEFINED status_then OR DEFINED status_else)
 28 |       message(STATUS "${text}")
 29 |       set(status_text "${status_placeholder}")
 30 |     else()
 31 |       set(status_text "${text}")
 32 |     endif()
 33 | 
 34 |     if(DEFINED status_then OR DEFINED status_else)
 35 |       if(${status_cond})
 36 |         string(REPLACE ";" " " status_then "${status_then}")
 37 |         string(REGEX REPLACE "^[ \t]+" "" status_then "${status_then}")
 38 |         message(STATUS "${status_text} ${status_then}")
 39 |       else()
 40 |         string(REPLACE ";" " " status_else "${status_else}")
 41 |         string(REGEX REPLACE "^[ \t]+" "" status_else "${status_else}")
 42 |         message(STATUS "${status_text} ${status_else}")
 43 |       endif()
 44 |     else()
 45 |       string(REPLACE ";" " " status_cond "${status_cond}")
 46 |       string(REGEX REPLACE "^[ \t]+" "" status_cond "${status_cond}")
 47 |       message(STATUS "${status_text} ${status_cond}")
 48 |     endif()
 49 |   else()
 50 |     message(STATUS "${text}")
 51 |   endif()
 52 | endfunction()
 53 | 
 54 | 
 55 | ################################################################################################
 56 | # Function for fetching Greentea LibDNN version from git and headers
 57 | # Usage:
 58 | #   greentea_extract_greentea_libdnn_version()
 59 | function(greentea_extract_greentea_libdnn_version)
 60 |   set(GREENTEA_LIBDNN_GIT_VERSION "unknown")
 61 |   find_package(Git)
 62 |   if(GIT_FOUND)
 63 |     execute_process(COMMAND ${GIT_EXECUTABLE} describe --tags --always --dirty
 64 |                     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE
 65 |                     WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}"
 66 |                     OUTPUT_VARIABLE GREENTEA_LIBDNN_GIT_VERSION
 67 |                     RESULT_VARIABLE __git_result)
 68 |     if(NOT ${__git_result} EQUAL 0)
 69 |       set(GREENTEA_LIBDNN_GIT_VERSION "unknown")
 70 |     endif()
 71 |   endif()
 72 | 
 73 |   set(GREENTEA_LIBDNN_GIT_VERSION ${GREENTEA_LIBDNN_GIT_VERSION} PARENT_SCOPE)
 74 | 
 75 | 
 76 |   greentea_parse_header(${GREENTEA_INCLUDE_DIR}/version.hpp GREENTEA_LIBDNN_VERSION_LINES GREENTEA_LIBDNN_MAJOR GREENTEA_LIBDNN_MINOR GREENTEA_LIBDNN_PATCH)
 77 |   set(GREENTEA_LIBDNN_VERSION "${GREENTEA_LIBDNN_MAJOR}.${GREENTEA_LIBDNN_MINOR}.${GREENTEA_LIBDNN_PATCH}" PARENT_SCOPE)
 78 | 
 79 | endfunction()
 80 | 
 81 | 
 82 | ################################################################################################
 83 | # Prints accumulated Greentea LibDNN configuration summary
 84 | # Usage:
 85 | #   greentea_print_configuration_summary()
 86 | 
 87 | function(greentea_print_configuration_summary)
 88 |   greentea_extract_greentea_libdnn_version()
 89 |   set(GREENTEA_VERSION ${GREENTEA_VERSION} PARENT_SCOPE)
 90 | 
 91 |   greentea_merge_flag_lists(__flags_rel CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS)
 92 |   greentea_merge_flag_lists(__flags_deb CMAKE_CXX_FLAGS_DEBUG   CMAKE_CXX_FLAGS)
 93 | 
 94 |   greentea_status("")
 95 |   greentea_status("******************* Greentea LibDNN Configuration Summary *******************")
 96 |   greentea_status("General:")
 97 |   greentea_status("  Version           :   ${GREENTEA_TARGET_VERSION}")
 98 |   greentea_status("  Git               :   ${GREENTEA_GIT_VERSION}")
 99 |   greentea_status("  System            :   ${CMAKE_SYSTEM_NAME}")
100 |   greentea_status("  C++ compiler      :   ${CMAKE_CXX_COMPILER}")
101 |   greentea_status("  Release CXX flags :   ${__flags_rel}")
102 |   greentea_status("  Debug CXX flags   :   ${__flags_deb}")
103 |   greentea_status("  Build type        :   ${CMAKE_BUILD_TYPE}")
104 |   greentea_status("")
105 |   greentea_status("  BUILD_SHARED_LIBS :   ${BUILD_SHARED_LIBS}")
106 |   greentea_status("")
107 |   greentea_status("Dependencies:")
108 |   greentea_status("  OpenCL            : " HAVE_OPENCL THEN  "Yes" ELSE "No")
109 |   greentea_status("  ViennaCL          : " HAVE_VIENNACL THEN  "Yes" ELSE "No")
110 |   greentea_status("  CUDA              : " HAVE_CUDA THEN "Yes (ver. ${CUDA_VERSION})" ELSE "No" )
111 |   greentea_status("")
112 |   greentea_status("Install:")
113 |   greentea_status("  Install path      :   ${CMAKE_INSTALL_PREFIX}")
114 |   greentea_status("")
115 | endfunction()
116 | 


--------------------------------------------------------------------------------
/cmake/Targets.cmake:
--------------------------------------------------------------------------------
  1 | ################################################################################################
  2 | # Defines global GREENTEA_LINK flag, This flag is required to prevent linker from excluding
  3 | # some objects which are not addressed directly but are registered via static constructors
  4 | macro(greentea_set_greentea_link)
  5 |   if(BUILD_SHARED_LIBS)
  6 |     set(GREENTEA_LINK greentea_libdnn)
  7 |   else()
  8 |     if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
  9 |       set(GREENTEA_LINK -Wl,-force_load greentea_libdnn)
 10 |     elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
 11 |       set(GREENTEA_LINK -Wl,--whole-archive greentea_libdnn -Wl,--no-whole-archive)
 12 |     endif()
 13 |   endif()
 14 | endmacro()
 15 | ################################################################################################
 16 | # Convenient command to setup source group for IDEs that support this feature (VS, XCode)
 17 | # Usage:
 18 | #   greentea_source_group(<group> GLOB[_RECURSE] <globbing_expression>)
 19 | function(greentea_source_group group)
 20 |   cmake_parse_arguments(GREENTEA_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN})
 21 |   if(GREENTEA_SOURCE_GROUP_GLOB)
 22 |     file(GLOB srcs1 ${GREENTEA_SOURCE_GROUP_GLOB})
 23 |     source_group(${group} FILES ${srcs1})
 24 |   endif()
 25 | 
 26 |   if(GREENTEA_SOURCE_GROUP_GLOB_RECURSE)
 27 |     file(GLOB_RECURSE srcs2 ${GREENTEA_SOURCE_GROUP_GLOB_RECURSE})
 28 |     source_group(${group} FILES ${srcs2})
 29 |   endif()
 30 | endfunction()
 31 | 
 32 | ################################################################################################
 33 | # Collecting sources from globbing and appending to output list variable
 34 | # Usage:
 35 | #   greentea_collect_sources(<output_variable> GLOB[_RECURSE] <globbing_expression>)
 36 | function(greentea_collect_sources variable)
 37 |   cmake_parse_arguments(GREENTEA_COLLECT_SOURCES "" "" "GLOB;GLOB_RECURSE" ${ARGN})
 38 |   if(GREENTEA_COLLECT_SOURCES_GLOB)
 39 |     file(GLOB srcs1 ${GREENTEA_COLLECT_SOURCES_GLOB})
 40 |     set(${variable} ${variable} ${srcs1})
 41 |   endif()
 42 | 
 43 |   if(GREENTEA_COLLECT_SOURCES_GLOB_RECURSE)
 44 |     file(GLOB_RECURSE srcs2 ${GREENTEA_COLLECT_SOURCES_GLOB_RECURSE})
 45 |     set(${variable} ${variable} ${srcs2})
 46 |   endif()
 47 | endfunction()
 48 | 
 49 | ################################################################################################
 50 | # Short command getting greentea sources (assuming standard Greentea code tree)
 51 | # Usage:
 52 | #   greentea_pickup_greentea_sources(<root>)
 53 | function(greentea_pickup_greentea_sources root)
 54 |   # put all files in source groups (visible as subfolder in many IDEs)
 55 |   greentea_source_group("Include"        GLOB "${root}/include/*.h*")
 56 |   greentea_source_group("Include"        GLOB "${PROJECT_BINARY_DIR}/greentea_config.h*")
 57 |   greentea_source_group("Source"         GLOB "${root}/src/*.cpp")
 58 | 
 59 | 
 60 |   # collect files
 61 |   file(GLOB_RECURSE hdrs ${root}/include/*.h*)
 62 |   file(GLOB_RECURSE srcs ${root}/src/*.cpp)
 63 | 
 64 |   # adding headers to make the visible in some IDEs (Qt, VS, Xcode)
 65 |   list(APPEND srcs ${hdrs} ${PROJECT_BINARY_DIR}/greentea_libdnn_config.h)
 66 |   list(APPEND test_srcs ${test_hdrs})
 67 |   
 68 |   # convert to absolute paths
 69 |   greentea_convert_absolute_paths(srcs)
 70 | 
 71 |   # propogate to parent scope
 72 |   set(srcs ${srcs} PARENT_SCOPE)
 73 | endfunction()
 74 | 
 75 | ################################################################################################
 76 | # Short command for setting defeault target properties
 77 | # Usage:
 78 | #   greentea_default_properties(<target>)
 79 | function(greentea_default_properties target)
 80 |   set_target_properties(${target} PROPERTIES
 81 |     DEBUG_POSTFIX ${GREENTEA_DEBUG_POSTFIX}
 82 |     ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
 83 |     LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
 84 |     RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin")
 85 |   # make sure we build all external depepdencies first
 86 |   if (DEFINED external_project_dependencies)
 87 |     add_dependencies(${target} ${external_project_dependencies})
 88 |   endif()
 89 | endfunction()
 90 | 
 91 | ################################################################################################
 92 | # Short command for setting runtime directory for build target
 93 | # Usage:
 94 | #   greentea_set_runtime_directory(<target> <dir>)
 95 | function(greentea_set_runtime_directory target dir)
 96 |   set_target_properties(${target} PROPERTIES
 97 |     RUNTIME_OUTPUT_DIRECTORY "${dir}")
 98 | endfunction()
 99 | 
100 | ################################################################################################
101 | # Short command for setting solution folder property for target
102 | # Usage:
103 | #   greentea_set_solution_folder(<target> <folder>)
104 | function(greentea_set_solution_folder target folder)
105 |   if(USE_PROJECT_FOLDERS)
106 |     set_target_properties(${target} PROPERTIES FOLDER "${folder}")
107 |   endif()
108 | endfunction()
109 | 
110 | ################################################################################################
111 | # Reads lines from input file, prepends source directory to each line and writes to output file
112 | # Usage:
113 | #   greentea_configure_testdatafile(<testdatafile>)
114 | function(greentea_configure_testdatafile file)
115 |   file(STRINGS ${file} __lines)
116 |   set(result "")
117 |   foreach(line ${__lines})
118 |     set(result "${result}${PROJECT_SOURCE_DIR}/${line}\n")
119 |   endforeach()
120 |   file(WRITE ${file}.gen.cmake ${result})
121 | endfunction()
122 | 
123 | 


--------------------------------------------------------------------------------
/src/benchmark.cpp:
--------------------------------------------------------------------------------
  1 | #include "benchmark.hpp"
  2 | #include "common.hpp"
  3 | #include "device.hpp"
  4 | 
  5 | namespace greentea {
  6 | 
  7 | Timer::Timer(device* dev_ptr)
  8 |     : dev_ptr_(dev_ptr)
  9 |     , initted_(false)
 10 |     , running_(false)
 11 |     , has_run_at_least_once_(false) { Init(); }
 12 | 
 13 | Timer::~Timer() {
 14 | #ifdef USE_CUDA
 15 |     if (dev_ptr_->backend() == BACKEND_CUDA) {
 16 |         (cudaEventDestroy(start_gpu_cuda_));
 17 |         (cudaEventDestroy(stop_gpu_cuda_));
 18 |     }
 19 | #endif  // USE_CUDA
 20 | #ifdef USE_OPENCL
 21 |     if (dev_ptr_->backend() == BACKEND_OpenCL) {
 22 |         clWaitForEvents(1, &start_gpu_cl_);
 23 |         clWaitForEvents(1, &stop_gpu_cl_);
 24 |         clReleaseEvent(start_gpu_cl_);
 25 |         clReleaseEvent(stop_gpu_cl_);
 26 |     }
 27 | #endif  // USE_OPENCL
 28 | }
 29 | 
 30 | void Timer::Start() {
 31 |   if (!running()) {
 32 | #ifdef USE_CUDA
 33 |       if (dev_ptr_->backend() == BACKEND_CUDA) {
 34 |         (cudaEventRecord(start_gpu_cuda_, 0));
 35 |       }
 36 | #endif  // USE_CUDA
 37 | #ifdef USE_OPENCL
 38 |       if (dev_ptr_->backend() == BACKEND_OpenCL) {
 39 |         clWaitForEvents(1, &start_gpu_cl_);
 40 |         clReleaseEvent(start_gpu_cl_);
 41 |         viennacl::ocl::context &ctx = viennacl::ocl::get_context(
 42 |             dev_ptr_->id());
 43 |         viennacl::ocl::program &program = dev_ptr_->program();
 44 |         viennacl::ocl::kernel &kernel = program.get_kernel("null_kernel_float");
 45 |         // TODO(naibaf7): compiler shows deprecated declaration
 46 |         // use `clEnqueueNDRangeKernel` instead
 47 |         // https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/clEnqueueTask.html
 48 |         clEnqueueTask(ctx.get_queue().handle().get(), kernel.handle().get(), 0,
 49 |                         NULL, &start_gpu_cl_);
 50 |         clFinish(ctx.get_queue().handle().get());
 51 |       }
 52 | #endif
 53 |     running_ = true;
 54 |     has_run_at_least_once_ = true;
 55 |   }
 56 | }
 57 | 
 58 | void Timer::Stop() {
 59 |   if (running()) {
 60 | #ifdef USE_CUDA
 61 |       if (dev_ptr_->backend() == BACKEND_CUDA) {
 62 |         (cudaEventRecord(stop_gpu_cuda_, 0));
 63 |         (cudaEventSynchronize(stop_gpu_cuda_));
 64 |       }
 65 | #endif  // USE_CUDA
 66 | #ifdef USE_OPENCL
 67 |       if (dev_ptr_->backend() == BACKEND_OpenCL) {
 68 |         clWaitForEvents(1, &stop_gpu_cl_);
 69 |         clReleaseEvent(stop_gpu_cl_);
 70 |         viennacl::ocl::context &ctx = viennacl::ocl::get_context(
 71 |             dev_ptr_->id());
 72 |         viennacl::ocl::program &program = dev_ptr_->program();
 73 |         viennacl::ocl::kernel &kernel = program.get_kernel("null_kernel_float");
 74 |         clEnqueueTask(ctx.get_queue().handle().get(), kernel.handle().get(), 0,
 75 |                         NULL, &stop_gpu_cl_);
 76 |         clFinish(ctx.get_queue().handle().get());
 77 |       }
 78 | #endif
 79 |     running_ = false;
 80 |   }
 81 | }
 82 | 
 83 | float Timer::MicroSeconds() {
 84 |     if (!has_run_at_least_once()) {
 85 |         return 0;
 86 |     }
 87 |     if (running()) {
 88 |         Stop();
 89 |     }
 90 | #ifdef USE_CUDA
 91 |     if (dev_ptr_->backend() == BACKEND_CUDA) {
 92 |       (cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_cuda_,
 93 |               stop_gpu_cuda_));
 94 |       // Cuda only measure milliseconds
 95 |       elapsed_microseconds_ = elapsed_milliseconds_ * 1000;
 96 |     }
 97 | #endif  // USE_CUDA
 98 | #ifdef USE_OPENCL
 99 |     if (dev_ptr_->backend() == BACKEND_OpenCL) {
100 |       cl_ulong startTime, stopTime;
101 |       clWaitForEvents(1, &stop_gpu_cl_);
102 |       clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END,
103 |           sizeof startTime, &startTime, NULL);
104 |       clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START,
105 |           sizeof stopTime, &stopTime, NULL);
106 |       double us = static_cast<double>(stopTime - startTime) / 1000.0;
107 |       elapsed_microseconds_ = static_cast<float>(us);
108 |     }
109 | #endif
110 |   return elapsed_microseconds_;
111 | }
112 | 
113 | float Timer::MilliSeconds() {
114 |   if (!has_run_at_least_once()) {
115 |     return 0;
116 |   }
117 |   if (running()) {
118 |     Stop();
119 |   }
120 | #ifdef USE_CUDA
121 |     if (dev_ptr_->backend() == BACKEND_CUDA) {
122 |       (cudaEventElapsedTime(&elapsed_milliseconds_, start_gpu_cuda_,
123 |               stop_gpu_cuda_));
124 |     }
125 | #endif  // USE_CUDA
126 | #ifdef USE_OPENCL
127 |     if (dev_ptr_->backend() == BACKEND_OpenCL) {
128 |       cl_ulong startTime = 0, stopTime = 0;
129 |       clGetEventProfilingInfo(start_gpu_cl_, CL_PROFILING_COMMAND_END,
130 |           sizeof startTime, &startTime, NULL);
131 |       clGetEventProfilingInfo(stop_gpu_cl_, CL_PROFILING_COMMAND_START,
132 |           sizeof stopTime, &stopTime, NULL);
133 |       double ms = static_cast<double>(stopTime - startTime) / 1000000.0;
134 |       elapsed_milliseconds_ = static_cast<float>(ms);
135 |     }
136 | #endif
137 |   return elapsed_milliseconds_;
138 | }
139 | 
140 | float Timer::Seconds() {
141 |   return MilliSeconds() / 1000.;
142 | }
143 | 
144 | void Timer::Init() {
145 |   if (!initted()) {
146 | #ifdef USE_CUDA
147 |       if (dev_ptr_->backend() == BACKEND_CUDA) {
148 |         (cudaEventCreate(&start_gpu_cuda_));
149 |         (cudaEventCreate(&stop_gpu_cuda_));
150 |       }
151 | #endif  // USE_CUDA
152 | #ifdef USE_OPENCL
153 |       if (dev_ptr_->backend() == BACKEND_OpenCL) {
154 |         start_gpu_cl_ = 0;
155 |         stop_gpu_cl_ = 0;
156 |       }
157 | #endif
158 |     initted_ = true;
159 |   }
160 | }
161 | 
162 | CPUTimer::CPUTimer(device* dev_ptr) : Timer(dev_ptr) {
163 |   this->initted_ = true;
164 |   this->running_ = false;
165 |   this->has_run_at_least_once_ = false;
166 | }
167 | 
168 | void CPUTimer::Start() {
169 |   if (!running()) {
170 |     this->start_cpu_ = std::chrono::high_resolution_clock::now();
171 |     this->running_ = true;
172 |     this->has_run_at_least_once_ = true;
173 |   }
174 | }
175 | 
176 | void CPUTimer::Stop() {
177 |   if (running()) {
178 |     this->stop_cpu_ = std::chrono::high_resolution_clock::now();
179 |     this->running_ = false;
180 |   }
181 | }
182 | 
183 | float CPUTimer::MilliSeconds() {
184 |   if (!has_run_at_least_once()) {
185 |     return 0;
186 |   }
187 |   if (running()) {
188 |     Stop();
189 |   }
190 |   this->elapsed_milliseconds_ = std::chrono::duration_cast<std::chrono::milliseconds>(
191 |                                               this->stop_cpu_ -
192 |                                               this->start_cpu_).count();
193 |   return this->elapsed_milliseconds_;
194 | }
195 | 
196 | float CPUTimer::MicroSeconds() {
197 |   if (!has_run_at_least_once()) {
198 |     return 0;
199 |   }
200 |   if (running()) {
201 |     Stop();
202 |   }
203 |   this->elapsed_microseconds_ = std::chrono::duration_cast<std::chrono::microseconds>(
204 |                                               this->stop_cpu_ -
205 |                                               this->start_cpu_).count();
206 |   return this->elapsed_microseconds_;
207 | }
208 | 
209 | }  // namespace greentea
210 | 


--------------------------------------------------------------------------------
/include/libdnn_tuner.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef GREENTEA_LIBDNN_TUNER_HPP_
  2 | #define GREENTEA_LIBDNN_TUNER_HPP_
  3 | #include <functional>
  4 | #include <map>
  5 | #include <memory>
  6 | #include <queue>
  7 | #include <string>
  8 | #include <type_traits>
  9 | #include <vector>
 10 | #include "common.hpp"
 11 | 
 12 | namespace greentea {
 13 | 
 14 | typedef enum {
 15 |     LIBDNN_TUNER_METHOD_ALL          = 0,
 16 |     LIBDNN_TUNER_METHOD_ANNEALING    = 1,
 17 | } libdnnTunerMethod_t;
 18 | 
 19 | typedef enum {
 20 |     LIBDNN_TUNER_PARAM_STAT_OK           = 0,
 21 |     LIBDNN_TUNER_PARAM_STAT_OVERFLOW     = 1,
 22 |     LIBDNN_TUNER_PARAM_STAT_NO_SOLUTION  = 2,
 23 | } libdnnTunerParamStatus_t;
 24 | 
 25 | class LibDNNTuner;
 26 | 
 27 | class LibDNNTunerConstraint {
 28 |  public:
 29 |   LibDNNTunerConstraint(LibDNNTuner* tuner, std::vector<std::string> con_params,
 30 |                         std::vector<std::string> con_adapt) :
 31 |   tuner_(tuner), con_params_(con_params), con_adapt_(con_adapt) {
 32 |   }
 33 |   virtual bool evaluate() = 0;
 34 |  protected:
 35 |   LibDNNTuner* tuner_;
 36 |   std::vector<std::string> con_params_;
 37 |   std::vector<std::string> con_adapt_;
 38 | };
 39 | 
 40 | class LibDNNTunerConstraintBool : public LibDNNTunerConstraint {
 41 |  public:
 42 |   LibDNNTunerConstraintBool(LibDNNTuner* tuner,
 43 |                             std::vector<std::string> con_params,
 44 |                             std::vector<std::string> con_adapt,
 45 |                             std::function<bool(std::vector<bool>)> func) :
 46 |                             LibDNNTunerConstraint(tuner, con_params, con_adapt),
 47 |                             func_(func) {
 48 |   }
 49 |   bool evaluate();
 50 |  protected:
 51 |   std::function<bool(std::vector<bool>)> func_;
 52 | };
 53 | 
 54 | class LibDNNTunerConstraintReal : public LibDNNTunerConstraint {
 55 |  public:
 56 |   LibDNNTunerConstraintReal(LibDNNTuner* tuner,
 57 |                             std::vector<std::string> con_params,
 58 |                             std::vector<std::string> con_adapt,
 59 |                             std::function<bool(std::vector<double>)> func) :
 60 |                             LibDNNTunerConstraint(tuner, con_params, con_adapt),
 61 |                             func_(func) {
 62 |   }
 63 |   bool evaluate();
 64 |  protected:
 65 |   std::function<bool(std::vector<double>)> func_;
 66 | };
 67 | 
 68 | class LibDNNTunerConstraintInt : public LibDNNTunerConstraint {
 69 |  public:
 70 |   LibDNNTunerConstraintInt(LibDNNTuner* tuner,
 71 |                            std::vector<std::string> con_params,
 72 |                            std::vector<std::string> con_adapt,
 73 |                            std::function<bool(std::vector<int64_t>)> func) :
 74 |                            LibDNNTunerConstraint(tuner, con_params, con_adapt),
 75 |                            func_(func) {
 76 |   }
 77 |   bool evaluate();
 78 |  protected:
 79 |   std::function<bool(std::vector<int64_t>)> func_;
 80 | };
 81 | 
 82 | class LibDNNTunerParam {
 83 |  public:
 84 |   LibDNNTunerParam(LibDNNTuner* tuner, std::string name, int_tp def_idx) :
 85 |     constraints_(), tuner_(tuner), name_(name),
 86 |     curr_idx_(def_idx), def_idx_(def_idx)
 87 |   {}
 88 |   LibDNNTunerParam(LibDNNTuner* tuner, LibDNNTunerParam& other) :  // NOLINT
 89 |     constraints_(other.constraints_), tuner_(tuner),
 90 |     name_(other.name_), curr_idx_(other.curr_idx_), def_idx_(other.def_idx_)
 91 |   {}
 92 | 
 93 |   virtual int_tp count_values() = 0;
 94 |   virtual std::shared_ptr<LibDNNTunerParam> clone() = 0;
 95 | 
 96 |   std::string get_name();
 97 | 
 98 |   libdnnTunerParamStatus_t advance(int_tp offset);
 99 | 
100 |   int_tp get_curr_idx();
101 |   int_tp get_def_idx();
102 |   void set_curr_idx(int_tp curr_idx);
103 |   void set_def_idx(int_tp def_idx);
104 |   void update(std::shared_ptr<LibDNNTunerParam> other);
105 |   void add_constraint(std::shared_ptr<LibDNNTunerConstraint> constraint);
106 | 
107 |  protected:
108 |   LibDNNTuner* tuner_;
109 |   std::string name_;
110 |   int_tp curr_idx_;
111 |   int_tp def_idx_;
112 |   std::vector<std::shared_ptr<LibDNNTunerConstraint>> constraints_;
113 | };
114 | 
115 | class LibDNNTunerParamInt: public LibDNNTunerParam {
116 |  public:
117 |   LibDNNTunerParamInt(LibDNNTuner* tuner,
118 |                       std::string name, std::vector<int64_t> values,
119 |                       int_tp def_idx) :
120 |                       LibDNNTunerParam(tuner, name, def_idx) {
121 |     values_ = values;
122 |   }
123 |   LibDNNTunerParamInt(LibDNNTunerParamInt& other) :  // NOLINT
124 |     LibDNNTunerParam(other), values_(other.values_) {
125 |   }
126 |   int64_t get_value();
127 |   const std::vector<int64_t>& get_values();
128 |   int_tp count_values();
129 |   std::shared_ptr<LibDNNTunerParam> clone();
130 |  protected:
131 |   std::vector<int64_t> values_;
132 | };
133 | 
134 | class LibDNNTunerParamBool: public LibDNNTunerParam {
135 |  public:
136 |   LibDNNTunerParamBool(LibDNNTuner* tuner,
137 |                        std::string name, std::vector<bool> values,
138 |                        int_tp def_idx) :
139 |                        LibDNNTunerParam(tuner, name, def_idx) {
140 |     values_ = values;
141 |   }
142 |   LibDNNTunerParamBool(LibDNNTunerParamBool& other) :  // NOLINT
143 |     LibDNNTunerParam(other), values_(other.values_) {
144 |   }
145 |   bool get_value();
146 |   const std::vector<bool>& get_values();
147 |   int_tp count_values();
148 |   virtual std::shared_ptr<LibDNNTunerParam> clone();
149 |  protected:
150 |   std::vector<bool> values_;
151 | };
152 | 
153 | class LibDNNTunerParamReal: public LibDNNTunerParam {
154 |  public:
155 |   LibDNNTunerParamReal(LibDNNTuner* tuner,
156 |                        std::string name, std::vector<double> values,
157 |                        int_tp def_idx) :
158 |                        LibDNNTunerParam(tuner, name, def_idx) {
159 |     values_ = values;
160 |   }
161 |   LibDNNTunerParamReal(LibDNNTunerParamReal& other) :  // NOLINT
162 |     LibDNNTunerParam(other), values_(other.values_) {
163 |   }
164 |   double get_value();
165 |   const std::vector<double>& get_values();
166 |   int_tp count_values();
167 |   virtual std::shared_ptr<LibDNNTunerParam> clone();
168 |  protected:
169 |   std::vector<double> values_;
170 | };
171 | 
172 | 
173 | 
174 | class LibDNNTunerSnapshot {
175 |  public:
176 |   LibDNNTunerSnapshot(double score,
177 |                       std::vector<std::shared_ptr<LibDNNTunerParam>>* params) :
178 |     score_(score) {
179 |       for (int i = 0; i < params->size(); ++i) {
180 |         std::shared_ptr<LibDNNTunerParam> param((*params)[i]->clone());
181 |         params_.push_back(param);
182 |       }
183 |   }
184 |   double get_score();
185 |   std::vector<std::shared_ptr<LibDNNTunerParam>>* get_params();
186 |  protected:
187 |   double score_;
188 |   std::vector<std::shared_ptr<LibDNNTunerParam>> params_;
189 | };
190 | 
191 | class LibDNNTunerSnapshotCompare {
192 |  public:
193 |   explicit LibDNNTunerSnapshotCompare(const bool& revparam = false)
194 |     { reverse_ = revparam; }
195 |   bool operator() (std::shared_ptr<LibDNNTunerSnapshot>& lhs,  // NOLINT
196 |                    std::shared_ptr<LibDNNTunerSnapshot>& rhs) const {  // NOLINT
197 |     if (reverse_)
198 |       return (lhs->get_score() > rhs->get_score());
199 |     else
200 |       return (lhs->get_score() < rhs->get_score());
201 |   }
202 |  private:
203 |   bool reverse_;
204 | };
205 | 
206 | 
207 | class LibDNNTuner {
208 |  public:
209 |   explicit LibDNNTuner() :
210 |   constraints_(), params_() {
211 |   }
212 | 
213 |   void Tune(libdnnTunerMethod_t method);
214 | 
215 |   std::string Serialize();
216 | 
217 |   void Restore(std::string json);
218 | 
219 |   void Snapshot(double score);
220 |   void RestoreSnapshot(std::shared_ptr<LibDNNTunerSnapshot> snapshot);
221 | 
222 |   void set_setup_routine(std::function<bool()> fun);
223 | 
224 |   void set_benchmark_routine(std::function<double()> fun);
225 | 
226 |   void add_boolean_param(std::string name, bool def_value, bool inverse);
227 |   void add_boolean_param(const char* name, bool def_value, bool inverse);
228 | 
229 |   template<class T>
230 |   void add_range_param(std::string name, T def_value, T min, T max, T step);
231 |   template<class T>
232 |   void add_range_param(const char* name, T def_value, T min, T max, T step);
233 | 
234 |   template<class T>
235 |   void add_set_param(std::string name, T def_value, std::vector<T> values);
236 |   template<class T>
237 |   void add_set_param(const char* name, T def_value, std::vector<T> values);
238 | 
239 |   template<class T>
240 |   void add_constraint(std::vector<std::string> con_params,
241 |                       std::vector<std::string> con_adapt,
242 |                       std::function<bool(std::vector<T>)> con_func);
243 | 
244 |   template<class T>
245 |   void add_constraint(std::vector<const char*> con_params,
246 |                       std::vector<const char*> con_adapt,
247 |                       std::function<bool(std::vector<T>)> con_func);
248 | 
249 |   template<class T>
250 |   void add_constraint(std::vector<const char*> con_params,
251 |                       std::vector<std::string> con_adapt,
252 |                       std::function<bool(std::vector<T>)> con_func);
253 | 
254 | 
255 |   template<class T>
256 |   void add_constraint(std::vector<std::string> con_params,
257 |                       std::vector<const char*> con_adapt,
258 |                       std::function<bool(std::vector<T>)> con_func);
259 | 
260 |   template<class T>
261 |   T get_param(std::string name);
262 |   template<class T>
263 |   T get_param(const char* name);
264 | 
265 |  protected:
266 |   void snapshot();
267 | 
268 |  private:
269 |   std::function<bool()> setup_routine_;
270 |   std::function<double()> benchmark_routine_;
271 | 
272 |   std::priority_queue<std::shared_ptr<LibDNNTunerSnapshot>,
273 |     std::vector<std::shared_ptr<LibDNNTunerSnapshot>>,
274 |     LibDNNTunerSnapshotCompare> snapshot_queue_;
275 | 
276 |   std::vector<std::shared_ptr<LibDNNTunerSnapshot>> snapshots_;
277 | 
278 |   std::vector<std::shared_ptr<LibDNNTunerConstraint> > constraints_;
279 |   std::vector<std::shared_ptr<LibDNNTunerParam> > params_;
280 |   std::map<std::string, std::shared_ptr<LibDNNTunerParam>> param_map_;
281 | };
282 | 
283 | }  // namespace greentea
284 | 
285 | #endif  // GREENTEA_TUNER_HPP_
286 | 


--------------------------------------------------------------------------------
/include/libdnn.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef GREENTEA_LIBDNN_HPP_
  2 | #define GREENTEA_LIBDNN_HPP_
  3 | 
  4 | #include <iomanip>
  5 | #include <memory>
  6 | #include <string>
  7 | #include <vector>
  8 | 
  9 | #include "device.hpp"
 10 | #include "libdnn_tuner.hpp"
 11 | 
 12 | #ifdef USE_OPENCL
 13 | #include "viennacl/backend/opencl.hpp"
 14 | #include "viennacl/ocl/backend.hpp"
 15 | #include "viennacl/ocl/context.hpp"
 16 | #include "viennacl/ocl/device.hpp"
 17 | #include "viennacl/ocl/platform.hpp"
 18 | #endif  // USE_OPENCL
 19 | 
 20 | #ifdef USE_CUDA
 21 | #include <cuda_runtime_api.h>
 22 | #include "cuda.h"
 23 | #include "nvrtc.h"
 24 | #endif  // USE_CUDA
 25 | 
 26 | namespace greentea {
 27 | 
 28 | typedef enum {
 29 |   // Stack the batch update into one GEMM block
 30 |   // (deterministic, 1 kernel call)
 31 |   // Serializes the batch and may therefore under use
 32 |   // the GPUs compute units.
 33 |   LIBDNN_CONVOLUTION_WG_ALGO_DIRECT        = 0,
 34 |   // Use multiple GEMM blocks in parallel and update weights atomically
 35 |   // (non deterministic, 1 kernel call, not supported on all devices)
 36 |   // Parallelizes the batch and has therefore higher GPU usage.
 37 |   LIBDNN_CONVOLUTION_WG_ALGO_ATOMIC        = 1,
 38 |   // Use multiple GEMM blocks and an intermediate buffer
 39 |   // to reduce weight updates
 40 |   // (deterministic, >= 2 kernel calls)
 41 |   // Parallelizes the batch and has therefore higher GPU usage.
 42 |   // NOT IMPLEMENTED YET
 43 |   LIBDNN_CONVOLUTION_WG_ALGO_REDUCTION     = 2
 44 | } libdnnConvolutionWeightAlgo_t;
 45 | 
 46 | typedef enum {
 47 |   // Transform data before GEMM (load, im2col, gemm, store)
 48 |   // This method is suitable for convolutions with similar
 49 |   // spatial input == output sizes, but can become inefficient
 50 |   // if input >> output (with large strides and kernels).
 51 |   LIBDNN_CONVOLUTION_BW_ALGO_IM2COL        = 0,
 52 |   // Transform data after GEMM (load, gemm, col2im, store)
 53 |   // Sometimes faster than im2col method, but uses
 54 |   // atomic operations and is not deterministic.
 55 |   LIBDNN_CONVOLUTION_BW_ALGO_COL2IM_ATOMIC = 1
 56 | } libdnnConvolutionBackwardAlgo_t;
 57 | 
 58 | typedef enum {
 59 |   LIBDNN_POOLING_METHOD_MAX                 = 0,
 60 |   LIBDNN_POOLING_METHOD_AVE                 = 1,
 61 |   LIBDNN_POOLING_METHOD_STO                 = 2
 62 | } libdnnPoolingMethod_t;
 63 | 
 64 | typedef enum {
 65 |   LIBDNN_POOLING_BW_ALGO_DIRECT             = 0,
 66 |   LIBDNN_POOLING_BW_ALGO_ATOMIC             = 1
 67 | } libdnnPoolingBackwardAlgo_t;
 68 | 
 69 | struct LibDNNConvConfig {
 70 |   LibDNNConvConfig() :
 71 |     in_shape(3, 1),
 72 |     out_shape(3, 1),
 73 |     kernel(1, 1),
 74 |     pad(0, 0),
 75 |     stride(1, 1),
 76 |     dilation(1, 1)
 77 |   {}
 78 |   device* dev_ptr = nullptr;
 79 |   std::vector<int_tp> in_shape;
 80 |   std::vector<int_tp> out_shape;
 81 |   std::vector<int_tp> kernel;
 82 |   std::vector<int_tp> pad;
 83 |   std::vector<int_tp> stride;
 84 |   std::vector<int_tp> dilation;
 85 |   int_tp group = 1;
 86 |   bool bias_term = false;
 87 |   bool fast_unsafe_math = false;
 88 |   bool weights_backward = true;
 89 |   bool bias_backward = true;
 90 |   libdnnConvolutionWeightAlgo_t wgalgo =
 91 |       LIBDNN_CONVOLUTION_WG_ALGO_ATOMIC;
 92 |   libdnnConvolutionBackwardAlgo_t bwalgo =
 93 |       LIBDNN_CONVOLUTION_BW_ALGO_COL2IM_ATOMIC;
 94 |   std::function<void*(void**, const uint_tp, const int_tp)>
 95 |       memory_allocator = nullptr;
 96 | };
 97 | 
 98 | template<typename Dtype>
 99 | class LibDNN {
100 |  protected:
101 |   explicit LibDNN();
102 |   virtual void GenerateKernels() = 0;
103 |   virtual std::string string_identifier() = 0;
104 |   std::string generate_header();
105 |   std::string generate_common_defs();
106 |   bool CompileKernels();
107 |   void AllocateMemory(void** ptr, uint_tp size, int_tp flags);
108 |   void SetMemory(Dtype* memory, int_tp count, int_tp offset, Dtype value);
109 | #ifdef USE_OPENCL
110 |   viennacl::ocl::program CompileKernelsOpenCL(viennacl::ocl::context *ctx);
111 | #endif  // USE_OPENCL
112 | #ifdef USE_CUDA
113 |   nvrtcProgram CompileKernelsCuda();
114 | #endif  // USE_CUDA
115 | 
116 |   template<class T>
117 |   inline void add_def(std::stringstream& ss,  // NOLINT
118 |       const char* name, T value) {
119 |     ss << "#ifdef " << name << std::endl;
120 |     ss << "#undef " << name << std::endl;
121 |     ss << "#endif" << std::endl;
122 |     if (std::is_same<T, float>::value) {
123 |       ss << "#define " << name << " (float) " << std::setprecision(32) << value
124 |           << std::endl;
125 |     } else if (std::is_same<T, double>::value) {
126 |       ss << "#define " << name << " (double) " << std::setprecision(32) << value
127 |           << std::endl;
128 |     } else {
129 |       ss << "#define " << name << " " << value << std::endl;
130 |     }
131 |   }
132 | 
133 |   template<class T>
134 |   inline void add_def(std::stringstream& ss,  // NOLINT
135 |       const std::string name, T value) {
136 |     add_def(ss, name.c_str(), value);
137 |   }
138 | 
139 |   device* dev_ptr_;
140 | 
141 | #ifdef USE_OPENCL
142 |   viennacl::ocl::program ocl_program_;
143 | #endif  // USE_OPENCL
144 | 
145 | #ifdef USE_CUDA
146 |   nvrtcProgram cuda_program_;
147 |   CUmodule cuda_module_;
148 | #endif  // USE_CUDA
149 | 
150 |   std::string kernel_;
151 | 
152 |   bool fast_unsafe_math_;
153 | };
154 | 
155 | template<typename Dtype>
156 | class LibDNNConv : public LibDNN<Dtype> {
157 |  public:
158 |   explicit LibDNNConv(LibDNNConvConfig config);
159 |   void Forward(const Dtype* bottom_data, const Dtype* weight,
160 |                const Dtype* bias,
161 |                Dtype* top_data, int_tp batch_size);
162 |   void Backward(bool prop_down_data, bool prop_down_weights,
163 |                 const Dtype* top_data, const Dtype* top_diff,
164 |                 const Dtype* weight, Dtype* weight_diff,
165 |                 const Dtype* bias, Dtype* bias_diff,
166 |                 const Dtype* bottom_data, Dtype* bottom_diff,
167 |                 int_tp batch_size);
168 | 
169 |   void Tune(Dtype* top_data, Dtype* top_diff,
170 |             Dtype* weight, Dtype* weight_diff,
171 |             Dtype* bias, Dtype* bias_diff,
172 |             Dtype* bottom_data, Dtype* bottom_diff,
173 |             int_tp batch_size);
174 | 
175 |   const LibDNNConvConfig get_config();
176 | 
177 |  protected:
178 |   void GenerateKernels();
179 |   std::string string_identifier();
180 |   std::string generate_fw_defs();
181 |   std::string generate_bw_defs();
182 |   std::string generate_wg_defs();
183 |   std::string generate_gemm_core(std::shared_ptr<LibDNNTuner> tuner,
184 |                                  bool dterm);
185 |   std::string generate_accreg_init(std::shared_ptr<LibDNNTuner> tuner,
186 |                                    bool dterm, bool load);
187 |   std::string generate_fw_kernels(std::string name);
188 |   std::string generate_bw_kernels(std::string name);
189 |   std::string generate_wg_kernels(std::string name);
190 | 
191 |  private:
192 |   LibDNNConvConfig config_;
193 | 
194 |   // Autotuners
195 |   std::shared_ptr<LibDNNTuner> fw_tuner_;
196 |   std::shared_ptr<LibDNNTuner> bw_tuner_;
197 |   std::shared_ptr<LibDNNTuner> wg_tuner_;
198 | 
199 |   // Forward GEMM sizes
200 |   int_tp M_FW_;
201 |   int_tp MG_FW_;
202 |   int_tp N_FW_;
203 |   int_tp K_FW_;
204 |   int_tp KG_FW_;
205 | 
206 |   // Backward GEMM sizes
207 |   int_tp M_BW_;
208 |   int_tp MG_BW_;
209 |   int_tp N_BW_;
210 |   int_tp K_BW_;
211 |   int_tp KG_BW_;
212 | 
213 |   // Weight GEMM sizes
214 |   int_tp M_WG_;
215 |   int_tp MG_WG_;
216 |   int_tp N_WG_;
217 |   int_tp NG_WG_;
218 |   int_tp K_WG_;
219 | 
220 |   // Convolution parameters
221 |   int_tp num_axes_;
222 |   int_tp fmaps_in_;
223 |   int_tp fmaps_out_;
224 |   int_tp group_;
225 | 
226 |   std::vector<int_tp> pad_;
227 |   std::vector<int_tp> stride_;
228 |   std::vector<int_tp> dilation_;
229 |   std::vector<int_tp> kernel_shape_;
230 |   std::vector<int_tp> im_in_shape_;
231 |   std::vector<int_tp> im_out_shape_;
232 | 
233 |   // Compile and method flags
234 |   bool weights_backward_;
235 |   bool bias_backward_;
236 |   bool bias_term_;
237 |   bool skip_range_check_;
238 |   Dtype bias_multiplier_;
239 |   libdnnConvolutionWeightAlgo_t wgalgo_;
240 |   libdnnConvolutionBackwardAlgo_t bwalgo_;
241 | };
242 | 
243 | struct LibDNNPoolConfig {
244 |   LibDNNPoolConfig() :
245 |     in_shape(3, 1),
246 |     out_shape(3, 1),
247 |     kernel(1, 1),
248 |     pad(0, 0),
249 |     stride(1, 1),
250 |     dilation(1, 1)
251 |   {}
252 |   device* dev_ptr = nullptr;
253 |   std::vector<int_tp> in_shape;
254 |   std::vector<int_tp> out_shape;
255 |   std::vector<int_tp> kernel;
256 |   std::vector<int_tp> pad;
257 |   std::vector<int_tp> stride;
258 |   std::vector<int_tp> dilation;
259 |   bool use_top_mask = false;
260 |   bool fast_unsafe_math = false;
261 |   libdnnPoolingMethod_t pool_method = LIBDNN_POOLING_METHOD_MAX;
262 |   libdnnPoolingBackwardAlgo_t bwalgo = LIBDNN_POOLING_BW_ALGO_ATOMIC;
263 |   bool global_pooling = false;
264 |   std::function<void*(void**, const uint_tp, const int_tp)>
265 |       memory_allocator = nullptr;
266 | };
267 | 
268 | template<typename Dtype>
269 | class LibDNNPool : public LibDNN<Dtype> {
270 |  public:
271 |   explicit LibDNNPool(LibDNNPoolConfig config);
272 |   void Forward(const Dtype* bottom_data, Dtype* top_data,
273 |                int_tp channels, int_tp batch_size,
274 |                bool test_mode, int_tp* mask,
275 |                Dtype* top_mask, Dtype* rand_idx);
276 |   void Backward(const Dtype* top_diff, Dtype* bottom_diff,
277 |                 int_tp channels, int_tp batch_size,
278 |                 const int_tp* mask, const Dtype* top_mask,
279 |                 const Dtype* rand_idx);
280 | 
281 |   const LibDNNPoolConfig get_config();
282 | 
283 |  protected:
284 |   void GenerateKernels();
285 |   std::string string_identifier();
286 |   std::string generate_fw_defs();
287 |   std::string generate_bw_defs();
288 |   std::string generate_fw_kernels(std::string name, bool test_mode);
289 |   std::string generate_fwtr_kernels(std::string name);
290 |   std::string generate_fwte_kernels(std::string name);
291 |   std::string generate_bw_kernels(std::string name);
292 | 
293 |  private:
294 |   LibDNNPoolConfig config_;
295 | 
296 |   // Autotuners
297 |   std::shared_ptr<LibDNNTuner> fw_tuner_;
298 |   std::shared_ptr<LibDNNTuner> bw_tuner_;
299 | 
300 |   // Pooling parameters
301 |   int_tp num_axes_;
302 | 
303 |   std::vector<int_tp> pad_;
304 |   std::vector<int_tp> stride_;
305 |   std::vector<int_tp> dilation_;
306 |   std::vector<int_tp> kernel_shape_;
307 |   std::vector<int_tp> im_in_shape_;
308 |   std::vector<int_tp> im_out_shape_;
309 | 
310 |   // Working memory for stochastic and max pooling
311 |   int_tp* mask_ = nullptr;
312 |   Dtype* rand_idx_ = nullptr;
313 | 
314 |   // Compile and method flags
315 |   bool skip_range_check_;
316 |   libdnnPoolingMethod_t pool_method_;
317 |   libdnnPoolingBackwardAlgo_t bwalgo_;
318 |   bool use_top_mask_;
319 | };
320 | 
321 | }  // namespace libdnn
322 | 
323 | #endif  // LIBDNN_LIBDNN_HPP_
324 | 


--------------------------------------------------------------------------------
/src/libdnn.cpp:
--------------------------------------------------------------------------------
  1 | #include <string>
  2 | #include <vector>
  3 | 
  4 | #include "common.hpp"
  5 | #include "device.hpp"
  6 | #include "libdnn.hpp"
  7 | #include "benchmark.hpp"
  8 | 
  9 | namespace greentea {
 10 | 
 11 | template<typename Dtype>
 12 | LibDNN<Dtype>::LibDNN() {
 13 | }
 14 | 
 15 | template<typename Dtype>
 16 | std::string LibDNN<Dtype>::generate_header() {
 17 |   std::stringstream ss;
 18 | 
 19 |   if (dev_ptr_->backend() == BACKEND_OpenCL) {
 20 |     if (std::is_same<Dtype, double>::value) {
 21 |       // Test/enable KHR 64 bit (double)
 22 |       ss << "#if defined(cl_khr_fp64)" << std::endl;
 23 |       ss << "#pragma OPENCL EXTENSION cl_khr_fp64 : enable" << std::endl;
 24 |       ss << "#define DOUBLE_SUPPORT_AVAILABLE" << std::endl;
 25 | 
 26 |       // Test/enable AMD 64 bit (double)
 27 |       ss << "#elif defined(cl_amd_fp64)" << std::endl;
 28 |       ss << "#pragma OPENCL EXTENSION cl_amd_fp64 : enable" << std::endl;
 29 |       ss << "#define DOUBLE_SUPPORT_AVAILABLE" << std::endl;
 30 |       ss << "#endif" << std::endl;
 31 |     }
 32 | 
 33 |     // Test/enable 32 bit atomics
 34 |     ss << "#if defined(cl_khr_int32_base_atomics)" << std::endl;
 35 |     ss << "#pragma OPENCL EXTENSION cl_khr_int32_base_atomics : enable"
 36 |        << std::endl;
 37 |     ss << "#define ATOMICS_32_AVAILABLE" << std::endl;
 38 |     ss << "#endif" << std::endl;
 39 |     ss << "#if defined(cl_khr_global_int32_base_atomics)" << std::endl;
 40 |     ss << "#pragma OPENCL EXTENSION cl_khr_global_int32_base_atomics : enable"
 41 |        << std::endl;
 42 |     ss << "#define ATOMICS_32_AVAILABLE" << std::endl;
 43 |     ss << "#endif" << std::endl;
 44 | 
 45 |     // 64 bit integers
 46 |     if (sizeof(int_tp) == 8 || std::is_same<Dtype, double>::value) {
 47 |       // Test/enable 64 bit atomics
 48 |       ss << "#if defined(cl_khr_int64_base_atomics)" << std::endl;
 49 |       ss << "#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable"
 50 |          << std::endl;
 51 |       ss << "#define ATOMICS_64_AVAILABLE" << std::endl;
 52 |       ss << "#endif" << std::endl;
 53 |     }
 54 |   }
 55 | 
 56 |   if (std::is_same<Dtype, double>::value) {
 57 |     ss << "#define Dtype double" << std::endl;
 58 |     ss << "#define Dtype1 double" << std::endl;
 59 |     // double2, double4, double8, double16
 60 |     for (int_tp i = 2; i <= 16; i *= 2) {
 61 |       ss << "#define Dtype" << i << " double" << i << std::endl;
 62 |     }
 63 |   } else {
 64 |     ss << "#define Dtype float" << std::endl;
 65 |     ss << "#define Dtype1 float" << std::endl;
 66 |     // float2, float4, float8, float16
 67 |     for (int_tp i = 2; i <= 16; i *= 2) {
 68 |       ss << "#define Dtype" << i << " float" << i << std::endl;
 69 |     }
 70 |   }
 71 | 
 72 |   std::vector<std::string> elems4({
 73 |       "x", "y", "z", "w" });
 74 |   std::vector<std::string> elems16({
 75 |       "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
 76 |       "s8", "s9", "sA", "sB", "sC", "sD", "sE", "sF" });
 77 | 
 78 |   for (int_tp i = 1; i <= 16; i *= 2) {
 79 |     for (int_tp j = 0; j < i; ++j) {
 80 |       if (i == 1) {
 81 |         ss << "#define VEC_" << i << "_" << j << "(X)" << " X" << std::endl;
 82 |       } else if (i < 8) {
 83 |         ss << "#define VEC_" << i << "_" << j << "(X)" << " X." << elems4[j]
 84 |            << std::endl;
 85 |       } else {
 86 |         ss << "#define VEC_" << i << "_" << j << "(X)" << " X." << elems16[j]
 87 |            << std::endl;
 88 |       }
 89 |     }
 90 |   }
 91 | 
 92 |   if (sizeof(int_tp) == 8) {
 93 |     ss << "#define int_tp long" << std::endl;
 94 |     ss << "#define uint_tp unsigned long" << std::endl;
 95 |     ss << "#define int_tpc long" << std::endl;
 96 |     ss << "#define uint_tpc unsigned long" << std::endl;
 97 |   } else {
 98 |     ss << "#define int_tp int" << std::endl;
 99 |     ss << "#define uint_tp unsigned int" << std::endl;
100 |     ss << "#define int_tpc int" << std::endl;
101 |     ss << "#define uint_tpc unsigned int" << std::endl;
102 |   }
103 | 
104 |   if (dev_ptr_->backend() == BACKEND_CUDA) {
105 |     // Prepare definitions for OpenCL => CUDA cross compile
106 |     // Mainly from: http://www.cedricnugteren.nl/tutorial.php?page=10
107 |     ss << "#define __kernel __placeholder__" << std::endl;
108 |     ss << "#define __global" << std::endl;
109 |     ss << "#define __placeholder__ extern \"C\" __global__" << std::endl;
110 |     ss << "#define __local __shared__" << std::endl;
111 |     ss << "#define __restricted __restricted__" << std::endl;
112 |     ss << "#define barrier(x) __syncthreads()" << std::endl;
113 | 
114 |     ss << "#define FLT_MIN 1.175494350822287507969e-38f"
115 |        << std::endl;
116 |     ss << "#define FLT_MAX 340282346638528859811704183484516925440.0f"
117 |        << std::endl;
118 | 
119 |     ss << "__device__ int get_local_id(int x) {" << std::endl;
120 |     ss << "if (x == 0) return threadIdx.x;" << std::endl;
121 |     ss << "if (x == 1) return threadIdx.y;" << std::endl;
122 |     ss << "if (x == 2) return threadIdx.z;" << std::endl;
123 |     ss << "return 0;" << std::endl;
124 |     ss << "}" << std::endl;
125 | 
126 |     ss << "__device__ int get_group_id(int x) {" << std::endl;
127 |     ss << "if (x == 0) return blockIdx.x;" << std::endl;
128 |     ss << "if (x == 1) return blockIdx.y;" << std::endl;
129 |     ss << "if (x == 2) return blockIdx.z;" << std::endl;
130 |     ss << "return 0;" << std::endl;
131 |     ss << "}" << std::endl;
132 | 
133 |     ss << "__device__ int get_global_id(int x) {" << std::endl;
134 |     ss << "if (x == 0) return blockIdx.x * blockDim.x" << " + threadIdx.x;"
135 |        << std::endl;
136 |     ss << "if (x == 1) return blockIdx.y * blockDim.y" << " + threadIdx.y;"
137 |        << std::endl;
138 |     ss << "if (x == 2) return blockIdx.z * blockDim.z" << " + threadIdx.z;"
139 |        << std::endl;
140 |     ss << "return 0;" << std::endl;
141 |     ss << "}" << std::endl;
142 | 
143 |     ss << "__device__ int get_global_size(int x) {" << std::endl;
144 |     ss << "if (x == 0) return blockDim.x * gridDim.x;" << std::endl;
145 |     ss << "if (x == 1) return blockDim.y * gridDim.y;" << std::endl;
146 |     ss << "if (x == 2) return blockDim.z * gridDim.z;" << std::endl;
147 |     ss << "return 0;" << std::endl;
148 |     ss << "}" << std::endl;
149 |   }
150 | 
151 |   std::vector<std::string> atomic_funcs({ "Add", "Sub", "Mul", "Div" });
152 |   std::vector<std::string> atomic_ops({ "+", "-", "*", "/" });
153 | 
154 |   // Atomic operations
155 |   if (dev_ptr_->backend() == BACKEND_OpenCL) {
156 |     // OpenCL atomics, derived from:
157 |     // https://streamcomputing.eu/blog/2016-02-09/atomic-operations-for-floats-in-opencl-improved/
158 |     if (std::is_same<Dtype, double>::value) {
159 |       ss << "#ifdef ATOMICS_64_AVAILABLE" << std::endl;
160 |     } else {
161 |       ss << "#ifdef ATOMICS_32_AVAILABLE" << std::endl;
162 |     }
163 |     for (int i = 0; i < atomic_funcs.size(); ++i) {
164 |       ss << "inline void atomic" << atomic_funcs[i];
165 |       ss << "(volatile __global Dtype* source, const Dtype operand) {"
166 |          << std::endl;
167 |       ss << "union {" << std::endl;
168 |       if (std::is_same<Dtype, double>::value) {
169 |         ss << "unsigned long intVal;" << std::endl;
170 |       } else {
171 |         ss << "unsigned int intVal;" << std::endl;
172 |       }
173 |       ss << "Dtype floatVal;" << std::endl;
174 |       ss << "} next, expected, current;" << std::endl;
175 |       ss << "current.floatVal = *source;" << std::endl;
176 |       ss << "do {" << std::endl;
177 |       ss << "expected.floatVal = current.floatVal;" << std::endl;
178 |       ss << "next.floatVal = expected.floatVal " << atomic_ops[i] << " operand;"
179 |          << std::endl;
180 |       ss << "current.intVal = ";
181 |       if (std::is_same<Dtype, double>::value) {
182 |         ss << "atom_cmpxchg((volatile __global unsigned long *)";
183 |       } else {
184 |         ss << "atomic_cmpxchg((volatile __global unsigned int *)";
185 |       }
186 |       ss << "source, expected.intVal, next.intVal);" << std::endl;
187 |       ss << "} while (current.intVal != expected.intVal);" << std::endl;
188 |       ss << "}" << std::endl;
189 |     }
190 |     if (std::is_same<Dtype, double>::value) {
191 |       ss << "#endif" << std::endl;
192 |     } else {
193 |       ss << "#endif" << std::endl;
194 |     }
195 |   }
196 | 
197 |   // Memory set
198 |   ss << "__kernel void fill_memory(const int_tp n, const Dtype alpha,"
199 |      << "__global Dtype* x, const int_tp offx) {" << std::endl;
200 |   ss << "for (int_tp index = get_global_id(0); index < n; "
201 |      << "index += get_global_size(0)) {" << std::endl;
202 |   ss << "x[index + offx] = alpha;" << std::endl;
203 |   ss << "}" << std::endl;
204 |   ss << "}" << std::endl;
205 | 
206 |   return ss.str();
207 | }
208 | 
209 | 
210 | template<typename Dtype>
211 | bool LibDNN<Dtype>::CompileKernels() {
212 |   std::string code_ext = "";
213 | 
214 |   if (dev_ptr_->backend() == BACKEND_OpenCL) {
215 |     code_ext = ".cl";
216 |   }
217 |   if (dev_ptr_->backend() == BACKEND_CUDA)  {
218 |     code_ext = ".cu";
219 |   }
220 | 
221 | #ifdef LIBDNN_DEBUG
222 |   FILE* fp = fopen((".libdnn_debug/" + string_identifier() + code_ext).c_str(),
223 |                    "wb");
224 |   fwrite(kernel_.c_str(), sizeof(char), kernel_.length(), fp);
225 |   fclose(fp);
226 | #endif  // LIBDNN_DEBUG
227 | 
228 | #ifdef USE_OPENCL
229 |   if (dev_ptr_->backend() == BACKEND_OpenCL) {
230 |     CompileKernelsOpenCL(&(viennacl::ocl::get_context(dev_ptr_->id())));
231 |   }
232 | #endif  // USE_OPENCL
233 | #ifdef USE_CUDA
234 |   if (dev_ptr_->backend() == BACKEND_CUDA) {
235 |     CompileKernelsCuda();
236 |   }
237 | #endif  // USE_CUDA
238 |   return true;
239 | }
240 | 
241 | #ifdef USE_OPENCL
242 | template<typename Dtype>
243 | viennacl::ocl::program LibDNN<Dtype>::CompileKernelsOpenCL(
244 |     viennacl::ocl::context *ctx) {
245 | 
246 |   std::string build_opts = "";
247 | 
248 |   if (fast_unsafe_math_) {
249 |     build_opts += "-cl-fast-relaxed-math -cl-mad-enable ";
250 |   }
251 | 
252 |   if (is_same<Dtype, float>::value) {
253 |     build_opts += "-cl-single-precision-constant ";
254 |   }
255 | 
256 |   ctx->build_options(build_opts);
257 | 
258 |   ocl_program_ = ctx->add_program(kernel_.c_str(), "kernel_program");
259 | 
260 | #ifdef LIBDNN_DEBUG
261 |   size_t bin_sz;
262 |   clGetProgramInfo(ocl_program_.handle().get(),
263 |                    CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &bin_sz, NULL);
264 |   unsigned char *bin = (unsigned char *)malloc(bin_sz);  // NOLINT
265 |   clGetProgramInfo(ocl_program_.handle().get(),
266 |                    CL_PROGRAM_BINARIES, sizeof(unsigned char *), &bin, NULL);
267 |   FILE* fp = fopen((".libdnn_debug/" + string_identifier() + ".clptx").c_str(),
268 |                    "wb");
269 |   fwrite(bin, sizeof(char), bin_sz, fp);
270 |   fclose(fp);
271 |   free(bin);  // NOLINT
272 | #endif  // LIBDNN_DEBUG
273 | 
274 |   return ocl_program_;
275 | }
276 | #endif  // USE_OPENCL
277 | 
278 | #ifdef USE_CUDA
279 | template<typename Dtype>
280 | nvrtcProgram LibDNN<Dtype>::CompileKernelsCuda() {
281 |   nvrtcCreateProgram(&cuda_program_, kernel_.c_str(), NULL, 0, NULL, NULL);
282 | 
283 |   std::vector<const char*> build_opts;
284 | 
285 |   cudaDeviceProp prop;
286 |   cudaGetDeviceProperties(&prop, dev_ptr_->id());
287 | 
288 |   std::string arch_opt = "--gpu-architecture=compute_"
289 |       + std::to_string(prop.major) + std::to_string(prop.minor);
290 |   std::string stdcpp_opt = "--std=c++11";
291 |   std::string fum_opt = "--use_fast_math";
292 | 
293 |   build_opts.push_back(arch_opt.c_str());
294 |   build_opts.push_back(stdcpp_opt.c_str());
295 |   if (fast_unsafe_math_) {
296 |     build_opts.push_back(fum_opt.c_str());
297 |   }
298 |   nvrtcCompileProgram(cuda_program_, build_opts.size(), &build_opts[0]);
299 | 
300 |   size_t ptxSize;
301 |   nvrtcGetPTXSize(cuda_program_, &ptxSize);
302 |   char *ptx = new char[ptxSize];
303 |   nvrtcGetPTX(cuda_program_, ptx);
304 | 
305 |   cuModuleLoadDataEx(&cuda_module_, ptx, 0, 0, 0);
306 | 
307 | #ifdef LIBDNN_DEBUG
308 |   size_t log_size;
309 |   nvrtcGetProgramLogSize(cuda_program_, &log_size);
310 |   std::vector<char> log(log_size);
311 |   nvrtcGetProgramLog(cuda_program_, log.data());
312 | 
313 |   std::cout << "CUDA compile log:" << std::endl;
314 |   std::cout << log.data() << std::endl;
315 | 
316 |   FILE* fp = fopen((".libdnn_debug/" + string_identifier() + ".cuptx").c_str(),
317 |                    "wb");
318 |   fwrite(ptx, sizeof(char), ptxSize, fp);
319 |   fclose(fp);
320 |   free(ptx);
321 | #endif  // LIBDNN_DEBUG
322 | 
323 |   return cuda_program_;
324 | }
325 | #endif  // USE_CUDA
326 | 
327 | template<typename Dtype>
328 | void LibDNN<Dtype>::AllocateMemory(void** ptr, uint_tp size, int_tp flags) {
329 |   if (dev_ptr_->backend() == BACKEND_OpenCL) {
330 | #ifdef USE_OPENCL
331 |   viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_ptr_->id());
332 |   *ptr = (void*)clCreateBuffer(ctx.handle().get(),    // NOLINT
333 |                                flags,
334 |                                size, nullptr, nullptr);
335 | #endif  // USE_OPENCL
336 |   } else {
337 | #ifdef USE_CUDA
338 |     cudaMalloc(ptr, size);
339 | #endif  // USE_CUDA
340 |   }
341 | }
342 | 
343 | template<typename Dtype>
344 | void LibDNN<Dtype>::SetMemory(Dtype* memory, int_tp count, int_tp offset,
345 |                               Dtype value) {
346 |   if (dev_ptr_->backend() == BACKEND_OpenCL) {
347 | #ifdef USE_OPENCL
348 |     viennacl::ocl::kernel &kernel = ocl_program_.get_kernel("fill_memory");
349 |     viennacl::ocl::context &ctx = viennacl::ocl::get_context(dev_ptr_->id());
350 | 
351 |     int wgs = dev_ptr_->workgroup_size(0);
352 | 
353 |     kernel.local_work_size(0, wgs);
354 |     kernel.local_work_size(1, 1);
355 |     kernel.local_work_size(2, 1);
356 | 
357 |     kernel.global_work_size(0, ((count - 1) / wgs + 1) * wgs);
358 |     kernel.global_work_size(1, 1);
359 |     kernel.global_work_size(2, 1);
360 | 
361 |     viennacl::ocl::enqueue(
362 |         kernel(count, value, WrapHandle((cl_mem) memory, &ctx), offset),
363 |         ctx.get_queue());
364 | #endif  // USE_OPENCL
365 |   } else {
366 | #ifdef USE_CUDA
367 |     CUfunction kernel;
368 |     cuModuleGetFunction(&kernel, cuda_module_, "fill_memory");
369 | 
370 |     void *args[] = { &count, &value, &memory, &offset };
371 |     cuLaunchKernel(kernel, (count + 512 - 1) / 512,   // Grid X
372 |                    1,                                 // Grid Y
373 |                    1,                                 // Grid Z
374 |                    512, 1, 1,                         // Local
375 |                    0, NULL, args, 0);                 // Arguments
376 | #endif  // USE_CUDA
377 |   }
378 | }
379 | 
380 | 
381 | template class LibDNN<float>;
382 | template class LibDNN<double>;
383 | 
384 | }  // namespace greentea
385 | 


--------------------------------------------------------------------------------
/cmake/Utils.cmake:
--------------------------------------------------------------------------------
  1 | ################################################################################################
  2 | # Command alias for debugging messages
  3 | # Usage:
  4 | #   dmsg(<message>)
  5 | function(dmsg)
  6 |   message(STATUS ${ARGN})
  7 | endfunction()
  8 | 
  9 | ################################################################################################
 10 | # Removes duplicates from list(s)
 11 | # Usage:
 12 | #   greentea_list_unique(<list_variable> [<list_variable>] [...])
 13 | macro(greentea_list_unique)
 14 |   foreach(__lst ${ARGN})
 15 |     if(${__lst})
 16 |       list(REMOVE_DUPLICATES ${__lst})
 17 |     endif()
 18 |   endforeach()
 19 | endmacro()
 20 | 
 21 | ################################################################################################
 22 | # Clears variables from list
 23 | # Usage:
 24 | #   greentea_clear_vars(<variables_list>)
 25 | macro(greentea_clear_vars)
 26 |   foreach(_var ${ARGN})
 27 |     unset(${_var})
 28 |   endforeach()
 29 | endmacro()
 30 | 
 31 | ################################################################################################
 32 | # Removes duplicates from string
 33 | # Usage:
 34 | #   greentea_string_unique(<string_variable>)
 35 | function(greentea_string_unique __string)
 36 |   if(${__string})
 37 |     set(__list ${${__string}})
 38 |     separate_arguments(__list)
 39 |     list(REMOVE_DUPLICATES __list)
 40 |     foreach(__e ${__list})
 41 |       set(__str "${__str} ${__e}")
 42 |     endforeach()
 43 |     set(${__string} ${__str} PARENT_SCOPE)
 44 |   endif()
 45 | endfunction()
 46 | 
 47 | ################################################################################################
 48 | # Prints list element per line
 49 | # Usage:
 50 | #   greentea_print_list(<list>)
 51 | function(greentea_print_list)
 52 |   foreach(e ${ARGN})
 53 |     message(STATUS ${e})
 54 |   endforeach()
 55 | endfunction()
 56 | 
 57 | ################################################################################################
 58 | # Function merging lists of compiler flags to single string.
 59 | # Usage:
 60 | #   greentea_merge_flag_lists(out_variable <list1> [<list2>] [<list3>] ...)
 61 | function(greentea_merge_flag_lists out_var)
 62 |   set(__result "")
 63 |   foreach(__list ${ARGN})
 64 |     foreach(__flag ${${__list}})
 65 |       string(STRIP ${__flag} __flag)
 66 |       set(__result "${__result} ${__flag}")
 67 |     endforeach()
 68 |   endforeach()
 69 |   string(STRIP ${__result} __result)
 70 |   set(${out_var} ${__result} PARENT_SCOPE)
 71 | endfunction()
 72 | 
 73 | ################################################################################################
 74 | # Converts all paths in list to absolute
 75 | # Usage:
 76 | #   greentea_convert_absolute_paths(<list_variable>)
 77 | function(greentea_convert_absolute_paths variable)
 78 |   set(__dlist "")
 79 |   foreach(__s ${${variable}})
 80 |     get_filename_component(__abspath ${__s} ABSOLUTE)
 81 |     list(APPEND __list ${__abspath})
 82 |   endforeach()
 83 |   set(${variable} ${__list} PARENT_SCOPE)
 84 | endfunction()
 85 | 
 86 | ################################################################################################
 87 | # Reads set of version defines from the header file
 88 | # Usage:
 89 | #   greentea_parse_header(<file> <define1> <define2> <define3> ..)
 90 | macro(greentea_parse_header FILENAME FILE_VAR)
 91 |   set(vars_regex "")
 92 |   set(__parnet_scope OFF)
 93 |   set(__add_cache OFF)
 94 |   foreach(name ${ARGN})
 95 |     if("${name}" STREQUAL "PARENT_SCOPE")
 96 |       set(__parnet_scope ON)
 97 |     elseif("${name}" STREQUAL "CACHE")
 98 |       set(__add_cache ON)
 99 |     elseif(vars_regex)
100 |       set(vars_regex "${vars_regex}|${name}")
101 |     else()
102 |       set(vars_regex "${name}")
103 |     endif()
104 |   endforeach()
105 |   if(EXISTS "${FILENAME}")
106 |     file(STRINGS "${FILENAME}" ${FILE_VAR} REGEX "#define[ \t]+(${vars_regex})[ \t]+[0-9]+" )
107 |   else()
108 |     unset(${FILE_VAR})
109 |   endif()
110 |   foreach(name ${ARGN})
111 |     if(NOT "${name}" STREQUAL "PARENT_SCOPE" AND NOT "${name}" STREQUAL "CACHE")
112 |       if(${FILE_VAR})
113 |         if(${FILE_VAR} MATCHES ".+[ \t]${name}[ \t]+([0-9]+).*")
114 |           string(REGEX REPLACE ".+[ \t]${name}[ \t]+([0-9]+).*" "\\1" ${name} "${${FILE_VAR}}")
115 |         else()
116 |           set(${name} "")
117 |         endif()
118 |         if(__add_cache)
119 |           set(${name} ${${name}} CACHE INTERNAL "${name} parsed from ${FILENAME}" FORCE)
120 |         elseif(__parnet_scope)
121 |           set(${name} "${${name}}" PARENT_SCOPE)
122 |         endif()
123 |       else()
124 |         unset(${name} CACHE)
125 |       endif()
126 |     endif()
127 |   endforeach()
128 | endmacro()
129 | 
130 | ################################################################################################
131 | # Reads single version define from the header file and parses it
132 | # Usage:
133 | #   greentea_parse_header_single_define(<library_name> <file> <define_name>)
134 | function(greentea_parse_header_single_define LIBNAME HDR_PATH VARNAME)
135 |   set(${LIBNAME}_H "")
136 |   if(EXISTS "${HDR_PATH}")
137 |     file(STRINGS "${HDR_PATH}" ${LIBNAME}_H REGEX "^#define[ \t]+${VARNAME}[ \t]+\"[^\"]*\".*$" LIMIT_COUNT 1)
138 |   endif()
139 | 
140 |   if(${LIBNAME}_H)
141 |     string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MAJOR "${${LIBNAME}_H}")
142 |     string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_MINOR  "${${LIBNAME}_H}")
143 |     string(REGEX REPLACE "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.([0-9]+).*$" "\\1" ${LIBNAME}_VERSION_PATCH "${${LIBNAME}_H}")
144 |     set(${LIBNAME}_VERSION_MAJOR ${${LIBNAME}_VERSION_MAJOR} ${ARGN} PARENT_SCOPE)
145 |     set(${LIBNAME}_VERSION_MINOR ${${LIBNAME}_VERSION_MINOR} ${ARGN} PARENT_SCOPE)
146 |     set(${LIBNAME}_VERSION_PATCH ${${LIBNAME}_VERSION_PATCH} ${ARGN} PARENT_SCOPE)
147 |     set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_MAJOR}.${${LIBNAME}_VERSION_MINOR}.${${LIBNAME}_VERSION_PATCH}" PARENT_SCOPE)
148 | 
149 |     # append a TWEAK version if it exists:
150 |     set(${LIBNAME}_VERSION_TWEAK "")
151 |     if("${${LIBNAME}_H}" MATCHES "^.*[ \t]${VARNAME}[ \t]+\"[0-9]+\\.[0-9]+\\.[0-9]+\\.([0-9]+).*$")
152 |       set(${LIBNAME}_VERSION_TWEAK "${CMAKE_MATCH_1}" ${ARGN} PARENT_SCOPE)
153 |     endif()
154 |     if(${LIBNAME}_VERSION_TWEAK)
155 |       set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}.${${LIBNAME}_VERSION_TWEAK}" ${ARGN} PARENT_SCOPE)
156 |     else()
157 |       set(${LIBNAME}_VERSION_STRING "${${LIBNAME}_VERSION_STRING}" ${ARGN} PARENT_SCOPE)
158 |     endif()
159 |   endif()
160 | endfunction()
161 | 
162 | ########################################################################################################
163 | # An option that the user can select. Can accept condition to control when option is available for user.
164 | # Usage:
165 | #   greentea_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
166 | function(greentea_option variable description value)
167 |   set(__value ${value})
168 |   set(__condition "")
169 |   set(__varname "__value")
170 |   foreach(arg ${ARGN})
171 |     if(arg STREQUAL "IF" OR arg STREQUAL "if")
172 |       set(__varname "__condition")
173 |     else()
174 |       list(APPEND ${__varname} ${arg})
175 |     endif()
176 |   endforeach()
177 |   unset(__varname)
178 |   if("${__condition}" STREQUAL "")
179 |     set(__condition 2 GREATER 1)
180 |   endif()
181 | 
182 |   if(${__condition})
183 |     if("${__value}" MATCHES ";")
184 |       if(${__value})
185 |         option(${variable} "${description}" ON)
186 |       else()
187 |         option(${variable} "${description}" OFF)
188 |       endif()
189 |     elseif(DEFINED ${__value})
190 |       if(${__value})
191 |         option(${variable} "${description}" ON)
192 |       else()
193 |         option(${variable} "${description}" OFF)
194 |       endif()
195 |     else()
196 |       option(${variable} "${description}" ${__value})
197 |     endif()
198 |   else()
199 |     unset(${variable} CACHE)
200 |   endif()
201 | endfunction()
202 | 
203 | ################################################################################################
204 | # Utility macro for comparing two lists. Used for CMake debugging purposes
205 | # Usage:
206 | #   greentea_compare_lists(<list_variable> <list2_variable> [description])
207 | function(greentea_compare_lists list1 list2 desc)
208 |   set(__list1 ${${list1}})
209 |   set(__list2 ${${list2}})
210 |   list(SORT __list1)
211 |   list(SORT __list2)
212 |   list(LENGTH __list1 __len1)
213 |   list(LENGTH __list2 __len2)
214 | 
215 |   if(NOT ${__len1} EQUAL ${__len2})
216 |     message(FATAL_ERROR "Lists are not equal. ${__len1} != ${__len2}. ${desc}")
217 |   endif()
218 | 
219 |   foreach(__i RANGE 1 ${__len1})
220 |     math(EXPR __index "${__i}- 1")
221 |     list(GET __list1 ${__index} __item1)
222 |     list(GET __list2 ${__index} __item2)
223 |     if(NOT ${__item1} STREQUAL ${__item2})
224 |       message(FATAL_ERROR "Lists are not equal. Differ at element ${__index}. ${desc}")
225 |     endif()
226 |   endforeach()
227 | endfunction()
228 | 
229 | ################################################################################################
230 | # Command for disabling warnings for different platforms (see below for gcc and VisualStudio)
231 | # Usage:
232 | #   greentea_warnings_disable(<CMAKE_[C|CXX]_FLAGS[_CONFIGURATION]> -Wshadow /wd4996 ..,)
233 | macro(greentea_warnings_disable)
234 |   set(_flag_vars "")
235 |   set(_msvc_warnings "")
236 |   set(_gxx_warnings "")
237 | 
238 |   foreach(arg ${ARGN})
239 |     if(arg MATCHES "^CMAKE_")
240 |       list(APPEND _flag_vars ${arg})
241 |     elseif(arg MATCHES "^/wd")
242 |       list(APPEND _msvc_warnings ${arg})
243 |     elseif(arg MATCHES "^-W")
244 |       list(APPEND _gxx_warnings ${arg})
245 |     endif()
246 |   endforeach()
247 | 
248 |   if(NOT _flag_vars)
249 |     set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
250 |   endif()
251 | 
252 |   if(MSVC AND _msvc_warnings)
253 |     foreach(var ${_flag_vars})
254 |       foreach(warning ${_msvc_warnings})
255 |         set(${var} "${${var}} ${warning}")
256 |       endforeach()
257 |     endforeach()
258 |   elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings)
259 |     foreach(var ${_flag_vars})
260 |       foreach(warning ${_gxx_warnings})
261 |         if(NOT warning MATCHES "^-Wno-")
262 |           string(REPLACE "${warning}" "" ${var} "${${var}}")
263 |           string(REPLACE "-W" "-Wno-" warning "${warning}")
264 |         endif()
265 |         set(${var} "${${var}} ${warning}")
266 |       endforeach()
267 |     endforeach()
268 |   endif()
269 |   greentea_clear_vars(_flag_vars _msvc_warnings _gxx_warnings)
270 | endmacro()
271 | 
272 | ################################################################################################
273 | # Helper function get current definitions
274 | # Usage:
275 | #   greentea_get_current_definitions(<definitions_variable>)
276 | function(greentea_get_current_definitions definitions_var)
277 |   get_property(current_definitions DIRECTORY PROPERTY COMPILE_DEFINITIONS)
278 |   set(result "")
279 | 
280 |   foreach(d ${current_definitions})
281 |     list(APPEND result -D${d})
282 |   endforeach()
283 | 
284 |   greentea_list_unique(result)
285 |   set(${definitions_var} ${result} PARENT_SCOPE)
286 | endfunction()
287 | 
288 | ################################################################################################
289 | # Helper function get current includes/definitions
290 | # Usage:
291 | #   greentea_get_current_cflags(<cflagslist_variable>)
292 | function(greentea_get_current_cflags cflags_var)
293 |   get_property(current_includes DIRECTORY PROPERTY INCLUDE_DIRECTORIES)
294 |   greentea_convert_absolute_paths(current_includes)
295 |   greentea_get_current_definitions(cflags)
296 | 
297 |   foreach(i ${current_includes})
298 |     list(APPEND cflags "-I${i}")
299 |   endforeach()
300 | 
301 |   greentea_list_unique(cflags)
302 |   set(${cflags_var} ${cflags} PARENT_SCOPE)
303 | endfunction()
304 | 
305 | ################################################################################################
306 | # Helper function to parse current linker libs into link directories, libflags and osx frameworks
307 | # Usage:
308 | #   greentea_parse_linker_libs(<greentea_LINKER_LIBS_var> <directories_var> <libflags_var> <frameworks_var>)
309 | function(greentea_parse_linker_libs greentea_LINKER_LIBS_variable folders_var flags_var frameworks_var)
310 | 
311 |   set(__unspec "")
312 |   set(__debug "")
313 |   set(__optimized "")
314 |   set(__framework "")
315 |   set(__varname "__unspec")
316 | 
317 |   # split libs into debug, optimized, unspecified and frameworks
318 |   foreach(list_elem ${${greentea_LINKER_LIBS_variable}})
319 |     if(list_elem STREQUAL "debug")
320 |       set(__varname "__debug")
321 |     elseif(list_elem STREQUAL "optimized")
322 |       set(__varname "__optimized")
323 |     elseif(list_elem MATCHES "^-framework[ \t]+([^ \t].*)")
324 |       list(APPEND __framework -framework ${CMAKE_MATCH_1})
325 |     else()
326 |       list(APPEND ${__varname} ${list_elem})
327 |       set(__varname "__unspec")
328 |     endif()
329 |   endforeach()
330 | 
331 |   # attach debug or optimized libs to unspecified according to current configuration
332 |   if(CMAKE_BUILD_TYPE MATCHES "Debug")
333 |     set(__libs ${__unspec} ${__debug})
334 |   else()
335 |     set(__libs ${__unspec} ${__optimized})
336 |   endif()
337 | 
338 |   set(libflags "")
339 |   set(folders "")
340 | 
341 |   # convert linker libraries list to link flags
342 |   foreach(lib ${__libs})
343 |     if(TARGET ${lib})
344 |       list(APPEND folders $<TARGET_LINKER_FILE_DIR:${lib}>)
345 |       list(APPEND libflags -l${lib})
346 |     elseif(lib MATCHES "^-l.*")
347 |       list(APPEND libflags ${lib})
348 |     elseif(IS_ABSOLUTE ${lib})
349 |       get_filename_component(folder  ${lib} PATH)
350 |       get_filename_component(filename ${lib} NAME)
351 |       string(REGEX REPLACE "\\.[^.]*$" "" filename_without_shortest_ext ${filename})
352 | 
353 |       string(REGEX MATCH "^lib(.*)" __match ${filename_without_shortest_ext})
354 |       list(APPEND libflags -l${CMAKE_MATCH_1})
355 |       list(APPEND folders    ${folder})
356 |     else()
357 |       message(FATAL_ERROR "Logic error. Need to update cmake script")
358 |     endif()
359 |   endforeach()
360 | 
361 |   greentea_list_unique(libflags folders)
362 | 
363 |   set(${folders_var} ${folders} PARENT_SCOPE)
364 |   set(${flags_var} ${libflags} PARENT_SCOPE)
365 |   set(${frameworks_var} ${__framework} PARENT_SCOPE)
366 | endfunction()
367 | 
368 | ################################################################################################
369 | # Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, ....
370 | # Usage:
371 | #   greentea_detect_darwin_version(<version_variable>)
372 | function(greentea_detect_darwin_version output_var)
373 |   if(APPLE)
374 |     execute_process(COMMAND /usr/bin/sw_vers -productVersion
375 |                     RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out
376 |                     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
377 | 
378 |     set(${output_var} ${__sw_vers_out} PARENT_SCOPE)
379 |   else()
380 |     set(${output_var} "" PARENT_SCOPE)
381 |   endif()
382 | endfunction()
383 | 


--------------------------------------------------------------------------------
/src/device.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <string>
  3 | #include <vector>
  4 | 
  5 | #include "device.hpp"
  6 | 
  7 | namespace greentea {
  8 | 
  9 | #ifdef USE_OPENCL
 10 | void device::setupViennaCLContext(
 11 |     int id, cl_context ctx,
 12 |             cl_device_id dev,
 13 |             cl_command_queue queue) {
 14 |   viennacl::ocl::setup_context(id, ctx, dev, queue);
 15 | }
 16 | #endif
 17 | 
 18 | device::device()
 19 |     : current_queue_id_(0)
 20 |     , workgroup_sizes_(3, 0)
 21 |     , id_(0)
 22 |     , list_id_(0)
 23 |     , backend_(Backend::BACKEND_CPU)
 24 |     , memory_usage_(0)
 25 |     , peak_memory_usage_(0)
 26 |     , host_unified_(false)
 27 |     , name_(""){}
 28 | 
 29 | device::device(int id,
 30 |                int list_id,
 31 |                Backend backend)
 32 |     : current_queue_id_(0)
 33 |     , workgroup_sizes_(3, 0)
 34 |     , id_(id)
 35 |     , list_id_(list_id)
 36 |     , backend_(backend)
 37 |     , memory_usage_(0)
 38 |     , peak_memory_usage_(0)
 39 |     , host_unified_(false)
 40 |     , name_(""){}
 41 | 
 42 | void device::Init() {
 43 | #ifndef CPU_ONLY
 44 |   if (backend_ == BACKEND_CUDA) {
 45 | #ifdef USE_CUDA
 46 |     workgroup_sizes_[0] = CUDA_NUM_THREADS;
 47 |     workgroup_sizes_[1] = CUDA_NUM_THREADS;
 48 |     workgroup_sizes_[2] = CUDA_NUM_THREADS;
 49 | #endif  // USE_CUDA
 50 |   } else {
 51 | #ifdef USE_OPENCL
 52 |     viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_);
 53 | 
 54 |     std::vector<size_t> temp(3);
 55 |     clGetDeviceInfo(ctx.devices()[0].id(),
 56 |                     CL_DEVICE_MAX_WORK_ITEM_SIZES,
 57 |                     3 * sizeof(size_t), &temp[0], NULL);
 58 |     workgroup_sizes_[0] = temp[0];
 59 |     workgroup_sizes_[1] = temp[1];
 60 |     workgroup_sizes_[2] = temp[2];
 61 |     cl_bool host_unified;
 62 |     clGetDeviceInfo(ctx.devices()[0].id(),
 63 |                     CL_DEVICE_HOST_UNIFIED_MEMORY,
 64 |                     sizeof(cl_bool), &host_unified, NULL);
 65 | 
 66 |     host_unified_ = host_unified;
 67 | #endif  // USE_OPENCL
 68 |   }
 69 | #endif  // !CPU_ONLY
 70 | }
 71 | 
 72 | Backend device::backend() const {
 73 |   return backend_;
 74 | }
 75 | 
 76 | int device::id() const {
 77 |   return id_;
 78 | }
 79 | 
 80 | int device::list_id() const {
 81 |   return list_id_;
 82 | }
 83 | 
 84 | int device::workgroup_size(int id) {
 85 |   return workgroup_sizes_[id % 3];
 86 | }
 87 | 
 88 | #ifdef USE_OPENCL
 89 | viennacl::ocl::program& device::program() {
 90 |   return ocl_program_;
 91 | }
 92 | #endif
 93 | 
 94 | int device::num_queues() {
 95 |   if (backend_ == BACKEND_CUDA) {
 96 | #ifdef USE_CUDA
 97 |     return 1;
 98 | #endif  // USE_CUDA
 99 |   } else {
100 | #ifdef USE_OPENCL
101 |     return 1;
102 | #endif  // USE_OPENCL
103 |   }
104 |   return 1;
105 | }
106 | 
107 | int device::current_queue_id() {
108 |   return current_queue_id_;
109 | }
110 | 
111 | void device::SwitchQueue(const int id) {
112 |   if (backend_ == BACKEND_CUDA) {
113 | #ifdef USE_CUDA
114 |     (void) id;
115 | #endif  // USE_CUDA
116 |   } else {
117 | #ifdef USE_OPENCL
118 |     viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_);
119 |     ctx.switch_queue(id % num_queues());
120 |     current_queue_id_ = id % num_queues();
121 | #endif  // USE_OPENCL
122 |   }
123 | }
124 | 
125 | void device::FinishQueues() {
126 |   if (backend_ == BACKEND_CUDA) {
127 | #ifdef USE_CUDA
128 | #endif  // USE_CUDA
129 |   } else {
130 | #ifdef USE_OPENCL
131 |     viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_);
132 |     for (int i = 0; i < num_queues(); ++i) {
133 |       ctx.switch_queue(i);
134 |       ctx.get_queue().finish();
135 |     }
136 |     ctx.switch_queue(0);
137 |     current_queue_id_ = 0;
138 | #endif  // USE_OPENCL
139 |   }
140 | }
141 | 
142 | uint_tp device::memory_usage() {
143 |   return memory_usage_;
144 | }
145 | 
146 | uint_tp device::peak_memory_usage() {
147 |   return peak_memory_usage_;
148 | }
149 | 
150 | std::string device::name() {
151 |   if (name_ == "") {
152 |     if (backend_ == BACKEND_OpenCL) {
153 | #ifdef USE_OPENCL
154 |       viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_);
155 | 
156 |       size_t size;
157 |       size_t max_size = 1024 * 1024;
158 |       clGetDeviceInfo(ctx.devices()[0].id(), CL_DEVICE_NAME,
159 |                       0, NULL, &size);
160 | 
161 |       // Cap at 1 MB to capture faulty OpenCL implementations (nVidia)
162 |       std::vector<char> exts(std::min(size, max_size));
163 | 
164 |       clGetDeviceInfo(ctx.devices()[0].id(), CL_DEVICE_NAME,
165 |                       std::min(size, max_size), &(exts[0]), NULL);
166 | 
167 |       std::string extsstr(&(exts[0]));
168 |       std::replace(extsstr.begin(), extsstr.end(), ' ', '_');
169 |       name_ = extsstr;
170 | #endif  // USE_OPENCL
171 |     } else {
172 | #ifdef USE_CUDA
173 |       cudaDeviceProp prop;
174 |       cudaGetDeviceProperties(&prop, id_);
175 |       std::string extsstr(&prop.name[0]);
176 |       std::replace(extsstr.begin(), extsstr.end(), ' ', '_');
177 |       name_ = extsstr;
178 | #endif  // USE_CUDA
179 |     }
180 |   }
181 |   return name_;
182 | }
183 | 
184 | void device::IncreaseMemoryUsage(uint_tp bytes) {
185 |   memory_usage_ += bytes;
186 |   if (memory_usage_ > peak_memory_usage_) {
187 |     peak_memory_usage_ = memory_usage_;
188 |   }
189 | }
190 | 
191 | void device::DecreaseMemoryUsage(uint_tp bytes) {
192 |   memory_usage_ -= bytes;
193 | }
194 | 
195 | void device::ResetPeakMemoryUsage() {
196 |   peak_memory_usage_ = memory_usage_;
197 | }
198 | 
199 | bool device::CheckCapability(std::string cap) {
200 |   if (backend_ == BACKEND_OpenCL) {
201 | #ifdef USE_OPENCL
202 |     viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_);
203 | 
204 |     size_t size;
205 |     size_t max_size = 1024 * 1024;
206 |     clGetDeviceInfo(ctx.devices()[0].id(), CL_DEVICE_EXTENSIONS,
207 |                     0, NULL, &size);
208 | 
209 |     // Cap at 1 MB to capture faulty OpenCL implementations (nVidia)
210 |     std::vector<char> exts(std::min(size, max_size));
211 | 
212 |     clGetDeviceInfo(ctx.devices()[0].id(), CL_DEVICE_EXTENSIONS,
213 |                     size, &(exts[0]), NULL);
214 | 
215 |     std::string extsstr(&(exts[0]));
216 |     return extsstr.find(cap) != std::string::npos;
217 | #endif
218 |   }
219 |   return false;
220 | }
221 | 
222 | bool device::CheckVendor(std::string vendor) {
223 |   if (backend_ == Backend::BACKEND_CUDA) {
224 |     if (vendor.compare("NVIDIA") == 0)
225 |       return true;
226 |   }
227 | #ifdef USE_OPENCL
228 |   else if (backend_ == BACKEND_OpenCL) {
229 |     viennacl::ocl::context &ctx = viennacl::ocl::get_context(id_);
230 |     const viennacl::ocl::device &device = ctx.current_device();
231 | 
232 |     if (device.vendor().find(vendor) != std::string::npos)
233 |       return true;
234 |   }
235 | #endif
236 | 
237 |   return false;
238 | }
239 | 
240 | #ifdef USE_OPENCL
241 | bool device::is_host_unified() {
242 |   return host_unified_;
243 | }
244 | 
245 | const char* clGetErrorString(cl_int error) {
246 |   switch (error) {
247 |   case   0: return "CL_SUCCESS";
248 |   case  -1: return "CL_DEVICE_NOT_FOUND";
249 |   case  -2: return "CL_DEVICE_NOT_AVAILABLE";
250 |   case  -3: return "CL_COMPILER_NOT_AVAILABLE";
251 |   case  -4: return "CL_MEM_OBJECT_ALLOCATION_FAILURE";
252 |   case  -5: return "CL_OUT_OF_RESOURCES";
253 |   case  -6: return "CL_OUT_OF_HOST_MEMORY";
254 |   case  -7: return "CL_PROFILING_INFO_NOT_AVAILABLE";
255 |   case  -8: return "CL_MEM_COPY_OVERLAP";
256 |   case  -9: return "CL_IMAGE_FORMAT_MISMATCH";
257 |   case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED";
258 |   case -11: return "CL_BUILD_PROGRAM_FAILURE";
259 |   case -12: return "CL_MAP_FAILURE";
260 |   case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET";
261 |   case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST";
262 |   case -15: return "CL_COMPILE_PROGRAM_FAILURE";
263 |   case -16: return "CL_LINKER_NOT_AVAILABLE";
264 |   case -17: return "CL_LINK_PROGRAM_FAILURE";
265 |   case -18: return "CL_DEVICE_PARTITION_FAILED";
266 |   case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE";
267 |   case -30: return "CL_INVALID_VALUE";
268 |   case -31: return "CL_INVALID_DEVICE_TYPE";
269 |   case -32: return "CL_INVALID_PLATFORM";
270 |   case -33: return "CL_INVALID_DEVICE";
271 |   case -34: return "CL_INVALID_CONTEXT";
272 |   case -35: return "CL_INVALID_QUEUE_PROPERTIES";
273 |   case -36: return "CL_INVALID_COMMAND_QUEUE";
274 |   case -37: return "CL_INVALID_HOST_PTR";
275 |   case -38: return "CL_INVALID_MEM_OBJECT";
276 |   case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR";
277 |   case -40: return "CL_INVALID_IMAGE_SIZE";
278 |   case -41: return "CL_INVALID_SAMPLER";
279 |   case -42: return "CL_INVALID_BINARY";
280 |   case -43: return "CL_INVALID_BUILD_OPTIONS";
281 |   case -44: return "CL_INVALID_PROGRAM";
282 |   case -45: return "CL_INVALID_PROGRAM_EXECUTABLE";
283 |   case -46: return "CL_INVALID_KERNEL_NAME";
284 |   case -47: return "CL_INVALID_KERNEL_DEFINITION";
285 |   case -48: return "CL_INVALID_KERNEL";
286 |   case -49: return "CL_INVALID_ARG_INDEX";
287 |   case -50: return "CL_INVALID_ARG_VALUE";
288 |   case -51: return "CL_INVALID_ARG_SIZE";
289 |   case -52: return "CL_INVALID_KERNEL_ARGS";
290 |   case -53: return "CL_INVALID_WORK_DIMENSION";
291 |   case -54: return "CL_INVALID_WORK_GROUP_SIZE";
292 |   case -55: return "CL_INVALID_WORK_ITEM_SIZE";
293 |   case -56: return "CL_INVALID_GLOBAL_OFFSET";
294 |   case -57: return "CL_INVALID_EVENT_WAIT_LIST";
295 |   case -58: return "CL_INVALID_EVENT";
296 |   case -59: return "CL_INVALID_OPERATION";
297 |   case -60: return "CL_INVALID_GL_OBJECT";
298 |   case -61: return "CL_INVALID_BUFFER_SIZE";
299 |   case -62: return "CL_INVALID_MIP_LEVEL";
300 |   case -63: return "CL_INVALID_GLOBAL_WORK_SIZE";
301 |   case -64: return "CL_INVALID_PROPERTY";
302 |   case -65: return "CL_INVALID_IMAGE_DESCRIPTOR";
303 |   case -66: return "CL_INVALID_COMPILER_OPTIONS";
304 |   case -67: return "CL_INVALID_LINKER_OPTIONS";
305 |   case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT";
306 |   case -69: return "CL_INVALID_PIPE_SIZE";
307 |   case -70: return "CL_INVALID_DEVICE_QUEUE";
308 |   case -1000: return "CL_INVALID_GL_SHAREGROUP_REFERENCE_KHR";
309 |   case -1001: return "CL_PLATFORM_NOT_FOUND_KHR";
310 |   case -1002: return "CL_INVALID_D3D10_DEVICE_KHR";
311 |   case -1003: return "CL_INVALID_D3D10_RESOURCE_KHR";
312 |   case -1004: return "CL_D3D10_RESOURCE_ALREADY_ACQUIRED_KHR";
313 |   case -1005: return "CL_D3D10_RESOURCE_NOT_ACQUIRED_KHR";
314 |   case -1024: return "clBLAS: Functionality is not implemented";
315 |   case -1023: return "clBLAS: Library is not initialized yet";
316 |   case -1022: return "clBLAS: Matrix A is not a valid memory object";
317 |   case -1021: return "clBLAS: Matrix B is not a valid memory object";
318 |   case -1020: return "clBLAS: Matrix C is not a valid memory object";
319 |   case -1019: return "clBLAS: Vector X is not a valid memory object";
320 |   case -1018: return "clBLAS: Vector Y is not a valid memory object";
321 |   case -1017: return "clBLAS: An input dimension (M:N:K) is invalid";
322 |   case -1016: return "clBLAS: Leading dimension A must not be less than the "
323 |                      "size of the first dimension";
324 |   case -1015: return "clBLAS: Leading dimension B must not be less than the "
325 |                      "size of the second dimension";
326 |   case -1014: return "clBLAS: Leading dimension C must not be less than the "
327 |                      "size of the third dimension";
328 |   case -1013: return "clBLAS: The increment for a vector X must not be 0";
329 |   case -1012: return "clBLAS: The increment for a vector Y must not be 0";
330 |   case -1011: return "clBLAS: The memory object for Matrix A is too small";
331 |   case -1010: return "clBLAS: The memory object for Matrix B is too small";
332 |   case -1009: return "clBLAS: The memory object for Matrix C is too small";
333 |   case -1008: return "clBLAS: The memory object for Vector X is too small";
334 |   case -1007: return "clBLAS: The memory object for Vector Y is too small";
335 |   default: return "Unknown OpenCL error";
336 |   }
337 | }
338 | 
339 | #ifdef USE_FFT
340 | const char* clfftGetErrorString(clfftStatus status) {
341 |   switch (status) {
342 |   case CLFFT_SUCCESS:
343 |     return "CLFFT_SUCCESS";
344 |   case CLFFT_INVALID_PLAN:
345 |     return "CLFFT_INVALID_PLAN";
346 |   case CLFFT_INVALID_GLOBAL_WORK_SIZE:
347 |     return "CLFFT_INVALID_GLOBAL_WORK_SIZE";
348 |   case CLFFT_INVALID_MIP_LEVEL:
349 |     return "CLFFT_INVALID_MIP_LEVEL";
350 |   case CLFFT_INVALID_BUFFER_SIZE:
351 |     return "CLFFT_INVALID_BUFFER_SIZE";
352 |   case CLFFT_INVALID_GL_OBJECT:
353 |     return "CLFFT_INVALID_GL_OBJECT";
354 |   case CLFFT_INVALID_OPERATION:
355 |     return "CLFFT_INVALID_OPERATION";
356 |   case CLFFT_INVALID_EVENT:
357 |     return "CLFFT_INVALID_EVENT";
358 |   case CLFFT_INVALID_EVENT_WAIT_LIST:
359 |     return "CLFFT_INVALID_EVENT_WAIT_LIST";
360 |   case CLFFT_INVALID_GLOBAL_OFFSET:
361 |     return "CLFFT_INVALID_GLOBAL_OFFSET";
362 |   case CLFFT_INVALID_WORK_ITEM_SIZE:
363 |     return "CLFFT_INVALID_WORK_ITEM_SIZE";
364 |   case CLFFT_INVALID_WORK_GROUP_SIZE:
365 |     return "CLFFT_INVALID_WORK_GROUP_SIZE";
366 |   case CLFFT_INVALID_WORK_DIMENSION:
367 |     return "CLFFT_INVALID_WORK_DIMENSION";
368 |   case CLFFT_INVALID_KERNEL_ARGS:
369 |     return "CLFFT_INVALID_KERNEL_ARGS";
370 |   case CLFFT_INVALID_ARG_SIZE:
371 |     return "CLFFT_INVALID_ARG_SIZE";
372 |   case CLFFT_INVALID_ARG_VALUE:
373 |     return "CLFFT_INVALID_ARG_VALUE";
374 |   case CLFFT_INVALID_ARG_INDEX:
375 |     return "CLFFT_INVALID_ARG_INDEX";
376 |   case CLFFT_INVALID_KERNEL:
377 |     return "CLFFT_INVALID_KERNEL";
378 |   case CLFFT_INVALID_KERNEL_DEFINITION:
379 |     return "CLFFT_INVALID_KERNEL_DEFINITION";
380 |   case CLFFT_INVALID_KERNEL_NAME:
381 |     return "CLFFT_INVALID_KERNEL_NAME";
382 |   case CLFFT_INVALID_PROGRAM_EXECUTABLE:
383 |     return "CLFFT_INVALID_PROGRAM_EXECUTABLE";
384 |   case CLFFT_INVALID_PROGRAM:
385 |     return "CLFFT_INVALID_PROGRAM";
386 |   case CLFFT_INVALID_BUILD_OPTIONS:
387 |     return "CLFFT_INVALID_BUILD_OPTIONS";
388 |   case CLFFT_INVALID_BINARY:
389 |     return "CLFFT_INVALID_BINARY";
390 |   case CLFFT_INVALID_SAMPLER:
391 |     return "CLFFT_INVALID_SAMPLER";
392 |   case CLFFT_INVALID_IMAGE_SIZE:
393 |     return "CLFFT_INVALID_IMAGE_SIZE";
394 |   case CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR:
395 |     return "CLFFT_INVALID_IMAGE_FORMAT_DESCRIPTOR";
396 |   case CLFFT_INVALID_MEM_OBJECT:
397 |     return "CLFFT_INVALID_MEM_OBJECT";
398 |   case CLFFT_INVALID_HOST_PTR:
399 |     return "CLFFT_INVALID_HOST_PTR";
400 |   case CLFFT_INVALID_COMMAND_QUEUE:
401 |     return "CLFFT_INVALID_COMMAND_QUEUE";
402 |   case CLFFT_INVALID_QUEUE_PROPERTIES:
403 |     return "CLFFT_INVALID_QUEUE_PROPERTIES";
404 |   case CLFFT_INVALID_CONTEXT:
405 |     return "CLFFT_INVALID_CONTEXT";
406 |   case CLFFT_INVALID_DEVICE:
407 |     return "CLFFT_INVALID_DEVICE";
408 |   case CLFFT_INVALID_PLATFORM:
409 |     return "CLFFT_INVALID_PLATFORM";
410 |   case CLFFT_INVALID_DEVICE_TYPE:
411 |     return "CLFFT_INVALID_DEVICE_TYPE";
412 |   case CLFFT_INVALID_VALUE:
413 |     return "CLFFT_INVALID_VALUE";
414 |   case CLFFT_MAP_FAILURE:
415 |     return "CLFFT_MAP_FAILURE";
416 |   case CLFFT_BUILD_PROGRAM_FAILURE:
417 |     return "CLFFT_BUILD_PROGRAM_FAILURE";
418 |   case CLFFT_IMAGE_FORMAT_NOT_SUPPORTED:
419 |     return "CLFFT_IMAGE_FORMAT_NOT_SUPPORTED";
420 |   case CLFFT_IMAGE_FORMAT_MISMATCH:
421 |     return "CLFFT_IMAGE_FORMAT_MISMATCH";
422 |   case CLFFT_MEM_COPY_OVERLAP:
423 |     return "CLFFT_MEM_COPY_OVERLAP";
424 |   case CLFFT_PROFILING_INFO_NOT_AVAILABLE:
425 |     return "CLFFT_PROFILING_INFO_NOT_AVAILABLE";
426 |   case CLFFT_OUT_OF_HOST_MEMORY:
427 |     return "CLFFT_OUT_OF_HOST_MEMORY";
428 |   case CLFFT_OUT_OF_RESOURCES:
429 |     return "CLFFT_OUT_OF_RESOURCES";
430 |   case CLFFT_MEM_OBJECT_ALLOCATION_FAILURE:
431 |     return "CLFFT_MEM_OBJECT_ALLOCATION_FAILURE";
432 |   case CLFFT_COMPILER_NOT_AVAILABLE:
433 |     return "CLFFT_COMPILER_NOT_AVAILABLE";
434 |   case CLFFT_DEVICE_NOT_AVAILABLE:
435 |     return "CLFFT_DEVICE_NOT_AVAILABLE";
436 |   case CLFFT_DEVICE_NOT_FOUND:
437 |     return "CLFFT_DEVICE_NOT_FOUND";
438 |   case CLFFT_BUGCHECK:
439 |     return "CLFFT_BUGCHECK";
440 |   case CLFFT_NOTIMPLEMENTED:
441 |     return "CLFFT_NOTIMPLEMENTED";
442 |   case CLFFT_TRANSPOSED_NOTIMPLEMENTED:
443 |     return "CLFFT_TRANSPOSED_NOTIMPLEMENTED";
444 |   case CLFFT_FILE_NOT_FOUND:
445 |     return "CLFFT_FILE_NOT_FOUND";
446 |   case CLFFT_FILE_CREATE_FAILURE:
447 |     return "CLFFT_FILE_CREATE_FAILURE";
448 |   case CLFFT_VERSION_MISMATCH:
449 |     return "CLFFT_VERSION_MISMATCH";
450 |   case CLFFT_DEVICE_NO_DOUBLE:
451 |     return "CLFFT_DEVICE_NO_DOUBLE";
452 |   case CLFFT_DEVICE_MISMATCH:
453 |     return "CLFFT_DEVICE_MISMATCH";
454 |   default:
455 |     return "CLFFT_UNKNOWN_ERROR";
456 |   }
457 | }
458 | #endif  // USE FFT
459 | 
460 | #endif  // USE_OPENCL
461 | 
462 | }  // namespace libnn
463 | 


--------------------------------------------------------------------------------
/src/libdnn_tuner.cpp:
--------------------------------------------------------------------------------
  1 | #include <algorithm>
  2 | #include <random>
  3 | #include <string>
  4 | #include <utility>
  5 | #include <vector>
  6 | #include "common.hpp"
  7 | #include "device.hpp"
  8 | #include "libdnn_tuner.hpp"
  9 | 
 10 | namespace greentea {
 11 | 
 12 | void LibDNNTuner::set_setup_routine(std::function<bool()> fun) {
 13 |   this->setup_routine_ = fun;
 14 | }
 15 | 
 16 | void LibDNNTuner::set_benchmark_routine(std::function<double()> fun) {
 17 |   this->benchmark_routine_ = fun;
 18 | }
 19 | 
 20 | void LibDNNTuner::Tune(libdnnTunerMethod_t method) {
 21 |   bool setup_success = setup_routine_();
 22 |   int_tp current_param = 0;
 23 |   double baseline_score = 0;
 24 |   double best_score = 0;
 25 |   for (int i = 0; i < 5; ++i) {
 26 |     baseline_score += benchmark_routine_();
 27 |   }
 28 |   baseline_score /= 5;
 29 |   best_score = baseline_score;
 30 | 
 31 |   if (method == LIBDNN_TUNER_METHOD_ALL) {
 32 |     while (true) {
 33 |       bool setup_success = setup_routine_();
 34 |       if (setup_success) {
 35 |         double score = benchmark_routine_();
 36 |         if (score > best_score) {
 37 |           best_score = score;
 38 |         }
 39 |         std::cout << "Score: "
 40 |             << (100.0/baseline_score)*score <<  "% (best: "
 41 |             << (100.0/baseline_score)*best_score << "%)"<< std::endl;
 42 |       }
 43 | 
 44 |       bool overflow = false;
 45 |       while (true) {
 46 |         overflow = params_[current_param]->advance(1);
 47 |         if (overflow) {
 48 |           // Parameter is at default value again
 49 |           // Switch to the next parameter
 50 |           ++current_param;
 51 |           if (current_param >= params_.size()) {
 52 |             // Through all parameters, stop
 53 |             break;
 54 |           }
 55 |         } else {
 56 |           // Current parameter has changed to a new value, stop
 57 |           break;
 58 |         }
 59 |       }
 60 |       if (current_param >= params_.size()) {
 61 |         // Through all parameters, stop
 62 |         break;
 63 |       }
 64 |       current_param = 0;
 65 |     }
 66 |   }
 67 |   if (method == LIBDNN_TUNER_METHOD_ANNEALING) {
 68 |     double temp = 1.0;
 69 |     double temp_min = 0.01;
 70 |     double alpha = 0.95;
 71 |     double old_score = baseline_score;
 72 | 
 73 |     std::random_device rd;
 74 |     std::mt19937 rng(rd());
 75 |     std::uniform_int_distribution<int> uni(0, params_.size() - 1);
 76 |     std::uniform_int_distribution<int> adv(1, 3);
 77 |     std::uniform_int_distribution<int> dir(0, 1);
 78 |     std::uniform_real_distribution<double> aprn(0.0, 1.0);
 79 | 
 80 |     // Initial state snapshot
 81 |     Snapshot(baseline_score);
 82 | 
 83 |     while (temp > temp_min) {
 84 |       for (int i = 0; i < 100; ++i) {
 85 |         int next_param = uni(rng);
 86 |         libdnnTunerParamStatus_t status;
 87 |         while (true) {
 88 |           status = params_[next_param]->advance(dir(rng) == 0?-1:1*adv(rng));
 89 |           if (status != LIBDNN_TUNER_PARAM_STAT_NO_SOLUTION) {
 90 |             break;
 91 |           }
 92 |         }
 93 |         std::cout << "Changing parameter: " << params_[next_param]->get_name()
 94 |             << ", new index: "
 95 |             << params_[next_param]->get_curr_idx()
 96 |             << ", new value: "
 97 |             << get_param<double>(params_[next_param]->get_name()) << std::endl;
 98 |         bool setup_success = setup_routine_();
 99 |         double score = -1.0;
100 |         if (setup_success) {
101 |           score = benchmark_routine_();
102 |           if (score > best_score) {
103 |             best_score = score;
104 |           }
105 |           std::cout << "Score: "
106 |               << (100.0/baseline_score)*score <<  "% (best: "
107 |               << (100.0/baseline_score)*best_score << "%) temp: "
108 |               << temp << ", step: " << i << std::endl;
109 |         } else {
110 |           std::cout << "Setup failure" << std::endl;
111 |           RestoreSnapshot(snapshots_[snapshots_.size()-1]);
112 |         }
113 |         double ap = std::exp(((1.0/old_score)-(1.0/score))/temp);
114 |         if (ap > aprn(rng)) {
115 |           // Accept solution, create a snapshot
116 |           Snapshot(score);
117 |           old_score = score;
118 |         } else {
119 |           // Reject solution, restore the last snapshot
120 |           RestoreSnapshot(snapshots_[snapshots_.size()-1]);
121 |         }
122 |       }
123 |       temp *= alpha;
124 |     }
125 |     // Restore the best solution
126 |     RestoreSnapshot(snapshot_queue_.top());
127 |     setup_routine_();
128 |     std::cout << "Final score: "
129 |         << ((100.0/baseline_score)*benchmark_routine_()) << std::endl;
130 |   }
131 |   // Cleanup
132 |   // TODO
133 | }
134 | 
135 | void LibDNNTuner::Snapshot(double score) {
136 |   std::shared_ptr<LibDNNTunerSnapshot>
137 |         snapshot(new LibDNNTunerSnapshot(score, &params_));
138 |   snapshots_.push_back(snapshot);
139 |   snapshot_queue_.push(snapshot);
140 | }
141 | 
142 | void LibDNNTuner::RestoreSnapshot(
143 |     std::shared_ptr<LibDNNTunerSnapshot> snapshot) {
144 |   std::vector<std::shared_ptr<LibDNNTunerParam>>* params =
145 |       snapshot->get_params();
146 |   for (int i = 0; i < params_.size(); ++i) {
147 |     params_[i]->update((*params)[i]);
148 |   }
149 | }
150 | 
151 | template<class T>
152 | void LibDNNTuner::add_range_param(std::string name,
153 |                                   T def_value, T min, T max, T step) {
154 |   std::vector<T> values;
155 | 
156 |   T value = static_cast<T>(def_value);
157 | 
158 |   T vmin = std::min(max, min);
159 |   T vmax = std::max(max, min);
160 | 
161 |   values.push_back(value);
162 | 
163 |   while (value >= vmin) {
164 |     value -= step;
165 |     if (value <= vmax && value >= vmin) {
166 |       values.insert(values.begin(), value);
167 |     }
168 |   }
169 | 
170 |   value = static_cast<T>(def_value);
171 | 
172 |   while (value <= vmax) {
173 |     value += step;
174 |     if (value >= vmin && value <= vmax) {
175 |       values.push_back(value);  std::vector<T> set_values;
176 |     }
177 |   }
178 | 
179 |   add_set_param(name, def_value, values);
180 | }
181 | template void LibDNNTuner::add_range_param(std::string name, float def_value,
182 |                                   float min, float max, float step);
183 | template void LibDNNTuner::add_range_param(std::string name, double def_value,
184 |                                   double min, double max, double step);
185 | template void LibDNNTuner::add_range_param(std::string name, int32_t def_value,
186 |                                   int32_t min, int32_t max, int32_t step);
187 | template void LibDNNTuner::add_range_param(std::string name, int64_t def_value,
188 |                                   int64_t min, int64_t max, int64_t step);
189 | 
190 | template<class T>
191 | void LibDNNTuner::add_range_param(const char* name,
192 |                                   T def_value, T min, T max, T step) {
193 |   std::string str(name);
194 |   add_range_param<T>(str, def_value, min, max, step);
195 | }
196 | template void LibDNNTuner::add_range_param(const char* name, float def_value,
197 |                                   float min, float max, float step);
198 | template void LibDNNTuner::add_range_param(const char* name, double def_value,
199 |                                   double min, double max, double step);
200 | template void LibDNNTuner::add_range_param(const char* name, int32_t def_value,
201 |                                   int32_t min, int32_t max, int32_t step);
202 | template void LibDNNTuner::add_range_param(const char* name, int64_t def_value,
203 |                                   int64_t min, int64_t max, int64_t step);
204 | 
205 | 
206 | template<class T>
207 | void LibDNNTuner::add_set_param(std::string name,
208 |                                 T def_value, std::vector<T> values) {
209 |   if (std::is_same<T, float>::value || std::is_same<T, double>::value) {
210 |     std::vector<double> set_values;
211 |     int_tp def_idx = -1;
212 |     for (int_tp i = 0; i < values.size(); ++i) {
213 |       set_values.push_back(values[i]);
214 |       if (def_value == values[i]) {
215 |         def_idx = i;
216 |       }
217 |     }
218 |     if (def_idx == -1) {
219 |       def_idx = set_values.size();
220 |       set_values.push_back(def_value);
221 |     }
222 |     std::shared_ptr<LibDNNTunerParam> param(
223 |         new LibDNNTunerParamReal(this, name, set_values, def_idx));
224 |     params_.push_back(param);
225 |     param_map_.insert(std::pair<std::string,
226 |                       std::shared_ptr<LibDNNTunerParam>>(name, param));
227 |   }
228 | 
229 |   if (std::is_same<T, bool>::value) {
230 |     std::vector<bool> set_values;
231 |     int_tp def_idx = -1;
232 |     for (int_tp i = 0; i < values.size(); ++i) {
233 |       set_values.push_back(values[i]);
234 |       if (def_value == values[i]) {
235 |         def_idx = i;
236 |       }
237 |     }
238 |     if (def_idx == -1) {
239 |       def_idx = set_values.size();
240 |       set_values.push_back(def_value);
241 |     }
242 |     std::shared_ptr<LibDNNTunerParam> param(
243 |         new LibDNNTunerParamBool(this, name, set_values, def_idx));
244 |     params_.push_back(param);
245 |     param_map_.insert(std::pair<std::string,
246 |                       std::shared_ptr<LibDNNTunerParam>>(name, param));
247 |   }
248 | 
249 |   if (std::is_same<T, int32_t>::value || std::is_same<T, int64_t>::value) {
250 |     std::vector<int64_t> set_values;
251 |     int_tp def_idx = -1;
252 |     for (int_tp i = 0; i < values.size(); ++i) {
253 |       set_values.push_back(values[i]);
254 |       if (def_value == values[i]) {
255 |         def_idx = i;
256 |       }
257 |     }
258 |     if (def_idx == -1) {
259 |       def_idx = set_values.size();
260 |       set_values.push_back(def_value);
261 |     }
262 |     std::shared_ptr<LibDNNTunerParam>
263 |           param(new LibDNNTunerParamInt(this, name, set_values, def_idx));
264 |     params_.push_back(param);
265 |     param_map_.insert(std::pair<std::string,
266 |                       std::shared_ptr<LibDNNTunerParam>>(name, param));
267 |   }
268 | }
269 | template void LibDNNTuner::add_set_param(std::string name,
270 |                             float def_value, std::vector<float> values);
271 | template void LibDNNTuner::add_set_param(std::string name,
272 |                             double def_value, std::vector<double> values);
273 | template void LibDNNTuner::add_set_param(std::string name,
274 |                             int32_t def_value, std::vector<int32_t> values);
275 | template void LibDNNTuner::add_set_param(std::string name,
276 |                             int64_t def_value, std::vector<int64_t> values);
277 | 
278 | template<>
279 | void LibDNNTuner::add_constraint(std::vector<std::string> con_params,
280 |                     std::vector<std::string> con_adapt,
281 |                     std::function<bool(std::vector<bool>)> con_func) {
282 |   std::shared_ptr<LibDNNTunerConstraint> constraint;
283 |   constraint = std::shared_ptr<LibDNNTunerConstraint>(
284 |       new LibDNNTunerConstraintBool(
285 |       this, con_params, con_adapt, con_func));
286 |   constraints_.push_back(constraint);
287 |   for (int_tp i = 0; i < con_params.size(); ++i) {
288 |     std::shared_ptr<LibDNNTunerParam> param = param_map_.at(con_params[i]);
289 |     param->add_constraint(constraint);
290 |   }
291 | }
292 | template<>
293 | void LibDNNTuner::add_constraint(std::vector<std::string> con_params,
294 |                     std::vector<std::string> con_adapt,
295 |                     std::function<bool(std::vector<double>)> con_func) {
296 |   std::shared_ptr<LibDNNTunerConstraint> constraint;
297 |   constraint = std::shared_ptr<LibDNNTunerConstraint>(
298 |       new LibDNNTunerConstraintReal(
299 |       this, con_params, con_adapt, con_func));
300 |   constraints_.push_back(constraint);
301 |   for (int_tp i = 0; i < con_params.size(); ++i) {
302 |     std::shared_ptr<LibDNNTunerParam> param = param_map_.at(con_params[i]);
303 |     param->add_constraint(constraint);
304 |   }
305 | }
306 | template<>
307 | void LibDNNTuner::add_constraint(std::vector<std::string> con_params,
308 |                     std::vector<std::string> con_adapt,
309 |                     std::function<bool(std::vector<int64_t>)> con_func) {
310 |   std::shared_ptr<LibDNNTunerConstraint> constraint;
311 |   constraint = std::shared_ptr<LibDNNTunerConstraint>(
312 |       new LibDNNTunerConstraintInt(
313 |       this, con_params, con_adapt, con_func));
314 |   constraints_.push_back(constraint);
315 |   for (int_tp i = 0; i < con_params.size(); ++i) {
316 |     std::shared_ptr<LibDNNTunerParam> param = param_map_.at(con_params[i]);
317 |     param->add_constraint(constraint);
318 |   }
319 | }
320 | 
321 | template<class T>
322 | void LibDNNTuner::add_constraint(std::vector<const char*> con_params,
323 |                     std::vector<const char*> con_adapt,
324 |                     std::function<bool(std::vector<T>)> con_func) {
325 |   std::vector<std::string> con_params_str;
326 |   std::vector<std::string> con_adapt_str;
327 | 
328 |   for (int_tp i = 0; i < con_params.size(); ++i) {
329 |     std::string str(con_params[i]);
330 |     con_params_str.push_back(str);
331 |   }
332 | 
333 |   for (int_tp i = 0; i < con_adapt.size(); ++i) {
334 |     std::string str(con_adapt[i]);
335 |     con_adapt_str.push_back(str);
336 |   }
337 | 
338 |   add_constraint(con_params_str, con_adapt_str, con_func);
339 | }
340 | template void LibDNNTuner::add_constraint(std::vector<const char*> con_params,
341 |                            std::vector<const char*> con_adapt,
342 |                            std::function<bool(std::vector<bool>)> con_func);
343 | template void LibDNNTuner::add_constraint(std::vector<const char*> con_params,
344 |                            std::vector<const char*> con_adapt,
345 |                            std::function<bool(std::vector<double>)> con_func);
346 | template void LibDNNTuner::add_constraint(std::vector<const char*> con_params,
347 |                            std::vector<const char*> con_adapt,
348 |                            std::function<bool(std::vector<int64_t>)> con_func);
349 | 
350 | template<class T>
351 | void LibDNNTuner::add_constraint(std::vector<const char*> con_params,
352 |                     std::vector<std::string> con_adapt,
353 |                     std::function<bool(std::vector<T>)> con_func) {
354 |   std::vector<std::string> con_params_str;
355 |   std::vector<std::string> con_adapt_str;
356 | 
357 |   for (int_tp i = 0; i < con_params.size(); ++i) {
358 |     std::string str(con_params[i]);
359 |     con_params_str.push_back(str);
360 |   }
361 | 
362 |   for (int_tp i = 0; i < con_adapt.size(); ++i) {
363 |     std::string str(con_adapt[i]);
364 |     con_adapt_str.push_back(str);
365 |   }
366 | }
367 | template void LibDNNTuner::add_constraint(std::vector<const char*> con_params,
368 |                            std::vector<std::string> con_adapt,
369 |                            std::function<bool(std::vector<bool>)> con_func);
370 | template void LibDNNTuner::add_constraint(std::vector<const char*> con_params,
371 |                            std::vector<std::string> con_adapt,
372 |                            std::function<bool(std::vector<double>)> con_func);
373 | template void LibDNNTuner::add_constraint(std::vector<const char*> con_params,
374 |                            std::vector<std::string> con_adapt,
375 |                            std::function<bool(std::vector<int64_t>)> con_func);
376 | 
377 | template<class T>
378 | void LibDNNTuner::add_constraint(std::vector<std::string> con_params,
379 |                     std::vector<const char*> con_adapt,
380 |                     std::function<bool(std::vector<T>)> con_func) {
381 |   std::vector<std::string> con_params_str;
382 |   std::vector<std::string> con_adapt_str;
383 | 
384 |   for (int_tp i = 0; i < con_params.size(); ++i) {
385 |     std::string str(con_params[i]);
386 |     con_params_str.push_back(str);
387 |   }
388 | 
389 |   for (int_tp i = 0; i < con_adapt.size(); ++i) {
390 |     std::string str(con_adapt[i]);
391 |     con_adapt_str.push_back(str);
392 |   }
393 | }
394 | template void LibDNNTuner::add_constraint(std::vector<std::string> con_params,
395 |                            std::vector<const char*> con_adapt,
396 |                            std::function<bool(std::vector<bool>)> con_func);
397 | template void LibDNNTuner::add_constraint(std::vector<std::string> con_params,
398 |                            std::vector<const char*> con_adapt,
399 |                            std::function<bool(std::vector<double>)> con_func);
400 | template void LibDNNTuner::add_constraint(std::vector<std::string> con_params,
401 |                            std::vector<const char*> con_adapt,
402 |                            std::function<bool(std::vector<int32_t>)> con_func);
403 | 
404 | template<class T>
405 | void LibDNNTuner::add_set_param(const char* name,
406 |                                 T def_value, std::vector<T> values) {
407 |   std::string str(name);
408 |   add_set_param<T>(str, def_value, values);
409 | }
410 | template void LibDNNTuner::add_set_param(const char* name,
411 |                             float def_value, std::vector<float> values);
412 | template void LibDNNTuner::add_set_param(const char* name,
413 |                             double def_value, std::vector<double> values);
414 | template void LibDNNTuner::add_set_param(const char* name,
415 |                             int32_t def_value, std::vector<int32_t> values);
416 | template void LibDNNTuner::add_set_param(const char* name,
417 |                             int64_t def_value, std::vector<int64_t> values);
418 | 
419 | void LibDNNTuner::add_boolean_param(std::string name,
420 |                                     bool def_value, bool inverse) {
421 |   std::vector<bool> set_values;
422 |   set_values.push_back(def_value);
423 |   if (inverse) {
424 |     set_values.push_back(!def_value);
425 |   }
426 |   std::shared_ptr<LibDNNTunerParam> param(
427 |       new LibDNNTunerParamBool(this, name, set_values, 0));
428 |   params_.push_back(param);
429 |   param_map_.insert(std::pair<std::string,
430 |                     std::shared_ptr<LibDNNTunerParam>>(name, param));
431 | }
432 | 
433 | void LibDNNTuner::add_boolean_param(const char* name,
434 |                                     bool def_value, bool inverse) {
435 |   std::string str(name);
436 |   add_boolean_param(str, def_value, inverse);
437 | }
438 | 
439 | 
440 | template<class T>
441 | T LibDNNTuner::get_param(std::string name) {
442 |   T value;
443 |   std::shared_ptr<LibDNNTunerParam> param = param_map_.at(name);
444 | 
445 |   std::shared_ptr<LibDNNTunerParamBool> param_bool =
446 |       std::dynamic_pointer_cast<LibDNNTunerParamBool>(param);
447 |   if (param_bool.get() != nullptr) {
448 |     value = static_cast<T>(param_bool->get_value());
449 |     return value;
450 |   }
451 | 
452 |   std::shared_ptr<LibDNNTunerParamInt> param_int =
453 |       std::dynamic_pointer_cast<LibDNNTunerParamInt>(param);
454 |   if (param_int.get() != nullptr) {
455 |     value = static_cast<T>(param_int->get_value());
456 |     return value;
457 |   }
458 | 
459 |   std::shared_ptr<LibDNNTunerParamReal> param_real =
460 |       std::dynamic_pointer_cast<LibDNNTunerParamReal>(param);
461 |   if (param_real.get() != nullptr) {
462 |     value = static_cast<T>(param_real->get_value());
463 |     return value;
464 |   }
465 | 
466 |   return value;
467 | }
468 | template float LibDNNTuner::get_param(std::string name);
469 | template double LibDNNTuner::get_param(std::string name);
470 | template int32_t LibDNNTuner::get_param(std::string name);
471 | template int64_t LibDNNTuner::get_param(std::string name);
472 | template bool LibDNNTuner::get_param(std::string name);
473 | 
474 | template<class T>
475 | T LibDNNTuner::get_param(const char* name) {
476 |   std::string str(name);
477 |   return get_param<T>(str);
478 | }
479 | template float LibDNNTuner::get_param(const char* name);
480 | template double LibDNNTuner::get_param(const char* name);
481 | template int32_t LibDNNTuner::get_param(const char* name);
482 | template int64_t LibDNNTuner::get_param(const char* name);
483 | template bool LibDNNTuner::get_param(const char* name);
484 | 
485 | std::string LibDNNTunerParam::get_name() {
486 |   return name_;
487 | }
488 | 
489 | libdnnTunerParamStatus_t LibDNNTunerParam::advance(int_tp offset) {
490 |   for (int i = 0; i < abs(offset); ++i) {
491 |     if (offset > 0) {
492 |       ++curr_idx_;
493 |     } else {
494 |       --curr_idx_;
495 |     }
496 |     if (curr_idx_ >= count_values()) {
497 |        curr_idx_ = 0;
498 |     }
499 |     if (curr_idx_ < 0) {
500 |       curr_idx_ = count_values() - 1;
501 |     }
502 |   }
503 |   if (curr_idx_ == def_idx_) {
504 |     return LIBDNN_TUNER_PARAM_STAT_OVERFLOW;
505 |   }
506 | 
507 |   bool constraints_ok = true;
508 |   for (int i = 0; i < constraints_.size(); ++i) {
509 |     constraints_ok &= constraints_[i]->evaluate();
510 |   }
511 | 
512 |   if (constraints_ok) {
513 |     return LIBDNN_TUNER_PARAM_STAT_OK;
514 |   } else {
515 |     return LIBDNN_TUNER_PARAM_STAT_NO_SOLUTION;
516 |   }
517 | }
518 | 
519 | int_tp LibDNNTunerParam::get_curr_idx() {
520 |   return curr_idx_;
521 | }
522 | 
523 | int_tp LibDNNTunerParam::get_def_idx() {
524 |   return def_idx_;
525 | }
526 | 
527 | void LibDNNTunerParam::set_curr_idx(int_tp curr_idx) {
528 |   curr_idx_ = curr_idx;
529 | }
530 | 
531 | void LibDNNTunerParam::set_def_idx(int_tp def_idx) {
532 |   def_idx_ = def_idx;
533 | }
534 | 
535 | void LibDNNTunerParam::add_constraint(
536 |     std::shared_ptr<LibDNNTunerConstraint> constraint) {
537 |   constraints_.push_back(constraint);
538 | }
539 | 
540 | double LibDNNTunerSnapshot::get_score() {
541 |   return score_;
542 | }
543 | 
544 | std::vector<std::shared_ptr<LibDNNTunerParam>>*
545 |   LibDNNTunerSnapshot::get_params() {
546 |   return &params_;
547 | }
548 | 
549 | 
550 | int_tp LibDNNTunerParamInt::count_values() {
551 |   return values_.size();
552 | }
553 | int_tp LibDNNTunerParamReal::count_values() {
554 |   return values_.size();
555 | }
556 | int_tp LibDNNTunerParamBool::count_values() {
557 |   return values_.size();
558 | }
559 | 
560 | int64_t LibDNNTunerParamInt::get_value() {
561 |   // std::cout << name_ << ", value: " << values_[curr_idx_] << std::endl;
562 |   return values_[curr_idx_];
563 | }
564 | double LibDNNTunerParamReal::get_value() {
565 |   // std::cout << name_ << ", value: " << values_[curr_idx_] << std::endl;
566 |   return values_[curr_idx_];
567 | }
568 | bool LibDNNTunerParamBool::get_value() {
569 |   // std::cout << name_ << ", value: " << values_[curr_idx_] << std::endl;
570 |   return values_[curr_idx_];
571 | }
572 | 
573 | const std::vector<int64_t>& LibDNNTunerParamInt::get_values() {
574 |   return values_;
575 | }
576 | const std::vector<double>& LibDNNTunerParamReal::get_values() {
577 |   return values_;
578 | }
579 | const std::vector<bool>& LibDNNTunerParamBool::get_values() {
580 |   return values_;
581 | }
582 | 
583 | 
584 | std::shared_ptr<LibDNNTunerParam> LibDNNTunerParamInt::clone() {
585 |   return std::shared_ptr<LibDNNTunerParamInt>
586 |       (new LibDNNTunerParamInt(*this));
587 | }
588 | 
589 | std::shared_ptr<LibDNNTunerParam> LibDNNTunerParamReal::clone() {
590 |   return std::shared_ptr<LibDNNTunerParamReal>
591 |       (new LibDNNTunerParamReal(*this));
592 | }
593 | 
594 | std::shared_ptr<LibDNNTunerParam> LibDNNTunerParamBool::clone() {
595 |   return std::shared_ptr<LibDNNTunerParamBool>
596 |       (new LibDNNTunerParamBool(*this));
597 | }
598 | 
599 | 
600 | void LibDNNTunerParam::update(std::shared_ptr<LibDNNTunerParam> other) {
601 |   curr_idx_ = other->get_curr_idx();
602 |   def_idx_ = other->get_def_idx();
603 | }
604 | 
605 | bool LibDNNTunerConstraintBool::evaluate() {
606 |   std::vector<bool> values;
607 | 
608 |   for (int_tp i = 0; i < con_params_.size(); ++i) {
609 |     values.push_back(tuner_->get_param<bool>(con_params_[i]));
610 |   }
611 | 
612 |   return func_(values);
613 | }
614 | 
615 | bool LibDNNTunerConstraintInt::evaluate() {
616 |   std::vector<int64_t> values;
617 | 
618 |   for (int_tp i = 0; i < con_params_.size(); ++i) {
619 |     values.push_back(tuner_->get_param<int64_t>(con_params_[i]));
620 |   }
621 | 
622 |   return func_(values);
623 | }
624 | 
625 | bool LibDNNTunerConstraintReal::evaluate() {
626 |   std::vector<double> values;
627 | 
628 |   for (int_tp i = 0; i < con_params_.size(); ++i) {
629 |     values.push_back(tuner_->get_param<double>(con_params_[i]));
630 |   }
631 | 
632 |   return func_(values);
633 | }
634 | 
635 | }  // namespace libdnn
636 | 


--------------------------------------------------------------------------------
/src/libdnn_pool.cpp:
--------------------------------------------------------------------------------
   1 | #include <functional>
   2 | #include <numeric>
   3 | #include <string>
   4 | #include <vector>
   5 | 
   6 | #include "common.hpp"
   7 | #include "device.hpp"
   8 | #include "libdnn.hpp"
   9 | #include "benchmark.hpp"
  10 | 
  11 | // #define LIBDNN_DEBUG 1
  12 | 
  13 | namespace greentea {
  14 | 
  15 | template<typename Dtype>
  16 | LibDNNPool<Dtype>::LibDNNPool(LibDNNPoolConfig config) {
  17 |   config_ = config;
  18 |   LibDNN<Dtype>::dev_ptr_ = config.dev_ptr;
  19 |   LibDNN<Dtype>::fast_unsafe_math_ = config.fast_unsafe_math;
  20 |   int_tp dims = config.in_shape.size();
  21 |   int_tp spatial_dims = config.kernel.size();
  22 | 
  23 |   num_axes_ = spatial_dims;
  24 | 
  25 |   pool_method_ = config.pool_method;
  26 |   bwalgo_ = config.bwalgo;
  27 |   use_top_mask_ = config.use_top_mask;
  28 | 
  29 |   skip_range_check_ = true;
  30 | 
  31 |   for (int_tp i = 0; i < spatial_dims; ++i) {
  32 |     kernel_shape_.push_back(config.kernel[i]);
  33 |     pad_.push_back(config.pad[i]);
  34 |     if (pad_[i] > 0) {
  35 |       skip_range_check_ = false;
  36 |     }
  37 |     stride_.push_back(config.stride[i]);
  38 |     dilation_.push_back(config.dilation[i]);
  39 |     im_in_shape_.push_back(config.in_shape[dims - spatial_dims + i]);
  40 |     im_out_shape_.push_back(config.out_shape[dims - spatial_dims + i]);
  41 |   }
  42 | 
  43 |   fw_tuner_ = std::shared_ptr<LibDNNTuner>(new LibDNNTuner());
  44 |   bw_tuner_ = std::shared_ptr<LibDNNTuner>(new LibDNNTuner());
  45 | 
  46 |   fw_tuner_->add_range_param<int_tp>("LW0", 8, 4, 16, 4);
  47 |   bw_tuner_->add_range_param<int_tp>("LW0", 8, 4, 16, 4);
  48 |   fw_tuner_->add_range_param<int_tp>("LW1", 8, 4, 16, 4);
  49 |   bw_tuner_->add_range_param<int_tp>("LW1", 8, 4, 16, 4);
  50 | 
  51 | 
  52 |   GenerateKernels();
  53 |   LibDNN<Dtype>::CompileKernels();
  54 | }
  55 | 
  56 | template<typename Dtype>
  57 | const LibDNNPoolConfig LibDNNPool<Dtype>::get_config() {
  58 |   return config_;
  59 | }
  60 | 
  61 | 
  62 | template<typename Dtype>
  63 | std::string LibDNNPool<Dtype>::string_identifier() {
  64 |   std::stringstream ss;
  65 |   ss << "POOL_";
  66 |   switch (pool_method_) {
  67 |     case LIBDNN_POOLING_METHOD_MAX:
  68 |       ss << "MAX_";
  69 |       break;
  70 |     case LIBDNN_POOLING_METHOD_AVE:
  71 |       ss << "AVE_";
  72 |       break;
  73 |     case LIBDNN_POOLING_METHOD_STO:
  74 |       ss << "STO_";
  75 |       break;
  76 |   }
  77 |   if (std::is_same<Dtype, double>::value) {
  78 |     ss << "double_";
  79 |   } else {
  80 |     ss << "float_";
  81 |   }
  82 |   // Device name
  83 |   ss << LibDNN<Dtype>::dev_ptr_->name();
  84 |   ss << "_";
  85 |   ss << num_axes_ << "D_";
  86 |   ss << "IN[";
  87 |   for (int_tp i = 0; i < im_in_shape_.size(); ++i) {
  88 |     ss << im_in_shape_[i];
  89 |     if (i < im_in_shape_.size() - 1) {
  90 |       ss << ",";
  91 |     }
  92 |   }
  93 |   ss << "]_OUT[";
  94 |   for (int_tp i = 0; i < im_out_shape_.size(); ++i) {
  95 |     ss << im_out_shape_[i];
  96 |     if (i < im_out_shape_.size() - 1) {
  97 |       ss << ",";
  98 |     }
  99 |   }
 100 |   ss << "]_K[";
 101 |   for (int_tp i = 0; i < kernel_shape_.size(); ++i) {
 102 |     ss << kernel_shape_[i];
 103 |     if (i < kernel_shape_.size() - 1) {
 104 |       ss << ",";
 105 |     }
 106 |   }
 107 |   ss << "]_S[";
 108 |   for (int_tp i = 0; i < stride_.size(); ++i) {
 109 |     ss << stride_[i];
 110 |     if (i < stride_.size() - 1) {
 111 |       ss << ",";
 112 |     }
 113 |   }
 114 |   ss << "]_P[";
 115 |   for (int_tp i = 0; i < pad_.size(); ++i) {
 116 |     ss << pad_[i];
 117 |     if (i < pad_.size() - 1) {
 118 |       ss << ",";
 119 |     }
 120 |   }
 121 |   ss << "]_D[";
 122 |   for (int_tp i = 0; i < dilation_.size(); ++i) {
 123 |     ss << dilation_[i];
 124 |     if (i < dilation_.size() - 1) {
 125 |       ss << ",";
 126 |     }
 127 |   }
 128 |   ss << "]";
 129 |   return ss.str();
 130 | }
 131 | 
 132 | template<typename Dtype>
 133 | std::string LibDNNPool<Dtype>::generate_fw_defs() {
 134 |   std::stringstream ss;
 135 | 
 136 |   // Number of spatial axes
 137 |   LibDNN<Dtype>::add_def(ss, "v_nax", num_axes_);
 138 | 
 139 |   for (int_tp i = 0; i < kernel_shape_.size(); ++i) {
 140 |     LibDNN<Dtype>::add_def(ss, "v_k_" + std::to_string(i), kernel_shape_[i]);
 141 |   }
 142 |   for (int_tp i = 0; i < pad_.size(); ++i) {
 143 |     LibDNN<Dtype>::add_def(ss, "v_p_" + std::to_string(i), pad_[i]);
 144 |   }
 145 |   for (int_tp i = 0; i < stride_.size(); ++i) {
 146 |     LibDNN<Dtype>::add_def(ss, "v_s_" + std::to_string(i), stride_[i]);
 147 |   }
 148 |   for (int_tp i = 0; i < dilation_.size(); ++i) {
 149 |     LibDNN<Dtype>::add_def(ss, "v_d_" + std::to_string(i), dilation_[i]);
 150 |   }
 151 | 
 152 |   int_tp imsi = 1;
 153 |   int_tp imso = 1;
 154 |   for (int_tp i = 0; i < im_in_shape_.size(); ++i) {
 155 |     LibDNN<Dtype>::add_def(ss, "v_imsi_" + std::to_string(i), im_in_shape_[i]);
 156 |     imsi *= im_in_shape_[i];
 157 |     LibDNN<Dtype>::add_def(ss, "v_imso_" + std::to_string(i), im_out_shape_[i]);
 158 |     imso *= im_out_shape_[i];
 159 |   }
 160 |   LibDNN<Dtype>::add_def(ss, "v_imsi", imsi);
 161 |   LibDNN<Dtype>::add_def(ss, "v_imso", imso);
 162 | 
 163 |   return ss.str();
 164 | }
 165 | 
 166 | 
 167 | template<typename Dtype>
 168 | std::string LibDNNPool<Dtype>::generate_bw_defs() {
 169 |   std::stringstream ss;
 170 | 
 171 |   // Number of spatial axes
 172 |   LibDNN<Dtype>::add_def(ss, "v_nax", num_axes_);
 173 |   for (int_tp i = 0; i < kernel_shape_.size(); ++i) {
 174 |     LibDNN<Dtype>::add_def(ss, "v_k_" + std::to_string(i), kernel_shape_[i]);
 175 |   }
 176 |   for (int_tp i = 0; i < pad_.size(); ++i) {
 177 |     LibDNN<Dtype>::add_def(ss, "v_p_" + std::to_string(i), pad_[i]);
 178 |   }
 179 |   for (int_tp i = 0; i < stride_.size(); ++i) {
 180 |     LibDNN<Dtype>::add_def(ss, "v_s_" + std::to_string(i), stride_[i]);
 181 |   }
 182 |   for (int_tp i = 0; i < dilation_.size(); ++i) {
 183 |     LibDNN<Dtype>::add_def(ss, "v_d_" + std::to_string(i), dilation_[i]);
 184 |   }
 185 | 
 186 |   int_tp imsi = 1;
 187 |   int_tp imso = 1;
 188 |   for (int_tp i = 0; i < im_in_shape_.size(); ++i) {
 189 |     LibDNN<Dtype>::add_def(ss, "v_imsi_" + std::to_string(i), im_in_shape_[i]);
 190 |     imsi *= im_in_shape_[i];
 191 |     LibDNN<Dtype>::add_def(ss, "v_imso_" + std::to_string(i), im_out_shape_[i]);
 192 |     imso *= im_out_shape_[i];
 193 |   }
 194 |   LibDNN<Dtype>::add_def(ss, "v_imsi", imsi);
 195 |   LibDNN<Dtype>::add_def(ss, "v_imso", imso);
 196 | 
 197 |   return ss.str();
 198 | }
 199 | 
 200 | template<typename Dtype>
 201 | std::string LibDNNPool<Dtype>::generate_fw_kernels(std::string name,
 202 |                                                    bool test_mode) {
 203 |   std::stringstream ss;
 204 | 
 205 |   ss << "__kernel void " + name + "(";
 206 |   ss << "__global const Dtype* __restrict bottom_data, ";
 207 |   ss << "__global Dtype* __restrict top_data, ";
 208 |   if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) {
 209 |     if (use_top_mask_) {
 210 |       ss << "__global Dtype* __restrict top_mask, ";
 211 |     } else {
 212 |       ss << "__global int_tp* __restrict mask, ";
 213 |     }
 214 |   }
 215 |   if (pool_method_ == LIBDNN_POOLING_METHOD_STO && !test_mode) {
 216 |     ss << "__global Dtype* __restrict rand_idx, ";
 217 |   }
 218 |   ss << "int_tp channels, ";
 219 |   ss << "int_tp batch_size";
 220 |   ss << ") {" << std::endl;
 221 | 
 222 |   ss << "int_tp out_idx = get_global_id(0);" << std::endl;
 223 |   ss << "if (get_global_id(1) >= channels * batch_size) {return;}" << std::endl;
 224 |   ss << "int_tp idx_0 = get_global_id(0);" << std::endl;
 225 |   for (int_tp i = num_axes_ - 1; i >= 1; --i) {
 226 |     ss << "int_tp idx_" << i << " = (idx_0 % v_imso_" << i << ");" << std::endl;
 227 |     ss << "idx_" << i << " = idx_" << i
 228 |        << " * v_s_" << i << " - v_p_" << i << ";" << std::endl;
 229 |     ss << "idx_0 /= v_imso_" << i << ";" << std::endl;
 230 |   }
 231 |   ss << "if (idx_0 >= v_imso_0) {return;}" << std::endl;
 232 |   ss << "idx_0 = idx_0 * v_s_0 - v_p_0;" << std::endl;
 233 |   ss << "int_tp in_idx = idx_0;" << std::endl;
 234 |   for (int_tp i = 1; i < num_axes_; ++i) {
 235 |     ss << "in_idx = in_idx * v_imsi_" << i
 236 |        << " + " << "idx_" << i << ";" << std::endl;
 237 |   }
 238 |   ss << "__global const Dtype* in_ptr = bottom_data + "
 239 |      << "get_global_id(1) * v_imsi + in_idx;" << std::endl;
 240 |   ss << "__global Dtype* out_ptr = top_data + "
 241 |      << "get_global_id(1) * v_imso;" << std::endl;
 242 | 
 243 |   if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) {
 244 |     if (use_top_mask_) {
 245 |       ss << "__global Dtype* mask_ptr = top_mask + get_global_id(1) * v_imso;"
 246 |          << std::endl;
 247 |     } else {
 248 |       ss << "__global int_tp* mask_ptr = mask + get_global_id(1) * v_imso;"
 249 |          << std::endl;
 250 |     }
 251 |     ss << "Dtype val = -FLT_MAX;" << std::endl;
 252 |     ss << "int_tp maxidx = -1;" << std::endl;
 253 |   }
 254 | 
 255 |   if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) {
 256 |     ss << "Dtype val = 0;" << std::endl;
 257 |   }
 258 | 
 259 |   if (pool_method_ == LIBDNN_POOLING_METHOD_STO) {
 260 |     if (test_mode) {
 261 |       ss << "Dtype cumsum = FLT_MIN;" << std::endl;
 262 |       ss << "Dtype cumvalues = 0;" << std::endl;
 263 |     } else {
 264 |       ss << "__global Dtype* rand_ptr = rand_idx + get_global_id(1) * v_imso;"
 265 |          << std::endl;
 266 |       ss << "Dtype val = 0;" << std::endl;
 267 |       ss << "Dtype cumsum = 0;" << std::endl;
 268 |       ss << "int_tp stoidx = -1;" << std::endl;
 269 |     }
 270 |   }
 271 | 
 272 |   std::vector<int_tp> d_iter;
 273 |   int_tp curr_idx = 0;
 274 | 
 275 |   for (int_tp i = 0; i < kernel_shape_.size(); ++i) {
 276 |     d_iter.push_back(0);
 277 |   }
 278 | 
 279 |   if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) {
 280 |     int_tp ave = std::accumulate(kernel_shape_.begin(),
 281 |                                  kernel_shape_.end(),
 282 |                                   1, std::multiplies<int_tp>());
 283 |     ss << "int_tp ave = " << ave << ";" << std::endl;
 284 |   }
 285 | 
 286 |   for (int_tp sto_idx = 0;
 287 |        sto_idx < ((pool_method_ == LIBDNN_POOLING_METHOD_STO && !test_mode)
 288 |        ? 2 : 1); ++sto_idx) {
 289 |     if (pool_method_ == LIBDNN_POOLING_METHOD_STO && sto_idx == 1) {
 290 |       ss << "Dtype thres = rand_ptr[out_idx] * cumsum;" << std::endl;
 291 |       ss << "cumsum = 0;" << std::endl;
 292 |     }
 293 |     // Loop over the kernel
 294 |     bool incremented;
 295 |     do {
 296 |       int_tp kernel_offset = 0;
 297 |       int_tp size_prod = 1;
 298 |       for (int_tp i = num_axes_ - 1; i >= 0; --i) {
 299 |         kernel_offset += size_prod * d_iter[i] * dilation_[i];
 300 |         size_prod *= im_in_shape_[i];
 301 |       }
 302 | 
 303 |       bool max_guard = false;
 304 |       bool pad_guard = false;
 305 |       bool overspill_guard = false;
 306 |       for (int_tp i = 0; i < num_axes_; ++i) {
 307 |         if (d_iter[i] * dilation_[i] < pad_[i]) {
 308 |           pad_guard = true;
 309 |         }
 310 |         if (d_iter[i] * dilation_[i] >=
 311 |             ((kernel_shape_[i] - 1) * dilation_[i] + 1) - pad_[i] ||
 312 |             (im_out_shape_[i] - 1) * stride_[i] + d_iter[i]
 313 |                          * dilation_[i] - pad_[i] >= im_in_shape_[i] ) {
 314 |           pad_guard = true;
 315 |         }
 316 |         if ((im_out_shape_[i] - 1) * stride_[i] + d_iter[i]
 317 |              * dilation_[i] - pad_[i] >= im_in_shape_[i]) {
 318 |           overspill_guard = true;
 319 |         }
 320 |       }
 321 |       if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) {
 322 |         max_guard = true;
 323 |       }
 324 | 
 325 |       if (max_guard || pad_guard || overspill_guard) {
 326 |         ss << "if (";
 327 |       }
 328 |       if (pad_guard || overspill_guard) {
 329 |         for (int_tp i = 0; i < num_axes_; ++i) {
 330 |           if (d_iter[i] * dilation_[i] < pad_[i]) {
 331 |             ss << "idx_" << i << " >= -" << (d_iter[i] * dilation_[i])
 332 |                << " && ";
 333 |           }
 334 |           if ((d_iter[i] * dilation_[i] >= ((kernel_shape_[i] - 1)
 335 |               * dilation_[i] + 1) - pad_[i]) ||
 336 |               ((im_out_shape_[i] - 1) * stride_[i]
 337 |               + d_iter[i] * dilation_[i] - pad_[i]
 338 |               >= im_in_shape_[i])) {
 339 |             ss << "idx_" << i << " < v_imsi_" << i << " - "
 340 |                << (d_iter[i] * dilation_[i]) << " && ";
 341 |           }
 342 |         }
 343 |       }
 344 |       if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) {
 345 |         if (max_guard  || pad_guard || overspill_guard) {
 346 |           ss << "in_ptr[" << kernel_offset << "] > val) {" << std::endl;
 347 |         }
 348 |         ss << "maxidx = in_idx + " << kernel_offset << ";" << std::endl;
 349 |         ss << "val = in_ptr[" << kernel_offset << "];" << std::endl;
 350 |         if (max_guard  || pad_guard || overspill_guard) {
 351 |           ss << "}" << std::endl;
 352 |         }
 353 |       }
 354 |       if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) {
 355 |         if (pad_guard || overspill_guard) {
 356 |           ss << "true) {" << std::endl;
 357 |         }
 358 |         ss << "val += in_ptr[" << kernel_offset << "];" << std::endl;
 359 |         if (pad_guard || overspill_guard) {
 360 |           ss << "}" << std::endl;
 361 |         }
 362 |         if (overspill_guard) {
 363 |           ss << "if (";
 364 |           for (int_tp i = 0; i < num_axes_; ++i) {
 365 |             if ((im_out_shape_[i] - 1) * stride_[i]
 366 |                 + d_iter[i] * dilation_[i] - pad_[i]
 367 |                 >= im_in_shape_[i]) {
 368 |               ss << "idx_" << i << " + " << d_iter[i] * dilation_[i]
 369 |                  << " >= v_imsi_" << i << " + "
 370 |                  << pad_[i] << " || ";
 371 |             }
 372 |           }
 373 |           ss << "false) {--ave;}" << std::endl;
 374 |         }
 375 |       }
 376 |       if (pool_method_ == LIBDNN_POOLING_METHOD_STO) {
 377 |         if (pad_guard || overspill_guard) {
 378 |           ss << "true) {" << std::endl;
 379 |         }
 380 |         ss << "cumsum += in_ptr[" << kernel_offset << "];" << std::endl;
 381 |         if (test_mode) {
 382 |           ss << "cumvalues += in_ptr[" << kernel_offset << "]"
 383 |              << " * in_ptr[" << kernel_offset << "];" << std::endl;
 384 |         } else {
 385 |           if (sto_idx == 1) {
 386 |             // Second pass
 387 |             ss << "if (cumsum > thres) {" << std::endl;
 388 |             ss << "stoidx = in_idx + " << kernel_offset << ";" << std::endl;
 389 |             ss << "val = in_ptr[" << kernel_offset << "];" << std::endl;
 390 |             ss << "thres = FLT_MAX;" << std::endl;
 391 |             ss << "}" << std::endl;
 392 |           }
 393 |         }
 394 |         if (pad_guard || overspill_guard) {
 395 |           ss << "}" << std::endl;
 396 |         }
 397 |       }
 398 | 
 399 |       incremented = false;
 400 |       for (int_tp i = num_axes_ - 1; i >= 0; --i) {
 401 |         if (d_iter[i] >= kernel_shape_[i] - 1) {
 402 |           d_iter[i] = 0;
 403 |         } else {
 404 |           d_iter[i] += 1;
 405 |           incremented = true;
 406 |           break;
 407 |         }
 408 |       }
 409 |     } while (incremented);
 410 |   }
 411 | 
 412 |   // Write out the pooling result
 413 |   if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) {
 414 |     ss << "out_ptr[out_idx] = val / ((Dtype)ave);" << std::endl;
 415 |   }
 416 |   if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) {
 417 |     ss << "out_ptr[out_idx] = val;" << std::endl;
 418 |     ss << "mask_ptr[out_idx] = (Dtype)maxidx;" << std::endl;
 419 |   }
 420 |   if (pool_method_ == LIBDNN_POOLING_METHOD_STO) {
 421 |     if (test_mode) {
 422 |       ss << "out_ptr[out_idx] = cumvalues / cumsum;" << std::endl;
 423 |     } else {
 424 |       ss << "out_ptr[out_idx] = val;" << std::endl;
 425 |       ss << "rand_ptr[out_idx] = (Dtype)stoidx;" << std::endl;
 426 |     }
 427 |   }
 428 | 
 429 |   ss << "}" << std::endl;  // Kernel
 430 |   return ss.str();
 431 | }
 432 | 
 433 | template<typename Dtype>
 434 | std::string LibDNNPool<Dtype>::generate_fwtr_kernels(std::string name) {
 435 |   std::stringstream ss;
 436 |   ss << generate_fw_kernels(name, false);
 437 |   return ss.str();
 438 | }
 439 | 
 440 | template<typename Dtype>
 441 | std::string LibDNNPool<Dtype>::generate_fwte_kernels(std::string name) {
 442 |   std::stringstream ss;
 443 |   ss << generate_fw_kernels(name, true);
 444 |   return ss.str();
 445 | }
 446 | 
 447 | 
 448 | 
 449 | template<typename Dtype>
 450 | std::string LibDNNPool<Dtype>::generate_bw_kernels(std::string name) {
 451 |   std::stringstream ss;
 452 | 
 453 |   ss << "__kernel void " + name + "(";
 454 |   ss << "__global const Dtype* __restrict top_diff, ";
 455 |   ss << "__global Dtype* __restrict bottom_diff, ";
 456 |   if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) {
 457 |     if (use_top_mask_) {
 458 |       ss << "__global const Dtype* __restrict top_mask, ";
 459 |     } else {
 460 |       ss << "__global const int_tp* __restrict mask, ";
 461 |     }
 462 |   }
 463 |   if (pool_method_ == LIBDNN_POOLING_METHOD_STO) {
 464 |     ss << "__global const Dtype* __restrict rand_idx, ";
 465 |   }
 466 |   ss << "int_tp channels, ";
 467 |   ss << "int_tp batch_size";
 468 |   ss << ") {" << std::endl;
 469 |   if (bwalgo_ == LIBDNN_POOLING_BW_ALGO_ATOMIC) {
 470 |     // Atomic kernel
 471 |     ss << "int_tp in_idx = get_global_id(0);" << std::endl;
 472 |     ss << "if (get_global_id(1) >= channels * batch_size) {return;}"
 473 |        << std::endl;
 474 |     ss << "int_tp idx_0 = get_global_id(0);" << std::endl;
 475 |     for (int_tp i = num_axes_ - 1; i >= 1; --i) {
 476 |       ss << "int_tp idx_" << i << " = (idx_0 % v_imso_" << i << ");"
 477 |          << std::endl;
 478 |       ss << "idx_" << i << " = idx_" << i << " * v_s_"
 479 |          << i << " - v_p_" << i << ";" << std::endl;
 480 |       ss << "idx_0 /= v_imso_" << i << ";" << std::endl;
 481 |     }
 482 |     ss << "if (idx_0 >= v_imso_0) {return;}" << std::endl;
 483 | 
 484 |     if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) {
 485 |       ss << "idx_0 = idx_0 * v_s_0 - v_p_0;" << std::endl;
 486 |       ss << "int_tp out_idx = idx_0;" << std::endl;
 487 |       for (int_tp i = 1; i < num_axes_; ++i) {
 488 |         ss << "out_idx = out_idx * v_imsi_" << i
 489 |            << " + " << "idx_" << i << ";" << std::endl;
 490 |       }
 491 |       ss << "__global Dtype* out_ptr = bottom_diff "
 492 |          << "+ get_global_id(1) * v_imsi + out_idx;" << std::endl;
 493 |     } else {
 494 |       ss << "__global Dtype* out_ptr = bottom_diff "
 495 |          << "+ get_global_id(1) * v_imsi;" << std::endl;
 496 |     }
 497 |     ss << "__global const Dtype* in_ptr = top_diff "
 498 |        << "+ get_global_id(1) * v_imso + in_idx;" << std::endl;
 499 | 
 500 |     if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) {
 501 |       if (use_top_mask_) {
 502 |         ss << "__global const Dtype* mask_ptr = top_mask "
 503 |            << "+ get_global_id(1) * v_imso + in_idx;" << std::endl;
 504 |       } else {
 505 |         ss << "__global const int_tp* mask_ptr = mask "
 506 |            << "+ get_global_id(1) * v_imso + in_idx;" << std::endl;
 507 |       }
 508 |     }
 509 | 
 510 |     if (pool_method_ == LIBDNN_POOLING_METHOD_STO) {
 511 |       ss << "__global const Dtype* rand_ptr = rand_idx "
 512 |          << "+ get_global_id(1) * v_imso + in_idx;" << std::endl;
 513 |     }
 514 | 
 515 |     std::vector<int_tp> d_iter;
 516 | 
 517 |     for (int_tp i = 0; i < kernel_shape_.size(); ++i) {
 518 |       d_iter.push_back(0);
 519 |     }
 520 | 
 521 |     if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) {
 522 |       int_tp ave = std::accumulate(kernel_shape_.begin(),
 523 |                                    kernel_shape_.end(),
 524 |                                     1, std::multiplies<int_tp>());
 525 |       ss << "int_tp ave = " << ave << ";" << std::endl;
 526 |       ss << "Dtype val = in_ptr[0];" << std::endl;
 527 |     }
 528 | 
 529 |     for (int_tp ave_idx = 0;
 530 |          ave_idx < ((pool_method_ == LIBDNN_POOLING_METHOD_AVE)
 531 |          ? 2 : 0); ++ave_idx) {
 532 |       if (ave_idx == 1) {
 533 |         ss << "val /= ((Dtype)ave);" << std::endl;
 534 |       }
 535 |       // Loop over the kernel
 536 |       bool incremented;
 537 |       do {
 538 |         int_tp kernel_offset = 0;
 539 |         int_tp size_prod = 1;
 540 |         for (int_tp i = num_axes_ - 1; i >= 0; --i) {
 541 |           kernel_offset += size_prod * d_iter[i] * dilation_[i];
 542 |           size_prod *= im_in_shape_[i];
 543 |         }
 544 | 
 545 |         bool pad_guard = false;
 546 |         bool overspill_guard = false;
 547 |         for (int_tp i = 0; i < num_axes_; ++i) {
 548 |           if (d_iter[i] * dilation_[i] < pad_[i]) {
 549 |             pad_guard = true;
 550 |           }
 551 |           if (d_iter[i] * dilation_[i] >=
 552 |               ((kernel_shape_[i] - 1) * dilation_[i] + 1) - pad_[i] ||
 553 |               (im_out_shape_[i] - 1) * stride_[i] + d_iter[i]
 554 |                            * dilation_[i] - pad_[i] >= im_in_shape_[i] ) {
 555 |             pad_guard = true;
 556 |           }
 557 |           if ((im_out_shape_[i] - 1) * stride_[i] + d_iter[i]
 558 |                * dilation_[i] - pad_[i] >= im_in_shape_[i]) {
 559 |             overspill_guard = true;
 560 |           }
 561 |         }
 562 | 
 563 |         if ((ave_idx == 1) && (pad_guard || overspill_guard)) {
 564 |           ss << "if (";
 565 |         }
 566 |         if ((ave_idx == 1) && (pad_guard || overspill_guard)) {
 567 |           for (int_tp i = 0; i < num_axes_; ++i) {
 568 |             if (d_iter[i] * dilation_[i] < pad_[i]) {
 569 |               ss << "idx_" << i << " >= -" << (d_iter[i] * dilation_[i])
 570 |                  << " && ";
 571 |             }
 572 |             if ((d_iter[i] * dilation_[i] >= ((kernel_shape_[i] - 1)
 573 |                 * dilation_[i] + 1) - pad_[i]) ||
 574 |                 ((im_out_shape_[i] - 1) * stride_[i]
 575 |                 + d_iter[i] * dilation_[i] - pad_[i]
 576 |                 >= im_in_shape_[i])) {
 577 |               ss << "idx_" << i << " < v_imsi_" << i << " - "
 578 |                  << (d_iter[i] * dilation_[i]) << " && ";
 579 |             }
 580 |           }
 581 |         }
 582 |         if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) {
 583 |           if ((ave_idx == 1) && (pad_guard || overspill_guard)) {
 584 |             ss << "true) {" << std::endl;
 585 |           }
 586 |           if (ave_idx == 1) {
 587 |             ss << "atomicAdd((&out_ptr[" << kernel_offset << "]), val);"
 588 |                << std::endl;
 589 |           }
 590 |           if ((ave_idx == 1) && (pad_guard || overspill_guard)) {
 591 |             ss << "}" << std::endl;
 592 |           }
 593 |           if (overspill_guard && ave_idx == 0) {
 594 |             ss << "if (";
 595 |             for (int_tp i = 0; i < num_axes_; ++i) {
 596 |               if ((im_out_shape_[i] - 1) * stride_[i]
 597 |                   + d_iter[i] * dilation_[i] - pad_[i]
 598 |                   >= im_in_shape_[i]) {
 599 |                 ss << "idx_" << i << " + " << d_iter[i] * dilation_[i]
 600 |                    << " >= v_imsi_" << i << " + "
 601 |                    << pad_[i] << " || ";
 602 |               }
 603 |             }
 604 |             ss << "false) {--ave;}" << std::endl;
 605 |           }
 606 |         }
 607 | 
 608 |         incremented = false;
 609 |         for (int_tp i = num_axes_ - 1; i >= 0; --i) {
 610 |           if (d_iter[i] >= kernel_shape_[i] - 1) {
 611 |             d_iter[i] = 0;
 612 |           } else {
 613 |             d_iter[i] += 1;
 614 |             incremented = true;
 615 |             break;
 616 |           }
 617 |         }
 618 |       } while (incremented);
 619 |     }
 620 |     if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) {
 621 |       ss << "if (mask_ptr[0] >= 0 && mask_ptr[0] < v_imsi) {" << std::endl;
 622 |       ss << "atomicAdd(&out_ptr[(int_tp)(mask_ptr[0])], "
 623 |          << "in_ptr[0]);" << std::endl;
 624 |       ss << "}" << std::endl;
 625 |     }
 626 |     if (pool_method_ == LIBDNN_POOLING_METHOD_STO) {
 627 |       ss << "if (mask_ptr[0] >= 0 && mask_ptr[0] < v_imsi) {" << std::endl;
 628 |       ss << "atomicAdd(&out_ptr[(int_tp)(rand_ptr[0])], "
 629 |          << "in_ptr[0]);" << std::endl;
 630 |       ss << "}" << std::endl;
 631 |     }
 632 | 
 633 |   } else {
 634 |     // Direct, deterministic kernel
 635 |     ss << "int_tp d_start[" << num_axes_ << "];" << std::endl;
 636 |     ss << "int_tp d_end[" << num_axes_ << "];" << std::endl;
 637 |     ss << "int_tp d_iter[" << num_axes_ << "];" << std::endl;
 638 | 
 639 |     ss << "int_tp out_idx = get_global_id(0);" << std::endl;
 640 |     ss << "int_tp idx_0 = get_global_id(0);" << std::endl;
 641 |     ss << "if (get_global_id(1) >= channels * batch_size) {return;}"
 642 |        << std::endl;
 643 | 
 644 |     for (int_tp i = num_axes_ - 1; i >= 1; --i) {
 645 |       ss << "int_tp idx_" << i << " = (idx_0 % v_imsi_" << i << ");"
 646 |          << std::endl;
 647 |       ss << "idx_0 /= v_imsi_" << i << ";" << std::endl;
 648 |     }
 649 |     ss << "if (idx_0 >= v_imsi_0) {return;}" << std::endl;
 650 | 
 651 |     if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) {
 652 |       ss << "__global Dtype* out_ptr = bottom_diff "
 653 |          << "+ get_global_id(1) * v_imsi + out_idx;" << std::endl;
 654 |     } else {
 655 |       ss << "__global Dtype* out_ptr = bottom_diff "
 656 |          << "+ get_global_id(1) * v_imsi + out_idx;" << std::endl;
 657 |     }
 658 |     ss << "__global const Dtype* in_ptr = top_diff "
 659 |        << "+ get_global_id(1) * v_imso;" << std::endl;
 660 | 
 661 |     if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) {
 662 |       if (use_top_mask_) {
 663 |         ss << "__global const Dtype* mask_ptr = top_mask "
 664 |            << "+ get_global_id(1) * v_imso;" << std::endl;
 665 |       } else {
 666 |         ss << "__global const int_tp* mask_ptr = mask "
 667 |            << "+ get_global_id(1) * v_imso;" << std::endl;
 668 |       }
 669 |     }
 670 | 
 671 |     if (pool_method_ == LIBDNN_POOLING_METHOD_STO) {
 672 |       ss << "__global const Dtype* rand_ptr = rand_idx "
 673 |          << "+ get_global_id(1) * v_imso;" << std::endl;
 674 |     }
 675 | 
 676 |     for (int_tp i = 0; i < num_axes_; ++i) {
 677 |       ss << "d_start[" << i << "] = (idx_" << i << " + v_p_" << i << " < "
 678 |          << "((v_k_" << i << " - 1) * v_d_" << i << " + 1)) ? 0 : (idx_" << i
 679 |          << " + v_p_" << i
 680 |          << " - ((v_k_" << i << " - 1) * v_d_" << i << " + 1))"
 681 |          << " / v_s_" << i << " + 1;" << std::endl;
 682 |       ss << "d_end[" << i << "] = min(v_imso_" << i << " - 1, "
 683 |          << "(idx_" << i << " + v_p_" << i << ")"
 684 |          << " / v_s_" << i << ");" << std::endl;
 685 |       ss << "d_iter[" << i << "] = d_start[" << i << "];" << std::endl;
 686 |       ss << "if (d_start[" << i << "] > d_end[" << i << "]) {" << std::endl;
 687 |       ss << "out_ptr[0] = 0;" << std::endl;
 688 |       ss << "return;" << std::endl;
 689 |       ss << "}" << std::endl;
 690 |     }
 691 | 
 692 |     if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) {
 693 |       ss << "int_tp av_start[" << num_axes_ << "];" << std::endl;
 694 |       ss << "int_tp av_end[" << num_axes_ << "];" << std::endl;
 695 |     }
 696 |     // ss << "printf(\"%f\\n\", (float)ave);" << std::endl;
 697 |     ss << "Dtype gradient = 0.0;" << std::endl;
 698 |     ss << "bool incremented;" << std::endl;
 699 |     ss << "do {" << std::endl;
 700 |     ss << "int_tp offset = 0;" << std::endl;
 701 |     for (int_tp i = 0; i < num_axes_; ++i) {
 702 |       ss << "offset += d_iter[" << i << "];" << std::endl;
 703 |       if (i < num_axes_ - 1) {
 704 |         ss << "offset *= v_imso_" << (i + 1) << ";" << std::endl;
 705 |       }
 706 |     }
 707 |     if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) {
 708 |       ss << "int_tp ave = 1;" << std::endl;
 709 |       for (int_tp i = 0; i < num_axes_; ++i) {
 710 |         ss << "av_start[" << i << "] = d_iter[" << i << "] * v_s_" << i
 711 |         << " - v_p_" << i << ";" << std::endl;
 712 |         ss << "av_end[" << i << "] = min(av_start[" << i << "] + ((v_k_"
 713 |            << i << " - 1) * v_d_"
 714 |            << i << " + 1), v_imsi_" << i << " + v_p_" << i << ");"
 715 |            << std::endl;
 716 |         ss << "ave *= ((av_end[" << i << "] - av_start[" << i << "] - 1) / v_d_"
 717 |            << i << " + 1);"
 718 |            << std::endl;
 719 |       }
 720 |     }
 721 |     // Dilation filters
 722 |     bool has_dilation = false;
 723 |     for (int_tp i = 0; i < num_axes_; ++i) {
 724 |       if (dilation_[i] > 1) {
 725 |         has_dilation = true;
 726 |       }
 727 |     }
 728 |     if (has_dilation &&
 729 |         (pool_method_ == LIBDNN_POOLING_METHOD_AVE ||
 730 |         pool_method_ == LIBDNN_POOLING_METHOD_STO)) {
 731 |       ss << "if (";
 732 |       for (int i  = 0; i < num_axes_; ++i) {
 733 |         ss << "idx_" << i << " >= av_start[" << i << "] && ";
 734 |         ss << "idx_" << i << " < av_end[" << i << "] && ";
 735 |         ss << "(idx_" << i <<" - av_start[" << i << "]) % v_d_" << i << " == 0"
 736 |            << " && ";
 737 |       }
 738 |       ss << "true) {" << std::endl;
 739 |     }
 740 |     if (pool_method_ == LIBDNN_POOLING_METHOD_MAX) {
 741 |       ss << "if ((int_tp)mask_ptr[offset] == out_idx) {" << std::endl;
 742 |     } else if (pool_method_ == LIBDNN_POOLING_METHOD_STO) {
 743 |       ss << "if ((int_tp)rand_ptr[offset] == out_idx) {" << std::endl;
 744 |     } else {
 745 |       ss << "{" << std::endl;
 746 |     }
 747 |     ss << "gradient += in_ptr[offset]";
 748 |     if (pool_method_ == LIBDNN_POOLING_METHOD_AVE) {
 749 |       ss << " / (Dtype)ave;" << std::endl;
 750 |     } else {
 751 |       ss << ";" << std::endl;
 752 |     }
 753 |     ss << "}" << std::endl;
 754 |     if (has_dilation &&
 755 |         (pool_method_ == LIBDNN_POOLING_METHOD_AVE ||
 756 |         pool_method_ == LIBDNN_POOLING_METHOD_STO)) {
 757 |       ss << "}" << std::endl;
 758 |     }
 759 |     // Increment
 760 |     ss << "incremented = false;" << std::endl;
 761 |     ss << "for (int_tp i = v_nax - 1; i >= 0; --i) {" << std::endl;
 762 |     ss << "if (d_iter[i] >= d_end[i]) {" << std::endl;
 763 |     ss << "d_iter[i] = d_start[i];" << std::endl;
 764 |     ss << "} else {" << std::endl;
 765 |     ss << "++d_iter[i];" << std::endl;
 766 |     ss << "incremented = true;" << std::endl;
 767 |     ss << "break;" << std::endl;
 768 |     ss << "}}} while (incremented);" << std::endl;
 769 | 
 770 |     ss << "out_ptr[0] = gradient;" << std::endl;
 771 |   }  // Deterministic kernel
 772 |   ss << "}" << std::endl;  // Kernel
 773 | 
 774 |   return ss.str();
 775 | }
 776 | 
 777 | template<typename Dtype>
 778 | void LibDNNPool<Dtype>::GenerateKernels() {
 779 |   std::stringstream ss;
 780 | 
 781 |   ss << LibDNN<Dtype>::generate_header();
 782 |   ss << generate_fw_defs();
 783 |   ss << generate_fwtr_kernels("pool_forward_train");
 784 |   ss << generate_fwte_kernels("pool_forward_test");
 785 |   ss << generate_bw_defs();
 786 |   ss << generate_bw_kernels("pool_backward");
 787 | 
 788 |   // Write complete kernel string
 789 |   LibDNN<Dtype>::kernel_ = ss.str();
 790 | }
 791 | 
 792 | template<typename Dtype>
 793 | void LibDNNPool<Dtype>::Forward(const Dtype* bottom_data,
 794 |                                 Dtype* top_data,
 795 |                                 int_tp channels,
 796 |                                 int_tp batch_size,
 797 |                                 bool test_mode,
 798 |                                 int_tp* mask,
 799 |                                 Dtype* top_mask,
 800 |                                 Dtype* rand_idx) {
 801 |   int_tp imsi = std::accumulate(im_in_shape_.begin(), im_in_shape_.end(),
 802 |                                 1, std::multiplies<int_tp>());
 803 |   int_tp imso = std::accumulate(im_out_shape_.begin(), im_out_shape_.end(),
 804 |                                 1, std::multiplies<int_tp>());
 805 | 
 806 |   int_tp lw0 = fw_tuner_->get_param<int_tp>("LW0");
 807 |   int_tp lw1 = fw_tuner_->get_param<int_tp>("LW1");
 808 | 
 809 | #ifdef USE_OPENCL
 810 |   if (LibDNN<Dtype>::dev_ptr_->backend() == BACKEND_OpenCL) {
 811 |     viennacl::ocl::kernel &kernel =
 812 |         LibDNN<Dtype>::ocl_program_.get_kernel(
 813 |         test_mode ? "pool_forward_test" : "pool_forward_train");
 814 |     viennacl::ocl::context &ctx =
 815 |         viennacl::ocl::get_context(LibDNN<Dtype>::dev_ptr_->id());
 816 | 
 817 |     kernel.local_work_size(0, lw0);
 818 |     kernel.local_work_size(1, lw1);
 819 |     kernel.local_work_size(2, 1);
 820 | 
 821 |     kernel.global_work_size(0, ((imso - 1) / lw0 + 1) * lw0);
 822 |     kernel.global_work_size(1, ((channels * batch_size - 1) / lw1 + 1) * lw1);
 823 |     kernel.global_work_size(2, 1);
 824 | 
 825 |     switch (pool_method_) {
 826 |       case LIBDNN_POOLING_METHOD_MAX:
 827 |         if (use_top_mask_) {
 828 |           viennacl::ocl::enqueue(
 829 |                  kernel(WrapHandle((cl_mem) bottom_data, &ctx),
 830 |                         WrapHandle((cl_mem) top_data, &ctx),
 831 |                         WrapHandle((cl_mem) top_mask, &ctx),
 832 |                         channels,
 833 |                         batch_size),
 834 |                  ctx.get_queue());
 835 |         } else {
 836 |          viennacl::ocl::enqueue(
 837 |                 kernel(WrapHandle((cl_mem) bottom_data, &ctx),
 838 |                        WrapHandle((cl_mem) top_data, &ctx),
 839 |                        WrapHandle((cl_mem) mask, &ctx),
 840 |                        channels,
 841 |                        batch_size),
 842 |                 ctx.get_queue());
 843 |         }
 844 |         break;
 845 |       case LIBDNN_POOLING_METHOD_AVE:
 846 |         viennacl::ocl::enqueue(
 847 |                kernel(WrapHandle((cl_mem) bottom_data, &ctx),
 848 |                       WrapHandle((cl_mem) top_data, &ctx),
 849 |                       channels,
 850 |                       batch_size),
 851 |                ctx.get_queue());
 852 |         break;
 853 |       case LIBDNN_POOLING_METHOD_STO:
 854 |         viennacl::ocl::enqueue(
 855 |                kernel(WrapHandle((cl_mem) bottom_data, &ctx),
 856 |                       WrapHandle((cl_mem) top_data, &ctx),
 857 |                       WrapHandle((cl_mem) rand_idx, &ctx),
 858 |                       channels,
 859 |                       batch_size),
 860 |                ctx.get_queue());
 861 |         break;
 862 |     }
 863 |   }
 864 | #endif  // USE_OPENCL
 865 | 
 866 | #ifdef USE_CUDA
 867 |   if (LibDNN<Dtype>::dev_ptr_->backend() == BACKEND_CUDA) {
 868 |     CUfunction kernel;
 869 |     cuModuleGetFunction(&kernel, LibDNN<Dtype>::cuda_module_,
 870 |                test_mode ? "pool_forward_test" : "pool_forward_train");
 871 | 
 872 |     switch (pool_method_) {
 873 |       case LIBDNN_POOLING_METHOD_MAX: {
 874 |         if (use_top_mask_) {
 875 |           void *args[] = { &bottom_data, &top_data, &top_mask,
 876 |               &channels, &batch_size };
 877 |           cuLaunchKernel(kernel,
 878 |                          (imso - 1) / lw0 + 1,                   // Grid X
 879 |                          (channels * batch_size - 1) / lw1 + 1,  // Grid Y
 880 |                          1,                                      // Grid Z
 881 |                          lw0, lw1, 1,                            // Local
 882 |                          0, NULL, args, 0);                      // Arguments
 883 |         } else {
 884 |           void *args[] = { &bottom_data, &top_data, &mask,
 885 |               &channels, &batch_size };
 886 |           cuLaunchKernel(kernel,
 887 |                          (imso - 1) / lw0 + 1,                   // Grid X
 888 |                          (channels * batch_size - 1) / lw1 + 1,  // Grid Y
 889 |                          1,                                      // Grid Z
 890 |                          lw0, lw1, 1,                            // Local
 891 |                          0, NULL, args, 0);                      // Arguments
 892 |         }
 893 |         break;
 894 |       }
 895 |       case LIBDNN_POOLING_METHOD_AVE: {
 896 |         void *args[] = { &bottom_data, &top_data,
 897 |             &channels, &batch_size };
 898 |         cuLaunchKernel(kernel,
 899 |                        (imso - 1) / lw0 + 1,                   // Grid X
 900 |                        (channels * batch_size - 1) / lw1 + 1,  // Grid Y
 901 |                        1,                                      // Grid Z
 902 |                        lw0, lw1, 1,                            // Local
 903 |                        0, NULL, args, 0);                      // Arguments
 904 |         break;
 905 |       }
 906 |       case LIBDNN_POOLING_METHOD_STO: {
 907 |         void *args[] = { &bottom_data, &top_data, &rand_idx,
 908 |             &channels, &batch_size };
 909 |         cuLaunchKernel(kernel,
 910 |                        (imso - 1) / lw0 + 1,                   // Grid X
 911 |                        (channels * batch_size - 1) / lw1 + 1,  // Grid Y
 912 |                        1,                                      // Grid Z
 913 |                        lw0, lw1, 1,                            // Local
 914 |                        0, NULL, args, 0);                      // Arguments
 915 |         break;
 916 |       }
 917 |     }
 918 |     cuCtxSynchronize();
 919 |   }
 920 | #endif  // USE_CUDA
 921 | }
 922 | 
 923 | 
 924 | template<typename Dtype>
 925 | void LibDNNPool<Dtype>::Backward(const Dtype* top_diff,
 926 |                                 Dtype* bottom_diff,
 927 |                                 int_tp channels,
 928 |                                 int_tp batch_size,
 929 |                                 const int_tp* mask,
 930 |                                 const Dtype* top_mask,
 931 |                                 const Dtype* rand_idx) {
 932 |   int_tp ims = batch_size * channels;
 933 |   for (int_tp i = 0; i < im_in_shape_.size(); ++i) {
 934 |     ims *= im_in_shape_[i];
 935 |   }
 936 |   LibDNN<Dtype>::SetMemory(bottom_diff, ims, 0, (Dtype) 0);
 937 | 
 938 |   int_tp imsi = std::accumulate(im_in_shape_.begin(), im_in_shape_.end(),
 939 |                                 1, std::multiplies<int_tp>());
 940 |   int_tp imso = std::accumulate(im_out_shape_.begin(), im_out_shape_.end(),
 941 |                                 1, std::multiplies<int_tp>());
 942 | 
 943 |   int_tp imsw = 0;
 944 |   if (bwalgo_ == LIBDNN_POOLING_BW_ALGO_DIRECT) {
 945 |     // Direct kernel iterates over input size
 946 |     imsw = imsi;
 947 |   } else {
 948 |     // Atomic kernel iterates over output size
 949 |     imsw = imso;
 950 |   }
 951 | 
 952 |   int_tp lw0 = bw_tuner_->get_param<int_tp>("LW0");
 953 |   int_tp lw1 = bw_tuner_->get_param<int_tp>("LW1");
 954 | 
 955 | #ifdef USE_OPENCL
 956 |   if (LibDNN<Dtype>::dev_ptr_->backend() == BACKEND_OpenCL) {
 957 |     viennacl::ocl::kernel &kernel =
 958 |         LibDNN<Dtype>::ocl_program_.get_kernel("pool_backward");
 959 |     viennacl::ocl::context &ctx =
 960 |         viennacl::ocl::get_context(LibDNN<Dtype>::dev_ptr_->id());
 961 | 
 962 |     kernel.local_work_size(0, lw0);
 963 |     kernel.local_work_size(1, lw1);
 964 |     kernel.local_work_size(2, 1);
 965 | 
 966 |     kernel.global_work_size(0, ((imsw - 1) / lw0 + 1) * lw0);
 967 |     kernel.global_work_size(1, ((channels * batch_size - 1) / lw1 + 1) * lw1);
 968 |     kernel.global_work_size(2, 1);
 969 | 
 970 |     switch (pool_method_) {
 971 |       case LIBDNN_POOLING_METHOD_MAX:
 972 |         if (use_top_mask_) {
 973 |           viennacl::ocl::enqueue(
 974 |                  kernel(WrapHandle((cl_mem) top_diff, &ctx),
 975 |                         WrapHandle((cl_mem) bottom_diff, &ctx),
 976 |                         WrapHandle((cl_mem) top_mask, &ctx),
 977 |                         channels,
 978 |                         batch_size),
 979 |                  ctx.get_queue());
 980 |         } else {
 981 |          viennacl::ocl::enqueue(
 982 |                 kernel(WrapHandle((cl_mem) top_diff, &ctx),
 983 |                        WrapHandle((cl_mem) bottom_diff, &ctx),
 984 |                        WrapHandle((cl_mem) mask, &ctx),
 985 |                        channels,
 986 |                        batch_size),
 987 |                 ctx.get_queue());
 988 |         }
 989 |         break;
 990 |       case LIBDNN_POOLING_METHOD_AVE:
 991 |         viennacl::ocl::enqueue(
 992 |                kernel(WrapHandle((cl_mem) top_diff, &ctx),
 993 |                       WrapHandle((cl_mem) bottom_diff, &ctx),
 994 |                       channels,
 995 |                       batch_size),
 996 |                ctx.get_queue());
 997 |         break;
 998 |       case LIBDNN_POOLING_METHOD_STO:
 999 |         viennacl::ocl::enqueue(
1000 |                kernel(WrapHandle((cl_mem) top_diff, &ctx),
1001 |                       WrapHandle((cl_mem) bottom_diff, &ctx),
1002 |                       WrapHandle((cl_mem) rand_idx, &ctx),
1003 |                       channels,
1004 |                       batch_size),
1005 |                ctx.get_queue());
1006 |         break;
1007 |     }
1008 |   }
1009 | #endif  // USE_OPENCL
1010 | 
1011 | #ifdef USE_CUDA
1012 |   if (LibDNN<Dtype>::dev_ptr_->backend() == BACKEND_CUDA) {
1013 |     CUfunction kernel;
1014 |     cuModuleGetFunction(&kernel, LibDNN<Dtype>::cuda_module_, "pool_backward");
1015 | 
1016 |     switch (pool_method_) {
1017 |       case LIBDNN_POOLING_METHOD_MAX: {
1018 |         if (use_top_mask_) {
1019 |           void *args[] = { &top_diff, &bottom_diff, &top_mask,
1020 |               &channels, &batch_size };
1021 |           cuLaunchKernel(kernel,
1022 |                          (imsw - 1) / lw0 + 1,                   // Grid X
1023 |                          (channels * batch_size - 1) / lw1 + 1,  // Grid Y
1024 |                          1,                                      // Grid Z
1025 |                          lw0, lw1, 1,                            // Local
1026 |                          0, NULL, args, 0);                      // Arguments
1027 |         } else {
1028 |           void *args[] = { &top_diff, &bottom_diff, &mask,
1029 |               &channels, &batch_size };
1030 |           cuLaunchKernel(kernel,
1031 |                          (imsw - 1) / lw0 + 1,                   // Grid X
1032 |                          (channels * batch_size - 1) / lw1 + 1,  // Grid Y
1033 |                          1,                                      // Grid Z
1034 |                          lw0, lw1, 1,                            // Local
1035 |                          0, NULL, args, 0);                      // Arguments
1036 |         }
1037 |         break;
1038 |       }
1039 |       case LIBDNN_POOLING_METHOD_AVE: {
1040 |         void *args[] = { &top_diff, &bottom_diff,
1041 |             &channels, &batch_size };
1042 |         cuLaunchKernel(kernel,
1043 |                        (imsw - 1) / lw0 + 1,                   // Grid X
1044 |                        (channels * batch_size - 1) / lw1 + 1,  // Grid Y
1045 |                        1,                                      // Grid Z
1046 |                        lw0, lw1, 1,                            // Local
1047 |                        0, NULL, args, 0);                      // Arguments
1048 |         break;
1049 |       }
1050 |       case LIBDNN_POOLING_METHOD_STO: {
1051 |         void *args[] = { &top_diff, &bottom_diff, &rand_idx,
1052 |             &channels, &batch_size };
1053 |         cuLaunchKernel(kernel,
1054 |                        (imsw - 1) / lw0 + 1,                   // Grid X
1055 |                        (channels * batch_size - 1) / lw1 + 1,  // Grid Y
1056 |                        1,                                      // Grid Z
1057 |                        lw0, lw1, 1,                            // Local
1058 |                        0, NULL, args, 0);                      // Arguments
1059 |         break;
1060 |       }
1061 |     }
1062 |     cuCtxSynchronize();
1063 |   }
1064 | #endif  // USE_CUDA
1065 | }
1066 | 
1067 | template class LibDNNPool<float>;
1068 | template class LibDNNPool<double>;
1069 | 
1070 | }  // namespace greentea
1071 | 


--------------------------------------------------------------------------------