├── .gitmodules
├── CMakeLists.txt
├── README.md
├── cmake
    ├── ConfigGen.cmake
    ├── Cuda.cmake
    ├── Dependencies.cmake
    ├── Modules
    │   ├── FindAtlas.cmake
    │   └── FindOpenBLAS.cmake
    ├── Targets.cmake
    ├── Templates
    │   ├── DAUConvNetConfig.cmake.in
    │   └── dau_conv_config.h.in
    └── Utils.cmake
├── include
    └── dau_conv
    │   ├── base_dau_conv_layer.hpp
    │   ├── dau_conv_impl
    │       ├── dau_conv_backward.hpp
    │       ├── dau_conv_backward_core.hpp
    │       ├── dau_conv_forward.hpp
    │       └── dau_conv_forward_core.hpp
    │   └── util
    │       ├── common.hpp
    │       ├── convolve.hpp
    │       ├── im2col.hpp
    │       ├── math_functions.hpp
    │       └── mkl_alternate.hpp
├── plugins
    └── tensorflow
    │   ├── CMakeLists.txt
    │   ├── MANIFEST.in.in
    │   ├── build-ci
    │       └── build-whl.sh
    │   ├── dau_conv
    │       ├── __init__.py
    │       ├── _dau_conv_grad_op.py
    │       ├── dau_conv.py
    │       └── test
    │       │   └── __main__.py
    │   ├── docker
    │       ├── Dockerfile
    │       ├── Dockerfile.ubuntu18.04
    │       ├── test_dau.sh
    │       └── verify_dau_import.py
    │   ├── scripts
    │       └── start_main_build.sh
    │   ├── setup.py.in
    │   └── src
    │       ├── dau_conv_grad_op.cpp
    │       ├── dau_conv_layer_tensorflow.cpp
    │       ├── dau_conv_layer_tensorflow.hpp
    │       └── dau_conv_op.cpp
└── src
    ├── dau_conv
        ├── CMakeLists.txt
        ├── base_dau_conv_layer.cpp
        ├── base_dau_conv_layer.cu
        ├── dau_conv_impl
        │   ├── dau_conv_backward.cpp
        │   ├── dau_conv_backward_patch_16x16.cu
        │   ├── dau_conv_backward_patch_16x32.cu
        │   ├── dau_conv_backward_patch_16x64.cu
        │   ├── dau_conv_backward_patch_16x8.cu
        │   ├── dau_conv_backward_patch_1x1.cu
        │   ├── dau_conv_backward_patch_32x16.cu
        │   ├── dau_conv_backward_patch_32x32.cu
        │   ├── dau_conv_backward_patch_32x64.cu
        │   ├── dau_conv_backward_patch_32x8.cu
        │   ├── dau_conv_backward_patch_64x16.cu
        │   ├── dau_conv_backward_patch_64x32.cu
        │   ├── dau_conv_backward_patch_64x64.cu
        │   ├── dau_conv_backward_patch_64x8.cu
        │   ├── dau_conv_backward_patch_8x16.cu
        │   ├── dau_conv_backward_patch_8x32.cu
        │   ├── dau_conv_backward_patch_8x64.cu
        │   ├── dau_conv_backward_patch_8x8.cu
        │   ├── dau_conv_forward.cpp
        │   ├── dau_conv_forward_off16_s0_f1.cu
        │   ├── dau_conv_forward_off16_s1_f1.cu
        │   ├── dau_conv_forward_off32_s1_f1.cu
        │   ├── dau_conv_forward_off4_s0_f0.cu
        │   ├── dau_conv_forward_off4_s0_f1.cu
        │   ├── dau_conv_forward_off4_s1_f0.cu
        │   ├── dau_conv_forward_off4_s1_f1.cu
        │   ├── dau_conv_forward_off8_s0_f0.cu
        │   ├── dau_conv_forward_off8_s0_f1.cu
        │   ├── dau_conv_forward_off8_s1_f0.cu
        │   └── dau_conv_forward_off8_s1_f1.cu
        └── util
        │   ├── common.cpp
        │   ├── convolve.cpp
        │   ├── convolve.cu
        │   ├── im2col.cpp
        │   ├── math_functions.cpp
        │   └── math_functions.cu
    └── main.cpp


/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "include/cub"]
2 | 	path = include/cub
3 | 	url = https://github.com/NVlabs/cub.git
4 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | # DAU-ConvNet project exposes the following varaibles:
  2 | #  - DAUConvNet_OBJ_TARGET: name of object target (that can be used as dependency)
  3 | #  - DAUConvNet_OBJS: pre-compiled .o objects (.cpp and .cu files)
  4 | #  - DAUConvNet_CU_OBJS; pre-compiled .cu.o objects (only resutling CUDA objects !!)
  5 | #  - DAUConvNet_INCLUDE_DIRS: include dirs of dependencies for DAU-ConvNet
  6 | #  - DAUConvNet_LINKER_LIBS: linker libs of dependencies for DAU-ConvNet
  7 | #  - DAUConvNet_INCLUDE_DIR: include dir for DAU-ConvNet (i.e. 3thparty/DAU-ConvNet/include)
  8 | 
  9 | cmake_minimum_required(VERSION 2.8.8)
 10 | if(POLICY CMP0046)
 11 |   cmake_policy(SET CMP0046 NEW)
 12 | endif()
 13 | if(POLICY CMP0054)
 14 |   cmake_policy(SET CMP0054 NEW)
 15 | endif()
 16 | 
 17 | # ---[ DAU-ConvNet project
 18 | project(DAUConvNet C CXX)
 19 | 
 20 | set(PACKAGE_VERSION "1.0" CACHE STRING "DAU-ConvNet version number")
 21 | 
 22 | # ---[ Using cmake scripts and modules
 23 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 24 | 
 25 | include(cmake/Utils.cmake)
 26 | include(cmake/Targets.cmake)
 27 | include(cmake/ConfigGen.cmake)
 28 | 
 29 | # ---[ Using C++11
 30 | 
 31 | include(CheckCXXCompilerFlag)
 32 | CHECK_CXX_COMPILER_FLAG("-std=c++17" COMPILER_SUPPORTS_CXX17)
 33 | CHECK_CXX_COMPILER_FLAG("-std=c++14" COMPILER_SUPPORTS_CXX14)
 34 | CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11)
 35 | CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORTS_CXX0X)
 36 | 
 37 | if(COMPILER_SUPPORTS_CXX14)
 38 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
 39 | elseif(COMPILER_SUPPORTS_CXX11)
 40 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 41 | elseif(COMPILER_SUPPORTS_CXX0X)
 42 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x")
 43 | else()
 44 |   message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.")
 45 | endif()
 46 | 
 47 | # ---[ Configuration types
 48 | set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Possible configurations" FORCE)
 49 | mark_as_advanced(CMAKE_CONFIGURATION_TYPES)
 50 | 
 51 | if(DEFINED CMAKE_BUILD_TYPE)
 52 |   set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES})
 53 | endif()
 54 | 
 55 | # --[ If user doesn't specify build type then assume release
 56 | if("${CMAKE_BUILD_TYPE}" STREQUAL "")
 57 |   set(CMAKE_BUILD_TYPE Release)
 58 | endif()
 59 | 
 60 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
 61 |   set(CMAKE_COMPILER_IS_CLANGXX TRUE)
 62 | endif()
 63 | 
 64 | # ---[ Solution folders
 65 | dau_conv_option(USE_PROJECT_FOLDERS "IDE Solution folders" (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) )
 66 | 
 67 | if(USE_PROJECT_FOLDERS)
 68 |   set_property(GLOBAL PROPERTY USE_FOLDERS ON)
 69 |   set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMakeTargets")
 70 | endif()
 71 | 
 72 | # ---[ RPATH settings
 73 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Use link paths for shared library rpath")
 74 | set(CMAKE_MACOSX_RPATH TRUE)
 75 | 
 76 | list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES ${CMAKE_INSTALL_PREFIX}/lib __is_systtem_dir)
 77 | if(${__is_systtem_dir} STREQUAL -1)
 78 |   set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib)
 79 | endif()
 80 | 
 81 | # ---[ Set debug postfix
 82 | set(DAUConvNet_DEBUG_POSTFIX "-d")
 83 | 
 84 | set(DAUConvNet_POSTFIX "")
 85 | if(CMAKE_BUILD_TYPE MATCHES "Debug")
 86 |   set(DAUConvNet_POSTFIX ${DAUConvNet_DEBUG_POSTFIX})
 87 | endif()
 88 | 
 89 | 
 90 | # ---[ Options
 91 | dau_conv_option(BUILD_TENSORFLOW_PLUGIN "Builds TensorFlow plugin" OFF)
 92 | dau_conv_option(BUILD_SHARED_LIBS "Build shared libraries" ON)
 93 | dau_conv_option(USE_DUMMY_CUDA_IMPL "For debugging purpose; do not compile CUDA kernels (fast compile time)" OFF)
 94 | dau_conv_option(ALLOW_INTERPOLATION_OFF "Build support for disabling interpolation in DAUs" OFF)
 95 | 
 96 | # ---[ Dependencies
 97 | include(cmake/Dependencies.cmake)
 98 | 
 99 | # ---[ Flags
100 | if(UNIX OR APPLE)
101 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall")
102 | endif()
103 | 
104 | if(USE_libstdcpp)
105 |   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++")
106 |   message("-- Warning: forcing libstdc++ (controlled by USE_libstdcpp option in cmake)")
107 | endif()
108 | 
109 | if(${USE_DUMMY_CUDA_IMPL})
110 |   list(APPEND DAUConvNet_DEFINITIONS "-DDAU_USE_DUMMY_CUDA_IMPL")
111 | endif()
112 | 
113 | if(${ALLOW_INTERPOLATION_OFF})
114 |   list(APPEND DAUConvNet_DEFINITIONS "-DDAU_ALLOW_INTERPOLATION_OFF")
115 | endif()
116 | 
117 | # ---[ Warnings
118 | dau_conv_warnings_disable(CMAKE_CXX_FLAGS -Wno-sign-compare -Wno-uninitialized)
119 | 
120 | # ---[ Config generation
121 | configure_file(cmake/Templates/dau_conv_config.h.in "${PROJECT_BINARY_DIR}/dau_conv_config.h")
122 | 
123 | # ---[ Includes
124 | set(DAUConvNet_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include)
125 | set(DAUConvNet_SRC_DIR ${PROJECT_SOURCE_DIR}/src)
126 | include_directories(${PROJECT_BINARY_DIR})
127 | 
128 | # ---[ Includes & defines for CUDA
129 | 
130 | # cuda_compile() does not have per-call dependencies or include pathes
131 | # (cuda_compile() has per-call flags, but we set them here too for clarity)
132 | #
133 | # list(REMOVE_ITEM ...) invocations remove PRIVATE and PUBLIC keywords from collected definitions and include pathes
134 | if(HAVE_CUDA)
135 |   # Add includes to CUB only for CUDA lower than 11.8 which does not provide CUB
136 |   if("${CUDA_VERSION}" VERSION_LESS "11.8")
137 |     message(STATUS "CUDA_VERSION (${CUDA_VERSION}) is less than 11.8, adding CUB to include pathes")
138 |     list(APPEND DAUConvNet_INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/include/cub)
139 |   endif()
140 | 
141 |   # pass include pathes to cuda_include_directories()
142 |   set(DAUConvNet_ALL_INCLUDE_DIRS ${DAUConvNet_INCLUDE_DIRS})
143 |   list(REMOVE_ITEM DAUConvNet_ALL_INCLUDE_DIRS PRIVATE PUBLIC)
144 |   cuda_include_directories(${DAUConvNet_INCLUDE_DIR} ${DAUConvNet_SRC_DIR} ${DAUConvNet_ALL_INCLUDE_DIRS})
145 | 
146 |   # add definitions to nvcc flags directly
147 |   set(DAUConvNet_ALL_DEFINITIONS ${DAUConvNet_DEFINITIONS})
148 |   list(REMOVE_ITEM DAUConvNet_ALL_DEFINITIONS PRIVATE PUBLIC)
149 |   list(APPEND CUDA_NVCC_FLAGS ${DAUConvNet_ALL_DEFINITIONS})
150 | else()
151 |   message( FATAL_ERROR "MISSING CUDA: DAU-ConvNet implementation requires CUDA")
152 | endif()
153 | 
154 | # ---[ Subdirectories
155 | add_subdirectory(src/dau_conv)
156 | add_subdirectory(plugins/tensorflow)
157 | 
158 | # ---[ Export configs generation
159 | dau_conv_generate_export_configs()
160 | 
161 | get_directory_property(has_parent PARENT_DIRECTORY)
162 | 
163 | if (has_parent)
164 |   set(DAUConvNet_OBJ_TARGET ${DAUConvNet_OBJ_TARGET} PARENT_SCOPE)
165 |   set(DAUConvNet_OBJS ${DAUConvNet_OBJS} PARENT_SCOPE)
166 |   set(DAUConvNet_CU_OBJS ${DAUConvNet_CU_OBJS} PARENT_SCOPE)
167 |   set(DAUConvNet_INCLUDE_DIRS ${DAUConvNet_INCLUDE_DIRS} PARENT_SCOPE)
168 |   set(DAUConvNet_INCLUDE_DIR ${DAUConvNet_INCLUDE_DIR} PARENT_SCOPE)
169 |   set(DAUConvNet_LINKER_LIBS ${DAUConvNet_LIBS} PARENT_SCOPE)
170 | endif()
171 | 


--------------------------------------------------------------------------------
/cmake/ConfigGen.cmake:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | ################################################################################################
 4 | # Function for generation DAU-ConvNet build- and install- tree export config files
 5 | # Usage:
 6 | #  dau_conv_generate_export_configs()
 7 | function(dau_conv_generate_export_configs)
 8 |   set(install_cmake_suffix "share/DAUConvNet")
 9 | 
10 |   if(NOT HAVE_CUDA)
11 |     set(HAVE_CUDA FALSE)
12 |   endif()
13 | 
14 |   # ---[ Configure build-tree DAUConvNetConfig.cmake file ]---
15 | 
16 |   configure_file("cmake/Templates/DAUConvNetConfig.cmake.in" "${PROJECT_BINARY_DIR}/DAUConvNetConfig.cmake" @ONLY)
17 | 
18 |   # Add targets to the build-tree export set
19 |   export(TARGETS dau-conv FILE "${PROJECT_BINARY_DIR}/DAUConvNetTargets.cmake")
20 |   export(PACKAGE DAUConvNet)
21 | 
22 |   # ---[ Configure install-tree DAUConvNetConfig.cmake file ]---
23 | 
24 |   configure_file("cmake/Templates/DAUConvNetConfig.cmake.in" "${PROJECT_BINARY_DIR}/cmake/DAUConvNetConfig.cmake" @ONLY)
25 | 
26 |   # Install the DAUConvNetConfig.cmake and export set to use with install-tree
27 |   install(FILES "${PROJECT_BINARY_DIR}/cmake/DAUConvNetConfig.cmake" DESTINATION ${install_cmake_suffix})
28 | 
29 | endfunction()
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/cmake/Cuda.cmake:
--------------------------------------------------------------------------------
  1 | ################################################################################################
  2 | # A function for automatic detection of GPUs installed  (if autodetection is enabled)
  3 | # Usage:
  4 | #   dau_conv_cuda_detect_installed_gpus(out_variable)
  5 | function(dau_conv_cuda_detect_installed_gpus out_variable)
  6 |   if(NOT CUDA_gpu_detect_output)
  7 |     set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
  8 | 
  9 |     file(WRITE ${__cufile} ""
 10 |       "#include <cstdio>\n"
 11 |       "int main()\n"
 12 |       "{\n"
 13 |       "  int count = 0;\n"
 14 |       "  if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
 15 |       "  if (count == 0) return -1;\n"
 16 |       "  for (int device = 0; device < count; ++device)\n"
 17 |       "  {\n"
 18 |       "    cudaDeviceProp prop;\n"
 19 |       "    if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
 20 |       "      std::printf(\"%d.%d \", prop.major, prop.minor);\n"
 21 |       "  }\n"
 22 |       "  return 0;\n"
 23 |       "}\n")
 24 | 
 25 |     execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${__cufile}"
 26 |                     WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
 27 |                     RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out
 28 |                     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
 29 | 
 30 |     if(__nvcc_res EQUAL 0)
 31 |       string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}")
 32 |       set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from cuda_detect_gpus tool" FORCE)
 33 |     endif()
 34 |   endif()
 35 | 
 36 |   if(NOT CUDA_gpu_detect_output)
 37 |     message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
 38 |     set(${out_variable} ${cuda_known_gpu_archs} PARENT_SCOPE)
 39 |   else()
 40 |     set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
 41 |   endif()
 42 | endfunction()
 43 | 
 44 | 
 45 | ################################################################################################
 46 | # Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
 47 | # Usage:
 48 | #   dau_conv_cuda_select_nvcc_arch_flags(out_variable)
 49 | function(dau_conv_cuda_select_nvcc_arch_flags out_variable)
 50 |   # List of arch names
 51 |   set(__archs_names "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual")
 52 |   set(__archs_name_default "All")
 53 |   if(NOT CMAKE_CROSSCOMPILING)
 54 |     list(APPEND __archs_names "Auto")
 55 |     set(__archs_name_default "Auto")
 56 |   endif()
 57 | 
 58 |   # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
 59 |   set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
 60 |   set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} )
 61 |   mark_as_advanced(CUDA_ARCH_NAME)
 62 | 
 63 |   # verify CUDA_ARCH_NAME value
 64 |   if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
 65 |     string(REPLACE ";" ", " __archs_names "${__archs_names}")
 66 |     message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.")
 67 |   endif()
 68 | 
 69 |   if(${CUDA_ARCH_NAME} STREQUAL "Manual")
 70 |     set(CUDA_ARCH_BIN ${cuda_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
 71 |     set(CUDA_ARCH_PTX "50"                     CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
 72 |     mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
 73 |   else()
 74 |     unset(CUDA_ARCH_BIN CACHE)
 75 |     unset(CUDA_ARCH_PTX CACHE)
 76 |   endif()
 77 | 
 78 |   if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
 79 |     set(__cuda_arch_bin "30 35")
 80 |   elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
 81 |     set(__cuda_arch_bin "50")
 82 |   elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
 83 |     set(__cuda_arch_bin "60 61")
 84 |   elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
 85 |     set(__cuda_arch_bin "70")
 86 |   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
 87 |     set(__cuda_arch_bin "75")
 88 |   elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
 89 |     if ("${CUDA_VERSION}" VERSION_GREATER "11.5" OR "${CUDA_VERSION}" VERSION_EQUAL "11.5")
 90 |       set(__cuda_arch_bin "80 86 87")
 91 |     else()
 92 |       set(__cuda_arch_bin "80 86")
 93 |     endif()
 94 |   elseif(${CUDA_ARCH_NAME} STREQUAL "Lovelace")
 95 |     set(__cuda_arch_bin "89")
 96 |   elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper")
 97 |     set(__cuda_arch_bin "90")
 98 |   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
 99 |     set(__cuda_arch_bin ${cuda_known_gpu_archs})
100 |   elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
101 |     dau_conv_cuda_detect_installed_gpus(__cuda_arch_bin)
102 |   else()  # (${CUDA_ARCH_NAME} STREQUAL "Manual")
103 |     set(__cuda_arch_bin ${CUDA_ARCH_BIN})
104 |   endif()
105 |   # remove dots and convert to lists
106 |   string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}")
107 |   string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}")
108 |   string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}")
109 |   string(REGEX MATCHALL "[0-9]+"   __cuda_arch_ptx "${__cuda_arch_ptx}")
110 |   dau_conv_list_unique(__cuda_arch_bin __cuda_arch_ptx)
111 | 
112 |   set(__nvcc_flags "")
113 |   set(__nvcc_archs_readable "")
114 | 
115 |   # Tell NVCC to add binaries for the specified GPUs
116 |   foreach(__arch ${__cuda_arch_bin})
117 |     if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
118 |       # User explicitly specified PTX for the concrete BIN
119 |       list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
120 |       list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1})
121 |     else()
122 |       # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
123 |       list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch})
124 |       list(APPEND __nvcc_archs_readable sm_${__arch})
125 |     endif()
126 |   endforeach()
127 | 
128 |   # Tell NVCC to add PTX intermediate code for the specified architectures
129 |   foreach(__arch ${__cuda_arch_ptx})
130 |     list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch})
131 |     list(APPEND __nvcc_archs_readable compute_${__arch})
132 |   endforeach()
133 | 
134 |   string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}")
135 |   set(${out_variable}          ${__nvcc_flags}          PARENT_SCOPE)
136 |   set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE)
137 | endfunction()
138 | 
139 | ################################################################################################
140 | # Short command for cuda compilation
141 | # Usage:
142 | #   cuda_compile(<objlist_variable> <cuda_files>)
143 | macro(dau_conv_cuda_compile objlist_variable)
144 |   foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
145 |     set(${var}_backup_in_cuda_compile_ "${${var}}")
146 | 
147 |     # we remove /EHa as it generates warnings under windows
148 |     string(REPLACE "/EHa" "" ${var} "${${var}}")
149 | 
150 |   endforeach()
151 | 
152 |   if(UNIX OR APPLE)
153 |     # we supprress "declared_but_not_referenced" warning since pops up way too frequently
154 |     list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC -Xcudafe --diag_suppress=declared_but_not_referenced)
155 |   endif()
156 | 
157 |   if(APPLE)
158 |     list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function)
159 |   endif()
160 | 
161 |   cuda_compile(cuda_objcs ${ARGN})
162 | 
163 |   foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG)
164 |     set(${var} "${${var}_backup_in_cuda_compile_}")
165 |     unset(${var}_backup_in_cuda_compile_)
166 |   endforeach()
167 | 
168 |   set(${objlist_variable} ${cuda_objcs})
169 | endmacro()
170 | 
171 | ################################################################################################
172 | ###  Non macro section
173 | ################################################################################################
174 | 
175 | find_package(CUDA 5.5 QUIET)
176 | find_cuda_helper_libs(curand)  # cmake 2.8.7 compartibility which doesn't search for curand
177 | 
178 | # Known NVIDIA GPU achitectures DAUConvNet can be compiled for.
179 | # This list will be used for CUDA_ARCH_NAME = All option
180 | if ("${CUDA_VERSION}" VERSION_GREATER "12.0" OR "${CUDA_VERSION}" VERSION_EQUAL "12.0")
181 |   set(cuda_known_gpu_archs "50 60 61 70 75 80 86 87 89 90")
182 | elseif ("${CUDA_VERSION}" VERSION_GREATER "11.8" OR "${CUDA_VERSION}" VERSION_EQUAL "11.8")
183 |   set(cuda_known_gpu_archs "50 60 61 70 75 80 86 87 89 90")
184 | elseif ("${CUDA_VERSION}" VERSION_GREATER "11.5" OR "${CUDA_VERSION}" VERSION_EQUAL "11.5")
185 |   set(cuda_known_gpu_archs "50 60 61 70 75 80 86 87")
186 | elseif ("${CUDA_VERSION}" VERSION_GREATER "11.0" OR "${CUDA_VERSION}" VERSION_EQUAL "11.0")
187 |   set(cuda_known_gpu_archs "50 60 61 70 75 80 86")
188 | elseif ("${CUDA_VERSION}" VERSION_GREATER "10.0" OR "${CUDA_VERSION}" VERSION_EQUAL "10.0")
189 |   set(cuda_known_gpu_archs "50 60 61 70 75")
190 | elseif ("${CUDA_VERSION}" VERSION_GREATER "9.0" OR "${CUDA_VERSION}" VERSION_EQUAL "9.0")
191 |   set(cuda_known_gpu_archs "50 60 61 70")
192 | elseif ("${CUDA_VERSION}" VERSION_GREATER "8.0" OR "${CUDA_VERSION}" VERSION_EQUAL "8.0")
193 |   set(cuda_known_gpu_archs "50 60 61")
194 | else()
195 |   set(cuda_known_gpu_archs "50")
196 | endif()
197 | 
198 | 
199 | if(NOT CUDA_FOUND)
200 |   return()
201 | endif()
202 | 
203 | set(HAVE_CUDA TRUE)
204 | message(STATUS "CUDA detected: " ${CUDA_VERSION})
205 | list(APPEND DAUConvNet_INCLUDE_DIRS PUBLIC ${CUDA_INCLUDE_DIRS})
206 | list(APPEND DAUConvNet_LINKER_LIBS PUBLIC ${CUDA_CUDART_LIBRARY}
207 |                                      ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES})
208 | 
209 | # setting nvcc arch flags
210 | dau_conv_cuda_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
211 | list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
212 | message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
213 | 
214 | # Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or
215 | # https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt
216 | if(Boost_VERSION EQUAL 105500)
217 |   message(STATUS "Cuda + Boost 1.55: Applying noinline work around")
218 |   # avoid warning for CMake >= 2.8.12
219 |   set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ")
220 | endif()
221 | 
222 | # disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc.
223 | foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used)
224 |   list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag})
225 | endforeach()
226 | 
227 | # setting default testing device
228 | if(NOT CUDA_TEST_DEVICE)
229 |   set(CUDA_TEST_DEVICE -1)
230 | endif()
231 | 
232 | mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
233 | mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
234 | 
235 | # Handle clang/libc++ issue
236 | if(APPLE)
237 |   dau_conv_detect_darwin_version(OSX_VERSION)
238 | 
239 |   # OSX 10.9 and higher uses clang/libc++ by default which is incompatible with old CUDA toolkits
240 |   if(OSX_VERSION VERSION_GREATER 10.8)
241 |     # enabled by default if and only if CUDA version is less than 7.0
242 |     dau_conv_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0))
243 |   endif()
244 | endif()
245 | 


--------------------------------------------------------------------------------
/cmake/Dependencies.cmake:
--------------------------------------------------------------------------------
 1 | # These lists are later turned into target properties on main dau_conv_impl library target
 2 | set(DAUConvNet_LINKER_LIBS "")
 3 | set(DAUConvNet_INCLUDE_DIRS "")
 4 | set(DAUConvNet_DEFINITIONS "")
 5 | set(DAUConvNet_COMPILE_OPTIONS "")
 6 | 
 7 | # we get strange error when DAUConvNet_DEFINITIONS is empty so we just fill it with gibrish definition to make compiler hapopy :)
 8 | list(APPEND DAUConvNet_DEFINITIONS PUBLIC -DDUMMYXYMMUD)
 9 | 
10 | # ---[ CUDA
11 | include(cmake/Cuda.cmake)
12 | if(NOT HAVE_CUDA)
13 |   if(CPU_ONLY)
14 |     message(STATUS "-- CUDA is disabled. Building without it...")
15 |   else()
16 |     message(WARNING "-- CUDA is not detected by cmake. Building without it...")
17 |   endif()
18 | 
19 |   list(APPEND DAUConvNet_DEFINITIONS PUBLIC -DCPU_ONLY)
20 | endif()
21 | 
22 | # ---[ BLAS
23 | if(NOT APPLE)
24 |   set(BLAS "Atlas" CACHE STRING "Selected BLAS library")
25 |   set_property(CACHE BLAS PROPERTY STRINGS "Atlas;Open;MKL")
26 | 
27 |   if(BLAS STREQUAL "Atlas" OR BLAS STREQUAL "atlas")
28 |     find_package(Atlas REQUIRED)
29 |     list(APPEND DAUConvNet_INCLUDE_DIRS PUBLIC ${Atlas_INCLUDE_DIR})
30 |     list(APPEND DAUConvNet_LINKER_LIBS PUBLIC ${Atlas_LIBRARIES})
31 |   elseif(BLAS STREQUAL "Open" OR BLAS STREQUAL "open")
32 |     find_package(OpenBLAS REQUIRED)
33 |     list(APPEND DAUConvNet_INCLUDE_DIRS PUBLIC ${OpenBLAS_INCLUDE_DIR})
34 |     list(APPEND DAUConvNet_LINKER_LIBS PUBLIC ${OpenBLAS_LIB})
35 |   elseif(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl")
36 |     find_package(MKL REQUIRED)
37 |     list(APPEND DAUConvNet_INCLUDE_DIRS PUBLIC ${MKL_INCLUDE_DIR})
38 |     list(APPEND DAUConvNet_LINKER_LIBS PUBLIC ${MKL_LIBRARIES})
39 |     list(APPEND DAUConvNet_DEFINITIONS PUBLIC -DUSE_MKL)
40 |   endif()
41 | elseif(APPLE)
42 |   find_package(vecLib REQUIRED)
43 |   list(APPEND DAUConvNet_INCLUDE_DIRS PUBLIC ${vecLib_INCLUDE_DIR})
44 |   list(APPEND DAUConvNet_LINKER_LIBS PUBLIC ${vecLib_LINKER_LIBS})
45 | 
46 |   if(VECLIB_FOUND)
47 |     if(NOT vecLib_INCLUDE_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*")
48 |       list(APPEND DAUConvNet_DEFINITIONS PUBLIC -DUSE_ACCELERATE)
49 |     endif()
50 |   endif()
51 | endif()
52 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindAtlas.cmake:
--------------------------------------------------------------------------------
 1 | # Find the Atlas (and Lapack) libraries
 2 | #
 3 | # The following variables are optionally searched for defaults
 4 | #  Atlas_ROOT_DIR:            Base directory where all Atlas components are found
 5 | #
 6 | # The following are set after configuration is done:
 7 | #  Atlas_FOUND
 8 | #  Atlas_INCLUDE_DIRS
 9 | #  Atlas_LIBRARIES
10 | #  Atlas_LIBRARYRARY_DIRS
11 | 
12 | set(Atlas_INCLUDE_SEARCH_PATHS
13 |   /usr/include/atlas
14 |   /usr/include/atlas-base
15 |   $ENV{Atlas_ROOT_DIR}
16 |   $ENV{Atlas_ROOT_DIR}/include
17 | )
18 | 
19 | set(Atlas_LIB_SEARCH_PATHS
20 |   /usr/lib/atlas
21 |   /usr/lib/atlas-base
22 |   $ENV{Atlas_ROOT_DIR}
23 |   $ENV{Atlas_ROOT_DIR}/lib
24 | )
25 | 
26 | find_path(Atlas_CBLAS_INCLUDE_DIR   NAMES cblas.h   PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
27 | find_path(Atlas_CLAPACK_INCLUDE_DIR NAMES clapack.h PATHS ${Atlas_INCLUDE_SEARCH_PATHS})
28 | 
29 | find_library(Atlas_CBLAS_LIBRARY NAMES  ptcblas_r ptcblas cblas_r cblas       PATHS ${Atlas_LIB_SEARCH_PATHS})
30 | find_library(Atlas_BLAS_LIBRARY NAMES   atlas_r   atlas                       PATHS ${Atlas_LIB_SEARCH_PATHS})
31 | find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas atllapack PATHS ${Atlas_LIB_SEARCH_PATHS})
32 | 
33 | set(LOOKED_FOR
34 |   Atlas_CBLAS_INCLUDE_DIR
35 |   Atlas_CLAPACK_INCLUDE_DIR
36 | 
37 |   Atlas_CBLAS_LIBRARY
38 |   Atlas_BLAS_LIBRARY
39 |   Atlas_LAPACK_LIBRARY
40 | )
41 | 
42 | include(FindPackageHandleStandardArgs)
43 | find_package_handle_standard_args(Atlas DEFAULT_MSG ${LOOKED_FOR})
44 | 
45 | if(ATLAS_FOUND)
46 |   set(Atlas_INCLUDE_DIR ${Atlas_CBLAS_INCLUDE_DIR} ${Atlas_CLAPACK_INCLUDE_DIR})
47 |   set(Atlas_LIBRARIES ${Atlas_LAPACK_LIBRARY} ${Atlas_CBLAS_LIBRARY} ${Atlas_BLAS_LIBRARY})
48 |   mark_as_advanced(${LOOKED_FOR})
49 | 
50 |   message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR} library: ${Atlas_BLAS_LIBRARY} lapack: ${Atlas_LAPACK_LIBRARY}")
51 | endif(ATLAS_FOUND)
52 | 
53 | 


--------------------------------------------------------------------------------
/cmake/Modules/FindOpenBLAS.cmake:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | SET(Open_BLAS_INCLUDE_SEARCH_PATHS
 4 |   /usr/include
 5 |   /usr/include/openblas
 6 |   /usr/include/openblas-base
 7 |   /usr/local/include
 8 |   /usr/local/include/openblas
 9 |   /usr/local/include/openblas-base
10 |   /opt/OpenBLAS/include
11 |   $ENV{OpenBLAS_HOME}
12 |   $ENV{OpenBLAS_HOME}/include
13 | )
14 | 
15 | SET(Open_BLAS_LIB_SEARCH_PATHS
16 |         /lib/
17 |         /lib/openblas-base
18 |         /lib64/
19 |         /usr/lib
20 |         /usr/lib/openblas-base
21 |         /usr/lib64
22 |         /usr/local/lib
23 |         /usr/local/lib64
24 |         /opt/OpenBLAS/lib
25 |         $ENV{OpenBLAS}cd
26 |         $ENV{OpenBLAS}/lib
27 |         $ENV{OpenBLAS_HOME}
28 |         $ENV{OpenBLAS_HOME}/lib
29 |  )
30 | 
31 | FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS})
32 | FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS})
33 | 
34 | SET(OpenBLAS_FOUND ON)
35 | 
36 | #    Check include files
37 | IF(NOT OpenBLAS_INCLUDE_DIR)
38 |     SET(OpenBLAS_FOUND OFF)
39 |     MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off")
40 | ENDIF()
41 | 
42 | #    Check libraries
43 | IF(NOT OpenBLAS_LIB)
44 |     SET(OpenBLAS_FOUND OFF)
45 |     MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off")
46 | ENDIF()
47 | 
48 | IF (OpenBLAS_FOUND)
49 |   IF (NOT OpenBLAS_FIND_QUIETLY)
50 |     MESSAGE(STATUS "Found OpenBLAS libraries: ${OpenBLAS_LIB}")
51 |     MESSAGE(STATUS "Found OpenBLAS include: ${OpenBLAS_INCLUDE_DIR}")
52 |   ENDIF (NOT OpenBLAS_FIND_QUIETLY)
53 | ELSE (OpenBLAS_FOUND)
54 |   IF (OpenBLAS_FIND_REQUIRED)
55 |     MESSAGE(FATAL_ERROR "Could not find OpenBLAS")
56 |   ENDIF (OpenBLAS_FIND_REQUIRED)
57 | ENDIF (OpenBLAS_FOUND)
58 | 
59 | MARK_AS_ADVANCED(
60 |     OpenBLAS_INCLUDE_DIR
61 |     OpenBLAS_LIB
62 |     OpenBLAS
63 | )
64 | 
65 | 


--------------------------------------------------------------------------------
/cmake/Targets.cmake:
--------------------------------------------------------------------------------
  1 | ################################################################################################
  2 | # Defines global DAUConvNet_LINK flag, This flag is required to prevent linker from excluding
  3 | # some objects which are not addressed directly but are registered via static constructors
  4 | macro(dau_conv_set_link)
  5 |   if(BUILD_SHARED_LIBS)
  6 |     set(DAUConvNet_LINK dau-conv)
  7 |   else()
  8 |     if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
  9 |       set(DAUConvNet_LINK -Wl,-force_load dau-conv)
 10 |     elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
 11 |       set(DAUConvNet_LINK -Wl,--whole-archive dau-conv -Wl,--no-whole-archive)
 12 |     endif()
 13 |   endif()
 14 | endmacro()
 15 | ################################################################################################
 16 | # Convenient command to setup source group for IDEs that support this feature (VS, XCode)
 17 | # Usage:
 18 | #   dau_conv_source_group(<group> GLOB[_RECURSE] <globbing_expression>)
 19 | function(dau_conv_source_group group)
 20 |   cmake_parse_arguments(DAU_CONV_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN})
 21 |   if(DAU_CONV_SOURCE_GROUP_GLOB)
 22 |     file(GLOB srcs1 ${DAU_CONV_SOURCE_GROUP_GLOB})
 23 |     source_group(${group} FILES ${srcs1})
 24 |   endif()
 25 | 
 26 |   if(DAU_CONV_SOURCE_GROUP_GLOB_RECURSE)
 27 |     file(GLOB_RECURSE srcs2 ${DAU_CONV_SOURCE_GROUP_GLOB_RECURSE})
 28 |     source_group(${group} FILES ${srcs2})
 29 |   endif()
 30 | endfunction()
 31 | 
 32 | ################################################################################################
 33 | # Collecting sources from globbing and appending to output list variable
 34 | # Usage:
 35 | #   dau_conv_collect_sources(<output_variable> GLOB[_RECURSE] <globbing_expression>)
 36 | function(dau_conv_collect_sources variable)
 37 |   cmake_parse_arguments(DAU_CONV_COLLECT_SOURCES "" "" "GLOB;GLOB_RECURSE" ${ARGN})
 38 |   if(DAU_CONV_COLLECT_SOURCES_GLOB)
 39 |     file(GLOB srcs1 ${DAU_CONV_COLLECT_SOURCES_GLOB})
 40 |     set(${variable} ${variable} ${srcs1})
 41 |   endif()
 42 | 
 43 |   if(DAU_CONV_COLLECT_SOURCES_GLOB_RECURSE)
 44 |     file(GLOB_RECURSE srcs2 ${DAU_CONV_COLLECT_SOURCES_GLOB_RECURSE})
 45 |     set(${variable} ${variable} ${srcs2})
 46 |   endif()
 47 | endfunction()
 48 | 
 49 | ################################################################################################
 50 | # Short command getting dau_conv_impl sources (assuming standard DAUConvNet code tree)
 51 | # Usage:
 52 | #   dau_conv_pickup_dau_conv_sources(<root>)
 53 | function(dau_conv_pickup_sources root)
 54 |   # put all files in source groups (visible as subfolder in many IDEs)
 55 |   dau_conv_source_group("Include"        GLOB "${root}/include/dau_conv/*.h*")
 56 |   dau_conv_source_group("Include\\Util"  GLOB "${root}/include/dau_conv/util/*.h*")
 57 |   dau_conv_source_group("Include"        GLOB "${PROJECT_BINARY_DIR}/dau_conv_config.h*")
 58 |   dau_conv_source_group("Source"         GLOB "${root}/src/dau_conv/*.cpp")
 59 |   dau_conv_source_group("Source\\Util"   GLOB "${root}/src/dau_conv/util/*.cpp")
 60 |   dau_conv_source_group("Source\\Layers" GLOB "${root}/src/dau_conv/layers/*.cpp")
 61 |   dau_conv_source_group("Source\\Cuda"   GLOB "${root}/src/dau_conv/layers/*.cu")
 62 |   dau_conv_source_group("Source\\Cuda"   GLOB "${root}/src/dau_conv/util/*.cu")
 63 | 
 64 |   # collect files
 65 |   file(GLOB_RECURSE hdrs ${root}/include/dau_cconv/*.h*)
 66 |   file(GLOB_RECURSE srcs ${root}/src/dau_conv/*.cpp)
 67 | 
 68 |   # adding headers to make the visible in some IDEs (Qt, VS, Xcode)
 69 |   list(APPEND srcs ${hdrs} ${PROJECT_BINARY_DIR}/dau_conv_config.h)
 70 | 
 71 |   # collect cuda files
 72 |   file(GLOB_RECURSE cuda ${root}/src/dau_conv/*.cu)
 73 | 
 74 |   # convert to absolute paths
 75 |   dau_conv_convert_absolute_paths(srcs)
 76 |   dau_conv_convert_absolute_paths(cuda)
 77 | 
 78 |   # propagate to parent scope
 79 |   set(srcs ${srcs} PARENT_SCOPE)
 80 |   set(cuda ${cuda} PARENT_SCOPE)
 81 | endfunction()
 82 | 
 83 | ################################################################################################
 84 | # Short command for setting default target properties
 85 | # Usage:
 86 | #   dau_conv_default_properties(<target>)
 87 | function(dau_conv_default_properties target)
 88 |   set_target_properties(${target} PROPERTIES
 89 |     DEBUG_POSTFIX ${DAUConvNet_DEBUG_POSTFIX}
 90 |     ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
 91 |     LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib"
 92 |     RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin")
 93 |   # make sure we build all external dependencies first
 94 |   if (DEFINED external_project_dependencies)
 95 |     add_dependencies(${target} ${external_project_dependencies})
 96 |   endif()
 97 | endfunction()
 98 | 
 99 | ################################################################################################
100 | # Short command for setting runtime directory for build target
101 | # Usage:
102 | #   dau_conv_set_runtime_directory(<target> <dir>)
103 | function(dau_conv_set_runtime_directory target dir)
104 |   set_target_properties(${target} PROPERTIES
105 |     RUNTIME_OUTPUT_DIRECTORY "${dir}")
106 | endfunction()
107 | 
108 | ################################################################################################
109 | # Short command for setting solution folder property for target
110 | # Usage:
111 | #   dau_conv_set_solution_folder(<target> <folder>)
112 | function(dau_conv_set_solution_folder target folder)
113 |   if(USE_PROJECT_FOLDERS)
114 |     set_target_properties(${target} PROPERTIES FOLDER "${folder}")
115 |   endif()
116 | endfunction()
117 | 
118 | ################################################################################################
119 | # Reads lines from input file, prepends source directory to each line and writes to output file
120 | # Usage:
121 | #   dau_conv_configure_testdatafile(<testdatafile>)
122 | function(dau_conv_configure_testdatafile file)
123 |   file(STRINGS ${file} __lines)
124 |   set(result "")
125 |   foreach(line ${__lines})
126 |     set(result "${result}${PROJECT_SOURCE_DIR}/${line}\n")
127 |   endforeach()
128 |   file(WRITE ${file}.gen.cmake ${result})
129 | endfunction()
130 | 
131 | ################################################################################################
132 | # Filter out all files that are not included in selected list
133 | # Usage:
134 | #   dau_conv_leave_only_selected_tests(<filelist_variable> <selected_list>)
135 | function(dau_conv_leave_only_selected_tests file_list)
136 |   if(NOT ARGN)
137 |     return() # blank list means leave all
138 |   endif()
139 |   string(REPLACE "," ";" __selected ${ARGN})
140 |   list(APPEND __selected dau_conv_main)
141 | 
142 |   set(result "")
143 |   foreach(f ${${file_list}})
144 |     get_filename_component(name ${f} NAME_WE)
145 |     string(REGEX REPLACE "^test_" "" name ${name})
146 |     list(FIND __selected ${name} __index)
147 |     if(NOT __index EQUAL -1)
148 |       list(APPEND result ${f})
149 |     endif()
150 |   endforeach()
151 |   set(${file_list} ${result} PARENT_SCOPE)
152 | endfunction()
153 | 
154 | 


--------------------------------------------------------------------------------
/cmake/Templates/DAUConvNetConfig.cmake.in:
--------------------------------------------------------------------------------
 1 | # Config file for the DAU-ConvNet package.
 2 | #
 3 | # Note:
 4 | #   DAU-ConvNetand this config file depends on opencv,
 5 | #   so put `find_package(OpenCV)` before searching DAU-ConvNet
 6 | #   via `find_package(DAUConvNet)`. All other lib/includes
 7 | #   dependencies are hard coded in the file
 8 | #
 9 | # After successful configuration the following variables
10 | # will be defined:
11 | #
12 | #   DAUConvNet_LIBRARIES    - IMPORTED targets to link against
13 | #                        (There is no DAUConvNet_INCLUDE_DIRS and DAUConvNet_DEFINITIONS
14 | #                         because they are specified in the IMPORTED target interface.)
15 | #
16 | #   DAUConvNet_HAVE_CUDA    - signals about CUDA support
17 | 
18 | 
19 | # OpenCV dependency (optional)
20 | 
21 | if(@USE_OPENCV@)
22 |   if(NOT OpenCV_FOUND)
23 |     set(DAUConvNet_OpenCV_CONFIG_PATH "@OpenCV_CONFIG_PATH@")
24 |     if(DAUConvNet_OpenCV_CONFIG_PATH)
25 |       get_filename_component(DAUConvNet_OpenCV_CONFIG_PATH ${DAUConvNet_OpenCV_CONFIG_PATH} ABSOLUTE)
26 | 
27 |       if(EXISTS ${DAUConvNet_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core)
28 |         message(STATUS "DAUConvNet: using OpenCV config from ${DAUConvNet_OpenCV_CONFIG_PATH}")
29 | 	include(${DAUConvNet_OpenCV_CONFIG_PATH}/OpenCVConfig.cmake)
30 |       endif()
31 | 
32 |     else()
33 |       find_package(OpenCV REQUIRED)
34 |     endif()
35 |     unset(DAUConvNet_OpenCV_CONFIG_PATH)
36 |   endif()
37 | endif()
38 | 
39 | # Compute paths
40 | get_filename_component(DAUConvNet_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
41 | 
42 | # Our library dependencies
43 | if(NOT TARGET DAUConvNet AND NOT DAUConvNet_BINARY_DIR)
44 |   include("${DAUConvNet_CMAKE_DIR}/DAUConvNetTargets.cmake")
45 | endif()
46 | 
47 | # List of IMPORTED libs created by DAUConvNetTargets.cmake
48 | # These targets already specify all needed definitions and include pathes
49 | set(DAUConvNet_LIBRARIES dau-conv)
50 | 
51 | # Cuda support variables
52 | set(DAUConvNet_CPU_ONLY @CPU_ONLY@)
53 | set(DAUConvNet_HAVE_CUDA @HAVE_CUDA@)
54 | 


--------------------------------------------------------------------------------
/cmake/Templates/dau_conv_config.h.in:
--------------------------------------------------------------------------------
1 | /* Sources directory */
2 | #define SOURCE_FOLDER "${PROJECT_SOURCE_DIR}"
3 | 
4 | /* Binaries directory */
5 | #define BINARY_FOLDER "${PROJECT_BINARY_DIR}"
6 | 
7 | /* Test device */
8 | #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE}
9 | 


--------------------------------------------------------------------------------
/cmake/Utils.cmake:
--------------------------------------------------------------------------------
  1 | ################################################################################################
  2 | # Command alias for debugging messages
  3 | # Usage:
  4 | #   dmsg(<message>)
  5 | function(dmsg)
  6 |   message(STATUS ${ARGN})
  7 | endfunction()
  8 | 
  9 | ################################################################################################
 10 | # Removes duplicates from list(s)
 11 | # Usage:
 12 | #   dau_conv_list_unique(<list_variable> [<list_variable>] [...])
 13 | macro(dau_conv_list_unique)
 14 |   foreach(__lst ${ARGN})
 15 |     if(${__lst})
 16 |       list(REMOVE_DUPLICATES ${__lst})
 17 |     endif()
 18 |   endforeach()
 19 | endmacro()
 20 | 
 21 | ################################################################################################
 22 | # Clears variables from list
 23 | # Usage:
 24 | #   dau_conv_clear_vars(<variables_list>)
 25 | macro(dau_conv_clear_vars)
 26 |   foreach(_var ${ARGN})
 27 |     unset(${_var})
 28 |   endforeach()
 29 | endmacro()
 30 | 
 31 | 
 32 | ################################################################################################
 33 | # Converts all paths in list to absolute
 34 | # Usage:
 35 | #   dau_conv_convert_absolute_paths(<list_variable>)
 36 | function(dau_conv_convert_absolute_paths variable)
 37 |   set(__dlist "")
 38 |   foreach(__s ${${variable}})
 39 |     get_filename_component(__abspath ${__s} ABSOLUTE)
 40 |     list(APPEND __list ${__abspath})
 41 |   endforeach()
 42 |   set(${variable} ${__list} PARENT_SCOPE)
 43 | endfunction()
 44 | 
 45 | 
 46 | ########################################################################################################
 47 | # An option that the user can select. Can accept condition to control when option is available for user.
 48 | # Usage:
 49 | #   dau_conv_option(<option_variable> "doc string" <initial value or boolean expression> [IF <condition>])
 50 | function(dau_conv_option variable description value)
 51 |   set(__value ${value})
 52 |   set(__condition "")
 53 |   set(__varname "__value")
 54 |   foreach(arg ${ARGN})
 55 |     if(arg STREQUAL "IF" OR arg STREQUAL "if")
 56 |       set(__varname "__condition")
 57 |     else()
 58 |       list(APPEND ${__varname} ${arg})
 59 |     endif()
 60 |   endforeach()
 61 |   unset(__varname)
 62 |   if("${__condition}" STREQUAL "")
 63 |     set(__condition 2 GREATER 1)
 64 |   endif()
 65 | 
 66 |   if(${__condition})
 67 |     if("${__value}" MATCHES ";")
 68 |       if(${__value})
 69 |         option(${variable} "${description}" ON)
 70 |       else()
 71 |         option(${variable} "${description}" OFF)
 72 |       endif()
 73 |     elseif(DEFINED ${__value})
 74 |       if(${__value})
 75 |         option(${variable} "${description}" ON)
 76 |       else()
 77 |         option(${variable} "${description}" OFF)
 78 |       endif()
 79 |     else()
 80 |       option(${variable} "${description}" ${__value})
 81 |     endif()
 82 |   else()
 83 |     unset(${variable} CACHE)
 84 |   endif()
 85 | endfunction()
 86 | 
 87 | 
 88 | ################################################################################################
 89 | # Command for disabling warnings for different platforms (see below for gcc and VisualStudio)
 90 | # Usage:
 91 | #   dau_conv_warnings_disable(<CMAKE_[C|CXX]_FLAGS[_CONFIGURATION]> -Wshadow /wd4996 ..,)
 92 | macro(dau_conv_warnings_disable)
 93 |   set(_flag_vars "")
 94 |   set(_msvc_warnings "")
 95 |   set(_gxx_warnings "")
 96 | 
 97 |   foreach(arg ${ARGN})
 98 |     if(arg MATCHES "^CMAKE_")
 99 |       list(APPEND _flag_vars ${arg})
100 |     elseif(arg MATCHES "^/wd")
101 |       list(APPEND _msvc_warnings ${arg})
102 |     elseif(arg MATCHES "^-W")
103 |       list(APPEND _gxx_warnings ${arg})
104 |     endif()
105 |   endforeach()
106 | 
107 |   if(NOT _flag_vars)
108 |     set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
109 |   endif()
110 | 
111 |   if(MSVC AND _msvc_warnings)
112 |     foreach(var ${_flag_vars})
113 |       foreach(warning ${_msvc_warnings})
114 |         set(${var} "${${var}} ${warning}")
115 |       endforeach()
116 |     endforeach()
117 |   elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings)
118 |     foreach(var ${_flag_vars})
119 |       foreach(warning ${_gxx_warnings})
120 |         if(NOT warning MATCHES "^-Wno-")
121 |           string(REPLACE "${warning}" "" ${var} "${${var}}")
122 |           string(REPLACE "-W" "-Wno-" warning "${warning}")
123 |         endif()
124 |         set(${var} "${${var}} ${warning}")
125 |       endforeach()
126 |     endforeach()
127 |   endif()
128 |   dau_conv_clear_vars(_flag_vars _msvc_warnings _gxx_warnings)
129 | endmacro()
130 | 
131 | 
132 | ################################################################################################
133 | # Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, ....
134 | # Usage:
135 | #   dau_conv_detect_darwin_version(<version_variable>)
136 | function(dau_conv_detect_darwin_version output_var)
137 |   if(APPLE)
138 |     execute_process(COMMAND /usr/bin/sw_vers -productVersion
139 |             RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out
140 |             ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
141 | 
142 |     set(${output_var} ${__sw_vers_out} PARENT_SCOPE)
143 |   else()
144 |     set(${output_var} "" PARENT_SCOPE)
145 |   endif()
146 | endfunction()
147 | 
148 | ################################################################################################
149 | # Helper function to convert version string to integer
150 | function(convert_version_string_to_int version_string version_int)
151 |     # Split the version string into its components
152 |     string(REPLACE "." ";" version_list ${version_string})
153 |     
154 |     list(GET version_list 0 major)
155 |     list(GET version_list 1 minor)
156 |     list(GET version_list 2 patch)
157 | 
158 |     # Convert the version components to integers
159 |     math(EXPR major_int "${major}*1000")
160 |     math(EXPR minor_int "${minor}*10")
161 |     set(patch_int ${patch})
162 |     
163 |     # Combine the version components into a single integer
164 |     math(EXPR ret "${major_int}+${minor_int}+${patch_int}" )
165 |     set(${version_int} ${ret} PARENT_SCOPE)
166 | endfunction()


--------------------------------------------------------------------------------
/include/dau_conv/dau_conv_impl/dau_conv_backward.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef DAU_CONV_UTIL_DAU_BACKWARD_H_
  2 | #define DAU_CONV_UTIL_DAU_BACKWARD_H_
  3 | 
  4 | 
  5 | #include <stdio.h>
  6 | 
  7 | #include "dau_conv/util/common.hpp"
  8 | 
  9 | namespace DAUConvNet {
 10 | #ifndef CPU_ONLY  // GPU
 11 | 
 12 | #define MAX(x,y) (x > y ? x : y)
 13 | 
 14 | template <typename Dtype>
 15 | class DAUConvBackward {
 16 | 	// TODO:
 17 | 	//	- make interpolation weights in 16 bit float (they are computed with 32 bit error so cannot use 16 bit float arithmetics)
 18 | 	//  - make input data in 16 bit float but retain error in 32 bit float and perform computation in 16 bit (this will reduce memory bandwidth required)
 19 | 	// --> tried but not worked:
 20 | 	//      float 16 bit does half transfer time, but adds additionl conversions from fp16 to fp32 which brings total time back to the same !!
 21 | 	//		--> would be possible with new Nvidia VOLTA arch which should have fp16 dot product with aggregation to fp32 !!!
 22 | 	//
 23 | 	//  - make data and computation with 16 bit float (only viable version but effect on performance is yet unknown)
 24 | public:
 25 | 	// fixed params during construction
 26 | 	const int img_width_in, img_height_in;
 27 | 	const int img_width, img_height;
 28 | 	const int I, S, F, G, IN_K;
 29 | 	int OUT_K; // this is const but is calculated in constructor
 30 | 
 31 | private:
 32 | 	// this parameters are used as template params for DAUConvBackwardCUDA
 33 | 	int patch_size_w, patch_size_h, num_images;
 34 | 	bool use_smaller_warp_and_group_k, use_interpolation, single_subfeature;
 35 | 	bool last_k_optional;
 36 | 
 37 | 
 38 | public:
 39 | 	DAUConvBackward(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const int K, const bool last_k_optional, const bool use_interpolation);
 40 | 
 41 | 	void get_allocation_sizes(const int kernel_width, const int kernel_height, const bool offsets_already_centered,
 42 |                               size_t* prepared_filtered_images_size, size_t* prepared_error_images_size, size_t* prepared_filter_weights_size, size_t* prepared_filter_offsets_size);
 43 | 
 44 | 	void backward_pass(const Dtype* filtered_images, const Dtype* error_images,
 45 | 					   const Dtype* filter_offsets_float_x, const Dtype* filter_offsets_float_y,
 46 | 					   const Dtype* filter_weights,
 47 | 					   const int kernel_w, const int kernel_h, const Dtype actual_max_offset,
 48 |                        const bool offsets_already_centered,
 49 | 					   Dtype* output,
 50 | 					   Dtype* prepared_filtered_images,
 51 | 					   Dtype* prepared_error_images,
 52 | 					   Dtype* prepared_filter_weights,
 53 | 					   int* prepared_filter_offsets,
 54 | 					   const bool ignore_edge_gradients,
 55 | 					   cudaStream_t streamId = 0);
 56 | 
 57 | 	class CUDAParams {
 58 | 	public:
 59 | 		// fixed params during construction
 60 | 		const int img_width_in, img_height_in;
 61 | 		const int img_width, img_height;
 62 | 		const int I, S, F, G, K, IN_K;
 63 | 
 64 | 		// parameters to setup before call
 65 | 
 66 | 		// params for get_allocation_sizes call
 67 | 		size_t* alloc_img, *alloc_err, *alloc_w, *alloc_off;
 68 | 
 69 | 		// params for run_kernel call
 70 | 		Dtype const* filtered_images, *error_images, *filter_offsets_float_x, *filter_offsets_float_y, *filter_weights;
 71 | 		Dtype* output, *prepared_filtered_images, *prepared_error_images, *prepared_filter_weights;
 72 | 		int* prepared_filter_offsets;
 73 | 		int kernel_w, kernel_h;
 74 | 		bool ignore_edge_gradients;
 75 | 		bool offsets_already_centered;
 76 | 		cudaStream_t streamId;
 77 | 
 78 | 		float actual_max_offset;
 79 | 
 80 | 		CUDAParams(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const int K, const int IN_K, const bool offsets_already_centered) :
 81 | 				img_width_in(img_width_in), img_height_in(img_height_in), img_width(img_width), img_height(img_height), I(I), S(S), F(F), G(G), K(K), IN_K(IN_K), offsets_already_centered(offsets_already_centered){
 82 | 
 83 | 		}
 84 | 		void set_params_for_allocation_call(size_t* alloc_img, size_t* alloc_err, size_t* alloc_w, size_t* alloc_off);
 85 | 
 86 | 		void set_params_for_kernel_call(const Dtype* filtered_images, const Dtype* error_images,
 87 | 										const Dtype* filter_offsets_float_x, const Dtype* filter_offsets_float_y,
 88 | 										const Dtype* filter_weights, const int kernel_w, const int kernel_h, const Dtype actual_max_offset,
 89 | 										Dtype* output,
 90 | 										Dtype* prepared_filtered_images,
 91 | 										Dtype* prepared_error_images,
 92 | 										Dtype* prepared_filter_weights,
 93 | 										int* prepared_filter_offsets,
 94 | 										const bool ignore_edge_gradients,
 95 | 										cudaStream_t streamId);
 96 | 	};
 97 | private:
 98 | 
 99 | 	void call_cuda_kernel(CUDAParams& params);
100 | 
101 | 	static int select_optimal_block_size_bw(int img_size, int min_power, int max_power);
102 | 
103 | };
104 | 
105 | 
106 | 
107 | // we make explicit functions for different combinations of
108 | // each function is implemented in separate .cu file to allow for parallel compile
109 | // (there are 288 combination all-together so this way we can reduce compute time by a factor of 8 if enough CPU cores)
110 | void DAUConv_backward_multi_subfeatures_patch_1x1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
111 | void DAUConv_backward_multi_subfeatures_patch_8x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
112 | void DAUConv_backward_multi_subfeatures_patch_8x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
113 | void DAUConv_backward_multi_subfeatures_patch_8x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
114 | void DAUConv_backward_multi_subfeatures_patch_8x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
115 | void DAUConv_backward_multi_subfeatures_patch_16x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
116 | void DAUConv_backward_multi_subfeatures_patch_16x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
117 | void DAUConv_backward_multi_subfeatures_patch_16x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
118 | void DAUConv_backward_multi_subfeatures_patch_16x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
119 | void DAUConv_backward_multi_subfeatures_patch_32x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
120 | void DAUConv_backward_multi_subfeatures_patch_32x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
121 | void DAUConv_backward_multi_subfeatures_patch_32x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
122 | void DAUConv_backward_multi_subfeatures_patch_32x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
123 | void DAUConv_backward_multi_subfeatures_patch_64x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
124 | void DAUConv_backward_multi_subfeatures_patch_64x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
125 | void DAUConv_backward_multi_subfeatures_patch_64x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward<float>::CUDAParams &PARAMS);
126 | void DAUConv_backward_multi_subfeatures_patch_64x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,DAUConvBackward<float>::CUDAParams &PARAMS);
127 | 
128 | #endif  // !CPU_ONLY
129 | 
130 | }  // namespace DAUConvNet
131 | 
132 | #endif  // DAU_CONV_UTIL_DAU_BACKWARD_H_
133 | 


--------------------------------------------------------------------------------
/include/dau_conv/dau_conv_impl/dau_conv_forward.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef DAU_CONV_UTIL_DAU_FORWARD_H_
  2 | #define DAU_CONV_UTIL_DAU_FORWARD_H_
  3 | 
  4 | #include <stdio.h>
  5 | 
  6 | #include "dau_conv/util/common.hpp"
  7 | 
  8 | namespace DAUConvNet {
  9 | 
 10 | #ifndef CPU_ONLY  // GPU
 11 | 
 12 | template <typename Dtype>
 13 | class DAUConvForward {
 14 | 	// fixed params during construction
 15 | 	const int img_width_in, img_height_in;
 16 | 	const int img_width, img_height;
 17 | 	const int I, S, F, G;
 18 | 
 19 | 	// this parameters are used as template params for DAUConvBackwardCUDA
 20 | 	int patch_size_w, patch_size_h, max_offset, num_images, warp_pixel_size_x, warp_pixel_size_y;
 21 | 	bool single_feature, single_subfeature, use_interpolation;
 22 | 
 23 | public:
 24 | 	enum PARAM_FORMAT { SGF, FGS}; // default should be SGF
 25 | 
 26 | 	DAUConvForward(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const bool use_interpolation);
 27 | 
 28 | 	void get_allocation_sizes(const int kernel_width, const int kernel_height, const bool offsets_already_centered,
 29 | 							  size_t* prepared_filtered_images_size,
 30 | 							  size_t* prepared_filter_weights_size,
 31 | 							  size_t* prepared_filter_offsets_size);
 32 | 
 33 | 	void forward_pass(const Dtype* filtered_images,
 34 | 					  const Dtype* filter_offsets_float_x, const Dtype* filter_offsets_float_y,
 35 | 					  const Dtype* filter_weights, const PARAM_FORMAT param_format,
 36 | 					  const int kernel_width, const int kernel_height, const Dtype actual_max_offset,
 37 |                       const bool offsets_already_centered, Dtype* output,
 38 | 					  Dtype* prepared_filtered_images,
 39 | 					  Dtype* prepared_filter_weights,
 40 | 					  int* prepared_filter_offsets,
 41 | 					  Dtype* prepared_filter_offsets_and_weights, cudaStream_t streamId = NULL);
 42 | 
 43 | 	class CUDAParams {
 44 | 	public:
 45 | 		// fixed params during construction
 46 | 		const int img_width_in, img_height_in;
 47 | 		const int img_width, img_height;
 48 | 		const int I, S, F, G;
 49 | 
 50 | 		// parameters for setup before call
 51 | 
 52 | 		// params for get_allocation_sizes call
 53 | 		size_t *alloc_img, *alloc_w, *alloc_off;
 54 | 
 55 | 		// params for run_kernel call
 56 | 		Dtype const *filtered_images, *filter_offsets_float_x, *filter_offsets_float_y, *filter_weights;
 57 | 		Dtype *output, *prepared_filtered_images, *prepared_filter_weights, *prepared_filter_offsets_and_weights;
 58 | 		int *prepared_filter_offsets;
 59 | 		int kernel_w, kernel_h;
 60 | 		PARAM_FORMAT param_format;
 61 | 		bool offsets_already_centered;
 62 | 		cudaStream_t streamId;
 63 | 
 64 | 		float actual_max_offset;
 65 | 
 66 | 	public:
 67 | 		CUDAParams(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const bool offsets_already_centered) :
 68 | 				img_width_in(img_width_in), img_height_in(img_height_in), img_width(img_width), img_height(img_height), I(I), S(S), F(F), G(G), offsets_already_centered(offsets_already_centered) {
 69 | 		}
 70 | 
 71 | 		void set_params_for_allocation_call(size_t *alloc_img, size_t *alloc_w, size_t *alloc_off);
 72 | 
 73 | 		void set_params_for_kernel_call(const Dtype *filtered_images,
 74 | 										const Dtype *filter_offsets_float_x, const Dtype *filter_offsets_float_y,
 75 | 										const Dtype *filter_weights,
 76 | 										const PARAM_FORMAT param_format, const int kernel_w, const int kernel_h,
 77 | 										const Dtype actual_max_offset, Dtype *output,
 78 | 										Dtype *prepared_filtered_images,
 79 | 										Dtype *prepared_filter_weights,
 80 | 										int *prepared_filter_offsets,
 81 | 										Dtype *prepared_filter_offsets_and_weights,
 82 | 										cudaStream_t streamId);
 83 | 
 84 | 	};
 85 | 
 86 | private:
 87 | 	void call_cuda_kernel(CUDAParams& params);
 88 | 
 89 | };
 90 | // we make explicit functions for different combinations of [OFFSET, USE_SINGLE_FEATURE, USE_SINGLE_SUBFEATURE]
 91 | // each function is implemented in separate .cu file to allow for parallel compile
 92 | // (there are 288 combination all-together so this way we can reduce compute time by a factor of 8 if enough CPU cores)
 93 | void DAUConv_forward_float_off_4_single_feat_0_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward<float>::CUDAParams &PARAMS);
 94 | void DAUConv_forward_float_off_4_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward<float>::CUDAParams &PARAMS);
 95 | void DAUConv_forward_float_off_4_single_feat_1_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward<float>::CUDAParams &PARAMS);
 96 | void DAUConv_forward_float_off_4_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward<float>::CUDAParams &PARAMS);
 97 | 
 98 | void DAUConv_forward_float_off_8_single_feat_0_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward<float>::CUDAParams &PARAMS);
 99 | void DAUConv_forward_float_off_8_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward<float>::CUDAParams &PARAMS);
100 | void DAUConv_forward_float_off_8_single_feat_1_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward<float>::CUDAParams &PARAMS);
101 | void DAUConv_forward_float_off_8_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward<float>::CUDAParams &PARAMS);
102 | 
103 | void DAUConv_forward_float_off_16_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward<float>::CUDAParams &PARAMS);
104 | void DAUConv_forward_float_off_16_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward<float>::CUDAParams &PARAMS);
105 | void DAUConv_forward_float_off_32_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward<float>::CUDAParams &PARAMS);
106 | 
107 | 
108 | 
109 | #endif  // !CPU_ONLY
110 | 
111 | }  // namespace DAUConvNet
112 | 
113 | #endif  // DAU_CONV_UTIL_DAU_FORWARD_H_
114 | 


--------------------------------------------------------------------------------
/include/dau_conv/util/common.hpp:
--------------------------------------------------------------------------------
  1 | //
  2 | // Created by domen on 3/23/18.
  3 | //
  4 | 
  5 | #ifndef DAUCONVNET_COMMON_H
  6 | #define DAUCONVNET_COMMON_H
  7 | 
  8 | 
  9 | #include <climits>
 10 | #include <cmath>
 11 | #include <fstream>  // NOLINT(readability/streams)
 12 | #include <iostream>  // NOLINT(readability/streams)
 13 | #include <map>
 14 | #include <set>
 15 | #include <sstream>
 16 | #include <string>
 17 | #include <utility>  // pair
 18 | #include <vector>
 19 | #include <memory.h>
 20 | #include <memory>
 21 | #include <iostream>
 22 | #include <cstdio>
 23 | 
 24 | #include <cublas_v2.h>
 25 | #include <cuda.h>
 26 | #include <cuda_runtime.h>
 27 | #include <curand.h>
 28 | #include <driver_types.h>  // cuda driver types
 29 | 
 30 | #ifndef NDEBUG
 31 | #   define M_Assert(Expr, Msg) \
 32 |     __M_Assert(#Expr, Expr, __FILE__, __LINE__, Msg)
 33 | 
 34 | #else
 35 | #   define M_Assert(Expr, Msg)
 36 | #endif
 37 | 
 38 | void __M_Assert(const char* expr_str, bool expr, const char* file, int line, const char* msg);
 39 | 
 40 | 
 41 | #ifndef DAU_CHECK
 42 | #define DAU_CHECK(Expr,Msg ) \
 43 |  if ((Expr) == false) { throw DAUConvNet::DAUException(string_format("ASSERT ERROR: %s\n", Msg)); }
 44 | 
 45 | #endif
 46 | 
 47 | //
 48 | // CUDA macros
 49 | //
 50 | 
 51 | // CUDA: various checks for different function calls.
 52 | #ifndef CUDA_CHECK
 53 | #define CUDA_CHECK(condition) \
 54 |   /* Code block avoids redefinition of cudaError_t error */ \
 55 |   do { \
 56 |     cudaError_t error = condition; \
 57 |     DAU_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
 58 |   } while (0)
 59 | #endif
 60 | 
 61 | #ifndef CUBLAS_CHECK
 62 | #define CUBLAS_CHECK(condition) \
 63 |   do { \
 64 |     cublasStatus_t status = condition; \
 65 |     DAU_CHECK(status == CUBLAS_STATUS_SUCCESS, DAUConvNet::cublasGetErrorString(status)); \
 66 |   } while (0)
 67 | #endif
 68 | 
 69 | // CUDA: grid stride looping
 70 | #ifndef CUDA_KERNEL_LOOP
 71 | #define CUDA_KERNEL_LOOP(i, n) \
 72 |   for (int i = blockIdx.x * blockDim.x + threadIdx.x; \
 73 |        i < (n); \
 74 |        i += blockDim.x * gridDim.x)
 75 | #endif
 76 | 
 77 | // CUDA: check for error after kernel execution and exit loudly if there is one.
 78 | #ifndef CUDA_POST_KERNEL_CHECK
 79 | #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError())
 80 | #endif
 81 | 
 82 | 
 83 | namespace DAUConvNet {
 84 | 
 85 | // Common functions and classes from std that dau_conv_impl often uses.
 86 |     using std::vector;
 87 | 
 88 | // CUDA: library error reporting.
 89 |     const char* cublasGetErrorString(cublasStatus_t error);
 90 | 
 91 | // CUDA: use 512 threads per block
 92 |     const int CUDA_NUM_THREADS = 512;
 93 | 
 94 | // CUDA: number of blocks for threads.
 95 |     inline int CUDA_GET_BLOCKS(const int N) {
 96 |         return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS;
 97 |     }
 98 | 
 99 |     class DAUException : public std::runtime_error {
100 |     public:
101 |         DAUException(const std::string& what_arg ) : std::runtime_error(what_arg) {
102 |         }
103 |     };
104 | 
105 | }  // namespace DAUConvNet
106 | 
107 | template<typename ... Args>
108 | std::string string_format( const std::string& format, Args ... args )
109 | {
110 |     size_t size = std::snprintf( nullptr, 0, format.c_str(), args ... ) + 1; // Extra space for '\0'
111 |     std::unique_ptr<char[]> buf( new char[ size ] );
112 |     std::snprintf( buf.get(), size, format.c_str(), args ... );
113 |     return std::string( buf.get(), size - 1 ); // We don't want the '\0' inside
114 | }
115 | 
116 | 
117 | 
118 | 
119 | 
120 | #endif //DAUCONVNET_COMMON_H
121 | 


--------------------------------------------------------------------------------
/include/dau_conv/util/convolve.hpp:
--------------------------------------------------------------------------------
 1 | /*******************************************************
 2 |  * Copyright (c) 2014, ArrayFire
 3 |  * All rights reserved.
 4 |  *
 5 |  * This file is distributed under 3-clause BSD license.
 6 |  * The complete license agreement can be obtained at:
 7 |  * http://arrayfire.com/licenses/BSD-3-Clause
 8 |  ********************************************************/
 9 | 
10 | #include "dau_conv/util/common.hpp"
11 | 
12 | namespace DAUConvNet
13 | {
14 |     typedef enum {
15 |         AF_BATCH_UNSUPPORTED = -1, /* invalid inputs */
16 |         AF_BATCH_NONE,             /* one signal, one filter   */
17 |         AF_BATCH_LHS,              /* many signal, one filter  */
18 |         AF_BATCH_RHS,              /* one signal, many filter  */
19 |         AF_BATCH_SAME,             /* signal and filter have same batch size */
20 |         AF_BATCH_DIFF,             /* signal and filter have different batch size */
21 |     } AF_BATCH_KIND;
22 | 
23 |     struct conv2_data_desc {
24 |         conv2_data_desc() {}
25 |         conv2_data_desc(int n, int c, int h, int w, int s_n, int s_c, int s_h, int s_w)  {
26 |             dims[0] = n; dims[1] = c; dims[2] = h; dims[3] = w;
27 |             strides[0] = s_n; strides[1] = s_c; strides[2] = s_h; strides[3] = s_w;
28 |         }
29 |         int dims[4];
30 |         int strides[4];
31 |     };
32 | 
33 |     template<typename Dtype>
34 |     void caffe_gpu_convolve2(Dtype* out, const conv2_data_desc& out_desc,
35 |                              const Dtype* signal, const conv2_data_desc& signal_desc,
36 |                              const Dtype* filter, const conv2_data_desc& filter_desc, cudaStream_t streamId = 0);
37 | 
38 | namespace kernel
39 | {
40 | 
41 | 
42 | 
43 | template<typename Dtype, int baseDim, bool expand>
44 | void convolve_nd(Dtype* out, const conv2_data_desc& out_desc,
45 |                  const Dtype* signal, const conv2_data_desc& signal_desc,
46 |                  const Dtype* filter, const conv2_data_desc& filter_desc,
47 |                  AF_BATCH_KIND kind, cudaStream_t streamId = 0);
48 | 
49 | }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/include/dau_conv/util/im2col.hpp:
--------------------------------------------------------------------------------
 1 | #ifndef DAU_CONV_UTIL_IM2COL_HPP
 2 | #define DAU_CONV_UTIL_IM2COL_HPP
 3 | 
 4 | namespace DAUConvNet {
 5 | 
 6 | template <typename Dtype>
 7 | void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,
 8 |     const int* im_shape, const int* col_shape,
 9 |     const int* kernel_shape, const int* pad, const int* stride,
10 |     const int* dilation, Dtype* data_col);
11 | 
12 | template <typename Dtype>
13 | void im2col_cpu(const Dtype* data_im, const int channels,
14 |     const int height, const int width, const int kernel_h, const int kernel_w,
15 |     const int pad_h, const int pad_w, const int stride_h,
16 |     const int stride_w, const int dilation_h, const int dilation_w,
17 |     Dtype* data_col);
18 | 
19 | template <typename Dtype>
20 | void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
21 |     const int* im_shape, const int* col_shape,
22 |     const int* kernel_shape, const int* pad, const int* stride,
23 |     const int* dilation, Dtype* data_im);
24 | 
25 | template <typename Dtype>
26 | void col2im_cpu(const Dtype* data_col, const int channels,
27 |     const int height, const int width, const int kernel_h, const int kernel_w,
28 |     const int pad_h, const int pad_w, const int stride_h,
29 |     const int stride_w, const int dilation_h, const int dilation_w,
30 |     Dtype* data_im);
31 | 
32 | }  // namespace DAUConvNet
33 | 
34 | #endif  // DAU_CONV_UTIL_IM2COL_HPP
35 | 


--------------------------------------------------------------------------------
/include/dau_conv/util/math_functions.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_
  2 | #define CAFFE_UTIL_MATH_FUNCTIONS_H_
  3 | 
  4 | #include <stdint.h>
  5 | #include <cmath>  // for std::fabs and std::signbit
  6 | #include <cstring>
  7 | 
  8 | #include "dau_conv/util/common.hpp"
  9 | #include "dau_conv/util/mkl_alternate.hpp"
 10 | 
 11 | 
 12 | namespace DAUConvNet {
 13 | 
 14 | // Caffe gemm provides a simpler interface to the gemm functions, with the
 15 | // limitation that the data has to be contiguous in memory.
 16 | template <typename Dtype>
 17 | void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA,
 18 |                     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
 19 |                     const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
 20 |                     Dtype* y);
 21 | 
 22 | template <typename Dtype>
 23 | void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
 24 |                     const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
 25 |                     Dtype* y);
 26 | 
 27 | template <typename Dtype>
 28 | void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y);
 29 | 
 30 | template <typename Dtype>
 31 | void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X, const Dtype beta,
 32 |                      Dtype* Y);
 33 | 
 34 | template <typename Dtype>
 35 | void caffe_set(const int N, const Dtype alpha, Dtype *X);
 36 | 
 37 | inline void caffe_memset(const size_t N, const int alpha, void* X) {
 38 |   memset(X, alpha, N);  // NOLINT(dau_conv_impl/alt_fn)
 39 | }
 40 | 
 41 | template <typename Dtype>
 42 | void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X);
 43 | 
 44 | template <typename Dtype>
 45 | void caffe_scal(const int N, const Dtype alpha, Dtype *X);
 46 | 
 47 | template <typename Dtype>
 48 | void caffe_sqr(const int N, const Dtype* a, Dtype* y);
 49 | 
 50 | template <typename Dtype>
 51 | void caffe_sqrt(const int N, const Dtype* a, Dtype* y);
 52 | 
 53 | template <typename Dtype>
 54 | void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 55 | 
 56 | template <typename Dtype>
 57 | void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 58 | 
 59 | template <typename Dtype>
 60 | void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 61 | 
 62 | template <typename Dtype>
 63 | void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 64 | 
 65 | template <typename Dtype>
 66 | void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y);
 67 | 
 68 | template <typename Dtype>
 69 | void caffe_exp(const int n, const Dtype* a, Dtype* y);
 70 | 
 71 | template <typename Dtype>
 72 | void caffe_log(const int n, const Dtype* a, Dtype* y);
 73 | 
 74 | template <typename Dtype>
 75 | void caffe_abs(const int n, const Dtype* a, Dtype* y);
 76 | 
 77 | template <typename Dtype>
 78 | Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y);
 79 | 
 80 | template <typename Dtype>
 81 | Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx,
 82 |                             const Dtype* y, const int incy);
 83 | 
 84 | // Returns the sum of the absolute values of the elements of vector x
 85 | template <typename Dtype>
 86 | Dtype caffe_cpu_asum(const int n, const Dtype* x);
 87 | 
 88 | 
 89 | // the branchless, type-safe version from
 90 | // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c
 91 | template<typename Dtype>
 92 | inline int8_t caffe_sign(Dtype val) {
 93 |   return (Dtype(0) < val) - (val < Dtype(0));
 94 | }
 95 | 
 96 | // The following two macros are modifications of DEFINE_VSL_UNARY_FUNC
 97 | //   in include/dau_conv_impl/util/mkl_alternate.hpp authored by @Rowland Depp.
 98 | // Please refer to commit 7e8ef25c7 of the boost-eigen branch.
 99 | // Git cherry picking that commit caused a conflict hard to resolve and
100 | //   copying that file in convenient for code reviewing.
101 | // So they have to be pasted here temporarily.
102 | #define DEFINE_CAFFE_CPU_UNARY_FUNC(name, operation) \
103 |   template<typename Dtype> \
104 |   void caffe_cpu_##name(const int n, const Dtype* x, Dtype* y) { \
105 |     M_Assert(n > 0,""); M_Assert(x,""); M_Assert(y,""); \
106 |     for (int i = 0; i < n; ++i) { \
107 |       operation; \
108 |     } \
109 |   }
110 | 
111 | // output is 1 for the positives, 0 for zero, and -1 for the negatives
112 | DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign<Dtype>(x[i]))
113 | 
114 | // This returns a nonzero value if the input has its sign bit set.
115 | // The name sngbit is meant to avoid conflicts with std::signbit in the macro.
116 | // The extra parens are needed because CUDA < 6.5 defines signbit as a macro,
117 | // and we don't want that to expand here when CUDA headers are also included.
118 | DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \
119 |     y[i] = static_cast<bool>((std::signbit)(x[i])))
120 | 
121 | DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i]))
122 | 
123 | template <typename Dtype>
124 | void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y);
125 | 
126 | #ifndef CPU_ONLY  // GPU
127 | 
128 | // Decaf gpu gemm provides an interface that is almost the same as the cpu
129 | // gemm function - following the c convention and calling the fortran-order
130 | // gpu code under the hood.
131 | template <typename Dtype>
132 | void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA,
133 |     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
134 |     const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta,
135 |     Dtype* C, cublasHandle_t cublas_handle);
136 | 
137 | template <typename Dtype>
138 | void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N,
139 |     const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta,
140 |     Dtype* y, cublasHandle_t cublas_handle);
141 | 
142 | template <typename Dtype>
143 | void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X,
144 |     Dtype* Y, cublasHandle_t cublas_handle);
145 | 
146 | template <typename Dtype>
147 | void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X,
148 |     const Dtype beta, Dtype* Y, cublasHandle_t cublas_handle);
149 | 
150 | void caffe_gpu_memcpy(const size_t N, const void *X, void *Y);
151 | 
152 | template <typename Dtype>
153 | void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X);
154 | 
155 | inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) {
156 |   CUDA_CHECK(cudaMemset(X, alpha, N));  // NOLINT(dau_conv_impl/alt_fn)
157 | }
158 | 
159 | template <typename Dtype>
160 | void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X, cudaStream_t streamId = 0);
161 | 
162 | template <typename Dtype>
163 | void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X, cublasHandle_t cublas_handle);
164 | 
165 | template <typename Dtype>
166 | void caffe_gpu_scal(const int N, const Dtype alpha, Dtype* X, cublasHandle_t cublas_handle, cudaStream_t str);
167 | 
168 | template <typename Dtype>
169 | void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y, cudaStream_t streamId = 0);
170 | 
171 | template <typename Dtype>
172 | void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out, cublasHandle_t cublas_handle);
173 | 
174 | template <typename Dtype>
175 | void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y, cublasHandle_t cublas_handle);
176 | 
177 | 
178 | void caffe_gpu_memcpy_async(const size_t N, const void* X, void* Y, cudaStream_t streamId = 0);
179 | 
180 | template <typename Dtype>
181 | void caffe_gpu_set_async(const int N, const Dtype alpha, Dtype *X, cudaStream_t streamId = 0);
182 | 
183 | template <typename Dtype>
184 | void caffe_gpu_sum(const int N, const Dtype* x, Dtype* y, const int num_segments,
185 |                    int* offsets_gpu, bool with_add = false, cudaStream_t streamId = NULL);
186 | 
187 | template <typename Dtype>
188 | void caffe_gpu_clip_lower(const int N, const Dtype lower_bound, const Dtype* x, Dtype* y,
189 |                           cudaStream_t streamId = 0);
190 | 
191 | template <typename Dtype>
192 | void caffe_gpu_clip_upper(const int N, const Dtype upper_bound, const Dtype* x, Dtype* y,
193 |                           cudaStream_t streamId = 0);
194 | 
195 | template <typename Dtype>
196 | void caffe_gpu_clip_eps(const int N, const Dtype eps_bound, const Dtype* x, Dtype* y,
197 |                         cudaStream_t streamId = 0);
198 | 
199 | template <typename Dtype>
200 | void caffe_gpu_clip_nan(const int N, const Dtype* x, Dtype* y, cudaStream_t streamId = 0);
201 | 
202 | template <typename Dtype>
203 | void caffe_gpu_pad2d(const int I, const int H, const int W, int pad_size, const Dtype* X, Dtype* Y,
204 |                      cudaStream_t streamId = 0);
205 | 
206 | template <typename Dtype>
207 | void caffe_gpu_amax(const int I, const Dtype* X, Dtype* Y,
208 |                     cublasHandle_t cublas_handle);
209 | 
210 | #endif  // !CPU_ONLY
211 | 
212 | }  // namespace dau_conv_impl
213 | 
214 | #endif  // CAFFE_UTIL_MATH_FUNCTIONS_H_
215 | 


--------------------------------------------------------------------------------
/include/dau_conv/util/mkl_alternate.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CAFFE_UTIL_MKL_ALTERNATE_H_
  2 | #define CAFFE_UTIL_MKL_ALTERNATE_H_
  3 | 
  4 | #ifdef USE_MKL
  5 | 
  6 | #include <mkl.h>
  7 | 
  8 | #else  // If use MKL, simply include the MKL header
  9 | 
 10 | #ifdef USE_ACCELERATE
 11 | #include <Accelerate/Accelerate.h>
 12 | #else
 13 | extern "C" {
 14 | #include <cblas.h>
 15 | }
 16 | #endif  // USE_ACCELERATE
 17 | 
 18 | #include <math.h>
 19 | 
 20 | // Functions that caffe uses but are not present if MKL is not linked.
 21 | 
 22 | // A simple way to define the vsl unary functions. The operation should
 23 | // be in the form e.g. y[i] = sqrt(a[i])
 24 | #define DEFINE_VSL_UNARY_FUNC(name, operation) \
 25 |   template<typename Dtype> \
 26 |   void v##name(const int n, const Dtype* a, Dtype* y) { \
 27 |     M_Assert(n > 0,""); M_Assert(a,""); M_Assert(y,""); \
 28 |     for (int i = 0; i < n; ++i) { operation; } \
 29 |   } \
 30 |   inline void vs##name( \
 31 |     const int n, const float* a, float* y) { \
 32 |     v##name<float>(n, a, y); \
 33 |   } \
 34 |   inline void vd##name( \
 35 |       const int n, const double* a, double* y) { \
 36 |     v##name<double>(n, a, y); \
 37 |   }
 38 | 
 39 | DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i])
 40 | DEFINE_VSL_UNARY_FUNC(Sqrt, y[i] = sqrt(a[i]))
 41 | DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i]))
 42 | DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i]))
 43 | DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i]))
 44 | 
 45 | // A simple way to define the vsl unary functions with singular parameter b.
 46 | // The operation should be in the form e.g. y[i] = pow(a[i], b)
 47 | #define DEFINE_VSL_UNARY_FUNC_WITH_PARAM(name, operation) \
 48 |   template<typename Dtype> \
 49 |   void v##name(const int n, const Dtype* a, const Dtype b, Dtype* y) { \
 50 |     M_Assert(n > 0,""); M_Assert(a,""); M_Assert(y,""); \
 51 |     for (int i = 0; i < n; ++i) { operation; } \
 52 |   } \
 53 |   inline void vs##name( \
 54 |     const int n, const float* a, const float b, float* y) { \
 55 |     v##name<float>(n, a, b, y); \
 56 |   } \
 57 |   inline void vd##name( \
 58 |       const int n, const double* a, const float b, double* y) { \
 59 |     v##name<double>(n, a, b, y); \
 60 |   }
 61 | 
 62 | DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b))
 63 | 
 64 | // A simple way to define the vsl binary functions. The operation should
 65 | // be in the form e.g. y[i] = a[i] + b[i]
 66 | #define DEFINE_VSL_BINARY_FUNC(name, operation) \
 67 |   template<typename Dtype> \
 68 |   void v##name(const int n, const Dtype* a, const Dtype* b, Dtype* y) { \
 69 |     M_Assert(n > 0,""); M_Assert(a,""); M_Assert(b,""); M_Assert(y,""); \
 70 |     for (int i = 0; i < n; ++i) { operation; } \
 71 |   } \
 72 |   inline void vs##name( \
 73 |     const int n, const float* a, const float* b, float* y) { \
 74 |     v##name<float>(n, a, b, y); \
 75 |   } \
 76 |   inline void vd##name( \
 77 |       const int n, const double* a, const double* b, double* y) { \
 78 |     v##name<double>(n, a, b, y); \
 79 |   }
 80 | 
 81 | DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i])
 82 | DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i])
 83 | DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i])
 84 | DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i])
 85 | 
 86 | // In addition, MKL comes with an additional function axpby that is not present
 87 | // in standard blas. We will simply use a two-step (inefficient, of course) way
 88 | // to mimic that.
 89 | inline void cblas_saxpby(const int N, const float alpha, const float* X,
 90 |                          const int incX, const float beta, float* Y,
 91 |                          const int incY) {
 92 |   cblas_sscal(N, beta, Y, incY);
 93 |   cblas_saxpy(N, alpha, X, incX, Y, incY);
 94 | }
 95 | inline void cblas_daxpby(const int N, const double alpha, const double* X,
 96 |                          const int incX, const double beta, double* Y,
 97 |                          const int incY) {
 98 |   cblas_dscal(N, beta, Y, incY);
 99 |   cblas_daxpy(N, alpha, X, incX, Y, incY);
100 | }
101 | 
102 | #endif  // USE_MKL
103 | #endif  // CAFFE_UTIL_MKL_ALTERNATE_H_
104 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/CMakeLists.txt:
--------------------------------------------------------------------------------
  1 | cmake_minimum_required(VERSION 2.8)
  2 | 
  3 | if (NOT BUILD_TENSORFLOW_PLUGIN)
  4 |     MESSAGE(STATUS "BUILD_TENSORFLOW_PLUGIN not set.")
  5 |     return()
  6 | endif()
  7 | 
  8 | #find_package (Python COMPONENTS Interpreter Development)
  9 | find_package( PythonInterp REQUIRED )
 10 | MESSAGE(STATUS "Python exec ${PYTHON_EXECUTABLE}")
 11 | 
 12 | #SET VARIABLE AS OTHER PYTHON EXECUTABLE IF PythonInterp FINDS A DIFFERENT EXECUTABLE
 13 | #set(PYTHON_EXECUTABLE python3)
 14 | 
 15 | #get TF VERSION
 16 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow; import sys; sys.stdout.write(tensorflow.__version__)" OUTPUT_VARIABLE TF_VERSION)
 17 | convert_version_string_to_int("${TF_VERSION}" TF_VERSION_INT)
 18 | message(STATUS "TF_VERSION_INT: ${TF_VERSION_INT}") 
 19 | 
 20 | if( "${TF_VERSION}" VERSION_GREATER "2.0.0" OR "${TF_VERSION}" VERSION_EQUAL "2.0.0")
 21 |     MESSAGE(STATUS "TF VER MORE THAN 2.0.0: ${TF_VERSION}")
 22 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow; import sys; sys.stdout.write(tensorflow.sysconfig.get_include())" OUTPUT_VARIABLE Tensorflow_INCLUDE_DIRS)
 23 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_compile_flags()))" OUTPUT_VARIABLE TF_CFLAGS)
 24 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_link_flags()))" OUTPUT_VARIABLE TF_LFLAGS)
 25 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(tf.sysconfig.get_lib()+'/libtensorflow_framework.so.2')" OUTPUT_VARIABLE TF_LIB)
 26 |     MESSAGE(STATUS "${TF_LIB_DIR}")
 27 | 
 28 | elseif( "${TF_VERSION}" VERSION_GREATER "1.14.0" OR "${TF_VERSION}" VERSION_EQUAL "1.14.0")
 29 |     MESSAGE(STATUS "TF VER BETWEEN THAN 1.14 and 1.15: ${TF_VERSION}")
 30 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow; import sys; sys.stdout.write(tensorflow.sysconfig.get_include())" OUTPUT_VARIABLE Tensorflow_INCLUDE_DIRS)
 31 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_compile_flags()))" OUTPUT_VARIABLE TF_CFLAGS)
 32 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_link_flags()))" OUTPUT_VARIABLE TF_LFLAGS)
 33 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(tf.sysconfig.get_lib()+'/libtensorflow_framework.so.1')" OUTPUT_VARIABLE TF_LIB)
 34 |     MESSAGE(STATUS "${TF_LIB_DIR}")
 35 | 
 36 | elseif( ${TF_VERSION} VERSION_LESS "1.5.0")
 37 |     MESSAGE(STATUS "TF VER LOWER THAN 1.5.: ${TF_VERSION}")
 38 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow; import sys; sys.stdout.write(tensorflow.sysconfig.get_include())" OUTPUT_VARIABLE Tensorflow_INCLUDE_DIRS)
 39 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(tf.sysconfig.get_lib()+'/libtensorflow_framework.so')" OUTPUT_VARIABLE TF_LIB)
 40 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(tf.sysconfig.get_lib())" OUTPUT_VARIABLE TF_LIB_DIR)
 41 |     set(TF_CFLAGS "-I ${Tensorflow_INCLUDE_DIRS} -D_GLIBCXX_USE_CXX11_ABI=0")
 42 |     set(TF_LFLAGS "-L ${TF_LIB_DIR} -ltensorflow_framework")
 43 |     
 44 | else()
 45 |     MESSAGE(STATUS "TF VER: ${TF_VERSION}")
 46 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow; import sys; sys.stdout.write(tensorflow.sysconfig.get_include())" OUTPUT_VARIABLE Tensorflow_INCLUDE_DIRS)
 47 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_compile_flags()))" OUTPUT_VARIABLE TF_CFLAGS)
 48 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_link_flags()))" OUTPUT_VARIABLE TF_LFLAGS)
 49 |     execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(tf.sysconfig.get_lib()+'/libtensorflow_framework.so')" OUTPUT_VARIABLE TF_LIB)
 50 | endif()
 51 | 
 52 | 
 53 | # NO need to explicitly add -std=c++11 since main CMakeLists.txt already does that
 54 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 55 | 
 56 | if(( "${TF_VERSION}" VERSION_GREATER "2.7.0" OR "${TF_VERSION}" VERSION_EQUAL "2.7.0")  AND NOT COMPILER_SUPPORTS_CXX14)
 57 |     message(FATAL_ERROR "TF v2.7.0 or higher requires C++14 support. Please use a different C++ compiler.")
 58 | elseif( "${TF_VERSION}" VERSION_GREATER "2.10.0" OR "${TF_VERSION}" VERSION_EQUAL "2.10.0")
 59 |    if(COMPILER_SUPPORTS_CXX17)
 60 |      message(STATUS "Enabling C++17 support for TF v2.10.0 or higher.")
 61 |      set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
 62 |    else()
 63 |      message(FATAL_ERROR "TF v2.10.0 or higher requires C++17 support. Please use a different C++ compiler.")
 64 |    endif()
 65 | endif()
 66 | 
 67 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D GOOGLE_CUDA=1")
 68 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D TENSORFLOW_VERSION=${TF_VERSION_INT}")
 69 | 
 70 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TF_CFLAGS}")
 71 | set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${TF_LFLAGS}")
 72 | 
 73 | #TEMP INSERT FROM BASE CMAKE
 74 | 
 75 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 76 | 
 77 | 
 78 | message(STATUS "flags: ${CMAKE_CXX_FLAGS}")
 79 | message(STATUS "flags: ${CMAKE_SHARED_LINKER_FLAGS}")
 80 | message(STATUS "tf link flags: ${TF_LFLAGS}")
 81 | message(STATUS "tf compiler flags: ${TF_CFLAGS}")
 82 | 
 83 | 
 84 | 
 85 | message(STATUS "LINKER LIBS: ${DAUConvNet_LINKER_LIBS}")
 86 | message(STATUS "INCLUDE DIRS: ${DAUConvNet_INCLUDE_DIRS}")
 87 | message(STATUS "INCLUDE DIR: ${DAUConvNet_INCLUDE_DIR}")
 88 | message(STATUS "TENSORFLOW DIRS: ${Tensorflow_INCLUDE_DIRS}")
 89 | message(STATUS "TENSORFLOW LIB: ${TF_LIB}")
 90 | 
 91 | 
 92 | # build the gradient operation which is used in base_op_grad.py
 93 | # to register it
 94 | #LAYER ADD LIBRARY
 95 | #LINK_DIRECTORIES(${TF_LIB})
 96 | include_directories(${Tensorflow_INCLUDE_DIRS})
 97 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src)
 98 | 
 99 | # we need to manually add dependecny on .cu.o objects
100 | # by adding dummy output (real outputs will be created DAU-ConvNet target)
101 | add_custom_command(OUTPUT ${DAUConvNet_CU_OBJS}
102 |                     COMMAND echo
103 |                     DEPENDS ${DAUConvNet_OBJ_TARGET})
104 | 
105 | add_library(dau_conv_tensorflow SHARED src/dau_conv_layer_tensorflow.cpp src/dau_conv_layer_tensorflow.hpp ${DAUConvNet_OBJS})
106 | 
107 | # we also need to ensure to first compile DAU-ConvNet
108 | add_dependencies(dau_conv_tensorflow ${DAUConvNet_OBJ_TARGET})
109 | 
110 | target_include_directories(dau_conv_tensorflow PUBLIC ${DAUConvNet_INCLUDE_DIR})
111 | target_include_directories(dau_conv_tensorflow PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
112 | target_include_directories(dau_conv_tensorflow PUBLIC ${Tensorflow_INCLUDE_DIRS})
113 | target_include_directories(dau_conv_tensorflow PUBLIC ${DAUConvNet_INCLUDE_DIRS})
114 | 
115 | message(STATUS ${DAUConvNet_LINKER_LIBS})
116 | target_link_libraries(dau_conv_tensorflow PUBLIC ${TF_LIB} )
117 | target_link_libraries(dau_conv_tensorflow ${DAUConvNet_LINKER_LIBS})
118 | 
119 | 
120 | # build the actual operation which can be used directory
121 | add_library(dau_conv_grad_op SHARED src/dau_conv_grad_op.cpp)
122 | target_link_libraries(dau_conv_grad_op PUBLIC dau_conv_tensorflow)
123 | target_link_libraries(dau_conv_grad_op PUBLIC ${DAUConvNet_LINKER_LIBS})
124 | target_include_directories(dau_conv_grad_op PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
125 | target_include_directories(dau_conv_grad_op PUBLIC "/usr/local/")
126 | target_include_directories(dau_conv_grad_op ${DAUConvNet_INCLUDE_DIRS} PUBLIC ${DAUConvNet_INCLUDE_DIR})
127 | 
128 | add_library(dau_conv_op SHARED src/dau_conv_op.cpp)
129 | 
130 | target_link_libraries(dau_conv_op dau_conv_tensorflow)
131 | target_include_directories(dau_conv_op PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
132 | target_include_directories(dau_conv_op PUBLIC "/usr/local/")
133 | target_include_directories(dau_conv_op ${DAUConvNet_INCLUDE_DIRS} PUBLIC ${DAUConvNet_INCLUDE_DIR})
134 | 
135 | 
136 | include(GNUInstallDirs)
137 | 
138 | set(CMAKE_INSTALL_PREFIX "/usr/local/lib/")
139 | 
140 | message(STATUS ${CMAKE_INSTALL_FULL_LIBDIR})
141 | message(STATUS ${CMAKE_INSTALL_FULL_BINDIR})
142 | 
143 | set(DAU_CONV_OP_PATH "${CMAKE_CURRENT_BINARY_DIR}/${DAU_CONV_MODULE_NAME}/${CMAKE_SHARED_LIBRARY_PREFIX}dau_conv_op${CMAKE_SHARED_LIBRARY_SUFFIX}")
144 | set(DAU_CONV_GRAD_OP_PATH "${CMAKE_CURRENT_BINARY_DIR}/${DAU_CONV_MODULE_NAME}/${CMAKE_SHARED_LIBRARY_PREFIX}dau_conv_grad_op${CMAKE_SHARED_LIBRARY_SUFFIX}")
145 | set(DAU_CONV_TENSORFLOW_PATH "${CMAKE_CURRENT_BINARY_DIR}/${DAU_CONV_MODULE_NAME}/${CMAKE_SHARED_LIBRARY_PREFIX}dau_conv_tensorflow${CMAKE_SHARED_LIBRARY_SUFFIX}")
146 | 
147 | message(STATUS ${DAU_CONV_OP_PATH})
148 | message(STATUS ${DAU_CONV_GRAD_OP_PATH})
149 | message(STATUS ${DAU_CONV_TENSORFLOW_PATH})
150 | 
151 | set(DAU_CONV_MODULE_NAME dau_conv)
152 | 
153 | set(SETUP_PY_IN "${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in")
154 | set(SETUP_PY    "${CMAKE_CURRENT_BINARY_DIR}/setup.py")
155 | set(DEPS        "${CMAKE_CURRENT_SOURCE_DIR}/${DAU_CONV_MODULE_NAME}/__init__.py")
156 | set(OUTPUT      "${CMAKE_CURRENT_BINARY_DIR}/build/timestamp")
157 | set(WHEELHOUSE  "${CMAKE_CURRENT_BINARY_DIR}/wheelhouse")
158 | 
159 | message(STATUS ${WHEELHOUSE})
160 | 
161 | set(MANIFEST_IN "${CMAKE_CURRENT_SOURCE_DIR}/MANIFEST.in.in")
162 | set(MANIFEST    "${CMAKE_CURRENT_BINARY_DIR}/MANIFEST.in")
163 | 
164 | # for TF v2.1 or higher there is no more tensorflow-gpu package
165 | if( "${TF_VERSION}" VERSION_GREATER "2.1.0" OR "${TF_VERSION}" VERSION_EQUAL "2.1.0")
166 |     set(TENSORFLOW_PIP_PACKAGE_NAME "tensorflow==${TF_VERSION}")
167 | else()
168 |     set(TENSORFLOW_PIP_PACKAGE_NAME "tensorflow-gpu==${TF_VERSION}")
169 | endif()
170 | 
171 | 
172 | configure_file(${MANIFEST_IN} ${MANIFEST})
173 | configure_file(${SETUP_PY_IN} ${SETUP_PY})
174 | 
175 | add_custom_command(OUTPUT ${WHEELHOUSE}
176 | 	COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/${DAU_CONV_MODULE_NAME} .
177 | 	COMMAND cp *.so  ${DAU_CONV_MODULE_NAME}
178 | 	COMMAND ${PYTHON_EXECUTABLE} -m pip wheel . -w ${WHEELHOUSE} --no-deps
179 |         DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${DAU_CONV_MODULE_NAME}" dau_conv_tensorflow dau_conv_op dau_conv_grad_op)
180 | 
181 | add_custom_target(target ALL DEPENDS ${WHEELHOUSE} dau_conv_tensorflow dau_conv_op dau_conv_grad_op)
182 | 
183 | install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install --find-links=${WHEELHOUSE} --force-reinstall --no-deps ${DAU_CONV_MODULE_NAME}==${PACKAGE_VERSION}.${TF_VERSION})")
184 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/MANIFEST.in.in:
--------------------------------------------------------------------------------
1 | include dau_conv/*.so
2 | include dau_conv/tmp
3 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/build-ci/build-whl.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # This script builds dau-conv package for various TensorFlow and Python version
  4 | #
  5 | # Below is defined a list of all tensorflow builds (TF_BUILDS) and python builds (PYTHON_BUILDS)
  6 | # for which DAU-ConvNet package will be build. 
  7 | #
  8 | # This script performs:
  9 | #  1. For all combinations of TensorFlow na Python version perform build using a prepared docker file
 10 | #
 11 | #  2. After all images are build it performs the following tests:
 12 | #    - integirety check by running "import dau_conv" within container
 13 | #    - quick unit test by running test "python -m dau_conv.test DAUConvTest.test_DAUConv
 14 | #
 15 | #  3. Wheel packages (.whl) are stored to the same location of this script.
 16 | 
 17 | # Define a function to stop all parent processes and exit the script
 18 | function stop_script {
 19 |   echo "Stopping all parent processes and exiting script..."
 20 |   pkill -P $$
 21 |   exit 1
 22 | }
 23 | 
 24 | # Catch the SIGINT signal (CTRL+C) and execute the stop_script function
 25 | trap stop_script SIGINT
 26 | 
 27 | # Rest of the script goes here...
 28 | 
 29 | #DOCKER_EXEC=nvidia-docker
 30 | DOCKER_EXEC=docker
 31 | DOCKER_GPUS=--gpus=all
 32 | 
 33 | DAU_VERSION=1.0
 34 | DOCKER_IMG_NAME=dau-convnet
 35 | UNITTEST_DOCKER=0
 36 | DOCKER_HUB_REPO=""
 37 | 
 38 | # list of python version, TensorFlow version and corresponding 
 39 | # nvidia/cuda image version where each value is seperated by semicolumn (;)
 40 | 
 41 | BUILD_CFG=("py3.8;TF2.12.0;nvidia/cuda:11.8.0-cudnn8-devel-ubuntu18.04" \
 42 |            "py3.8;TF2.11.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \
 43 |            "py3.8;TF2.10.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \
 44 |            "py3.8;TF2.9.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \
 45 |            "py3.8;TF2.8.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \
 46 |            "py3.8;TF2.7.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \
 47 |            "py3.8;TF2.6.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \
 48 |            "py3.8;TF2.5.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \
 49 |            "py3.8;TF2.4.0;nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04" \
 50 |            "py3.8;TF2.3.0;nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04" \
 51 |            "py3.8;TF2.2.0;nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04" \
 52 |            "py3.7;TF2.2.0;nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04" \
 53 |            "py3.7;TF2.1.0;nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04" \
 54 |            "py3.7;TF2.0.0;nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04" \
 55 |            "py3.7;TF1.15.5;nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04" \
 56 |            "py3.7;TF1.14.0;nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04")
 57 | 
 58 | for i in "$@"
 59 | do
 60 | case $i in
 61 |     --dau-version=*)
 62 |     DAU_VERSION="${i#*=}"
 63 |     shift # past argument
 64 |     ;;
 65 |     --docker-basename=*)
 66 |     DOCKER_IMG_NAME="${i#*=}"
 67 |     shift # past argument
 68 |     ;;
 69 |     --python-builds=*)
 70 |     PYTHON_BUILDS=""
 71 |     IFS=',' read -r -a PYTHON_BUILDS <<< "${i#*=}"
 72 |     shift # past argument
 73 |     ;;
 74 |     --tf-builds=*)
 75 |     TF_BUILDS=""
 76 |     IFS=',' read -r -a TF_BUILDS <<< "${i#*=}"
 77 |     shift # past argument
 78 |     ;;
 79 |     --unit-test)
 80 |     UNITTEST_DOCKER=1
 81 |     shift # past argument
 82 |     ;;
 83 |     --docker-hub-repo=*)
 84 |     DOCKER_HUB_REPO="${i#*=}"
 85 |     shift # past argument
 86 |     ;;
 87 | 
 88 |     *)
 89 | 	 # unknown option
 90 |     ;;
 91 | esac
 92 | done
 93 | 
 94 | echo "Settings:"
 95 | echo "  DAU_VERSION=${DAU_VERSION}"
 96 | echo "  DOCKER_IMG_NAME=${DOCKER_IMG_NAME}"
 97 | echo "  BUILD_CFG=${BUILD_CFG[*]}"
 98 | echo "  UNITTEST_DOCKER=${UNITTEST_DOCKER}"
 99 | 
100 | echo "Building docker images for:"
101 | for BUILD_CFG_STR in "${BUILD_CFG[@]}"
102 | do
103 |   IFS=";" read -r -a SINGLE_BUILD_CFG <<< "${BUILD_CFG_STR}"
104 |   PY_VER=${SINGLE_BUILD_CFG[0]:2}
105 |   TF_VER=${SINGLE_BUILD_CFG[1]:2}
106 |   TF_BASE_IMAGE=${SINGLE_BUILD_CFG[2]}
107 |   DOCKER_IMG_TAG=${DAU_VERSION}-py${PY_VER}-tf${TF_VER}
108 | 
109 |   ##############################################################################
110 |   echo -n "  ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} ... "
111 | 
112 |   BUILD_LOG="build_dau_${DOCKER_IMG_TAG}.log"
113 |   PY_VER_MAJOR=${PY_VER%.*}
114 |   if [ ${PY_VER_MAJOR} -eq 2 ]; then
115 |       PY_VER_MAJOR=""
116 |   fi
117 | 
118 |   DAU_CMAKE_FLAGS="-DPACKAGE_VERSION=${DAU_VERSION}"
119 |   
120 |   DOCKERFILE_VERSION=""
121 |   if [[ $TF_BASE_IMAGE == *"ubuntu18.04"* ]]; then
122 |     DOCKERFILE_VERSION=".ubuntu18.04"
123 |   fi
124 | 
125 |   ${DOCKER_EXEC} build -t ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} \
126 |     --build-arg BASE_CUDA_VERSION=${TF_BASE_IMAGE} \
127 |     --build-arg TF_VER=${TF_VER} \
128 |     --build-arg PY_VER=${PY_VER} \
129 |     --build-arg PY_VER_MAJOR="${PY_VER_MAJOR}" \
130 |     --build-arg DAU_CMAKE_FLAGS=${DAU_CMAKE_FLAGS} -f docker/Dockerfile${DOCKERFILE_VERSION} docker/ >& ${BUILD_LOG}
131 |   STATUS=$?
132 |   if [ ${STATUS} -ne 0 ]; then
133 |     echo "ERROR: check ${BUILD_LOG} for logs."
134 |   else
135 |     echo "OK"
136 |   fi
137 |   ${DOCKER_EXEC} builder prune -f --keep-storage 5GB >> ${BUILD_LOG} 2>&1
138 | done
139 | 
140 | # Run each docker for unit-test and extract whl file
141 | for TF_VER_BUILD_STR in "${TF_BUILDS[@]}"
142 | do
143 |   IFS=";" read -r -a TF_VER_BUILD <<< "${TF_VER_BUILD_STR}"
144 |   TF_VER=${TF_VER_BUILD[0]}
145 |   TF_BASE_IMAGE=${TF_VER_BUILD[1]}
146 |   for PY_VER in "${PYTHON_BUILDS[@]}"
147 |   do
148 |     PY_VER_MAJOR=${PY_VER%.*}
149 |     PY_VER_STR=${PY_VER//.}
150 |     PYTHON_EXEC=/usr/bin/python${PY_VER}
151 | 
152 |     DOCKER_IMG_TAG=${DAU_VERSION}-py${PY_VER}-tf${TF_VER}
153 |     CONTAINER_NAME="integration-testing-dau-convnet-${DOCKER_IMG_TAG}"
154 | 
155 |     echo "Testing ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG}:"
156 | 
157 |     echo -n "  Verifying dau_conv package integrity ... "
158 |     ${DOCKER_EXEC} run -i --rm --name ${CONTAINER_NAME} ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} /usr/bin/python${PY_VER} /opt/verify_dau_import.py
159 |     STATUS=$?
160 |     
161 |     if [ ${STATUS} -ne 0 ]; then
162 |       echo "ERROR: cannot run 'import dau_conv'"
163 |     else
164 |       echo "OK"
165 | 
166 |       if [ ${UNITTEST_DOCKER} -ne 0 ]; then
167 |         UNITTEST_LOG="test_dau_${DOCKER_IMG_TAG}.log"
168 |         echo -n "  Running UnitTest ... "
169 |         ${DOCKER_EXEC} run $DOCKER_GPUS -i --rm --name ${CONTAINER_NAME} -e DEBIAN_FRONTEND=noninteractive ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} /bin/bash /opt/test_dau.sh ${PYTHON_EXEC} &> ${UNITTEST_LOG}
170 |         STATUS=$?
171 | 
172 |         if [ ${STATUS} -ne 0 ]; then
173 |           echo "ERROR: check ${UNITTEST_LOG} for logs."
174 |         else
175 |           echo "OK"
176 |         fi
177 |       fi
178 |       
179 | 
180 |       echo -n "  Copying .whl package to build-ci ... "
181 |       WHL_STR="py${PY_VER_MAJOR}-none-any"
182 |       if [ ${PY_VER_MAJOR} -eq 2 ]; then
183 |          WHL_REPLACEMENT_STR="cp${PY_VER_STR}-cp${PY_VER_STR}mu-manylinux1_x86_64"
184 |       else
185 |          WHL_REPLACEMENT_STR="cp${PY_VER_STR}-cp${PY_VER_STR}m-manylinux1_x86_64"
186 |       fi
187 | 
188 |       WHL_TMP_DIR=/tmp/whl-${DOCKER_IMG_TAG}
189 |       if [ ! -d "$WHL_TMP_DIR" ]; then
190 |         mkdir $WHL_TMP_DIR
191 |       fi
192 |       ${DOCKER_EXEC} rm -f dummy-${DOCKER_IMG_NAME} &> /dev/null
193 |       ${DOCKER_EXEC} create --name dummy-${DOCKER_IMG_NAME} ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} /bin/bash
194 |       ${DOCKER_EXEC} cp dummy-${DOCKER_IMG_NAME}:/opt ${WHL_TMP_DIR}/.
195 |       ${DOCKER_EXEC} rm -f dummy-${DOCKER_IMG_NAME}
196 | 
197 |       WHL_TMP_DIR=$WHL_TMP_DIR/opt
198 | 
199 |       for file in $WHL_TMP_DIR/*.whl; do
200 |         mv "$file" "${file/$WHL_STR/$WHL_REPLACEMENT_STR}"
201 |       done
202 |       mv -f $WHL_TMP_DIR/*.whl `dirname "$0"`/.
203 |       rm -rf $WHL_TMP_DIR
204 |       echo "done"
205 | 
206 |      if [ ! -z "${DOCKER_HUB_REPO}"  ]; then
207 |        echo -n "  Tagging and pushing docker to DockerHub ... "
208 | 
209 |        DOCKERPUSH_LOG="docker_push_dau_${DOCKER_IMG_TAG}.log"
210 | 
211 |        ${DOCKER_EXEC} tag ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} ${DOCKER_HUB_REPO}/${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} >& /dev/null
212 |        ${DOCKER_EXEC} push ${DOCKER_HUB_REPO}/${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} &> ${DOCKERPUSH_LOG}
213 |        STATUS=$?
214 | 
215 |        if [ ${STATUS} -ne 0 ]; then
216 | 	 echo "ERROR: check ${DOCKERPUSH_LOG} for logs."
217 |        else
218 |          echo "OK"
219 |        fi
220 |      fi
221 |     fi
222 |   done  
223 | done
224 | 
225 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/dau_conv/__init__.py:
--------------------------------------------------------------------------------
1 | from .dau_conv import *
2 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/dau_conv/_dau_conv_grad_op.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import os
 4 | import tensorflow as tf
 5 | from tensorflow.python.framework import ops
 6 | 
 7 | # preload libdau_conv_tensorflow.so manuall since it is most likely not on the LD_LIBRARY_PATH
 8 | from ctypes import cdll
 9 | cdll.LoadLibrary(os.path.join(os.path.dirname(os.path.realpath(__file__)),'libdau_conv_tensorflow.so'))
10 | 
11 | dau_conv_grad_module = tf.load_op_library(os.path.join(os.path.dirname(os.path.realpath(__file__)),'libdau_conv_grad_op.so'))
12 | 
13 | 
14 | @ops.RegisterGradient("DAUConv")
15 | def _dau_conv_op_grad_cc(op, grad):
16 |     # Op is the Op object - get all the inputs
17 |     # Grad is the gradient with respect to the first input
18 |     number_units_x = op.get_attr("number_units_x")
19 |     number_units_y = op.get_attr("number_units_y")
20 |     number_units_ignore = op.get_attr("number_units_ignore")
21 |     num_output = op.get_attr("num_output")
22 |     kernel_size = op.get_attr("kernel_size")
23 |     pad = op.get_attr("pad")
24 |     stride = op.get_attr("stride")
25 |     unit_normalization = op.get_attr("unit_normalization")
26 |     square_unit_normalization = op.get_attr("square_unit_normalization")
27 |     mean_iteration_step = op.get_attr("mean_iteration_step")
28 |     sigma_iteration_step = op.get_attr("sigma_iteration_step")
29 |     component_border_bound = op.get_attr("component_border_bound")
30 |     sigma_lower_bound = op.get_attr("sigma_lower_bound")
31 |     merge_iteration_step = op.get_attr("merge_iteration_step")
32 |     merge_threshold = op.get_attr("merge_threshold")
33 |     unit_testing = op.get_attr("unit_testing")
34 |     mu_learning_rate_factor = op.get_attr("mu_learning_rate_factor")
35 |     single_dim_kernel = op.get_attr("single_dim_kernel")
36 |     forbid_positive_dim1 = op.get_attr("forbid_positive_dim1")
37 |     use_interpolation = op.get_attr("use_interpolation")
38 | 
39 | 
40 |     return dau_conv_grad_module.dau_conv_grad(grad, op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], op.inputs[4],
41 |                                               number_units_x=number_units_x,
42 |                                               number_units_y=number_units_y,
43 |                                               number_units_ignore=number_units_ignore,
44 |                                               num_output=num_output,
45 |                                               kernel_size=kernel_size,
46 |                                               pad=pad,
47 |                                               stride=stride,
48 |                                               unit_normalization=unit_normalization,
49 |                                               square_unit_normalization=square_unit_normalization,
50 |                                               mean_iteration_step=mean_iteration_step,
51 |                                               sigma_iteration_step=sigma_iteration_step,
52 |                                               component_border_bound=component_border_bound,
53 |                                               sigma_lower_bound=sigma_lower_bound,
54 |                                               merge_iteration_step=merge_iteration_step,
55 |                                               merge_threshold=merge_threshold,
56 |                                               mu_learning_rate_factor=mu_learning_rate_factor,
57 |                                               single_dim_kernel=single_dim_kernel,
58 |                                               forbid_positive_dim1=forbid_positive_dim1,
59 |                                               use_interpolation=use_interpolation,
60 |                                               unit_testing=unit_testing)
61 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/docker/Dockerfile:
--------------------------------------------------------------------------------
 1 | ARG BASE_CUDA_VERSION=nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04
 2 | FROM $BASE_CUDA_VERSION
 3 | 
 4 | LABEL maintainer "domen.tabernik@fri.uni-lj.si"
 5 | 
 6 | ARG DAU_CMAKE_FLAGS=""
 7 | 
 8 | # TF/PY version argument must be after FROM statement
 9 | ARG TF_VER=1.13.1
10 | ARG PY_VER=3.5
11 | ARG PY_VER_MAJOR=3
12 | 
13 | ENV PYTHON "python$PY_VER"
14 | ENV PYTHON_MAJOR "python$PY_VER_MAJOR"
15 | 
16 | ENV LD_LIBRARY_PATH "/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
17 | ENV DAU_CONVNET_HOME /opt/dau-convnet
18 | 
19 | WORKDIR $DAU_CONVNET_HOME
20 | 
21 | RUN echo "Using TensorFlow==$TF_VER"
22 | RUN echo "Using python binary path=$PYTHON"
23 | 
24 | # Install general packages for building 
25 | RUN apt-get update && \
26 |     apt-get install -y software-properties-common \
27 | 		       sudo \
28 | 		       build-essential \
29 | 	cmake \
30 |         build-essential \
31 |         curl \
32 |         git \
33 |         libcurl3-dev \
34 |         libfreetype6-dev \
35 |         libpng12-dev \
36 |         libzmq3-dev \
37 |         pkg-config \
38 |         rsync \
39 |         software-properties-common \
40 |         unzip \
41 |         zip \
42 |         zlib1g-dev \
43 | 	libopenblas-dev
44 | 
45 | # Install specific python and tensorflow versions  
46 | RUN apt-get install -y $PYTHON \
47 |                        $PYTHON-dev \
48 |                        $PYTHON_MAJOR-pip && \
49 |     apt-get clean
50 | 
51 | RUN $PYTHON -m pip --no-cache-dir install numpy pathlib
52 | RUN $PYTHON -m pip install tensorflow==$TF_VER
53 | 
54 | # NOTE: since docker build does not provide nvidia drivers we cannot run "import tensorflow"
55 | # using tensorflow-gpu so we only use CPU tensorflow during build-time and then install 
56 | # tensorflow-gpu after DAU-ConvNet is compiled
57 | 
58 | # Download and build DAU-ConvNet plugin
59 | RUN git clone https://github.com/skokec/DAU-ConvNet . &&  \
60 |     git submodule update --init --recursive
61 |    
62 | RUN mkdir build && cd build && \
63 |     cmake -DBLAS=Open -DBUILD_TENSORFLOW_PLUGIN=on -DPYTHON_EXECUTABLE="/usr/bin/$PYTHON" $DAU_CMAKE_FLAGS .. && \
64 |     make -j install
65 | 
66 | # We need to install back GPU support for tensorflow 
67 | RUN $PYTHON -m pip install tensorflow-gpu==$TF_VER
68 | 
69 | 
70 | # Install two scripts that will verify integrity of build with tests
71 | COPY verify_dau_import.py /opt/verify_dau_import.py
72 | COPY test_dau.sh /opt/test_dau.sh
73 | 
74 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/docker/Dockerfile.ubuntu18.04:
--------------------------------------------------------------------------------
 1 | ARG BASE_CUDA_VERSION=nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04
 2 | FROM $BASE_CUDA_VERSION as dau-base
 3 | 
 4 | LABEL maintainer "domen.tabernik@fri.uni-lj.si"
 5 | 
 6 | ARG DAU_CMAKE_FLAGS=""
 7 | 
 8 | ENV LD_LIBRARY_PATH "/usr/local/nvidia/lib:/usr/local/nvidia/lib64"
 9 | ENV DAU_CONVNET_HOME /opt/dau-convnet
10 | 
11 | # Install general packages for building 
12 | RUN apt-get update && \
13 |     apt-get install -y software-properties-common \
14 | 		        sudo \
15 | 		        build-essential \
16 |                 wget \
17 | 	            cmake \
18 |                 build-essential \
19 |                 curl \
20 |                 git \
21 |                 libcurl3-dev \
22 |                 libfreetype6-dev \
23 |                 libpng-dev \
24 |                 libzmq3-dev \
25 |                 pkg-config \
26 |                 rsync \
27 |                 software-properties-common \
28 |                 unzip \
29 |                 zip \
30 |                 zlib1g-dev \
31 | 	            libopenblas-dev && \
32 |     apt-get clean && \
33 |     rm -rf /var/lib/apt/lists/*
34 | 
35 | ######################################################################
36 | # TF/PY version argument must be after FROM statement
37 | ARG PY_VER=3.7
38 | ARG PY_VER_MAJOR=3
39 | 
40 | ENV PYTHON "python$PY_VER"
41 | ENV PYTHON_MAJOR "python$PY_VER_MAJOR"
42 | 
43 | # Install specific python and tensorflow versions  
44 | RUN apt-get update && \ 
45 |     apt-get install -y $PYTHON \
46 |                        $PYTHON-dev \
47 |                        $PYTHON_MAJOR-pip && \
48 |     apt-get clean && \
49 |     rm -rf /var/lib/apt/lists/*
50 | 
51 | ######################################################################
52 | 
53 | RUN $PYTHON -m pip --no-cache-dir install setuptools==57.5.0 && \
54 |     $PYTHON -m pip --no-cache-dir install cython numpy==1.19.5 pathlib protobuf==3.20
55 | RUN $PYTHON -m pip --no-cache-dir install pip --upgrade
56 | 
57 | ######################################################################
58 | 
59 | FROM dau-base as dau-build
60 | 
61 | WORKDIR /tmp
62 | 
63 | # install latest CMAKE 
64 | RUN wget -q https://cmake.org/files/v3.21/cmake-3.21.3-linux-x86_64.tar.gz -O - | tar -xz -C /opt && mv /opt/cmake-3.21.3-linux-x86_64 /opt/cmake-3.21.3
65 | ENV PATH /opt/cmake-3.21.3/bin:$PATH
66 | 
67 | ######################################################################
68 | 
69 | ARG TF_VER=1.15.5
70 | RUN $PYTHON -m pip --no-cache-dir install tensorflow==$TF_VER
71 | 
72 | ######################################################################
73 | 
74 | # NOTE: since docker build does not provide nvidia drivers we cannot run "import tensorflow"
75 | # using tensorflow-gpu so we only use CPU tensorflow during build-time and then install 
76 | # tensorflow-gpu after DAU-ConvNet is compiled
77 | 
78 | WORKDIR $DAU_CONVNET_HOME
79 | 
80 | # Download and build DAU-ConvNet plugin
81 | RUN git clone --depth=1 --branch=v1.0-TF2 https://github.com/skokec/DAU-ConvNet . &&  \
82 |     git submodule update --init --recursive
83 | 
84 | RUN mkdir build && cd build && \
85 |     cmake -DBLAS=Open -DBUILD_TENSORFLOW_PLUGIN=on -DPYTHON_EXECUTABLE="/usr/bin/$PYTHON" $DAU_CMAKE_FLAGS .. && \
86 |     make -j install
87 | 
88 | FROM dau-base as dau-convnet
89 | 
90 | # Copy DAU-ConvNet whl from build stage
91 | COPY --from=dau-build ${DAU_CONVNET_HOME}/build/plugins/tensorflow/wheelhouse/*.whl /opt/.
92 | 
93 | # install DAU-ConvNet whl which will also install tensorflow-gpu
94 | RUN $PYTHON -m pip install --no-cache-dir /opt/*.whl
95 | 
96 | # Install two scripts that will verify integrity of build with tests
97 | COPY verify_dau_import.py /opt/verify_dau_import.py
98 | COPY test_dau.sh /opt/test_dau.sh
99 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/docker/test_dau.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export DEBIAN_FRONTEND=noninteractive
 4 | PYTHON_EXEC=$1
 5 | 
 6 | apt update && apt install -y python-tk
 7 | ${PYTHON_EXEC} -m pip install --no-cache-dir scipy matplotlib==2.2
 8 | 
 9 | ${PYTHON_EXEC} -m dau_conv.test DAUConvTest.test_DAUConv
10 | 
11 | STATUS=$?
12 | exit $STATUS
13 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/docker/verify_dau_import.py:
--------------------------------------------------------------------------------
1 | import sys
2 | try:
3 |      import dau_conv
4 | except:
5 |      sys.exit(1)
6 | sys.exit(0)
7 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/scripts/start_main_build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | BUILD_DIR=~/Documents/5.letnik/gauss_conv/new_impl/VitjanZ/DAU-ConvNet/build/
 4 | if [ ! -d $BUILD_DIR ]; then
 5 |  mkdir $BUILD_DIR
 6 | fi
 7 | 
 8 | cd $BUILD_DIR
 9 | cmake ..
10 | make
11 | cp ./plugins/tensorflow/*.so ../plugins/tensorflow/bin/
12 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/setup.py.in:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | setup(name='dau_conv',
 4 |       version='${PACKAGE_VERSION}.${TF_VERSION}',
 5 |       description='DAU-ConvNet (Displaced Aggregation Units) package for TensorFlow',
 6 |       url="https://https://github.com/skokec/DAU-ConvNet",
 7 |       author="Domen Tabernik",
 8 |       author_email="domen.tabernik@fri.uni-lj.si",
 9 |       include_package_data=True,
10 |       install_requires=["${TENSORFLOW_PIP_PACKAGE_NAME}"],
11 |       packages=['${DAU_CONV_MODULE_NAME}', '${DAU_CONV_MODULE_NAME}.test'])
12 | 
13 | 


--------------------------------------------------------------------------------
/plugins/tensorflow/src/dau_conv_layer_tensorflow.hpp:
--------------------------------------------------------------------------------
  1 | #ifndef CAFFE_DAU_CONV_LAYER_HPP_
  2 | #define CAFFE_DAU_CONV_LAYER_HPP_
  3 | 
  4 | #include <string>
  5 | #include <utility>
  6 | #include <vector>
  7 | 
  8 | #include "tensorflow/core/framework/op_kernel.h"
  9 | #include "tensorflow/core/framework/tensor_shape.h"
 10 | #include "tensorflow/core/platform/logging.h"
 11 | #include "tensorflow/core/framework/shape_inference.h"
 12 | 
 13 | #include "dau_conv/base_dau_conv_layer.hpp"
 14 | 
 15 | 
 16 | // we will be using base classes from DAUConvNet
 17 | using DAUConvNet::DAUConvSettings;
 18 | 
 19 | using DAUConvNet::BaseDAUConvLayer;
 20 | using DAUConvNet::BaseDAUComponentInitializer;
 21 | 
 22 | using DAUConvNet::BaseDAUKernelCompute;
 23 | using DAUConvNet::BaseDAUKernelOutput;
 24 | using DAUConvNet::BaseDAUKernelParams;
 25 | using DAUConvNet::DAUException;
 26 | 
 27 | using namespace std;
 28 | using namespace tensorflow;
 29 | 
 30 | 
 31 | #define TENSOR_DATA_PTR(t, TYPE) (t == NULL ? NULL : reinterpret_cast<TYPE*>((t)->template flat<TYPE>().data()))
 32 | #define TENSOR_DATA_PTR_CONST(t, TYPE) (t == NULL ? NULL : reinterpret_cast<const TYPE*>((t)->template flat<TYPE>().data()))
 33 | 
 34 | ////////////////////////////////////////////////////////////////////////////////
 35 | // Tensorflow implementation of buffers used in DAUKernel*
 36 | 
 37 | template <typename Dtype>
 38 | class DAUKernelParamsTF : public  BaseDAUKernelParams<Dtype> {
 39 | public:
 40 | 	explicit DAUKernelParamsTF(OpKernelContext* context)
 41 | 		: context_(context){}
 42 | 
 43 |     virtual ~DAUKernelParamsTF();
 44 | 
 45 |     void reshape(int num_in_channels, int num_out_channels, int num_gauss);
 46 | 
 47 | 	void initialize_params(Tensor w, Tensor mu1, Tensor mu2, Tensor sigma);
 48 | 
 49 | 	Tensor* weight_=  NULL;
 50 |     Tensor* mu1_= NULL;
 51 |     Tensor* mu2_ = NULL;
 52 |     Tensor* sigma_= NULL;
 53 | 
 54 | private:
 55 | 	OpKernelContext* context_ = NULL;
 56 | 
 57 | };
 58 | 
 59 | 
 60 | template <typename Dtype>
 61 | class DAUKernelOutputTF : public BaseDAUKernelOutput<Dtype> {
 62 | public:
 63 | 	explicit DAUKernelOutputTF(OpKernelContext* context)
 64 | 	: context_(context){}
 65 | 
 66 |     virtual ~DAUKernelOutputTF();
 67 | 
 68 | 	virtual void reshape(int num_in_channels, int num_out_channels, int num_gauss, int kernel_h, int kernel_w);
 69 | 
 70 | 	// main filter weights
 71 | 	Tensor* weight_ = NULL;
 72 | 
 73 | 	// derivative weights for back-propagation and all four parameters
 74 | 	Tensor* d_error_ = NULL;
 75 | 	Tensor* d_params_ = NULL; // four params == [w,mu1,mu2,sigma]
 76 | 
 77 | private:
 78 | 	OpKernelContext* context_ = NULL;
 79 | 
 80 | };
 81 | 
 82 | template <typename Dtype>
 83 | class DAUKernelComputeTF : public BaseDAUKernelCompute<Dtype> {
 84 | public:
 85 | 	explicit DAUKernelComputeTF(OpKernelContext* context)
 86 | 		: context_(context){}
 87 | 
 88 | 	virtual ~DAUKernelComputeTF();
 89 | 
 90 | 	virtual void reshape(int num_in_channels, int num_out_channels, int num_gauss,
 91 | 						 int kernel_h, int kernel_w);
 92 | 
 93 | 
 94 | protected:
 95 | 	void create_precompute_index(const int index_size, const int kernel_size);
 96 | 
 97 | 	// intermediate buffers when computing derivative kernels in precompute_guassian_weights_gpu
 98 | 	// temporary buffers for pre-computed sigma^2, sigma^3 and 1/2*sigma^2
 99 | 	vector<Tensor*> param_buffers_;
100 | 	vector<Tensor*> kernels_buffers_;
101 | 
102 | 	// pre-computed indexes for caffe_gpu_sum in get_kernels
103 | 	Tensor* tmp_precomp_index_ = NULL;
104 | 
105 | private:
106 | 	OpKernelContext* context_ = NULL;
107 | 
108 | };
109 | 
110 | 
111 | ////////////////////////////////////////////////////////////////////////////////
112 | // GPU version of Tensorflow buffers used in DAUKernel*
113 | 
114 | template <typename Dtype>
115 | class DAUKernelParamsTFGPU : public  DAUKernelParamsTF<Dtype> {
116 | public:
117 |     explicit DAUKernelParamsTFGPU(OpKernelContext* context)
118 |     : DAUKernelParamsTF<Dtype>(context), context_(context){}
119 | 
120 | 	virtual Dtype* weight() { return TENSOR_DATA_PTR(this->weight_, Dtype); }
121 | 	virtual Dtype* mu1() { return TENSOR_DATA_PTR(this->mu1_, Dtype); }
122 | 	virtual Dtype* mu2() { return TENSOR_DATA_PTR(this->mu2_, Dtype); }
123 | 	virtual Dtype* sigma() { return TENSOR_DATA_PTR(this->sigma_, Dtype); }
124 | 
125 | private:
126 |     OpKernelContext* context_;
127 | 
128 | };
129 | 
130 | template <typename Dtype>
131 | class DAUKernelOutputTFGPU : public DAUKernelOutputTF<Dtype> {
132 | public:
133 |     explicit DAUKernelOutputTFGPU(OpKernelContext* context)
134 |     : DAUKernelOutputTF<Dtype>(context), context_(context){}
135 | 
136 |     virtual Dtype* weight() { return TENSOR_DATA_PTR(this->weight_, Dtype); }
137 | 	virtual Dtype* d_error() { return TENSOR_DATA_PTR(this->d_error_, Dtype); }
138 | 	virtual Dtype* d_params() { return TENSOR_DATA_PTR(this->d_params_, Dtype); }
139 | 
140 | private:
141 |     OpKernelContext* context_;
142 | 
143 | };
144 | 
145 | template <typename Dtype>
146 | class DAUKernelComputeTFGPU : public DAUKernelComputeTF<Dtype> {
147 | public:
148 | 
149 | 	explicit DAUKernelComputeTFGPU(OpKernelContext* context)
150 | 		: DAUKernelComputeTF<Dtype>(context), context_(context){}
151 | 
152 | 
153 | 	virtual Dtype* param_temp(typename BaseDAUKernelCompute<Dtype>::Param_IDX index) { return TENSOR_DATA_PTR(this->param_buffers_[index], Dtype); }
154 | 	virtual Dtype* kernels_temp(typename BaseDAUKernelCompute<Dtype>::Kernel_IDX index) { return TENSOR_DATA_PTR(this->param_buffers_[index], Dtype); }
155 | 	virtual int* precomp_index() { return TENSOR_DATA_PTR(this->tmp_precomp_index_, int); }
156 | 
157 | private:
158 |     OpKernelContext* context_;
159 | 
160 | };
161 | 
162 | //
163 | template <typename Dtype>
164 | class DAUKernelParamsTFCPU : public DAUKernelParamsTF<Dtype> {
165 | public:
166 | 
167 | 	virtual Dtype* weight() { return TENSOR_DATA_PTR(this->weight_, Dtype); }
168 | 	virtual Dtype* mu1() { return TENSOR_DATA_PTR(this->mu1_, Dtype); }
169 | 	virtual Dtype* mu2() { return TENSOR_DATA_PTR(this->mu2_, Dtype); }
170 | 	virtual Dtype* sigma() { return TENSOR_DATA_PTR(this->sigma_, Dtype); }
171 | };
172 | 
173 | template <typename Dtype>
174 | class DAUKernelOutputTFCPU : public DAUKernelOutputTF<Dtype> {
175 | public:
176 | 	virtual Dtype* weight() {return TENSOR_DATA_PTR(this->weight_, Dtype); }
177 | 	virtual Dtype* d_error() {return TENSOR_DATA_PTR(this->d_error_, Dtype); }
178 | 	virtual Dtype* d_params() {return TENSOR_DATA_PTR(this->d_params_, Dtype); }
179 | };
180 | 
181 | template <typename Dtype>
182 | class DAUKernelComputeTFCPU : public DAUKernelComputeTF<Dtype> {
183 | public:
184 | 
185 | 	virtual Dtype* param_temp(typename BaseDAUKernelCompute<Dtype>::Param_IDX index) { return TENSOR_DATA_PTR(this->param_buffers_[index], Dtype); }
186 | 	virtual Dtype* kernels_temp(typename BaseDAUKernelCompute<Dtype>::Kernel_IDX index) { return TENSOR_DATA_PTR(this->param_buffers_[index], Dtype); }
187 | 	virtual int* precomp_index() { return TENSOR_DATA_PTR(this->tmp_precomp_index_, Dtype); }
188 | 
189 | };
190 | 
191 | ////////////////////////////////////////////////////////////////////////////////
192 | // Tensorflow version of DAUComponentInitializer:
193 | //  - variable initialization happens directly in Python so we do not need it here
194 | //  - this implements empty initialization (no operation)
195 | 
196 | template <typename Dtype>
197 | class NullDAUComponentInitializerTensorflow : public BaseDAUComponentInitializer<Dtype> {
198 | public:
199 | 
200 | 	explicit NullDAUComponentInitializerTensorflow(){
201 | 	}
202 | 
203 | 	void InitializeParameters(const DAUConvSettings& settings, Dtype* w, Dtype* mu1, Dtype* mu2, Dtype* sigma, bool is_gpu_ptr,
204 |                                       int num_units_per_x, int num_units_per_y, int num_units_ignore,
205 | 									  int conv_in_channels, int conv_out_channels, int kernel_h, int kernel_w) const {};
206 | };
207 | 
208 | ////////////////////////////////////////////////////////////////////////////////
209 | // Tensorflow GPU version of DAUConvolution layer (BaseDAUConvLayer)
210 | 
211 | template <typename Dtype>
212 | class DAUConvLayerTensorflowGPU : public  BaseDAUConvLayer<Dtype> {
213 | public:
214 | 
215 | 	explicit DAUConvLayerTensorflowGPU(cublasHandle_t cublas_handle,OpKernelContext* context, bool ignore_edge_gradients = false)
216 | 			: BaseDAUConvLayer<Dtype>(cublas_handle, ignore_edge_gradients, true, false), context_(context),own_workspace_data(0), do_on_gpu_(true), cublasHandle(cublas_handle) {
217 | 
218 | 	}
219 | 
220 | 	virtual ~DAUConvLayerTensorflowGPU();
221 | 
222 | 	virtual void LayerSetUp(const DAUConvSettings& settings,
223 | 							const BaseDAUComponentInitializer<Dtype>& param_initializer,
224 | 							BaseDAUKernelCompute<Dtype>* kernel_compute,
225 | 							BaseDAUKernelParams<Dtype>* kernel_param,
226 | 							BaseDAUKernelOutput<Dtype>* kernel_output,
227 | 							const vector<int>& bottom_shape, int num_dau_units_ignore, bool in_train = true);
228 | 
229 | 	//Dtype* w, Dtype* mu1, Dtype* mu2, Dtype* sigma, bool is_gpu_ptr,
230 |     //                                                           int num_units_per_x, int num_units_per_y, int num_units_ignore,
231 |     //                                                           int conv_in_channels, int conv_out_channels, int kernel_h, int kernel_w
232 | 	virtual void InitializeFromInput(DAUConvSettings& settings, Tensor* w, Tensor* mu1, Tensor* mu2, Tensor* sigma);
233 | 	virtual void InitializeGrad(DAUConvSettings& settings, Tensor* w_grad, Tensor* mu1_grad, Tensor* mu2_grad, Tensor* sigma_grad);
234 | 	virtual vector<int> Reshape(const vector<int>& bottom_shape, const vector<int>& top);
235 | 
236 | 	// make compute_output_shape() public
237 | 	virtual void compute_output_shape() { return BaseDAUConvLayer<Dtype>::compute_output_shape(); }
238 | 
239 | 	void set_processing_on_gpu(bool do_on_gpu) { do_on_gpu_ = do_on_gpu; }
240 | 
241 |     void set_max_kernel_size(int kernel_w, int kernel_h) {
242 |         this->max_kernel_w_ = kernel_w;
243 |         this->max_kernel_h_ = kernel_h;
244 |     }
245 | 
246 | 	// parameters to learn
247 | 	const Tensor* param_buffer_w_ = NULL;
248 | 	const Tensor* param_buffer_mu1_ = NULL;
249 | 	const Tensor* param_buffer_mu2_ = NULL;
250 | 	const Tensor* param_buffer_sigma_ = NULL;
251 | 	const Tensor* param_buffer_bias_ = NULL;
252 |     Tensor* param_buffer_w_grad = NULL;
253 |     Tensor* param_buffer_mu1_grad = NULL;
254 |     Tensor* param_buffer_mu2_grad = NULL;
255 |     Tensor* param_buffer_sigma_grad = NULL;
256 |     Tensor* param_buffer_bias_grad = NULL;
257 | 
258 |     OpKernelContext* context_ = NULL;
259 | 	cublasHandle_t cublasHandle;
260 | 	
261 | protected:
262 | 	virtual bool is_data_on_gpu() { return do_on_gpu_; }
263 | 
264 |     virtual void reshape_params(const vector<int>& shape) ;
265 | 
266 | 	virtual bool update_prefiltering_kernels(cudaStream_t stream);
267 | 
268 |     virtual Dtype* param_w() { return (Dtype*)TENSOR_DATA_PTR_CONST(param_buffer_w_, Dtype); }
269 |     virtual Dtype* param_mu1() { return (Dtype*)TENSOR_DATA_PTR_CONST(param_buffer_mu1_, Dtype); }
270 |     virtual Dtype* param_mu2() { return (Dtype*)TENSOR_DATA_PTR_CONST(param_buffer_mu2_, Dtype); }
271 |     virtual Dtype* param_sigma() { return (Dtype*)TENSOR_DATA_PTR_CONST(param_buffer_sigma_, Dtype); }
272 |     virtual Dtype* param_bias() { return (Dtype*)TENSOR_DATA_PTR_CONST(param_buffer_bias_, Dtype); }
273 | 
274 | 
275 |     virtual Dtype* param_w_grad() { return TENSOR_DATA_PTR(param_buffer_w_grad, Dtype); }
276 |     virtual Dtype* param_mu1_grad() { return TENSOR_DATA_PTR(param_buffer_mu1_grad, Dtype); }
277 |     virtual Dtype* param_mu2_grad() { return TENSOR_DATA_PTR(param_buffer_mu2_grad, Dtype); }
278 |     virtual Dtype* param_sigma_grad(){ return TENSOR_DATA_PTR(param_buffer_sigma_grad, Dtype); }
279 |     virtual Dtype* param_bias_grad() { return TENSOR_DATA_PTR(param_buffer_bias_grad, Dtype); }
280 | 
281 |     // remaining intermediate/temporary buffers
282 |     virtual Dtype* temp_bwd_gradients() { return TENSOR_DATA_PTR(bwd_gradients_, Dtype); }
283 |     virtual Dtype* temp_interm_buffer() { return TENSOR_DATA_PTR(interm_buffer_, Dtype); }
284 |     virtual Dtype* temp_param_buffer() { return TENSOR_DATA_PTR(tmp_param_buffer_, Dtype); }
285 |     virtual Dtype* temp_col_buffer() { return TENSOR_DATA_PTR(col_buffer_, Dtype); }
286 |     virtual Dtype* temp_bias_multiplier() { return TENSOR_DATA_PTR(bias_multiplier_, Dtype); }
287 | 
288 | 	virtual void* allocate_workspace_mem(size_t bytes);
289 | 	virtual void deallocate_workspace_mem();
290 | 
291 | 	// accumulated gradients
292 | 	Tensor* bwd_gradients_ = NULL;
293 | 	// additional buffers
294 | 	Tensor* interm_buffer_ = NULL; // GPU only
295 | 	Tensor* tmp_param_buffer_ = NULL; // GPU and CPU
296 | 
297 | 	Tensor* col_buffer_= NULL; // CPU only
298 | 	Tensor* bias_multiplier_= NULL; // GPU and CPU
299 | 
300 | 	// workspace memory that we have allocated
301 | 	void* own_workspace_data = NULL;
302 |     //tensor that holds the workspace memory
303 |     Tensor* own_workspace_tensor = NULL;
304 | 
305 | 	bool do_on_gpu_;
306 | };
307 | 
308 | /**
309 |  * We use this exception in OP_REQUIRES_OK_THROW_EX macro to mark that exception has already
310 |  * been reported to tensorflow context using context->CtxFailureWithWarning(...)
311 |  */
312 | class DAUExceptionTF : public std::exception {
313 | public:
314 |     DAUExceptionTF() : std::exception() {}
315 | 
316 |     virtual const char* what() const noexcept {
317 |         return "TENSORFLOW reported status error";
318 |     }
319 | };
320 | 
321 | //OP_REQUIRES_OK uses return, problematic for compilation in non void functions
322 | #define OP_REQUIRES_OK_BREAK(CTX, ...)                      \
323 |   do {                                                      \
324 |     ::tensorflow::Status _s(__VA_ARGS__);                   \
325 |     if (!TF_PREDICT_TRUE(_s.ok())) {                        \
326 |       (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
327 |       break;                                                \
328 |     }                                                       \
329 |   } while (0)
330 | 
331 | #define OP_REQUIRES_OK_THROW_EX(CTX, ...)                      \
332 |   do {                                                      \
333 |     ::tensorflow::Status _s(__VA_ARGS__);                   \
334 |     if (!TF_PREDICT_TRUE(_s.ok())) {                        \
335 |       (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
336 |       throw new DAUExceptionTF();                            \
337 |     }                                                       \
338 |   } while (0)
339 | 
340 | #endif  // CAFFE_DAU_CONV_LAYER_HPP_
341 | 


--------------------------------------------------------------------------------
/src/dau_conv/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # --[ DAU-ConvNet library
 3 | 
 4 | # creates 'test_srcs', 'srcs', 'test_cuda', 'cuda' lists
 5 | dau_conv_pickup_sources(${PROJECT_SOURCE_DIR})
 6 | 
 7 | set(cuda_objs "")
 8 | 
 9 | # make cuda objects
10 | dau_conv_cuda_compile(cuda_objs ${cuda})
11 | add_custom_target(dauc-conv-cu DEPENDS ${cuda_objs} SOURCES ${cuda})
12 | 
13 | # make objects from src
14 | add_library(dau-conv-obj OBJECT ${srcs} )
15 | dau_conv_default_properties(dau-conv-obj)
16 | target_include_directories(dau-conv-obj ${DAUConvNet_INCLUDE_DIRS}
17 |                                  PUBLIC
18 |                                  $<BUILD_INTERFACE:${DAUConvNet_INCLUDE_DIR}>
19 |                                  $<INSTALL_INTERFACE:include>)
20 | target_compile_definitions(dau-conv-obj ${DAUConvNet_DEFINITIONS})
21 | add_dependencies(dau-conv-obj dauc-conv-cu)
22 | if(DAUConvNet_COMPILE_OPTIONS)
23 |   target_compile_options(dau-conv-obj ${DAUConvNet_COMPILE_OPTIONS})
24 | endif()
25 | 
26 | list(APPEND DAUConvNet_OBJS ${cuda_objs} ${cuda} $<TARGET_OBJECTS:dau-conv-obj>)
27 | 
28 | # save list of .o objects (both src and cuda) so that parrent project can directly embedd into .so
29 | set(DAUConvNet_OBJS ${DAUConvNet_OBJS} PARENT_SCOPE)
30 | set(DAUConvNet_CU_OBJS ${cuda_objs} PARENT_SCOPE)
31 | set(DAUConvNet_CU_SRC ${cuda} PARENT_SCOPE)
32 | set(DAUConvNet_OBJ_TARGET "dau-conv-obj" PARENT_SCOPE)
33 | set(DAUConvNet_LINKER_LIBS ${DAUConvNet_LINKER_LIBS} PARENT_SCOPE)
34 | 
35 | # crate shared object
36 | add_library(dau-conv ${DAUConvNet_OBJS})
37 | target_link_libraries(dau-conv ${DAUConvNet_LINKER_LIBS})
38 | 
39 | 
40 | 
41 | add_executable(main ../main.cpp)
42 | target_link_libraries(main dau-conv)
43 | target_include_directories(main ${DAUConvNet_INCLUDE_DIRS} PUBLIC ${DAUConvNet_INCLUDE_DIR})
44 | dau_conv_default_properties(main)
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_16x16.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_16x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                     bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                     bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                     DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 16, 16, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_16x32.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_16x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                     bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                     bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                     DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 16, 32, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_16x64.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_16x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                     bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                     bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                     DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 16, 64, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_16x8.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_16x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                    bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                    bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                    DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 16, 8, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_1x1.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_1x1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                   bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                   bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                   DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     if (SMALLER_WARP_AND_GROUP_K) {
11 |         RUN_KERNEL_R4(DAUConvBackwardCUDA, 2, 2, MAX_OFFSET, 4, 4, 2, 2, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
12 |     } else {
13 |         RUN_KERNEL_R4(DAUConvBackwardCUDA, 2, 2, MAX_OFFSET, 3, 1, 2, 2, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
14 |     }
15 | }
16 | 
17 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_32x16.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_32x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                     bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                     bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                     DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 32, 16, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_32x32.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_32x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                     bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                     bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                     DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 32, 32, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_32x64.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_32x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                     bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                     bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                     DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 32, 64, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_32x8.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_32x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                    bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                    bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                    DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 32, 8, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_64x16.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_64x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                     bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                     bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                     DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 64, 16, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_64x32.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_64x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                     bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                     bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                     DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 64, 32, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_64x64.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_64x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                     bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                     bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                     DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 64, 64, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_64x8.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_64x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                    bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                    bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                    DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 64, 8, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_8x16.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_8x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                    bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                    bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                    DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 8, 16, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_8x32.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_8x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                    bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                    bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                    DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 8, 32, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_8x64.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_8x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                    bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                    bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                    DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 8, 64, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_backward_patch_8x8.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_backward_multi_subfeatures_patch_8x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET,
 6 |                                                   bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES,
 7 |                                                   bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,
 8 |                                                   DAUConvBackward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R5(DAUConvBackwardCUDA, 8, 8, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward.cpp:
--------------------------------------------------------------------------------
  1 | #include <cmath>
  2 | #include <algorithm>
  3 | 
  4 | #include "dau_conv/dau_conv_impl/dau_conv_forward.hpp"
  5 | 
  6 | #include "dau_conv/util/common.hpp"
  7 | 
  8 | namespace DAUConvNet {
  9 | 
 10 | #define MAX(x,y) (x > y ? x : y)
 11 | 
 12 | int select_optimal_block_size(int img_size, int min_power, int max_power) {
 13 | 	float best_unutilized_percent = 1.0f;
 14 | 	int best_block_size = 0;
 15 | 	for (int i = min_power; i <= max_power; ++i) {
 16 | 		int block_size = pow(2,i);
 17 | 
 18 | 		float utilization_factor = (img_size / (float)block_size);
 19 | 		float unutilized_percent = (ceil(utilization_factor) - utilization_factor);
 20 | 		if (unutilized_percent <= best_unutilized_percent) {
 21 | 			best_unutilized_percent = unutilized_percent;
 22 | 			best_block_size = block_size;
 23 | 		}
 24 | 	}
 25 | 	return best_block_size;
 26 | }
 27 | 
 28 | template <typename Dtype>
 29 | DAUConvForward<Dtype>::DAUConvForward(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const bool use_interpolation)  :
 30 | 		img_width_in(img_width_in), img_height_in(img_height_in), img_width(img_width), img_height(img_height), I(I), S(S), F(F), G(G), use_interpolation(use_interpolation) {
 31 | 
 32 |     // calls either DAUConvForwardCUDA->run_kernel() or DAUConvForwardCUDA->get_allocation_sizes()
 33 |     // if prepared_filtered_images_size, prepared_filter_weights_size OR prepared_filter_offsets_size are not NULL
 34 | 
 35 |     // decide which size of patch to use to minimize wasted memory/processing
 36 | 	if (img_width == 1 && img_height == 1) {
 37 | 		patch_size_w = 1;
 38 | 		patch_size_h = 1;
 39 | 	} else {
 40 | 		patch_size_w = img_width <= 8 ? 8 :
 41 | 					   	(img_width <= 16 ? 16 : select_optimal_block_size(img_width, 5, 6)); // allowed patch sizes = 2^[5,6] i.e, [32,64]
 42 | 		patch_size_h = img_height <= 8 ? 8 :
 43 | 						   (img_height <= 16 ? 16 : select_optimal_block_size(img_height, 5, 6)); // allowed patch sizes = 2^[5,6] i.e, [32,64]
 44 | 	}
 45 | 
 46 |     // decide wheather to use:
 47 |     //  - 32 pixels per warp
 48 |     // 		- if 32x8 pixels and 1 images per block (full utilization)
 49 |     //  - 16 pixels per warp
 50 |     // 		- if 16x8 pixels and 2 images per block (full utilization)
 51 |     // 		- if 16x8 pixels and 1 images per block (half utilization)
 52 | 	//  - 8 pixels per warp
 53 | 	// 		- if 8x8 pixels and 4 images per block (full utilization)
 54 | 	// 		- if 8x8 pixels and 2 images per block (half utilization)
 55 | 	// 		- if 8x8 pixels and 1 images per block (1/4 utilization)
 56 | 	//	- 1 pixel per warp
 57 | 	//		- if 1x1 pixels and 16 images per block (half utilization) (32 images uses too much shared memory so we cannot have full utilization)
 58 | 
 59 |     int boundry_img_width = img_width - floor(img_width/patch_size_w) * patch_size_w;
 60 | 
 61 | 
 62 | 	// use warp size 1x1 if patch size only 1x1 otherwise use [16,32]x8 (if patch_size_w==8 then use 8x8 but do not prefer it)
 63 | 	warp_pixel_size_x = patch_size_w == 1 ? 1 :
 64 | 							(patch_size_w <= 8 ? 8 : std::min(patch_size_w, select_optimal_block_size(boundry_img_width, 4,5))); // allowed warp pixels sizes = 2^[3,4,5] i.e. [8,16,32]
 65 | 	warp_pixel_size_y = patch_size_h == 1 ? 1 : 8;
 66 | 
 67 |     int new_img_parts_width = (int)ceil((float)img_width / patch_size_w);
 68 |     int new_img_parts_height = (int)ceil((float)img_height / patch_size_h);
 69 | 
 70 |     num_images = I * new_img_parts_width * new_img_parts_height;
 71 | 
 72 |     // we compute multiple features by one thread but that depends on interpolation
 73 |     int batch_features = 8 * (use_interpolation ? 2 : 4);
 74 | 
 75 |     single_feature = F % batch_features == 0 ? false : true;
 76 |     single_subfeature = S % 2 == 0 ? false : true;
 77 | }
 78 | 
 79 | template <typename Dtype>
 80 | void DAUConvForward<Dtype>::CUDAParams::set_params_for_allocation_call(size_t *alloc_img, size_t *alloc_w, size_t *alloc_off) {
 81 |     this->alloc_img = alloc_img;
 82 |     this->alloc_w = alloc_w;
 83 |     this->alloc_off = alloc_off;
 84 | }
 85 | 
 86 | template <typename Dtype>
 87 | void DAUConvForward<Dtype>::CUDAParams::set_params_for_kernel_call(const Dtype *filtered_images,
 88 |                                                                      const Dtype *filter_offsets_float_x, const Dtype *filter_offsets_float_y,
 89 |                                                                      const Dtype *filter_weights, const PARAM_FORMAT param_format, const int kernel_w, const int kernel_h,
 90 |                                                                      const Dtype actual_max_offset, Dtype *output,
 91 |                                                                      Dtype *prepared_filtered_images,
 92 |                                                                      Dtype *prepared_filter_weights,
 93 |                                                                      int *prepared_filter_offsets,
 94 |                                                                      Dtype *prepared_filter_offsets_and_weights,
 95 |                                                                      cudaStream_t streamId) {
 96 |     this->filtered_images = filtered_images;
 97 |     this->filter_offsets_float_x = filter_offsets_float_x;
 98 |     this->filter_offsets_float_y = filter_offsets_float_y;
 99 |     this->filter_weights = filter_weights;
100 |     this->kernel_w = kernel_w;
101 |     this->kernel_h = kernel_h;
102 |     this->actual_max_offset = actual_max_offset;
103 |     this->param_format = param_format;
104 |     this->output = output;
105 |     this->prepared_filtered_images = prepared_filtered_images;
106 |     this->prepared_filter_weights = prepared_filter_weights;
107 |     this->prepared_filter_offsets = prepared_filter_offsets;
108 |     this->prepared_filter_offsets_and_weights = prepared_filter_offsets_and_weights;
109 |     this->streamId = streamId;
110 | }
111 | template <typename Dtype>
112 | void DAUConvForward<Dtype>::get_allocation_sizes(const int kernel_width, const int kernel_height, const bool offsets_already_centered,
113 |                                                    size_t* prepared_filtered_images_size,
114 |                                                    size_t* prepared_filter_weights_size,
115 |                                                    size_t* prepared_filter_offsets_size) {
116 | 
117 |     CUDAParams params(img_width_in, img_height_in, img_width, img_height, I, S, F, G, offsets_already_centered);
118 | 
119 |     params.set_params_for_allocation_call(prepared_filtered_images_size, prepared_filter_weights_size, prepared_filter_offsets_size);
120 | 
121 |     params.set_params_for_kernel_call(NULL, NULL, NULL, NULL, PARAM_FORMAT::SGF, kernel_width, kernel_height, (MAX(kernel_width, kernel_height)-1)/2, NULL,
122 |                                       NULL, NULL, NULL, NULL, 0);
123 | 
124 |     call_cuda_kernel(params);
125 | }
126 | 
127 | 
128 | template <typename Dtype>
129 | void DAUConvForward<Dtype>::forward_pass(const Dtype* filtered_images,
130 |                                            const Dtype* filter_offsets_float_x, const Dtype* filter_offsets_float_y,
131 |                                            const Dtype* filter_weights, const PARAM_FORMAT param_format,
132 | 										   const int kernel_width, const int kernel_height, const Dtype actual_max_offset,
133 | 										   const bool offsets_already_centered,
134 |                                            Dtype* output,
135 |                                            Dtype* prepared_filtered_images,
136 |                                            Dtype* prepared_filter_weights,
137 |                                            int* prepared_filter_offsets,
138 |                                            Dtype* prepared_filter_offsets_and_weights, cudaStream_t streamId) {
139 | 	// Optimize the max possible offset that is needed since larger offsets require loading more memory and is less efficent
140 | 
141 | 	// For offsets larger then 8 px then we need to :
142 | 	//  * for offsets <= 16px: use OUT_K = 3
143 | 	//  * for offsets <= 32px: use OUT_K = 1 and run several times for each K
144 | 	//
145 | 	// WARNING: this must be synced with RUN_KERNEL_R2 in dau_conv_backward_core.hpp
146 | 
147 | 	float max_offset = 32;
148 | 
149 | 	if (actual_max_offset <= 4)
150 | 		max_offset = 4;
151 | 	else if (actual_max_offset <= 8)
152 | 		max_offset = 8;
153 | 	else if (actual_max_offset <= 16) {
154 | 		max_offset = 16;
155 | 	} else if (actual_max_offset <= 32) {
156 | 		max_offset = 32;
157 | 	} else {
158 |         throw DAUException(string_format("ERROR: actual offsets larger then what CUDA memory allows (setup max_kernel_size and unit_border_bound correctly to avoid this)!!"));
159 |     }
160 | 
161 | 	// To ensure we have enough memory we require max_offset not to exceed kernel_width or kernel_height
162 | 	// since kernel_width and kernel_height are used in get_allocation_sizes()
163 | 	DAU_CHECK(kernel_width >= max_offset*2+1, "Maximum offset values exceeds boundries as defined by kernel_width.");
164 | 	DAU_CHECK(kernel_height >= max_offset*2+1, "Maximum offset values exceeds boundries as defined by kernel_height.");
165 | 
166 | 	CUDAParams params(img_width_in, img_height_in, img_width, img_height, I, S, F, G, offsets_already_centered);
167 | 
168 | 	params.set_params_for_allocation_call(NULL, NULL, NULL);
169 | 	params.set_params_for_kernel_call(filtered_images, filter_offsets_float_x, filter_offsets_float_y, filter_weights, param_format, kernel_width, kernel_height, max_offset, output,
170 | 									  prepared_filtered_images, prepared_filter_weights, prepared_filter_offsets, prepared_filter_offsets_and_weights,
171 | 									  streamId);
172 | 
173 |     call_cuda_kernel(params);
174 | }
175 | 
176 | template <>
177 | void DAUConvForward<float>::call_cuda_kernel(CUDAParams& params) {
178 | 
179 | 	int max_offset = ceil(params.actual_max_offset);
180 |     //int max_offset = MAX(params.kernel_w, params.kernel_h)/2;
181 | 
182 | 	if (max_offset <= 4) {
183 | 		if (single_feature == false && single_subfeature == false) {
184 | 			// version where single_feature is false and single_subfeature false
185 |             DAUConv_forward_float_off_4_single_feat_0_single_subfeat_0(patch_size_w, patch_size_h, max_offset,
186 |                                                                        warp_pixel_size_x, warp_pixel_size_y, num_images,
187 |                                                                        use_interpolation, params);
188 | 
189 | 		} else if (single_feature == false && single_subfeature == true) {
190 | 			// version where single_feature is false and single_subfeature true
191 | 			DAUConv_forward_float_off_4_single_feat_0_single_subfeat_1(patch_size_w, patch_size_h, max_offset,
192 | 																	   warp_pixel_size_x, warp_pixel_size_y, num_images,
193 | 																	   use_interpolation, params);
194 | 
195 | 		} else if (single_feature == true && single_subfeature == false) {
196 | 			// version where single_feature is true and single_subfeature false
197 | 			DAUConv_forward_float_off_4_single_feat_1_single_subfeat_0(patch_size_w, patch_size_h, max_offset,
198 | 																	   warp_pixel_size_x, warp_pixel_size_y, num_images,
199 | 																	   use_interpolation, params);
200 | 
201 | 		} else {
202 | 			// version where single_feature is true and single_subfeature true
203 | 			DAUConv_forward_float_off_4_single_feat_1_single_subfeat_1(patch_size_w, patch_size_h, max_offset,
204 | 																	   warp_pixel_size_x, warp_pixel_size_y, num_images,
205 | 																	   use_interpolation, params);
206 | 		}
207 | 	} else if (max_offset <= 8) {
208 | 		if (single_feature == false && single_subfeature == false) {
209 | 			// version where single_feature is false and single_subfeature false
210 | 			DAUConv_forward_float_off_8_single_feat_0_single_subfeat_0(patch_size_w, patch_size_h, max_offset,
211 | 																	   warp_pixel_size_x, warp_pixel_size_y, num_images,
212 | 																	   use_interpolation, params);
213 | 
214 | 		} else if (single_feature == false && single_subfeature == true) {
215 | 			// version where single_feature is false and single_subfeature true
216 | 			DAUConv_forward_float_off_8_single_feat_0_single_subfeat_1(patch_size_w, patch_size_h, max_offset,
217 | 																	   warp_pixel_size_x, warp_pixel_size_y, num_images,
218 | 																	   use_interpolation, params);
219 | 
220 | 		} else if (single_feature == true && single_subfeature == false) {
221 | 			// version where single_feature is true and single_subfeature false
222 | 			DAUConv_forward_float_off_8_single_feat_1_single_subfeat_0(patch_size_w, patch_size_h, max_offset,
223 | 																	   warp_pixel_size_x, warp_pixel_size_y, num_images,
224 | 																	   use_interpolation, params);
225 | 
226 | 		} else {
227 | 			// version where single_feature is true and single_subfeature true
228 | 			DAUConv_forward_float_off_8_single_feat_1_single_subfeat_1(patch_size_w, patch_size_h, max_offset,
229 | 																	   warp_pixel_size_x, warp_pixel_size_y, num_images,
230 | 																	   use_interpolation, params);
231 | 		}
232 | 	} else if (max_offset <= 16) {
233 | 
234 |         if (single_feature == false)
235 |             DAUConv_forward_float_off_16_single_feat_0_single_subfeat_1(patch_size_w, patch_size_h, max_offset,
236 |                                                                         warp_pixel_size_x, warp_pixel_size_y, num_images,
237 |                                                                         use_interpolation, params);
238 |         else
239 |             DAUConv_forward_float_off_16_single_feat_1_single_subfeat_1(patch_size_w, patch_size_h, max_offset,
240 |                                                                         warp_pixel_size_x, warp_pixel_size_y, num_images,
241 |                                                                         use_interpolation, params);
242 | 
243 | 	} else if (max_offset <= 32) {
244 |         DAUConv_forward_float_off_32_single_feat_1_single_subfeat_1(patch_size_w, patch_size_h, max_offset,
245 |                                                                     warp_pixel_size_x, warp_pixel_size_y, num_images,
246 |                                                                     use_interpolation, params);
247 | 
248 | 
249 |     } else {
250 |         throw DAUException(string_format("Unsupported filter size: %d. Supported only max up to 9x9 and 17x17 at the moment", max_offset));
251 | 	}
252 | 
253 | 
254 | 
255 | 	// CALL RUN_KERNEL_R4 macro that will call run_kernel() function on supplied class where first 4 parameters are replaced with compile-time known variables
256 | 	// replacing variables with compile-time known values allows CUDA compiler to generate kernels in advanced with pre-defined sizes
257 | /*
258 | 	RUN_KERNEL_R7(DAUConvForwardCUDA, patch_size_w, patch_size_h, max_offset, warp_pixel_size_x, num_images, use_interpolation, single_feature, single_subfeature,
259 | 				  img_width, img_height, I, S, F, G,
260 | 				  filtered_images, filter_offsets_float_x, filter_offsets_float_y, filter_weights, kernel_width, kernel_height, PARAM_FORMAT, output,
261 | 				  prepared_filtered_images, prepared_filter_weights, prepared_filter_offsets, prepared_filter_offsets_and_weights,
262 | 				  streamId);
263 | */
264 | }
265 | 
266 | template <>
267 | void DAUConvForward<double>::call_cuda_kernel(CUDAParams& params) {
268 |     throw DAUException("Not implemented for double");
269 | }
270 | 
271 | template DAUConvForward<float>::DAUConvForward(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const bool use_interpolation);
272 | template DAUConvForward<double>::DAUConvForward(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const bool use_interpolation);
273 | 
274 | template void DAUConvForward<float>::get_allocation_sizes(const int kernel_width, const int kernel_height, const bool offsets_already_centered, size_t* prepared_filtered_images_size, size_t* prepared_filter_weights_size, size_t* prepared_filter_offsets_size);
275 | template void DAUConvForward<double>::get_allocation_sizes(const int kernel_width, const int kernel_height, const bool offsets_already_centered, size_t* prepared_filtered_images_size, size_t* prepared_filter_weights_size, size_t* prepared_filter_offsets_size);
276 | 
277 | template void DAUConvForward<float>::forward_pass(const float* filtered_images, const float* filter_offsets_float_x, const float* filter_offsets_float_y, const float* filter_weights, const PARAM_FORMAT param_format, const int kernel_width, const int kernel_height, const float actual_max_offset, const bool offsets_already_centered, float* output, float* prepared_filtered_images, float* prepared_filter_weights, int* prepared_filter_offsets, float* prepared_filter_offsets_and_weights, cudaStream_t streamId);
278 | template void DAUConvForward<double>::forward_pass(const double* filtered_images, const double* filter_offsets_float_x, const double* filter_offsets_float_y, const double* filter_weights, const PARAM_FORMAT param_format, const int kernel_width, const int kernel_height, const double actual_max_offset, const bool offsets_already_centered, double* output, double* prepared_filtered_images, double* prepared_filter_weights, int* prepared_filter_offsets, double* prepared_filter_offsets_and_weights, cudaStream_t streamId);
279 | 
280 | }  // namespace caffe
281 | 
282 | 
283 | 


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward_off16_s0_f1.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_forward_float_off_16_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H,
 6 |                                                                 int MAX_OFFSET_, int WARP_PIXELS_X, int WARP_PIXELS_Y,
 7 |                                                                 int BLOCK_IMAGES, int USE_INTERPOLATION,
 8 |                                                                 DAUConvForward<float>::CUDAParams &PARAMS){
 9 | #define SINGLE_FEATURE false
10 | #define SINGLE_SUBFEATURE true
11 | #define MAX_OFFSET 16
12 | 
13 |     if (IMG_PATCH_SIZE_W == 1 && IMG_PATCH_SIZE_H == 1 && WARP_PIXELS_X == 1 && WARP_PIXELS_Y == 1) {
14 |         if (BLOCK_IMAGES % 2 == 0) {
15 | 		    RUN_KERNEL_R1(DAUConvForwardCUDA, 2, 1, MAX_OFFSET, 2, 1, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS);
16 |         } else {
17 |             RUN_KERNEL_R1(DAUConvForwardCUDA, 4, 1, MAX_OFFSET, 4, 1, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
18 | 		    /*printf("Unsupported BATCH SIZE for 1x1 pixels: Supported only a multiple of 16 (at MAX_OFFSET<=4), 8 (at MAX_OFFSET<=8) or 4 images at the moment\n"); */
19 |             /*throw std::exception();*/
20 |         }
21 |     } else if (IMG_PATCH_SIZE_W == 8 && WARP_PIXELS_X == 8) {
22 |         /* We have 8px WARP_PIXELS_X sizes only for smaller patch sizes - but check just in case (fixing IMG_PATCH_SIZE_W avoids unneeded computation as well) */
23 |         if (BLOCK_IMAGES % 2 == 0) {
24 | 		    RUN_KERNEL_R2(DAUConvForwardCUDA, 8, IMG_PATCH_SIZE_H, MAX_OFFSET, 8, 8, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
25 | 	    } else {
26 | 		    RUN_KERNEL_R2(DAUConvForwardCUDA, 8, IMG_PATCH_SIZE_H, MAX_OFFSET, 8, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
27 |         }
28 |     } else if (WARP_PIXELS_X == 16) {
29 | 		if (BLOCK_IMAGES % 2 == 0) {
30 | 		    RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
31 | 	    } else {
32 | 		    RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
33 |         }
34 |     } else if (WARP_PIXELS_X == 32)  {
35 |         RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 32, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
36 | 	} else {
37 | 		printf("Unsupported WARP_PIXELS_X: %d. Supported only 16 or 32 at the moment (or 1 when WARP_PIXELS_Y==1 as well) \n", WARP_PIXELS_X);
38 |         throw std::exception();
39 | 	}
40 | }
41 | 
42 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward_off16_s1_f1.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_forward_float_off_16_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H,
 6 |                                                                 int MAX_OFFSET_, int WARP_PIXELS_X, int WARP_PIXELS_Y,
 7 |                                                                 int BLOCK_IMAGES, int USE_INTERPOLATION,
 8 |                                                                 DAUConvForward<float>::CUDAParams &PARAMS){
 9 | #define SINGLE_FEATURE true
10 | #define SINGLE_SUBFEATURE true
11 | #define MAX_OFFSET 16
12 | 
13 |     if (IMG_PATCH_SIZE_W == 1 && IMG_PATCH_SIZE_H == 1 && WARP_PIXELS_X == 1 && WARP_PIXELS_Y == 1) {
14 |         if (BLOCK_IMAGES % 2 == 0) {
15 | 		    RUN_KERNEL_R1(DAUConvForwardCUDA, 2, 1, MAX_OFFSET, 2, 1, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS);
16 |         } else {
17 |             RUN_KERNEL_R1(DAUConvForwardCUDA, 4, 1, MAX_OFFSET, 4, 1, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
18 | 		    /*printf("Unsupported BATCH SIZE for 1x1 pixels: Supported only a multiple of 16 (at MAX_OFFSET<=4), 8 (at MAX_OFFSET<=8) or 4 images at the moment\n"); */
19 |             /*throw std::exception();*/
20 |         }
21 |     } else if (IMG_PATCH_SIZE_W == 8 && WARP_PIXELS_X == 8) {
22 |         /* We have 8px WARP_PIXELS_X sizes only for smaller patch sizes - but check just in case (fixing IMG_PATCH_SIZE_W avoids unneeded computation as well) */
23 |         if (BLOCK_IMAGES % 2 == 0) {
24 | 		    RUN_KERNEL_R2(DAUConvForwardCUDA, 8, IMG_PATCH_SIZE_H, MAX_OFFSET, 8, 8, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
25 | 	    } else {
26 | 		    RUN_KERNEL_R2(DAUConvForwardCUDA, 8, IMG_PATCH_SIZE_H, MAX_OFFSET, 8, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
27 |         }
28 |     } else if (WARP_PIXELS_X == 16) {
29 | 		if (BLOCK_IMAGES % 2 == 0) {
30 | 		    RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
31 | 	    } else {
32 | 		    RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
33 |         }
34 |     } else if (WARP_PIXELS_X == 32)  {
35 |         RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 32, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
36 | 	} else {
37 | 		printf("Unsupported WARP_PIXELS_X: %d. Supported only 16 or 32 at the moment (or 1 when WARP_PIXELS_Y==1 as well) \n", WARP_PIXELS_X);
38 |         throw std::exception();
39 | 	}
40 | }
41 | 
42 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward_off32_s1_f1.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_forward_float_off_32_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H,
 6 |                                                                 int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y,
 7 |                                                                 int BLOCK_IMAGES, int USE_INTERPOLATION,
 8 |                                                                 DAUConvForward<float>::CUDAParams &PARAMS){
 9 | #define SINGLE_FEATURE true
10 | #define SINGLE_SUBFEATURE true
11 | #define MAX_OFFSET 32
12 | 
13 |     if (IMG_PATCH_SIZE_W == 1 && IMG_PATCH_SIZE_H == 1 && WARP_PIXELS_X == 1 && WARP_PIXELS_Y == 1) {
14 |         RUN_KERNEL_R1(DAUConvForwardCUDA, 4, 1, MAX_OFFSET, 4, 1, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
15 |     } else if (IMG_PATCH_SIZE_W == 8 && WARP_PIXELS_X == 8) {
16 |         RUN_KERNEL_R2(DAUConvForwardCUDA, 8, IMG_PATCH_SIZE_H, MAX_OFFSET, 8, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
17 |     } else if (WARP_PIXELS_X == 16) {
18 |         RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
19 |     } else if (WARP_PIXELS_X == 32)  {
20 |         RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS)
21 |     } else {
22 |         printf("Unsupported WARP_PIXELS_X: %d. Supported only 16 or 32 at the moment (or 1 when WARP_PIXELS_Y==1 as well) \n", WARP_PIXELS_X);
23 |         throw std::exception();
24 |     }
25 |     //RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 32, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, false, false, PARAMS);
26 | }
27 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward_off4_s0_f0.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_forward_float_off_4_single_feat_0_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H,
 6 |                                                                 int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y,
 7 |                                                                 int BLOCK_IMAGES, int USE_INTERPOLATION,
 8 |                                                                 DAUConvForward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 4, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, false, false, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward_off4_s0_f1.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_forward_float_off_4_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H,
 6 |                                                                 int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y,
 7 |                                                                 int BLOCK_IMAGES, int USE_INTERPOLATION,
 8 |                                                                 DAUConvForward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 4, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, false, true, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward_off4_s1_f0.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_forward_float_off_4_single_feat_1_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H,
 6 |                                                                 int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y,
 7 |                                                                 int BLOCK_IMAGES, int USE_INTERPOLATION,
 8 |                                                                 DAUConvForward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 4, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, true, false, PARAMS);
11 | }
12 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward_off4_s1_f1.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_forward_float_off_4_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H,
 6 |                                                                 int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y,
 7 |                                                                 int BLOCK_IMAGES, int USE_INTERPOLATION,
 8 |                                                                 DAUConvForward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 4, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, true, true, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward_off8_s0_f0.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_forward_float_off_8_single_feat_0_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H,
 6 |                                                                 int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y,
 7 |                                                                 int BLOCK_IMAGES, int USE_INTERPOLATION,
 8 |                                                                 DAUConvForward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 8, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, false, false, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward_off8_s0_f1.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_forward_float_off_8_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H,
 6 |                                                                 int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y,
 7 |                                                                 int BLOCK_IMAGES, int USE_INTERPOLATION,
 8 |                                                                 DAUConvForward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 8, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, false, true, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward_off8_s1_f0.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_forward_float_off_8_single_feat_1_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H,
 6 |                                                                 int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y,
 7 |                                                                 int BLOCK_IMAGES, int USE_INTERPOLATION,
 8 |                                                                 DAUConvForward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 8, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, true, false, PARAMS);
11 | }
12 | }


--------------------------------------------------------------------------------
/src/dau_conv/dau_conv_impl/dau_conv_forward_off8_s1_f1.cu:
--------------------------------------------------------------------------------
 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp"
 2 | 
 3 | namespace  DAUConvNet {
 4 | 
 5 | void DAUConv_forward_float_off_8_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H,
 6 |                                                                 int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y,
 7 |                                                                 int BLOCK_IMAGES, int USE_INTERPOLATION,
 8 |                                                                 DAUConvForward<float>::CUDAParams &PARAMS){
 9 | 
10 |     RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 8, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, true, true, PARAMS);
11 | }
12 | 
13 | }


--------------------------------------------------------------------------------
/src/dau_conv/util/common.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by domen on 3/23/18.
 3 | //
 4 | 
 5 | #include "dau_conv/util/common.hpp"
 6 | 
 7 | void __M_Assert(const char* expr_str, bool expr, const char* file, int line, const char* msg) {
 8 |     if (!expr) {
 9 |         std::cerr << "Assert failed:\t" << msg << "\n"
10 |                   << "Expected:\t" << expr_str << "\n"
11 |                   << "Source:\t\t" << file << ", line " << line << "\n";
12 |         abort();
13 |     }
14 | }
15 | 
16 | namespace DAUConvNet {
17 |     const char *cublasGetErrorString(cublasStatus_t error) {
18 |         switch (error) {
19 |             case CUBLAS_STATUS_SUCCESS:
20 |                 return "CUBLAS_STATUS_SUCCESS";
21 |             case CUBLAS_STATUS_NOT_INITIALIZED:
22 |                 return "CUBLAS_STATUS_NOT_INITIALIZED";
23 |             case CUBLAS_STATUS_ALLOC_FAILED:
24 |                 return "CUBLAS_STATUS_ALLOC_FAILED";
25 |             case CUBLAS_STATUS_INVALID_VALUE:
26 |                 return "CUBLAS_STATUS_INVALID_VALUE";
27 |             case CUBLAS_STATUS_ARCH_MISMATCH:
28 |                 return "CUBLAS_STATUS_ARCH_MISMATCH";
29 |             case CUBLAS_STATUS_MAPPING_ERROR:
30 |                 return "CUBLAS_STATUS_MAPPING_ERROR";
31 |             case CUBLAS_STATUS_EXECUTION_FAILED:
32 |                 return "CUBLAS_STATUS_EXECUTION_FAILED";
33 |             case CUBLAS_STATUS_INTERNAL_ERROR:
34 |                 return "CUBLAS_STATUS_INTERNAL_ERROR";
35 | #if CUDA_VERSION >= 6000
36 |             case CUBLAS_STATUS_NOT_SUPPORTED:
37 |                 return "CUBLAS_STATUS_NOT_SUPPORTED";
38 | #endif
39 | #if CUDA_VERSION >= 6050
40 |             case CUBLAS_STATUS_LICENSE_ERROR:
41 |                 return "CUBLAS_STATUS_LICENSE_ERROR";
42 | #endif
43 |         }
44 |         return "Unknown cublas status";
45 |     }
46 | }


--------------------------------------------------------------------------------
/src/dau_conv/util/convolve.cpp:
--------------------------------------------------------------------------------
  1 | /*******************************************************
  2 |  * Copyright (c) 2014, ArrayFire
  3 |  * All rights reserved.
  4 |  *
  5 |  * This file is distributed under 3-clause BSD license.
  6 |  * The complete license agreement can be obtained at:
  7 |  * http://arrayfire.com/licenses/BSD-3-Clause
  8 |  ********************************************************/
  9 | 
 10 | #include "dau_conv/util/convolve.hpp"
 11 | 
 12 | #include <cassert>
 13 | 
 14 | namespace DAUConvNet
 15 | {
 16 | 
 17 | 
 18 |     template<typename Dtype, size_t baseDim, bool expand>
 19 |     void convolve(Dtype* out, const conv2_data_desc& out_desc,
 20 |                   const Dtype* signal, const conv2_data_desc& signal_desc,
 21 |                   const Dtype* filter, const conv2_data_desc& filter_desc,
 22 |                   cudaStream_t streamId ) {
 23 | 
 24 |         AF_BATCH_KIND kind;
 25 |         size_t sn = sizeof(signal_desc.dims) / sizeof(int);
 26 |         size_t fn = sizeof(filter_desc.dims) / sizeof(int);
 27 | 
 28 |         bool sn_stop = false, fn_stop = false;
 29 |         for (int i = 0; i < 4; ++i) {
 30 |             if (signal_desc.dims[i] <= 1 && !sn_stop)
 31 |                 sn--;
 32 |             else
 33 |                 sn_stop = true;
 34 | 
 35 |             if (filter_desc.dims[i] <= 1 && !fn_stop)
 36 |                 fn--;
 37 |             else
 38 |                 fn_stop = true;
 39 |         }
 40 | 
 41 |         if (sn == baseDim && fn == baseDim)
 42 |             kind = AF_BATCH_NONE;
 43 |         else if (sn == baseDim && (fn > baseDim && fn <= 4))
 44 |             kind = AF_BATCH_RHS;
 45 |         else if ((sn > baseDim && sn <= 4) && fn == baseDim)
 46 |             kind = AF_BATCH_LHS;
 47 |         else if ((sn > baseDim && sn <= 4) && (fn > baseDim && fn <= 4)) {
 48 |             bool doesDimensionsMatch = true;
 49 |             bool isInterleaved = true;
 50 |             for (int i = 3-baseDim; i >= 0; i--) {
 51 |                 doesDimensionsMatch &= (signal_desc.dims[i] == filter_desc.dims[i]);
 52 |                 isInterleaved &= (signal_desc.dims[i] == 1 || filter_desc.dims[i] == 1 || signal_desc.dims[i] == filter_desc.dims[i]);
 53 |             }
 54 |             if (doesDimensionsMatch) kind = AF_BATCH_SAME;
 55 |             else kind = (isInterleaved ? AF_BATCH_DIFF : AF_BATCH_UNSUPPORTED);
 56 |         } else
 57 |             kind = AF_BATCH_UNSUPPORTED;
 58 | 
 59 |         assert(kind != AF_BATCH_UNSUPPORTED && !(kind == AF_BATCH_DIFF && fn == 4));
 60 | 
 61 | 
 62 | 
 63 |         conv2_data_desc out_new_desc;
 64 |         if (expand) {
 65 |             for(size_t d=0; d<4; ++d) {
 66 |                 if (kind==AF_BATCH_NONE || kind==AF_BATCH_RHS) {
 67 |                     out_new_desc.dims[d] = signal_desc.dims[d]+filter_desc.dims[d]-1;
 68 |                 } else {
 69 |                     out_new_desc.dims[d] = (d>=baseDim ? signal_desc.dims[d]+filter_desc.dims[d]-1 : signal_desc.dims[d]);
 70 |                 }
 71 |             }
 72 |         } else {
 73 |             out_new_desc = signal_desc;
 74 |             if (kind==AF_BATCH_RHS) {
 75 |                 for (size_t i=0; i<4- baseDim; ++i) {
 76 |                     out_new_desc.dims[i] = filter_desc.dims[i];
 77 |                 }
 78 |             } else if (kind == AF_BATCH_DIFF) {
 79 |                 for (size_t i=0; i<4- baseDim; ++i) {
 80 |                     out_new_desc.dims[i] = signal_desc.dims[i] != 1 ? signal_desc.dims[i] : filter_desc.dims[i];
 81 |                 }
 82 |             }
 83 |         }
 84 | 
 85 |         // ensure output of correct size or reshape
 86 |         bool reshape = false;
 87 |         for (size_t i=0; i<4; ++i) {
 88 |             reshape = reshape || out_new_desc.dims[i] != out_desc.dims[i];
 89 | 
 90 |         }
 91 |         if (reshape) {
 92 |             // out shape not consistent !!
 93 |             printf("Invalid output shape size, expetced shape size: %d,%d,%d,%d.\n", out_new_desc.dims[0], out_new_desc.dims[1], out_new_desc.dims[2], out_new_desc.dims[3]);
 94 |             throw std::exception();
 95 |         }
 96 | 
 97 | 
 98 |         kernel::convolve_nd<Dtype, baseDim, expand>(out, out_desc,
 99 |                                                     signal, signal_desc,
100 |                                                     filter, filter_desc, kind, streamId);
101 |     }
102 | 
103 | 
104 |     template<typename Dtype>
105 |     void caffe_gpu_convolve2(Dtype* out, const conv2_data_desc& out_desc,
106 |                              const Dtype* signal, const conv2_data_desc& signal_desc,
107 |                              const Dtype* filter, const conv2_data_desc& filter_desc, cudaStream_t streamId ) {
108 |         return convolve<Dtype, 2, false>(out, out_desc,
109 |                                          signal, signal_desc,
110 |                                          filter, filter_desc,  streamId);
111 |     }
112 | 
113 |     template void caffe_gpu_convolve2<float>(float * out, const conv2_data_desc& out_desc,
114 |                                              const float* signal, const conv2_data_desc& signal_desc,
115 |                                              const float* filter, const conv2_data_desc& filter_desc, cudaStream_t streamId );
116 | 
117 |     template<>
118 |     void caffe_gpu_convolve2<double>(double* out, const conv2_data_desc& out_desc,
119 |                                      const double* signal, const conv2_data_desc& signal_desc,
120 |                                      const double* filter, const conv2_data_desc& filter_desc, cudaStream_t streamId ) {
121 |         printf("Disabled compiling of caffe_gpu_convolve2 for double to speed-up compile.");
122 |         throw std::exception();
123 |     }
124 | }
125 | 


--------------------------------------------------------------------------------
/src/dau_conv/util/convolve.cu:
--------------------------------------------------------------------------------
  1 | /*******************************************************
  2 |  * Copyright (c) 2014, ArrayFire
  3 |  * All rights reserved.
  4 |  *
  5 |  * This file is distributed under 3-clause BSD license.
  6 |  * The complete license agreement can be obtained at:
  7 |  * http://arrayfire.com/licenses/BSD-3-Clause
  8 |  ********************************************************/
  9 | 
 10 | #include "dau_conv/util/convolve.hpp"
 11 | 
 12 | #include "dau_conv/util/common.hpp"
 13 | 
 14 | #include <cuda_runtime_api.h>
 15 | 
 16 | #include <cstdio>
 17 | 
 18 | namespace DAUConvNet
 19 | {
 20 | 
 21 | namespace kernel
 22 | {
 23 | 
 24 | #define divup(a, b) (((a)+(b)-1)/(b))
 25 | 
 26 | // we do not use CUDA_NUM_THREADS as 256 is more optimal for this function
 27 | static const int THREADS   = 256;
 28 | 
 29 | static const int THREADS_X = 16;
 30 | static const int THREADS_Y = 16;
 31 | 
 32 | //static const int CUBE_X    =  8;
 33 | //static const int CUBE_Y    =  8;
 34 | //static const int CUBE_Z    =  4;
 35 | 
 36 | // below shared MAX_*_LEN's are calculated based on
 37 | // a maximum shared memory configuration of 48KB per block
 38 | // considering complex types as well
 39 | static const int MAX_CONV1_FILTER_LEN = 129;
 40 | static const int MAX_CONV2_FILTER_LEN = 17;
 41 | 
 42 | 
 43 | // we shall declare the maximum size required of above all three cases
 44 | // and re-use the same constant memory locations for every case
 45 | __constant__ char cFilter[2*(2*(MAX_CONV1_FILTER_LEN-1)+CUDA_NUM_THREADS)*sizeof(double)];
 46 | 
 47 | 
 48 | template<typename T, bool expand, int fLen0, int fLen1, int fLen2, int fStr2>
 49 | __global__
 50 | void convolve2(T* out, const conv2_data_desc out_desc,
 51 |                const T* signal, const conv2_data_desc signal_desc,
 52 |                int nBBS0, int nBBS1, int o2, int o3, int s2, int s3)
 53 | {
 54 |     const size_t C_SIZE  = (THREADS_X+2*(fLen0-1))* (THREADS_Y+2*(fLen1-1));
 55 |     __shared__ T shrdMem[C_SIZE];
 56 | 
 57 |     const int radius0  = fLen0-1;
 58 |     const int radius1  = fLen1-1;
 59 |     const int padding0 = 2*radius0;
 60 |     const int padding1 = 2*radius1;
 61 |     const int shrdLen0 = THREADS_X + padding0;
 62 |     const int shrdLen1 = THREADS_Y + padding1;
 63 | 
 64 |     unsigned b0  = blockIdx.x / nBBS0;
 65 |     unsigned b1  = (blockIdx.y + blockIdx.z * gridDim.y) / nBBS1;
 66 |     T *dst = (T *)out+ (b0 * out_desc.strides[3-2] + /* activated with batched input signal */
 67 |                              o2 * out_desc.strides[3-2] + /* activated with batched input filter */
 68 |                              b1 * out_desc.strides[3-3] + /* activated with batched input signal */
 69 |                              o3 * out_desc.strides[3-3]); /* activated with batched input filter */
 70 | 
 71 |     const T *src = (const T *)signal + (b0 * signal_desc.strides[3-2] + /* activated with batched input signal */
 72 |                                             s2 * signal_desc.strides[3-2] + /* activated with batched input filter */
 73 |                                             b1 * signal_desc.strides[3-3] + /* activated with batched input signal */
 74 |                                             s3 * signal_desc.strides[3-3]); /* activated with batched input filter */
 75 | 
 76 |     const T *impulse  = (const T *)cFilter;
 77 | 
 78 |     int lx  = threadIdx.x;
 79 |     int ly  = threadIdx.y;
 80 |     int gx  = THREADS_X * (blockIdx.x-b0*nBBS0) + lx;
 81 |     int gy  = THREADS_Y * ((blockIdx.y + blockIdx.z * gridDim.y) -b1*nBBS1) + ly;
 82 | 
 83 |     if(b1 >= out_desc.dims[3-3])
 84 |         return;
 85 | 
 86 |     int s0 = signal_desc.strides[3-0];
 87 |     int s1 = signal_desc.strides[3-1];
 88 |     int d0 = signal_desc.dims[3-0];
 89 |     int d1 = signal_desc.dims[3-1];
 90 |     // below loops are traditional loops, they only run multiple
 91 |     // times filter length is more than launch size
 92 | #pragma unroll
 93 |     for (int b=ly, gy2=gy; b<shrdLen1; b+=THREADS_Y, gy2+=THREADS_Y) {
 94 |         int j = gy2-radius1;
 95 |         bool is_j  = j>=0 && j<d1;
 96 |         // move row_set THREADS_Y along coloumns
 97 | #pragma unroll
 98 |         for (int a=lx, gx2=gx; a<shrdLen0; a+=THREADS_X, gx2+=THREADS_X) {
 99 |             int i = gx2-radius0;
100 |             bool is_i  = i>=0 && i<d0;
101 |             shrdMem[b*shrdLen0+a] = (is_i && is_j ? src[i*s0+j*s1] : (T)0);
102 |         }
103 |     }
104 |     __syncthreads();
105 | 
106 |     if (gx<out_desc.dims[3-0] && gy<out_desc.dims[3-1]) {
107 |         int ci = lx + radius0 + (expand ? 0 : fLen0>>1);
108 |         int cj = ly + radius1 + (expand ? 0 : fLen1>>1);
109 | 
110 |         T accum[fLen2];
111 |         for (int fk = 0; fk < fLen2; ++fk) accum[fk] = T(0);
112 | #pragma unroll
113 |         for(int fj=0; fj<fLen1; ++fj) {
114 | #pragma unroll
115 |             for(int fi=0; fi<fLen0; ++fi) {
116 |                 T s_val = shrdMem[(cj-fj)*shrdLen0 + (ci-fi)];
117 | 
118 |                 for (int fk = 0; fk < fLen2; ++fk) {
119 |                     // CONVOLUTION
120 |                     //T f_val = impulse[fj * fLen0 + fi + fk * (fStr2)];
121 |                     // CORRELATION
122 |                     T f_val = impulse[(fLen0-1-fj) * fLen0 + (fLen1-1 - fi) + fk * (fStr2)];
123 | 
124 |                     accum[fk] = accum[fk] + s_val * f_val;
125 |                 }
126 |             }
127 |         }
128 |         for (int fk = 0; fk < fLen2; ++fk)
129 |             dst[gy*out_desc.strides[3-1]+gx + fk * out_desc.strides[3-2] ] = (T)accum[fk];
130 |     }
131 | }
132 | 
133 | __inline__ __device__
134 | int index(int i, int j, int k, int jstride, int kstride)
135 | {
136 |     return i+j*jstride+k*kstride;
137 | }
138 | 
139 | struct conv_kparam_t {
140 |     dim3              mBlocks;
141 |     dim3             mThreads;
142 |     size_t        mSharedSize;
143 |     int           mBlk_x;
144 |     int           mBlk_y;
145 |     bool       outHasNoOffset;
146 |     bool        inHasNoOffset;
147 |     bool     launchMoreBlocks;
148 |     int             o[3];
149 |     int             s[3];
150 | };
151 | 
152 | template<typename T>
153 | void prepareKernelArgs(conv_kparam_t &params, const int* oDims, const int* fDims, int baseDim)
154 | {
155 |     int batchDims[4] = {1, 1, 1, 1};
156 |     for(int i=0; i<4-baseDim; ++i) {
157 |         batchDims[i] = (params.launchMoreBlocks ? 1 : oDims[i]);
158 |     }
159 | 
160 |     const int maxBlocksY   = 64*1024-1; //cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1];
161 |     if (baseDim==1) {
162 |         // unsupported
163 |     } else if (baseDim==2) {
164 |         params.mThreads    = dim3(THREADS_X, THREADS_Y);
165 |         params.mBlk_x      = divup(oDims[3-0], params.mThreads.x);
166 |         params.mBlk_y      = divup(oDims[3-1], params.mThreads.y);
167 |         params.mBlocks     = dim3(params.mBlk_x * batchDims[3-2], params.mBlk_y * batchDims[3-3]);
168 |         params.mBlocks.z = divup(params.mBlocks.y, maxBlocksY);
169 |         params.mBlocks.y = divup(params.mBlocks.y, params.mBlocks.z);
170 |     } else if (baseDim==3) {
171 |         // unsupported
172 |     }
173 | }
174 | 
175 | 
176 | template<typename Dtype, bool expand, int f0, int f1, int f2>
177 | void conv2Helper(const conv_kparam_t &p,
178 |                  Dtype* out, const conv2_data_desc& out_desc,
179 |                  const Dtype* sig, const conv2_data_desc& sig_desc,
180 |                  cudaStream_t streamId)
181 | {
182 |     convolve2<Dtype, expand, f0, f1, f2, f0*f1><<<p.mBlocks, p.mThreads, 0, streamId>>>(out, out_desc, sig, sig_desc,
183 |             p.mBlk_x, p.mBlk_y, p.o[1], p.o[2], p.s[1], p.s[2]);
184 | 
185 |     CUDA_POST_KERNEL_CHECK;
186 | }
187 | 
188 | template<typename Dtype, bool expand, int f0, int f1>
189 | void conv2Helper(const conv_kparam_t &p,
190 |                  Dtype* out, const conv2_data_desc& out_desc,
191 |                  const Dtype* sig, const conv2_data_desc& sig_desc,
192 |                  int f2, cudaStream_t streamId)
193 | {
194 |     switch(f2) {
195 |         case  1: conv2Helper<Dtype, expand, f0,  f1, 1>(p, out, out_desc, sig, sig_desc, streamId); break;
196 |         case  3: conv2Helper<Dtype, expand, f0,  f1, 3>(p, out, out_desc, sig, sig_desc, streamId); break;
197 |         case  4: conv2Helper<Dtype, expand, f0,  f1, 4>(p, out, out_desc, sig, sig_desc, streamId); break;
198 |         default: printf("Unsupported filter batched filter third-dimention. Supported only [1 x K x K], [3 x K x K] and [4 x K x K].\n"); throw std::exception();
199 |     }
200 | }
201 | 
202 | template<typename Dtype, bool expand, int f0>
203 | void conv2Helper(const conv_kparam_t &p,
204 |                  Dtype* out, const conv2_data_desc& out_desc,
205 |                  const Dtype* sig, const conv2_data_desc& sig_desc,
206 |                  int f1, int f2, cudaStream_t streamId)
207 | {
208 |     switch(f1) {
209 |         case  1: conv2Helper<Dtype, expand, f0,  1>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
210 |         case  2: conv2Helper<Dtype, expand, f0,  2>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
211 |         case  3: conv2Helper<Dtype, expand, f0,  3>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
212 |         case  4: conv2Helper<Dtype, expand, f0,  4>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
213 |         case  5: conv2Helper<Dtype, expand, f0,  5>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
214 |         default: printf("Unsupported filter size in caffe_gpu_convolve2. Supported up to 5x5 when unmatched witdh/height sizes.\n"); throw std::exception();
215 |     }
216 | }
217 | 
218 | template<typename Dtype, bool expand>
219 | void conv2Helper(const conv_kparam_t &p,
220 |                  Dtype* out, const conv2_data_desc& out_desc,
221 |                  const Dtype* sig, const conv2_data_desc& sig_desc,
222 |                  int f0, int f1, int f2, cudaStream_t streamId)
223 | {
224 |     switch(f0) {
225 |         case  1: conv2Helper<Dtype, expand,  1>(p, out, out_desc, sig, sig_desc, f1, f2, streamId); break;
226 |         case  2: conv2Helper<Dtype, expand,  2>(p, out, out_desc, sig, sig_desc, f1, f2, streamId); break;
227 |         case  3: conv2Helper<Dtype, expand,  3>(p, out, out_desc, sig, sig_desc, f1, f2, streamId); break;
228 |         case  4: conv2Helper<Dtype, expand,  4>(p, out, out_desc, sig, sig_desc, f1, f2, streamId); break;
229 |         case  5: conv2Helper<Dtype, expand,  5>(p, out, out_desc, sig, sig_desc, f1, f2, streamId); break;
230 |         default: {
231 |                      if (f0==f1) {
232 |                          switch(f1) {
233 |                              case  6: conv2Helper<Dtype, expand,  6,  6>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
234 |                              case  7: conv2Helper<Dtype, expand,  7,  7>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
235 |                              case  8: conv2Helper<Dtype, expand,  8,  8>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
236 |                              case  9: conv2Helper<Dtype, expand,  9,  9>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
237 |                              case 10: conv2Helper<Dtype, expand, 10, 10>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
238 |                              case 11: conv2Helper<Dtype, expand, 11, 11>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
239 |                              case 12: conv2Helper<Dtype, expand, 12, 12>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
240 |                              case 13: conv2Helper<Dtype, expand, 13, 13>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
241 |                              case 14: conv2Helper<Dtype, expand, 14, 14>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
242 |                              case 15: conv2Helper<Dtype, expand, 15, 15>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
243 |                              case 16: conv2Helper<Dtype, expand, 16, 16>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
244 |                              case 17: conv2Helper<Dtype, expand, 17, 17>(p, out, out_desc, sig, sig_desc, f2, streamId); break;
245 |                              default: printf("Unsupported filter size in caffe_gpu_convolve2. Supported up to 17x17.\n"); throw std::exception();
246 |                          }
247 |                      } else {
248 |                          printf("Unsupported filter size in caffe_gpu_convolve2. Supported up to 5x5 when unmatched witdh/height sizes.\n"); throw std::exception();
249 |                      }
250 |                  } break;
251 |     }
252 | }
253 | 
254 | template<typename Dtype, bool expand>
255 | void convolve_2d(conv_kparam_t &p,
256 |                  Dtype* out, const conv2_data_desc& out_desc,
257 |                  const Dtype* signal, const conv2_data_desc& signal_desc,
258 |                  const Dtype* filt, const conv2_data_desc& filt_desc, cudaStream_t streamId)
259 | {
260 |     prepareKernelArgs<Dtype>(p, signal_desc.dims, filt_desc.dims, 2);
261 | 
262 |     int filterLen = filt_desc.dims[3-0] * filt_desc.dims[3-1];
263 | 
264 |     for (int b3=0; b3<filt_desc.dims[3-3]; ++b3) {
265 |         int f3Off = b3 * filt_desc.strides[3-3];
266 | 
267 |         if (filt_desc.strides[3-2] == filt_desc.dims[3-0] *  filt_desc.dims[3-1]) {
268 |             // if filter is not strrided we can use single kernel
269 | 
270 |             CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter,
271 |                                                filt+(f3Off),
272 |                                                filterLen * filt_desc.dims[3-2] *sizeof(Dtype),
273 |                                                0, cudaMemcpyDeviceToDevice, streamId));
274 | 
275 |             p.o[1] = 0;
276 |             p.o[2] = (p.outHasNoOffset ? 0 : b3);
277 |             p.s[1] = 0;
278 |             p.s[2] = (p.inHasNoOffset ? 0 : b3);
279 | 
280 |             conv2Helper<Dtype, expand>(p, out, out_desc, signal, signal_desc, filt_desc.dims[3-0], filt_desc.dims[3-1], filt_desc.dims[3-2], streamId);
281 | 
282 |         } else {
283 |             for (int b2=0; b2<filt_desc.dims[3-2]; ++b2) {
284 |                 int f2Off = b2 * filt_desc.strides[3-2];
285 | 
286 |                 // FIXME: if the filter array is strided, direct copy of symbols
287 |                 // might cause issues
288 |                 CUDA_CHECK(cudaMemcpyToSymbolAsync(kernel::cFilter,
289 |                                                    filt+(f2Off+f3Off),
290 |                                                    filterLen*sizeof(Dtype),
291 |                                                    0, cudaMemcpyDeviceToDevice, streamId));
292 | 
293 |                 p.o[1] = (p.outHasNoOffset ? 0 : b2);
294 |                 p.o[2] = (p.outHasNoOffset ? 0 : b3);
295 |                 p.s[1] = (p.inHasNoOffset ? 0 : b2);
296 |                 p.s[2] = (p.inHasNoOffset ? 0 : b3);
297 | 
298 |                 conv2Helper<Dtype, expand>(p, out, out_desc, signal, signal_desc, filt_desc.dims[3-0], filt_desc.dims[3-1], 1, streamId);
299 |             }
300 |         }
301 | 
302 | 
303 |     }
304 | }
305 | 
306 | 
307 | template<typename Dtype, int baseDim, bool expand>
308 | void convolve_nd(Dtype* out, const conv2_data_desc& out_desc,
309 |                  const Dtype* signal, const conv2_data_desc& signal_desc,
310 |                  const Dtype* filt, const conv2_data_desc& filt_desc,
311 |                  AF_BATCH_KIND kind, cudaStream_t streamId)
312 | {
313 |     bool callKernel = true;
314 | 
315 | 
316 |     int MCFL2 = kernel::MAX_CONV2_FILTER_LEN;
317 |     switch(baseDim) {
318 |         case 2: if ((filt_desc.dims[3]*filt_desc.dims[2]) > (MCFL2 * MCFL2)) callKernel = false; break;
319 |     }
320 | 
321 |     if (!callKernel) {
322 |         printf("Unsupported filter dimension. Supported only 2-dimensional filter with third dimension as batch.\n"); throw std::exception();
323 |     }
324 | 
325 |     conv_kparam_t param;
326 |     for (int i=0; i<3; ++i) {
327 |         param.o[i] = 0;
328 |         param.s[i] = 0;
329 |     }
330 |     param.launchMoreBlocks = kind==AF_BATCH_SAME || kind==AF_BATCH_RHS;
331 |     param.outHasNoOffset   = kind==AF_BATCH_LHS  || kind==AF_BATCH_NONE;
332 |     param.inHasNoOffset    = kind!=AF_BATCH_SAME;
333 | 
334 |     switch(baseDim) {
335 |         case 2: convolve_2d<Dtype, expand>(param, out, out_desc, signal, signal_desc, filt, filt_desc, streamId); break;
336 |     }
337 | 
338 | }
339 | 
340 | #define INSTANTIATE(T)  \
341 |     template void convolve_nd<T, 2, true >(T* out, const conv2_data_desc& out_desc,\
342 |                                            const T* signal, const conv2_data_desc& signal_desc,\
343 |                                            const T* filt, const conv2_data_desc& filt_desc, AF_BATCH_KIND kind, cudaStream_t streamId); \
344 |     template void convolve_nd<T, 2, false>(T* out, const conv2_data_desc& out_desc, \
345 |                                            const T* signal, const conv2_data_desc& signal_desc, \
346 |                                            const T* filt, const conv2_data_desc& filt_desc, AF_BATCH_KIND kind, cudaStream_t streamId);\
347 | 
348 | 
349 | INSTANTIATE(float)
350 | 
351 | }
352 | 
353 | }
354 | 


--------------------------------------------------------------------------------
/src/dau_conv/util/im2col.cpp:
--------------------------------------------------------------------------------
  1 | #include <vector>
  2 | 
  3 | #include "dau_conv/util/im2col.hpp"
  4 | #include "dau_conv/util/math_functions.hpp"
  5 | 
  6 | using namespace std;
  7 | 
  8 | namespace DAUConvNet {
  9 | 
 10 | // Function uses casting from int to unsigned to compare if value of
 11 | // parameter a is greater or equal to zero and lower than value of
 12 | // parameter b. The b parameter is of type signed and is always positive,
 13 | // therefore its value is always lower than 0x800... where casting
 14 | // negative value of a parameter converts it to value higher than 0x800...
 15 | // The casting allows to use one condition instead of two.
 16 | inline bool is_a_ge_zero_and_a_lt_b(int a, int b) {
 17 |   return static_cast<unsigned>(a) < static_cast<unsigned>(b);
 18 | }
 19 | 
 20 | template <typename Dtype>
 21 | void im2col_cpu(const Dtype* data_im, const int channels,
 22 |     const int height, const int width, const int kernel_h, const int kernel_w,
 23 |     const int pad_h, const int pad_w,
 24 |     const int stride_h, const int stride_w,
 25 |     const int dilation_h, const int dilation_w,
 26 |     Dtype* data_col) {
 27 |   const int output_h = (height + 2 * pad_h -
 28 |     (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
 29 |   const int output_w = (width + 2 * pad_w -
 30 |     (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
 31 |   const int channel_size = height * width;
 32 |   for (int channel = channels; channel--; data_im += channel_size) {
 33 |     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
 34 |       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
 35 |         int input_row = -pad_h + kernel_row * dilation_h;
 36 |         for (int output_rows = output_h; output_rows; output_rows--) {
 37 |           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
 38 |             for (int output_cols = output_w; output_cols; output_cols--) {
 39 |               *(data_col++) = 0;
 40 |             }
 41 |           } else {
 42 |             int input_col = -pad_w + kernel_col * dilation_w;
 43 |             for (int output_col = output_w; output_col; output_col--) {
 44 |               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
 45 |                 *(data_col++) = data_im[input_row * width + input_col];
 46 |               } else {
 47 |                 *(data_col++) = 0;
 48 |               }
 49 |               input_col += stride_w;
 50 |             }
 51 |           }
 52 |           input_row += stride_h;
 53 |         }
 54 |       }
 55 |     }
 56 |   }
 57 | }
 58 | 
 59 | // Explicit instantiation
 60 | template void im2col_cpu<float>(const float* data_im, const int channels,
 61 |     const int height, const int width, const int kernel_h, const int kernel_w,
 62 |     const int pad_h, const int pad_w, const int stride_h,
 63 |     const int stride_w, const int dilation_h, const int dilation_w,
 64 |     float* data_col);
 65 | template void im2col_cpu<double>(const double* data_im, const int channels,
 66 |     const int height, const int width, const int kernel_h, const int kernel_w,
 67 |     const int pad_h, const int pad_w, const int stride_h,
 68 |     const int stride_w, const int dilation_h, const int dilation_w,
 69 |     double* data_col);
 70 | 
 71 | template <typename Dtype>
 72 | inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col,
 73 |     const int num_spatial_axes, const int* im_shape, const int* col_shape,
 74 |     const int* kernel_shape, const int* pad, const int* stride,
 75 |     const int* dilation, Dtype* data_output) {
 76 |   if (!im2col) {
 77 |     int im_size = im_shape[0];
 78 |     for (int i = 0; i < num_spatial_axes; ++i) {
 79 |       im_size *= im_shape[1 + i];
 80 |     }
 81 |     caffe_set(im_size, Dtype(0), data_output);
 82 |   }
 83 |   int kernel_size = 1;
 84 |   for (int i = 0; i < num_spatial_axes; ++i) {
 85 |     kernel_size *= kernel_shape[i];
 86 |   }
 87 |   const int channels_col = col_shape[0];
 88 |   vector<int> d_offset(num_spatial_axes, 0);
 89 |   vector<int> d_iter(num_spatial_axes, 0);
 90 |   for (int c_col = 0; c_col < channels_col; ++c_col) {
 91 |     // Loop over spatial axes in reverse order to compute a per-axis offset.
 92 |     int offset = c_col;
 93 |     for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {
 94 |       if (d_i < num_spatial_axes - 1) {
 95 |         offset /= kernel_shape[d_i + 1];
 96 |       }
 97 |       d_offset[d_i] = offset % kernel_shape[d_i];
 98 |     }
 99 |     for (bool incremented = true; incremented; ) {
100 |       // Loop over spatial axes in forward order to compute the indices in the
101 |       // image and column, and whether the index lies in the padding.
102 |       int index_col = c_col;
103 |       int index_im = c_col / kernel_size;
104 |       bool is_padding = false;
105 |       for (int d_i = 0; d_i < num_spatial_axes; ++d_i) {
106 |         const int d = d_iter[d_i];
107 |         const int d_im = d * stride[d_i] - pad[d_i] +
108 |             d_offset[d_i] * dilation[d_i];
109 |         is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1];
110 |         index_col *= col_shape[d_i + 1];
111 |         index_col += d;
112 |         index_im *= im_shape[d_i + 1];
113 |         index_im += d_im;
114 |       }
115 |       if (im2col) {
116 |         if (is_padding) {
117 |           data_output[index_col] = 0;
118 |         } else {
119 |           data_output[index_col] = data_input[index_im];
120 |         }
121 |       } else if (!is_padding) {  // col2im
122 |         data_output[index_im] += data_input[index_col];
123 |       }
124 |       // Loop over spatial axes in reverse order to choose an index,
125 |       // like counting.
126 |       incremented = false;
127 |       for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) {
128 |         const int d_max = col_shape[d_i + 1];
129 |         M_Assert(d_iter[d_i] < d_max,"");
130 |         if (d_iter[d_i] == d_max - 1) {
131 |           d_iter[d_i] = 0;
132 |         } else {  // d_iter[d_i] < d_max - 1
133 |           ++d_iter[d_i];
134 |           incremented = true;
135 |           break;
136 |         }
137 |       }
138 |     }  // while(incremented) {
139 |   }  // for (int c = 0; c < channels_col; ++c) {
140 | }
141 | 
142 | template <typename Dtype>
143 | void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes,
144 |     const int* im_shape, const int* col_shape,
145 |     const int* kernel_shape, const int* pad, const int* stride,
146 |     const int* dilation, Dtype* data_col) {
147 |   const bool kIm2Col = true;
148 |   im2col_nd_core_cpu(data_im, kIm2Col, num_spatial_axes, im_shape, col_shape,
149 |                   kernel_shape, pad, stride, dilation, data_col);
150 | }
151 | 
152 | // Explicit instantiation
153 | template void im2col_nd_cpu<float>(const float* data_im,
154 |     const int num_spatial_axes,
155 |     const int* im_shape, const int* col_shape,
156 |     const int* kernel_shape, const int* pad, const int* stride,
157 |     const int* dilation, float* data_col);
158 | template void im2col_nd_cpu<double>(const double* data_im,
159 |     const int num_spatial_axes,
160 |     const int* im_shape, const int* col_shape,
161 |     const int* kernel_shape, const int* pad, const int* stride,
162 |     const int* dilation, double* data_col);
163 | 
164 | template <typename Dtype>
165 | void col2im_cpu(const Dtype* data_col, const int channels,
166 |     const int height, const int width, const int kernel_h, const int kernel_w,
167 |     const int pad_h, const int pad_w,
168 |     const int stride_h, const int stride_w,
169 |     const int dilation_h, const int dilation_w,
170 |     Dtype* data_im) {
171 |   caffe_set(height * width * channels, Dtype(0), data_im);
172 |   const int output_h = (height + 2 * pad_h -
173 |     (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
174 |   const int output_w = (width + 2 * pad_w -
175 |     (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
176 |   const int channel_size = height * width;
177 |   for (int channel = channels; channel--; data_im += channel_size) {
178 |     for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) {
179 |       for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) {
180 |         int input_row = -pad_h + kernel_row * dilation_h;
181 |         for (int output_rows = output_h; output_rows; output_rows--) {
182 |           if (!is_a_ge_zero_and_a_lt_b(input_row, height)) {
183 |             data_col += output_w;
184 |           } else {
185 |             int input_col = -pad_w + kernel_col * dilation_w;
186 |             for (int output_col = output_w; output_col; output_col--) {
187 |               if (is_a_ge_zero_and_a_lt_b(input_col, width)) {
188 |                 data_im[input_row * width + input_col] += *data_col;
189 |               }
190 |               data_col++;
191 |               input_col += stride_w;
192 |             }
193 |           }
194 |           input_row += stride_h;
195 |         }
196 |       }
197 |     }
198 |   }
199 | }
200 | 
201 | // Explicit instantiation
202 | template void col2im_cpu<float>(const float* data_col, const int channels,
203 |     const int height, const int width, const int kernel_h, const int kernel_w,
204 |     const int pad_h, const int pad_w, const int stride_h,
205 |     const int stride_w, const int dilation_h, const int dilation_w,
206 |     float* data_im);
207 | template void col2im_cpu<double>(const double* data_col, const int channels,
208 |     const int height, const int width, const int kernel_h, const int kernel_w,
209 |     const int pad_h, const int pad_w, const int stride_h,
210 |     const int stride_w, const int dilation_h, const int dilation_w,
211 |     double* data_im);
212 | 
213 | template <typename Dtype>
214 | void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes,
215 |     const int* im_shape, const int* col_shape,
216 |     const int* kernel_shape, const int* pad, const int* stride,
217 |     const int* dilation, Dtype* data_im) {
218 |   const bool kIm2Col = false;
219 |   im2col_nd_core_cpu(data_col, kIm2Col, num_spatial_axes, im_shape, col_shape,
220 |                      kernel_shape, pad, stride, dilation, data_im);
221 | }
222 | 
223 | // Explicit instantiation
224 | template void col2im_nd_cpu<float>(const float* data_col,
225 |     const int num_spatial_axes,
226 |     const int* im_shape, const int* col_shape,
227 |     const int* kernel_shape, const int* pad, const int* stride,
228 |     const int* dilation, float* data_im);
229 | template void col2im_nd_cpu<double>(const double* data_col,
230 |     const int num_spatial_axes,
231 |     const int* im_shape, const int* col_shape,
232 |     const int* kernel_shape, const int* pad, const int* stride,
233 |     const int* dilation, double* data_im);
234 | 
235 | 
236 | }  // namespace dau_conv_impl
237 | 


--------------------------------------------------------------------------------
/src/dau_conv/util/math_functions.cpp:
--------------------------------------------------------------------------------
  1 | #include <limits>
  2 | 
  3 | #include "dau_conv/util/common.hpp"
  4 | #include "dau_conv/util/math_functions.hpp"
  5 | 
  6 | namespace DAUConvNet {
  7 | 
  8 | 
  9 | template<>
 10 | void caffe_cpu_gemm<float>(const CBLAS_TRANSPOSE TransA,
 11 |     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
 12 |     const float alpha, const float* A, const float* B, const float beta,
 13 |     float* C) {
 14 |   int lda = (TransA == CblasNoTrans) ? K : M;
 15 |   int ldb = (TransB == CblasNoTrans) ? N : K;
 16 |   cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
 17 |       ldb, beta, C, N);
 18 | }
 19 | 
 20 | template<>
 21 | void caffe_cpu_gemm<double>(const CBLAS_TRANSPOSE TransA,
 22 |     const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K,
 23 |     const double alpha, const double* A, const double* B, const double beta,
 24 |     double* C) {
 25 |   int lda = (TransA == CblasNoTrans) ? K : M;
 26 |   int ldb = (TransB == CblasNoTrans) ? N : K;
 27 |   cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B,
 28 |       ldb, beta, C, N);
 29 | }
 30 | 
 31 | template <>
 32 | void caffe_cpu_gemv<float>(const CBLAS_TRANSPOSE TransA, const int M,
 33 |     const int N, const float alpha, const float* A, const float* x,
 34 |     const float beta, float* y) {
 35 |   cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 36 | }
 37 | 
 38 | template <>
 39 | void caffe_cpu_gemv<double>(const CBLAS_TRANSPOSE TransA, const int M,
 40 |     const int N, const double alpha, const double* A, const double* x,
 41 |     const double beta, double* y) {
 42 |   cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1);
 43 | }
 44 | 
 45 | template <>
 46 | void caffe_axpy<float>(const int N, const float alpha, const float* X,
 47 |     float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); }
 48 | 
 49 | template <>
 50 | void caffe_axpy<double>(const int N, const double alpha, const double* X,
 51 |     double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); }
 52 | 
 53 | template <typename Dtype>
 54 | void caffe_set(const int N, const Dtype alpha, Dtype* Y) {
 55 |   if (alpha == 0) {
 56 |     memset(Y, 0, sizeof(Dtype) * N);  // NOLINT(dau_conv_impl/alt_fn)
 57 |     return;
 58 |   }
 59 |   for (int i = 0; i < N; ++i) {
 60 |     Y[i] = alpha;
 61 |   }
 62 | }
 63 | 
 64 | template void caffe_set<int>(const int N, const int alpha, int* Y);
 65 | template void caffe_set<float>(const int N, const float alpha, float* Y);
 66 | template void caffe_set<double>(const int N, const double alpha, double* Y);
 67 | 
 68 | template <>
 69 | void caffe_add_scalar(const int N, const float alpha, float* Y) {
 70 |   for (int i = 0; i < N; ++i) {
 71 |     Y[i] += alpha;
 72 |   }
 73 | }
 74 | 
 75 | template <>
 76 | void caffe_add_scalar(const int N, const double alpha, double* Y) {
 77 |   for (int i = 0; i < N; ++i) {
 78 |     Y[i] += alpha;
 79 |   }
 80 | }
 81 | template <>
 82 | void caffe_scal<float>(const int N, const float alpha, float *X) {
 83 |   cblas_sscal(N, alpha, X, 1);
 84 | }
 85 | 
 86 | template <>
 87 | void caffe_scal<double>(const int N, const double alpha, double *X) {
 88 |   cblas_dscal(N, alpha, X, 1);
 89 | }
 90 | 
 91 | template <>
 92 | void caffe_cpu_axpby<float>(const int N, const float alpha, const float* X,
 93 |                             const float beta, float* Y) {
 94 |   cblas_saxpby(N, alpha, X, 1, beta, Y, 1);
 95 | }
 96 | 
 97 | template <>
 98 | void caffe_cpu_axpby<double>(const int N, const double alpha, const double* X,
 99 |                              const double beta, double* Y) {
100 |   cblas_daxpby(N, alpha, X, 1, beta, Y, 1);
101 | }
102 | 
103 | template <>
104 | void caffe_add<float>(const int n, const float* a, const float* b,
105 |     float* y) {
106 |   vsAdd(n, a, b, y);
107 | }
108 | 
109 | template <>
110 | void caffe_add<double>(const int n, const double* a, const double* b,
111 |     double* y) {
112 |   vdAdd(n, a, b, y);
113 | }
114 | 
115 | template <>
116 | void caffe_sub<float>(const int n, const float* a, const float* b,
117 |     float* y) {
118 |   vsSub(n, a, b, y);
119 | }
120 | 
121 | template <>
122 | void caffe_sub<double>(const int n, const double* a, const double* b,
123 |     double* y) {
124 |   vdSub(n, a, b, y);
125 | }
126 | 
127 | template <>
128 | void caffe_mul<float>(const int n, const float* a, const float* b,
129 |     float* y) {
130 |   vsMul(n, a, b, y);
131 | }
132 | 
133 | template <>
134 | void caffe_mul<double>(const int n, const double* a, const double* b,
135 |     double* y) {
136 |   vdMul(n, a, b, y);
137 | }
138 | 
139 | template <>
140 | void caffe_div<float>(const int n, const float* a, const float* b,
141 |     float* y) {
142 |   vsDiv(n, a, b, y);
143 | }
144 | 
145 | template <>
146 | void caffe_div<double>(const int n, const double* a, const double* b,
147 |     double* y) {
148 |   vdDiv(n, a, b, y);
149 | }
150 | 
151 | template <>
152 | void caffe_powx<float>(const int n, const float* a, const float b,
153 |     float* y) {
154 |   vsPowx(n, a, b, y);
155 | }
156 | 
157 | template <>
158 | void caffe_powx<double>(const int n, const double* a, const double b,
159 |     double* y) {
160 |   vdPowx(n, a, b, y);
161 | }
162 | 
163 | template <>
164 | void caffe_sqr<float>(const int n, const float* a, float* y) {
165 |   vsSqr(n, a, y);
166 | }
167 | 
168 | template <>
169 | void caffe_sqr<double>(const int n, const double* a, double* y) {
170 |   vdSqr(n, a, y);
171 | }
172 | 
173 | template <>
174 | void caffe_sqrt<float>(const int n, const float* a, float* y) {
175 |   vsSqrt(n, a, y);
176 | }
177 | 
178 | template <>
179 | void caffe_sqrt<double>(const int n, const double* a, double* y) {
180 |   vdSqrt(n, a, y);
181 | }
182 | 
183 | template <>
184 | void caffe_exp<float>(const int n, const float* a, float* y) {
185 |   vsExp(n, a, y);
186 | }
187 | 
188 | template <>
189 | void caffe_exp<double>(const int n, const double* a, double* y) {
190 |   vdExp(n, a, y);
191 | }
192 | 
193 | template <>
194 | void caffe_log<float>(const int n, const float* a, float* y) {
195 |   vsLn(n, a, y);
196 | }
197 | 
198 | template <>
199 | void caffe_log<double>(const int n, const double* a, double* y) {
200 |   vdLn(n, a, y);
201 | }
202 | 
203 | template <>
204 | void caffe_abs<float>(const int n, const float* a, float* y) {
205 |     vsAbs(n, a, y);
206 | }
207 | 
208 | template <>
209 | void caffe_abs<double>(const int n, const double* a, double* y) {
210 |     vdAbs(n, a, y);
211 | }
212 | 
213 | template <>
214 | float caffe_cpu_strided_dot<float>(const int n, const float* x, const int incx,
215 |     const float* y, const int incy) {
216 |   return cblas_sdot(n, x, incx, y, incy);
217 | }
218 | 
219 | template <>
220 | double caffe_cpu_strided_dot<double>(const int n, const double* x,
221 |     const int incx, const double* y, const int incy) {
222 |   return cblas_ddot(n, x, incx, y, incy);
223 | }
224 | 
225 | template <typename Dtype>
226 | Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) {
227 |   return caffe_cpu_strided_dot(n, x, 1, y, 1);
228 | }
229 | 
230 | template
231 | float caffe_cpu_dot<float>(const int n, const float* x, const float* y);
232 | 
233 | template
234 | double caffe_cpu_dot<double>(const int n, const double* x, const double* y);
235 | 
236 | template <>
237 | float caffe_cpu_asum<float>(const int n, const float* x) {
238 |   return cblas_sasum(n, x, 1);
239 | }
240 | 
241 | template <>
242 | double caffe_cpu_asum<double>(const int n, const double* x) {
243 |   return cblas_dasum(n, x, 1);
244 | }
245 | 
246 | template <>
247 | void caffe_cpu_scale<float>(const int n, const float alpha, const float *x,
248 |                             float* y) {
249 |   cblas_scopy(n, x, 1, y, 1);
250 |   cblas_sscal(n, alpha, y, 1);
251 | }
252 | 
253 | template <>
254 | void caffe_cpu_scale<double>(const int n, const double alpha, const double *x,
255 |                              double* y) {
256 |   cblas_dcopy(n, x, 1, y, 1);
257 |   cblas_dscal(n, alpha, y, 1);
258 | }
259 | }  // namespace dau_conv_impl
260 | 


--------------------------------------------------------------------------------
/src/main.cpp:
--------------------------------------------------------------------------------
 1 | //
 2 | // Created by domen on 3/21/18.
 3 | //
 4 | 
 5 | #include <memory>
 6 | #include "dau_conv/base_dau_conv_layer.hpp"
 7 | 
 8 | 
 9 | int main(int argc, char** argv) {
10 | 
11 | /*
12 |     DAUConvSettings layer_param;
13 | 
14 |     DAUConvLayerCaffeGPU<float> layer;
15 |     const int N = 128;
16 |     const int S = 32;
17 |     const int F = 64;
18 |     const int H = 64;
19 |     const int W = 64;
20 | 
21 |     Blob<float> input(N,S,H,W);
22 |     Blob<float> output(N,F,H,W);
23 | 
24 |     vector<Blob<float>*> top;
25 |     vector<bool> param_propagate_down;
26 | 
27 |     DAUKernelComputeGPU<float>* dau_kernel_compute = new DAUKernelComputeGPU<float>();
28 |     DAUKernelParamsGPU<float>* dau_kernel_params = new DAUKernelParamsGPU<float>();
29 |     DAUKernelOutputGPU<float>* dau_kernel_output = new DAUKernelOutputGPU<float>();
30 | 
31 | 
32 |     layer.LayerSetUp(layer_param,
33 |                      dau_kernel_compute, dau_kernel_params, dau_kernel_output,
34 |                      param_propagate_down, input.shape());
35 | 
36 |     layer.Reshape(input.shape(), output.shape());
37 | 
38 |     layer.Forward_gpu(input.gpu_data(), input.shape(), output.mutable_gpu_data(), output.shape());
39 | 
40 |     layer.Backward_gpu(output.gpu_data(), output.gpu_diff(), output.shape(), true,
41 |                        input.gpu_data(), input.mutable_gpu_diff(), input.shape());
42 | */
43 | }


--------------------------------------------------------------------------------