├── .gitmodules ├── CMakeLists.txt ├── README.md ├── cmake ├── ConfigGen.cmake ├── Cuda.cmake ├── Dependencies.cmake ├── Modules │ ├── FindAtlas.cmake │ └── FindOpenBLAS.cmake ├── Targets.cmake ├── Templates │ ├── DAUConvNetConfig.cmake.in │ └── dau_conv_config.h.in └── Utils.cmake ├── include └── dau_conv │ ├── base_dau_conv_layer.hpp │ ├── dau_conv_impl │ ├── dau_conv_backward.hpp │ ├── dau_conv_backward_core.hpp │ ├── dau_conv_forward.hpp │ └── dau_conv_forward_core.hpp │ └── util │ ├── common.hpp │ ├── convolve.hpp │ ├── im2col.hpp │ ├── math_functions.hpp │ └── mkl_alternate.hpp ├── plugins └── tensorflow │ ├── CMakeLists.txt │ ├── MANIFEST.in.in │ ├── build-ci │ └── build-whl.sh │ ├── dau_conv │ ├── __init__.py │ ├── _dau_conv_grad_op.py │ ├── dau_conv.py │ └── test │ │ └── __main__.py │ ├── docker │ ├── Dockerfile │ ├── Dockerfile.ubuntu18.04 │ ├── test_dau.sh │ └── verify_dau_import.py │ ├── scripts │ └── start_main_build.sh │ ├── setup.py.in │ └── src │ ├── dau_conv_grad_op.cpp │ ├── dau_conv_layer_tensorflow.cpp │ ├── dau_conv_layer_tensorflow.hpp │ └── dau_conv_op.cpp └── src ├── dau_conv ├── CMakeLists.txt ├── base_dau_conv_layer.cpp ├── base_dau_conv_layer.cu ├── dau_conv_impl │ ├── dau_conv_backward.cpp │ ├── dau_conv_backward_patch_16x16.cu │ ├── dau_conv_backward_patch_16x32.cu │ ├── dau_conv_backward_patch_16x64.cu │ ├── dau_conv_backward_patch_16x8.cu │ ├── dau_conv_backward_patch_1x1.cu │ ├── dau_conv_backward_patch_32x16.cu │ ├── dau_conv_backward_patch_32x32.cu │ ├── dau_conv_backward_patch_32x64.cu │ ├── dau_conv_backward_patch_32x8.cu │ ├── dau_conv_backward_patch_64x16.cu │ ├── dau_conv_backward_patch_64x32.cu │ ├── dau_conv_backward_patch_64x64.cu │ ├── dau_conv_backward_patch_64x8.cu │ ├── dau_conv_backward_patch_8x16.cu │ ├── dau_conv_backward_patch_8x32.cu │ ├── dau_conv_backward_patch_8x64.cu │ ├── dau_conv_backward_patch_8x8.cu │ ├── dau_conv_forward.cpp │ ├── dau_conv_forward_off16_s0_f1.cu │ ├── dau_conv_forward_off16_s1_f1.cu │ ├── dau_conv_forward_off32_s1_f1.cu │ ├── dau_conv_forward_off4_s0_f0.cu │ ├── dau_conv_forward_off4_s0_f1.cu │ ├── dau_conv_forward_off4_s1_f0.cu │ ├── dau_conv_forward_off4_s1_f1.cu │ ├── dau_conv_forward_off8_s0_f0.cu │ ├── dau_conv_forward_off8_s0_f1.cu │ ├── dau_conv_forward_off8_s1_f0.cu │ └── dau_conv_forward_off8_s1_f1.cu └── util │ ├── common.cpp │ ├── convolve.cpp │ ├── convolve.cu │ ├── im2col.cpp │ ├── math_functions.cpp │ └── math_functions.cu └── main.cpp /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "include/cub"] 2 | path = include/cub 3 | url = https://github.com/NVlabs/cub.git 4 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # DAU-ConvNet project exposes the following varaibles: 2 | # - DAUConvNet_OBJ_TARGET: name of object target (that can be used as dependency) 3 | # - DAUConvNet_OBJS: pre-compiled .o objects (.cpp and .cu files) 4 | # - DAUConvNet_CU_OBJS; pre-compiled .cu.o objects (only resutling CUDA objects !!) 5 | # - DAUConvNet_INCLUDE_DIRS: include dirs of dependencies for DAU-ConvNet 6 | # - DAUConvNet_LINKER_LIBS: linker libs of dependencies for DAU-ConvNet 7 | # - DAUConvNet_INCLUDE_DIR: include dir for DAU-ConvNet (i.e. 3thparty/DAU-ConvNet/include) 8 | 9 | cmake_minimum_required(VERSION 2.8.8) 10 | if(POLICY CMP0046) 11 | cmake_policy(SET CMP0046 NEW) 12 | endif() 13 | if(POLICY CMP0054) 14 | cmake_policy(SET CMP0054 NEW) 15 | endif() 16 | 17 | # ---[ DAU-ConvNet project 18 | project(DAUConvNet C CXX) 19 | 20 | set(PACKAGE_VERSION "1.0" CACHE STRING "DAU-ConvNet version number") 21 | 22 | # ---[ Using cmake scripts and modules 23 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) 24 | 25 | include(cmake/Utils.cmake) 26 | include(cmake/Targets.cmake) 27 | include(cmake/ConfigGen.cmake) 28 | 29 | # ---[ Using C++11 30 | 31 | include(CheckCXXCompilerFlag) 32 | CHECK_CXX_COMPILER_FLAG("-std=c++17" COMPILER_SUPPORTS_CXX17) 33 | CHECK_CXX_COMPILER_FLAG("-std=c++14" COMPILER_SUPPORTS_CXX14) 34 | CHECK_CXX_COMPILER_FLAG("-std=c++11" COMPILER_SUPPORTS_CXX11) 35 | CHECK_CXX_COMPILER_FLAG("-std=c++0x" COMPILER_SUPPORTS_CXX0X) 36 | 37 | if(COMPILER_SUPPORTS_CXX14) 38 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14") 39 | elseif(COMPILER_SUPPORTS_CXX11) 40 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 41 | elseif(COMPILER_SUPPORTS_CXX0X) 42 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x") 43 | else() 44 | message(STATUS "The compiler ${CMAKE_CXX_COMPILER} has no C++11 support. Please use a different C++ compiler.") 45 | endif() 46 | 47 | # ---[ Configuration types 48 | set(CMAKE_CONFIGURATION_TYPES "Debug;Release" CACHE STRING "Possible configurations" FORCE) 49 | mark_as_advanced(CMAKE_CONFIGURATION_TYPES) 50 | 51 | if(DEFINED CMAKE_BUILD_TYPE) 52 | set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CMAKE_CONFIGURATION_TYPES}) 53 | endif() 54 | 55 | # --[ If user doesn't specify build type then assume release 56 | if("${CMAKE_BUILD_TYPE}" STREQUAL "") 57 | set(CMAKE_BUILD_TYPE Release) 58 | endif() 59 | 60 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 61 | set(CMAKE_COMPILER_IS_CLANGXX TRUE) 62 | endif() 63 | 64 | # ---[ Solution folders 65 | dau_conv_option(USE_PROJECT_FOLDERS "IDE Solution folders" (MSVC_IDE OR CMAKE_GENERATOR MATCHES Xcode) ) 66 | 67 | if(USE_PROJECT_FOLDERS) 68 | set_property(GLOBAL PROPERTY USE_FOLDERS ON) 69 | set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMakeTargets") 70 | endif() 71 | 72 | # ---[ RPATH settings 73 | set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOLEAN "Use link paths for shared library rpath") 74 | set(CMAKE_MACOSX_RPATH TRUE) 75 | 76 | list(FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES ${CMAKE_INSTALL_PREFIX}/lib __is_systtem_dir) 77 | if(${__is_systtem_dir} STREQUAL -1) 78 | set(CMAKE_INSTALL_RPATH ${CMAKE_INSTALL_PREFIX}/lib) 79 | endif() 80 | 81 | # ---[ Set debug postfix 82 | set(DAUConvNet_DEBUG_POSTFIX "-d") 83 | 84 | set(DAUConvNet_POSTFIX "") 85 | if(CMAKE_BUILD_TYPE MATCHES "Debug") 86 | set(DAUConvNet_POSTFIX ${DAUConvNet_DEBUG_POSTFIX}) 87 | endif() 88 | 89 | 90 | # ---[ Options 91 | dau_conv_option(BUILD_TENSORFLOW_PLUGIN "Builds TensorFlow plugin" OFF) 92 | dau_conv_option(BUILD_SHARED_LIBS "Build shared libraries" ON) 93 | dau_conv_option(USE_DUMMY_CUDA_IMPL "For debugging purpose; do not compile CUDA kernels (fast compile time)" OFF) 94 | dau_conv_option(ALLOW_INTERPOLATION_OFF "Build support for disabling interpolation in DAUs" OFF) 95 | 96 | # ---[ Dependencies 97 | include(cmake/Dependencies.cmake) 98 | 99 | # ---[ Flags 100 | if(UNIX OR APPLE) 101 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -Wall") 102 | endif() 103 | 104 | if(USE_libstdcpp) 105 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libstdc++") 106 | message("-- Warning: forcing libstdc++ (controlled by USE_libstdcpp option in cmake)") 107 | endif() 108 | 109 | if(${USE_DUMMY_CUDA_IMPL}) 110 | list(APPEND DAUConvNet_DEFINITIONS "-DDAU_USE_DUMMY_CUDA_IMPL") 111 | endif() 112 | 113 | if(${ALLOW_INTERPOLATION_OFF}) 114 | list(APPEND DAUConvNet_DEFINITIONS "-DDAU_ALLOW_INTERPOLATION_OFF") 115 | endif() 116 | 117 | # ---[ Warnings 118 | dau_conv_warnings_disable(CMAKE_CXX_FLAGS -Wno-sign-compare -Wno-uninitialized) 119 | 120 | # ---[ Config generation 121 | configure_file(cmake/Templates/dau_conv_config.h.in "${PROJECT_BINARY_DIR}/dau_conv_config.h") 122 | 123 | # ---[ Includes 124 | set(DAUConvNet_INCLUDE_DIR ${PROJECT_SOURCE_DIR}/include) 125 | set(DAUConvNet_SRC_DIR ${PROJECT_SOURCE_DIR}/src) 126 | include_directories(${PROJECT_BINARY_DIR}) 127 | 128 | # ---[ Includes & defines for CUDA 129 | 130 | # cuda_compile() does not have per-call dependencies or include pathes 131 | # (cuda_compile() has per-call flags, but we set them here too for clarity) 132 | # 133 | # list(REMOVE_ITEM ...) invocations remove PRIVATE and PUBLIC keywords from collected definitions and include pathes 134 | if(HAVE_CUDA) 135 | # Add includes to CUB only for CUDA lower than 11.8 which does not provide CUB 136 | if("${CUDA_VERSION}" VERSION_LESS "11.8") 137 | message(STATUS "CUDA_VERSION (${CUDA_VERSION}) is less than 11.8, adding CUB to include pathes") 138 | list(APPEND DAUConvNet_INCLUDE_DIRS ${PROJECT_SOURCE_DIR}/include/cub) 139 | endif() 140 | 141 | # pass include pathes to cuda_include_directories() 142 | set(DAUConvNet_ALL_INCLUDE_DIRS ${DAUConvNet_INCLUDE_DIRS}) 143 | list(REMOVE_ITEM DAUConvNet_ALL_INCLUDE_DIRS PRIVATE PUBLIC) 144 | cuda_include_directories(${DAUConvNet_INCLUDE_DIR} ${DAUConvNet_SRC_DIR} ${DAUConvNet_ALL_INCLUDE_DIRS}) 145 | 146 | # add definitions to nvcc flags directly 147 | set(DAUConvNet_ALL_DEFINITIONS ${DAUConvNet_DEFINITIONS}) 148 | list(REMOVE_ITEM DAUConvNet_ALL_DEFINITIONS PRIVATE PUBLIC) 149 | list(APPEND CUDA_NVCC_FLAGS ${DAUConvNet_ALL_DEFINITIONS}) 150 | else() 151 | message( FATAL_ERROR "MISSING CUDA: DAU-ConvNet implementation requires CUDA") 152 | endif() 153 | 154 | # ---[ Subdirectories 155 | add_subdirectory(src/dau_conv) 156 | add_subdirectory(plugins/tensorflow) 157 | 158 | # ---[ Export configs generation 159 | dau_conv_generate_export_configs() 160 | 161 | get_directory_property(has_parent PARENT_DIRECTORY) 162 | 163 | if (has_parent) 164 | set(DAUConvNet_OBJ_TARGET ${DAUConvNet_OBJ_TARGET} PARENT_SCOPE) 165 | set(DAUConvNet_OBJS ${DAUConvNet_OBJS} PARENT_SCOPE) 166 | set(DAUConvNet_CU_OBJS ${DAUConvNet_CU_OBJS} PARENT_SCOPE) 167 | set(DAUConvNet_INCLUDE_DIRS ${DAUConvNet_INCLUDE_DIRS} PARENT_SCOPE) 168 | set(DAUConvNet_INCLUDE_DIR ${DAUConvNet_INCLUDE_DIR} PARENT_SCOPE) 169 | set(DAUConvNet_LINKER_LIBS ${DAUConvNet_LIBS} PARENT_SCOPE) 170 | endif() 171 | -------------------------------------------------------------------------------- /cmake/ConfigGen.cmake: -------------------------------------------------------------------------------- 1 | 2 | 3 | ################################################################################################ 4 | # Function for generation DAU-ConvNet build- and install- tree export config files 5 | # Usage: 6 | # dau_conv_generate_export_configs() 7 | function(dau_conv_generate_export_configs) 8 | set(install_cmake_suffix "share/DAUConvNet") 9 | 10 | if(NOT HAVE_CUDA) 11 | set(HAVE_CUDA FALSE) 12 | endif() 13 | 14 | # ---[ Configure build-tree DAUConvNetConfig.cmake file ]--- 15 | 16 | configure_file("cmake/Templates/DAUConvNetConfig.cmake.in" "${PROJECT_BINARY_DIR}/DAUConvNetConfig.cmake" @ONLY) 17 | 18 | # Add targets to the build-tree export set 19 | export(TARGETS dau-conv FILE "${PROJECT_BINARY_DIR}/DAUConvNetTargets.cmake") 20 | export(PACKAGE DAUConvNet) 21 | 22 | # ---[ Configure install-tree DAUConvNetConfig.cmake file ]--- 23 | 24 | configure_file("cmake/Templates/DAUConvNetConfig.cmake.in" "${PROJECT_BINARY_DIR}/cmake/DAUConvNetConfig.cmake" @ONLY) 25 | 26 | # Install the DAUConvNetConfig.cmake and export set to use with install-tree 27 | install(FILES "${PROJECT_BINARY_DIR}/cmake/DAUConvNetConfig.cmake" DESTINATION ${install_cmake_suffix}) 28 | 29 | endfunction() 30 | 31 | 32 | -------------------------------------------------------------------------------- /cmake/Cuda.cmake: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # A function for automatic detection of GPUs installed (if autodetection is enabled) 3 | # Usage: 4 | # dau_conv_cuda_detect_installed_gpus(out_variable) 5 | function(dau_conv_cuda_detect_installed_gpus out_variable) 6 | if(NOT CUDA_gpu_detect_output) 7 | set(__cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu) 8 | 9 | file(WRITE ${__cufile} "" 10 | "#include \n" 11 | "int main()\n" 12 | "{\n" 13 | " int count = 0;\n" 14 | " if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n" 15 | " if (count == 0) return -1;\n" 16 | " for (int device = 0; device < count; ++device)\n" 17 | " {\n" 18 | " cudaDeviceProp prop;\n" 19 | " if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n" 20 | " std::printf(\"%d.%d \", prop.major, prop.minor);\n" 21 | " }\n" 22 | " return 0;\n" 23 | "}\n") 24 | 25 | execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "--run" "${__cufile}" 26 | WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/" 27 | RESULT_VARIABLE __nvcc_res OUTPUT_VARIABLE __nvcc_out 28 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) 29 | 30 | if(__nvcc_res EQUAL 0) 31 | string(REPLACE "2.1" "2.1(2.0)" __nvcc_out "${__nvcc_out}") 32 | set(CUDA_gpu_detect_output ${__nvcc_out} CACHE INTERNAL "Returned GPU architetures from cuda_detect_gpus tool" FORCE) 33 | endif() 34 | endif() 35 | 36 | if(NOT CUDA_gpu_detect_output) 37 | message(STATUS "Automatic GPU detection failed. Building for all known architectures.") 38 | set(${out_variable} ${cuda_known_gpu_archs} PARENT_SCOPE) 39 | else() 40 | set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) 41 | endif() 42 | endfunction() 43 | 44 | 45 | ################################################################################################ 46 | # Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME 47 | # Usage: 48 | # dau_conv_cuda_select_nvcc_arch_flags(out_variable) 49 | function(dau_conv_cuda_select_nvcc_arch_flags out_variable) 50 | # List of arch names 51 | set(__archs_names "Maxwell" "Pascal" "Volta" "Turing" "All" "Manual") 52 | set(__archs_name_default "All") 53 | if(NOT CMAKE_CROSSCOMPILING) 54 | list(APPEND __archs_names "Auto") 55 | set(__archs_name_default "Auto") 56 | endif() 57 | 58 | # set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui) 59 | set(CUDA_ARCH_NAME ${__archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.") 60 | set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${__archs_names} ) 61 | mark_as_advanced(CUDA_ARCH_NAME) 62 | 63 | # verify CUDA_ARCH_NAME value 64 | if(NOT ";${__archs_names};" MATCHES ";${CUDA_ARCH_NAME};") 65 | string(REPLACE ";" ", " __archs_names "${__archs_names}") 66 | message(FATAL_ERROR "Only ${__archs_names} architeture names are supported.") 67 | endif() 68 | 69 | if(${CUDA_ARCH_NAME} STREQUAL "Manual") 70 | set(CUDA_ARCH_BIN ${cuda_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported") 71 | set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for") 72 | mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX) 73 | else() 74 | unset(CUDA_ARCH_BIN CACHE) 75 | unset(CUDA_ARCH_PTX CACHE) 76 | endif() 77 | 78 | if(${CUDA_ARCH_NAME} STREQUAL "Kepler") 79 | set(__cuda_arch_bin "30 35") 80 | elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell") 81 | set(__cuda_arch_bin "50") 82 | elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal") 83 | set(__cuda_arch_bin "60 61") 84 | elseif(${CUDA_ARCH_NAME} STREQUAL "Volta") 85 | set(__cuda_arch_bin "70") 86 | elseif(${CUDA_ARCH_NAME} STREQUAL "Turing") 87 | set(__cuda_arch_bin "75") 88 | elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere") 89 | if ("${CUDA_VERSION}" VERSION_GREATER "11.5" OR "${CUDA_VERSION}" VERSION_EQUAL "11.5") 90 | set(__cuda_arch_bin "80 86 87") 91 | else() 92 | set(__cuda_arch_bin "80 86") 93 | endif() 94 | elseif(${CUDA_ARCH_NAME} STREQUAL "Lovelace") 95 | set(__cuda_arch_bin "89") 96 | elseif(${CUDA_ARCH_NAME} STREQUAL "Hopper") 97 | set(__cuda_arch_bin "90") 98 | elseif(${CUDA_ARCH_NAME} STREQUAL "All") 99 | set(__cuda_arch_bin ${cuda_known_gpu_archs}) 100 | elseif(${CUDA_ARCH_NAME} STREQUAL "Auto") 101 | dau_conv_cuda_detect_installed_gpus(__cuda_arch_bin) 102 | else() # (${CUDA_ARCH_NAME} STREQUAL "Manual") 103 | set(__cuda_arch_bin ${CUDA_ARCH_BIN}) 104 | endif() 105 | # remove dots and convert to lists 106 | string(REGEX REPLACE "\\." "" __cuda_arch_bin "${__cuda_arch_bin}") 107 | string(REGEX REPLACE "\\." "" __cuda_arch_ptx "${CUDA_ARCH_PTX}") 108 | string(REGEX MATCHALL "[0-9()]+" __cuda_arch_bin "${__cuda_arch_bin}") 109 | string(REGEX MATCHALL "[0-9]+" __cuda_arch_ptx "${__cuda_arch_ptx}") 110 | dau_conv_list_unique(__cuda_arch_bin __cuda_arch_ptx) 111 | 112 | set(__nvcc_flags "") 113 | set(__nvcc_archs_readable "") 114 | 115 | # Tell NVCC to add binaries for the specified GPUs 116 | foreach(__arch ${__cuda_arch_bin}) 117 | if(__arch MATCHES "([0-9]+)\\(([0-9]+)\\)") 118 | # User explicitly specified PTX for the concrete BIN 119 | list(APPEND __nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1}) 120 | list(APPEND __nvcc_archs_readable sm_${CMAKE_MATCH_1}) 121 | else() 122 | # User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN 123 | list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=sm_${__arch}) 124 | list(APPEND __nvcc_archs_readable sm_${__arch}) 125 | endif() 126 | endforeach() 127 | 128 | # Tell NVCC to add PTX intermediate code for the specified architectures 129 | foreach(__arch ${__cuda_arch_ptx}) 130 | list(APPEND __nvcc_flags -gencode arch=compute_${__arch},code=compute_${__arch}) 131 | list(APPEND __nvcc_archs_readable compute_${__arch}) 132 | endforeach() 133 | 134 | string(REPLACE ";" " " __nvcc_archs_readable "${__nvcc_archs_readable}") 135 | set(${out_variable} ${__nvcc_flags} PARENT_SCOPE) 136 | set(${out_variable}_readable ${__nvcc_archs_readable} PARENT_SCOPE) 137 | endfunction() 138 | 139 | ################################################################################################ 140 | # Short command for cuda compilation 141 | # Usage: 142 | # cuda_compile( ) 143 | macro(dau_conv_cuda_compile objlist_variable) 144 | foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG) 145 | set(${var}_backup_in_cuda_compile_ "${${var}}") 146 | 147 | # we remove /EHa as it generates warnings under windows 148 | string(REPLACE "/EHa" "" ${var} "${${var}}") 149 | 150 | endforeach() 151 | 152 | if(UNIX OR APPLE) 153 | # we supprress "declared_but_not_referenced" warning since pops up way too frequently 154 | list(APPEND CUDA_NVCC_FLAGS -Xcompiler -fPIC -Xcudafe --diag_suppress=declared_but_not_referenced) 155 | endif() 156 | 157 | if(APPLE) 158 | list(APPEND CUDA_NVCC_FLAGS -Xcompiler -Wno-unused-function) 159 | endif() 160 | 161 | cuda_compile(cuda_objcs ${ARGN}) 162 | 163 | foreach(var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_RELEASE CMAKE_CXX_FLAGS_DEBUG) 164 | set(${var} "${${var}_backup_in_cuda_compile_}") 165 | unset(${var}_backup_in_cuda_compile_) 166 | endforeach() 167 | 168 | set(${objlist_variable} ${cuda_objcs}) 169 | endmacro() 170 | 171 | ################################################################################################ 172 | ### Non macro section 173 | ################################################################################################ 174 | 175 | find_package(CUDA 5.5 QUIET) 176 | find_cuda_helper_libs(curand) # cmake 2.8.7 compartibility which doesn't search for curand 177 | 178 | # Known NVIDIA GPU achitectures DAUConvNet can be compiled for. 179 | # This list will be used for CUDA_ARCH_NAME = All option 180 | if ("${CUDA_VERSION}" VERSION_GREATER "12.0" OR "${CUDA_VERSION}" VERSION_EQUAL "12.0") 181 | set(cuda_known_gpu_archs "50 60 61 70 75 80 86 87 89 90") 182 | elseif ("${CUDA_VERSION}" VERSION_GREATER "11.8" OR "${CUDA_VERSION}" VERSION_EQUAL "11.8") 183 | set(cuda_known_gpu_archs "50 60 61 70 75 80 86 87 89 90") 184 | elseif ("${CUDA_VERSION}" VERSION_GREATER "11.5" OR "${CUDA_VERSION}" VERSION_EQUAL "11.5") 185 | set(cuda_known_gpu_archs "50 60 61 70 75 80 86 87") 186 | elseif ("${CUDA_VERSION}" VERSION_GREATER "11.0" OR "${CUDA_VERSION}" VERSION_EQUAL "11.0") 187 | set(cuda_known_gpu_archs "50 60 61 70 75 80 86") 188 | elseif ("${CUDA_VERSION}" VERSION_GREATER "10.0" OR "${CUDA_VERSION}" VERSION_EQUAL "10.0") 189 | set(cuda_known_gpu_archs "50 60 61 70 75") 190 | elseif ("${CUDA_VERSION}" VERSION_GREATER "9.0" OR "${CUDA_VERSION}" VERSION_EQUAL "9.0") 191 | set(cuda_known_gpu_archs "50 60 61 70") 192 | elseif ("${CUDA_VERSION}" VERSION_GREATER "8.0" OR "${CUDA_VERSION}" VERSION_EQUAL "8.0") 193 | set(cuda_known_gpu_archs "50 60 61") 194 | else() 195 | set(cuda_known_gpu_archs "50") 196 | endif() 197 | 198 | 199 | if(NOT CUDA_FOUND) 200 | return() 201 | endif() 202 | 203 | set(HAVE_CUDA TRUE) 204 | message(STATUS "CUDA detected: " ${CUDA_VERSION}) 205 | list(APPEND DAUConvNet_INCLUDE_DIRS PUBLIC ${CUDA_INCLUDE_DIRS}) 206 | list(APPEND DAUConvNet_LINKER_LIBS PUBLIC ${CUDA_CUDART_LIBRARY} 207 | ${CUDA_curand_LIBRARY} ${CUDA_CUBLAS_LIBRARIES}) 208 | 209 | # setting nvcc arch flags 210 | dau_conv_cuda_select_nvcc_arch_flags(NVCC_FLAGS_EXTRA) 211 | list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA}) 212 | message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}") 213 | 214 | # Boost 1.55 workaround, see https://svn.boost.org/trac/boost/ticket/9392 or 215 | # https://github.com/ComputationalRadiationPhysics/picongpu/blob/master/src/picongpu/CMakeLists.txt 216 | if(Boost_VERSION EQUAL 105500) 217 | message(STATUS "Cuda + Boost 1.55: Applying noinline work around") 218 | # avoid warning for CMake >= 2.8.12 219 | set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} \"-DBOOST_NOINLINE=__attribute__((noinline))\" ") 220 | endif() 221 | 222 | # disable some nvcc diagnostic that apears in boost, glog, glags, opencv, etc. 223 | foreach(diag cc_clobber_ignored integer_sign_change useless_using_declaration set_but_not_used) 224 | list(APPEND CUDA_NVCC_FLAGS -Xcudafe --diag_suppress=${diag}) 225 | endforeach() 226 | 227 | # setting default testing device 228 | if(NOT CUDA_TEST_DEVICE) 229 | set(CUDA_TEST_DEVICE -1) 230 | endif() 231 | 232 | mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD) 233 | mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION) 234 | 235 | # Handle clang/libc++ issue 236 | if(APPLE) 237 | dau_conv_detect_darwin_version(OSX_VERSION) 238 | 239 | # OSX 10.9 and higher uses clang/libc++ by default which is incompatible with old CUDA toolkits 240 | if(OSX_VERSION VERSION_GREATER 10.8) 241 | # enabled by default if and only if CUDA version is less than 7.0 242 | dau_conv_option(USE_libstdcpp "Use libstdc++ instead of libc++" (CUDA_VERSION VERSION_LESS 7.0)) 243 | endif() 244 | endif() 245 | -------------------------------------------------------------------------------- /cmake/Dependencies.cmake: -------------------------------------------------------------------------------- 1 | # These lists are later turned into target properties on main dau_conv_impl library target 2 | set(DAUConvNet_LINKER_LIBS "") 3 | set(DAUConvNet_INCLUDE_DIRS "") 4 | set(DAUConvNet_DEFINITIONS "") 5 | set(DAUConvNet_COMPILE_OPTIONS "") 6 | 7 | # we get strange error when DAUConvNet_DEFINITIONS is empty so we just fill it with gibrish definition to make compiler hapopy :) 8 | list(APPEND DAUConvNet_DEFINITIONS PUBLIC -DDUMMYXYMMUD) 9 | 10 | # ---[ CUDA 11 | include(cmake/Cuda.cmake) 12 | if(NOT HAVE_CUDA) 13 | if(CPU_ONLY) 14 | message(STATUS "-- CUDA is disabled. Building without it...") 15 | else() 16 | message(WARNING "-- CUDA is not detected by cmake. Building without it...") 17 | endif() 18 | 19 | list(APPEND DAUConvNet_DEFINITIONS PUBLIC -DCPU_ONLY) 20 | endif() 21 | 22 | # ---[ BLAS 23 | if(NOT APPLE) 24 | set(BLAS "Atlas" CACHE STRING "Selected BLAS library") 25 | set_property(CACHE BLAS PROPERTY STRINGS "Atlas;Open;MKL") 26 | 27 | if(BLAS STREQUAL "Atlas" OR BLAS STREQUAL "atlas") 28 | find_package(Atlas REQUIRED) 29 | list(APPEND DAUConvNet_INCLUDE_DIRS PUBLIC ${Atlas_INCLUDE_DIR}) 30 | list(APPEND DAUConvNet_LINKER_LIBS PUBLIC ${Atlas_LIBRARIES}) 31 | elseif(BLAS STREQUAL "Open" OR BLAS STREQUAL "open") 32 | find_package(OpenBLAS REQUIRED) 33 | list(APPEND DAUConvNet_INCLUDE_DIRS PUBLIC ${OpenBLAS_INCLUDE_DIR}) 34 | list(APPEND DAUConvNet_LINKER_LIBS PUBLIC ${OpenBLAS_LIB}) 35 | elseif(BLAS STREQUAL "MKL" OR BLAS STREQUAL "mkl") 36 | find_package(MKL REQUIRED) 37 | list(APPEND DAUConvNet_INCLUDE_DIRS PUBLIC ${MKL_INCLUDE_DIR}) 38 | list(APPEND DAUConvNet_LINKER_LIBS PUBLIC ${MKL_LIBRARIES}) 39 | list(APPEND DAUConvNet_DEFINITIONS PUBLIC -DUSE_MKL) 40 | endif() 41 | elseif(APPLE) 42 | find_package(vecLib REQUIRED) 43 | list(APPEND DAUConvNet_INCLUDE_DIRS PUBLIC ${vecLib_INCLUDE_DIR}) 44 | list(APPEND DAUConvNet_LINKER_LIBS PUBLIC ${vecLib_LINKER_LIBS}) 45 | 46 | if(VECLIB_FOUND) 47 | if(NOT vecLib_INCLUDE_DIR MATCHES "^/System/Library/Frameworks/vecLib.framework.*") 48 | list(APPEND DAUConvNet_DEFINITIONS PUBLIC -DUSE_ACCELERATE) 49 | endif() 50 | endif() 51 | endif() 52 | -------------------------------------------------------------------------------- /cmake/Modules/FindAtlas.cmake: -------------------------------------------------------------------------------- 1 | # Find the Atlas (and Lapack) libraries 2 | # 3 | # The following variables are optionally searched for defaults 4 | # Atlas_ROOT_DIR: Base directory where all Atlas components are found 5 | # 6 | # The following are set after configuration is done: 7 | # Atlas_FOUND 8 | # Atlas_INCLUDE_DIRS 9 | # Atlas_LIBRARIES 10 | # Atlas_LIBRARYRARY_DIRS 11 | 12 | set(Atlas_INCLUDE_SEARCH_PATHS 13 | /usr/include/atlas 14 | /usr/include/atlas-base 15 | $ENV{Atlas_ROOT_DIR} 16 | $ENV{Atlas_ROOT_DIR}/include 17 | ) 18 | 19 | set(Atlas_LIB_SEARCH_PATHS 20 | /usr/lib/atlas 21 | /usr/lib/atlas-base 22 | $ENV{Atlas_ROOT_DIR} 23 | $ENV{Atlas_ROOT_DIR}/lib 24 | ) 25 | 26 | find_path(Atlas_CBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Atlas_INCLUDE_SEARCH_PATHS}) 27 | find_path(Atlas_CLAPACK_INCLUDE_DIR NAMES clapack.h PATHS ${Atlas_INCLUDE_SEARCH_PATHS}) 28 | 29 | find_library(Atlas_CBLAS_LIBRARY NAMES ptcblas_r ptcblas cblas_r cblas PATHS ${Atlas_LIB_SEARCH_PATHS}) 30 | find_library(Atlas_BLAS_LIBRARY NAMES atlas_r atlas PATHS ${Atlas_LIB_SEARCH_PATHS}) 31 | find_library(Atlas_LAPACK_LIBRARY NAMES lapack alapack_r alapack lapack_atlas atllapack PATHS ${Atlas_LIB_SEARCH_PATHS}) 32 | 33 | set(LOOKED_FOR 34 | Atlas_CBLAS_INCLUDE_DIR 35 | Atlas_CLAPACK_INCLUDE_DIR 36 | 37 | Atlas_CBLAS_LIBRARY 38 | Atlas_BLAS_LIBRARY 39 | Atlas_LAPACK_LIBRARY 40 | ) 41 | 42 | include(FindPackageHandleStandardArgs) 43 | find_package_handle_standard_args(Atlas DEFAULT_MSG ${LOOKED_FOR}) 44 | 45 | if(ATLAS_FOUND) 46 | set(Atlas_INCLUDE_DIR ${Atlas_CBLAS_INCLUDE_DIR} ${Atlas_CLAPACK_INCLUDE_DIR}) 47 | set(Atlas_LIBRARIES ${Atlas_LAPACK_LIBRARY} ${Atlas_CBLAS_LIBRARY} ${Atlas_BLAS_LIBRARY}) 48 | mark_as_advanced(${LOOKED_FOR}) 49 | 50 | message(STATUS "Found Atlas (include: ${Atlas_CBLAS_INCLUDE_DIR} library: ${Atlas_BLAS_LIBRARY} lapack: ${Atlas_LAPACK_LIBRARY}") 51 | endif(ATLAS_FOUND) 52 | 53 | -------------------------------------------------------------------------------- /cmake/Modules/FindOpenBLAS.cmake: -------------------------------------------------------------------------------- 1 | 2 | 3 | SET(Open_BLAS_INCLUDE_SEARCH_PATHS 4 | /usr/include 5 | /usr/include/openblas 6 | /usr/include/openblas-base 7 | /usr/local/include 8 | /usr/local/include/openblas 9 | /usr/local/include/openblas-base 10 | /opt/OpenBLAS/include 11 | $ENV{OpenBLAS_HOME} 12 | $ENV{OpenBLAS_HOME}/include 13 | ) 14 | 15 | SET(Open_BLAS_LIB_SEARCH_PATHS 16 | /lib/ 17 | /lib/openblas-base 18 | /lib64/ 19 | /usr/lib 20 | /usr/lib/openblas-base 21 | /usr/lib64 22 | /usr/local/lib 23 | /usr/local/lib64 24 | /opt/OpenBLAS/lib 25 | $ENV{OpenBLAS}cd 26 | $ENV{OpenBLAS}/lib 27 | $ENV{OpenBLAS_HOME} 28 | $ENV{OpenBLAS_HOME}/lib 29 | ) 30 | 31 | FIND_PATH(OpenBLAS_INCLUDE_DIR NAMES cblas.h PATHS ${Open_BLAS_INCLUDE_SEARCH_PATHS}) 32 | FIND_LIBRARY(OpenBLAS_LIB NAMES openblas PATHS ${Open_BLAS_LIB_SEARCH_PATHS}) 33 | 34 | SET(OpenBLAS_FOUND ON) 35 | 36 | # Check include files 37 | IF(NOT OpenBLAS_INCLUDE_DIR) 38 | SET(OpenBLAS_FOUND OFF) 39 | MESSAGE(STATUS "Could not find OpenBLAS include. Turning OpenBLAS_FOUND off") 40 | ENDIF() 41 | 42 | # Check libraries 43 | IF(NOT OpenBLAS_LIB) 44 | SET(OpenBLAS_FOUND OFF) 45 | MESSAGE(STATUS "Could not find OpenBLAS lib. Turning OpenBLAS_FOUND off") 46 | ENDIF() 47 | 48 | IF (OpenBLAS_FOUND) 49 | IF (NOT OpenBLAS_FIND_QUIETLY) 50 | MESSAGE(STATUS "Found OpenBLAS libraries: ${OpenBLAS_LIB}") 51 | MESSAGE(STATUS "Found OpenBLAS include: ${OpenBLAS_INCLUDE_DIR}") 52 | ENDIF (NOT OpenBLAS_FIND_QUIETLY) 53 | ELSE (OpenBLAS_FOUND) 54 | IF (OpenBLAS_FIND_REQUIRED) 55 | MESSAGE(FATAL_ERROR "Could not find OpenBLAS") 56 | ENDIF (OpenBLAS_FIND_REQUIRED) 57 | ENDIF (OpenBLAS_FOUND) 58 | 59 | MARK_AS_ADVANCED( 60 | OpenBLAS_INCLUDE_DIR 61 | OpenBLAS_LIB 62 | OpenBLAS 63 | ) 64 | 65 | -------------------------------------------------------------------------------- /cmake/Targets.cmake: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Defines global DAUConvNet_LINK flag, This flag is required to prevent linker from excluding 3 | # some objects which are not addressed directly but are registered via static constructors 4 | macro(dau_conv_set_link) 5 | if(BUILD_SHARED_LIBS) 6 | set(DAUConvNet_LINK dau-conv) 7 | else() 8 | if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") 9 | set(DAUConvNet_LINK -Wl,-force_load dau-conv) 10 | elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") 11 | set(DAUConvNet_LINK -Wl,--whole-archive dau-conv -Wl,--no-whole-archive) 12 | endif() 13 | endif() 14 | endmacro() 15 | ################################################################################################ 16 | # Convenient command to setup source group for IDEs that support this feature (VS, XCode) 17 | # Usage: 18 | # dau_conv_source_group( GLOB[_RECURSE] ) 19 | function(dau_conv_source_group group) 20 | cmake_parse_arguments(DAU_CONV_SOURCE_GROUP "" "" "GLOB;GLOB_RECURSE" ${ARGN}) 21 | if(DAU_CONV_SOURCE_GROUP_GLOB) 22 | file(GLOB srcs1 ${DAU_CONV_SOURCE_GROUP_GLOB}) 23 | source_group(${group} FILES ${srcs1}) 24 | endif() 25 | 26 | if(DAU_CONV_SOURCE_GROUP_GLOB_RECURSE) 27 | file(GLOB_RECURSE srcs2 ${DAU_CONV_SOURCE_GROUP_GLOB_RECURSE}) 28 | source_group(${group} FILES ${srcs2}) 29 | endif() 30 | endfunction() 31 | 32 | ################################################################################################ 33 | # Collecting sources from globbing and appending to output list variable 34 | # Usage: 35 | # dau_conv_collect_sources( GLOB[_RECURSE] ) 36 | function(dau_conv_collect_sources variable) 37 | cmake_parse_arguments(DAU_CONV_COLLECT_SOURCES "" "" "GLOB;GLOB_RECURSE" ${ARGN}) 38 | if(DAU_CONV_COLLECT_SOURCES_GLOB) 39 | file(GLOB srcs1 ${DAU_CONV_COLLECT_SOURCES_GLOB}) 40 | set(${variable} ${variable} ${srcs1}) 41 | endif() 42 | 43 | if(DAU_CONV_COLLECT_SOURCES_GLOB_RECURSE) 44 | file(GLOB_RECURSE srcs2 ${DAU_CONV_COLLECT_SOURCES_GLOB_RECURSE}) 45 | set(${variable} ${variable} ${srcs2}) 46 | endif() 47 | endfunction() 48 | 49 | ################################################################################################ 50 | # Short command getting dau_conv_impl sources (assuming standard DAUConvNet code tree) 51 | # Usage: 52 | # dau_conv_pickup_dau_conv_sources() 53 | function(dau_conv_pickup_sources root) 54 | # put all files in source groups (visible as subfolder in many IDEs) 55 | dau_conv_source_group("Include" GLOB "${root}/include/dau_conv/*.h*") 56 | dau_conv_source_group("Include\\Util" GLOB "${root}/include/dau_conv/util/*.h*") 57 | dau_conv_source_group("Include" GLOB "${PROJECT_BINARY_DIR}/dau_conv_config.h*") 58 | dau_conv_source_group("Source" GLOB "${root}/src/dau_conv/*.cpp") 59 | dau_conv_source_group("Source\\Util" GLOB "${root}/src/dau_conv/util/*.cpp") 60 | dau_conv_source_group("Source\\Layers" GLOB "${root}/src/dau_conv/layers/*.cpp") 61 | dau_conv_source_group("Source\\Cuda" GLOB "${root}/src/dau_conv/layers/*.cu") 62 | dau_conv_source_group("Source\\Cuda" GLOB "${root}/src/dau_conv/util/*.cu") 63 | 64 | # collect files 65 | file(GLOB_RECURSE hdrs ${root}/include/dau_cconv/*.h*) 66 | file(GLOB_RECURSE srcs ${root}/src/dau_conv/*.cpp) 67 | 68 | # adding headers to make the visible in some IDEs (Qt, VS, Xcode) 69 | list(APPEND srcs ${hdrs} ${PROJECT_BINARY_DIR}/dau_conv_config.h) 70 | 71 | # collect cuda files 72 | file(GLOB_RECURSE cuda ${root}/src/dau_conv/*.cu) 73 | 74 | # convert to absolute paths 75 | dau_conv_convert_absolute_paths(srcs) 76 | dau_conv_convert_absolute_paths(cuda) 77 | 78 | # propagate to parent scope 79 | set(srcs ${srcs} PARENT_SCOPE) 80 | set(cuda ${cuda} PARENT_SCOPE) 81 | endfunction() 82 | 83 | ################################################################################################ 84 | # Short command for setting default target properties 85 | # Usage: 86 | # dau_conv_default_properties() 87 | function(dau_conv_default_properties target) 88 | set_target_properties(${target} PROPERTIES 89 | DEBUG_POSTFIX ${DAUConvNet_DEBUG_POSTFIX} 90 | ARCHIVE_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib" 91 | LIBRARY_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/lib" 92 | RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/bin") 93 | # make sure we build all external dependencies first 94 | if (DEFINED external_project_dependencies) 95 | add_dependencies(${target} ${external_project_dependencies}) 96 | endif() 97 | endfunction() 98 | 99 | ################################################################################################ 100 | # Short command for setting runtime directory for build target 101 | # Usage: 102 | # dau_conv_set_runtime_directory( ) 103 | function(dau_conv_set_runtime_directory target dir) 104 | set_target_properties(${target} PROPERTIES 105 | RUNTIME_OUTPUT_DIRECTORY "${dir}") 106 | endfunction() 107 | 108 | ################################################################################################ 109 | # Short command for setting solution folder property for target 110 | # Usage: 111 | # dau_conv_set_solution_folder( ) 112 | function(dau_conv_set_solution_folder target folder) 113 | if(USE_PROJECT_FOLDERS) 114 | set_target_properties(${target} PROPERTIES FOLDER "${folder}") 115 | endif() 116 | endfunction() 117 | 118 | ################################################################################################ 119 | # Reads lines from input file, prepends source directory to each line and writes to output file 120 | # Usage: 121 | # dau_conv_configure_testdatafile() 122 | function(dau_conv_configure_testdatafile file) 123 | file(STRINGS ${file} __lines) 124 | set(result "") 125 | foreach(line ${__lines}) 126 | set(result "${result}${PROJECT_SOURCE_DIR}/${line}\n") 127 | endforeach() 128 | file(WRITE ${file}.gen.cmake ${result}) 129 | endfunction() 130 | 131 | ################################################################################################ 132 | # Filter out all files that are not included in selected list 133 | # Usage: 134 | # dau_conv_leave_only_selected_tests( ) 135 | function(dau_conv_leave_only_selected_tests file_list) 136 | if(NOT ARGN) 137 | return() # blank list means leave all 138 | endif() 139 | string(REPLACE "," ";" __selected ${ARGN}) 140 | list(APPEND __selected dau_conv_main) 141 | 142 | set(result "") 143 | foreach(f ${${file_list}}) 144 | get_filename_component(name ${f} NAME_WE) 145 | string(REGEX REPLACE "^test_" "" name ${name}) 146 | list(FIND __selected ${name} __index) 147 | if(NOT __index EQUAL -1) 148 | list(APPEND result ${f}) 149 | endif() 150 | endforeach() 151 | set(${file_list} ${result} PARENT_SCOPE) 152 | endfunction() 153 | 154 | -------------------------------------------------------------------------------- /cmake/Templates/DAUConvNetConfig.cmake.in: -------------------------------------------------------------------------------- 1 | # Config file for the DAU-ConvNet package. 2 | # 3 | # Note: 4 | # DAU-ConvNetand this config file depends on opencv, 5 | # so put `find_package(OpenCV)` before searching DAU-ConvNet 6 | # via `find_package(DAUConvNet)`. All other lib/includes 7 | # dependencies are hard coded in the file 8 | # 9 | # After successful configuration the following variables 10 | # will be defined: 11 | # 12 | # DAUConvNet_LIBRARIES - IMPORTED targets to link against 13 | # (There is no DAUConvNet_INCLUDE_DIRS and DAUConvNet_DEFINITIONS 14 | # because they are specified in the IMPORTED target interface.) 15 | # 16 | # DAUConvNet_HAVE_CUDA - signals about CUDA support 17 | 18 | 19 | # OpenCV dependency (optional) 20 | 21 | if(@USE_OPENCV@) 22 | if(NOT OpenCV_FOUND) 23 | set(DAUConvNet_OpenCV_CONFIG_PATH "@OpenCV_CONFIG_PATH@") 24 | if(DAUConvNet_OpenCV_CONFIG_PATH) 25 | get_filename_component(DAUConvNet_OpenCV_CONFIG_PATH ${DAUConvNet_OpenCV_CONFIG_PATH} ABSOLUTE) 26 | 27 | if(EXISTS ${DAUConvNet_OpenCV_CONFIG_PATH} AND NOT TARGET opencv_core) 28 | message(STATUS "DAUConvNet: using OpenCV config from ${DAUConvNet_OpenCV_CONFIG_PATH}") 29 | include(${DAUConvNet_OpenCV_CONFIG_PATH}/OpenCVConfig.cmake) 30 | endif() 31 | 32 | else() 33 | find_package(OpenCV REQUIRED) 34 | endif() 35 | unset(DAUConvNet_OpenCV_CONFIG_PATH) 36 | endif() 37 | endif() 38 | 39 | # Compute paths 40 | get_filename_component(DAUConvNet_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH) 41 | 42 | # Our library dependencies 43 | if(NOT TARGET DAUConvNet AND NOT DAUConvNet_BINARY_DIR) 44 | include("${DAUConvNet_CMAKE_DIR}/DAUConvNetTargets.cmake") 45 | endif() 46 | 47 | # List of IMPORTED libs created by DAUConvNetTargets.cmake 48 | # These targets already specify all needed definitions and include pathes 49 | set(DAUConvNet_LIBRARIES dau-conv) 50 | 51 | # Cuda support variables 52 | set(DAUConvNet_CPU_ONLY @CPU_ONLY@) 53 | set(DAUConvNet_HAVE_CUDA @HAVE_CUDA@) 54 | -------------------------------------------------------------------------------- /cmake/Templates/dau_conv_config.h.in: -------------------------------------------------------------------------------- 1 | /* Sources directory */ 2 | #define SOURCE_FOLDER "${PROJECT_SOURCE_DIR}" 3 | 4 | /* Binaries directory */ 5 | #define BINARY_FOLDER "${PROJECT_BINARY_DIR}" 6 | 7 | /* Test device */ 8 | #define CUDA_TEST_DEVICE ${CUDA_TEST_DEVICE} 9 | -------------------------------------------------------------------------------- /cmake/Utils.cmake: -------------------------------------------------------------------------------- 1 | ################################################################################################ 2 | # Command alias for debugging messages 3 | # Usage: 4 | # dmsg() 5 | function(dmsg) 6 | message(STATUS ${ARGN}) 7 | endfunction() 8 | 9 | ################################################################################################ 10 | # Removes duplicates from list(s) 11 | # Usage: 12 | # dau_conv_list_unique( [] [...]) 13 | macro(dau_conv_list_unique) 14 | foreach(__lst ${ARGN}) 15 | if(${__lst}) 16 | list(REMOVE_DUPLICATES ${__lst}) 17 | endif() 18 | endforeach() 19 | endmacro() 20 | 21 | ################################################################################################ 22 | # Clears variables from list 23 | # Usage: 24 | # dau_conv_clear_vars() 25 | macro(dau_conv_clear_vars) 26 | foreach(_var ${ARGN}) 27 | unset(${_var}) 28 | endforeach() 29 | endmacro() 30 | 31 | 32 | ################################################################################################ 33 | # Converts all paths in list to absolute 34 | # Usage: 35 | # dau_conv_convert_absolute_paths() 36 | function(dau_conv_convert_absolute_paths variable) 37 | set(__dlist "") 38 | foreach(__s ${${variable}}) 39 | get_filename_component(__abspath ${__s} ABSOLUTE) 40 | list(APPEND __list ${__abspath}) 41 | endforeach() 42 | set(${variable} ${__list} PARENT_SCOPE) 43 | endfunction() 44 | 45 | 46 | ######################################################################################################## 47 | # An option that the user can select. Can accept condition to control when option is available for user. 48 | # Usage: 49 | # dau_conv_option( "doc string" [IF ]) 50 | function(dau_conv_option variable description value) 51 | set(__value ${value}) 52 | set(__condition "") 53 | set(__varname "__value") 54 | foreach(arg ${ARGN}) 55 | if(arg STREQUAL "IF" OR arg STREQUAL "if") 56 | set(__varname "__condition") 57 | else() 58 | list(APPEND ${__varname} ${arg}) 59 | endif() 60 | endforeach() 61 | unset(__varname) 62 | if("${__condition}" STREQUAL "") 63 | set(__condition 2 GREATER 1) 64 | endif() 65 | 66 | if(${__condition}) 67 | if("${__value}" MATCHES ";") 68 | if(${__value}) 69 | option(${variable} "${description}" ON) 70 | else() 71 | option(${variable} "${description}" OFF) 72 | endif() 73 | elseif(DEFINED ${__value}) 74 | if(${__value}) 75 | option(${variable} "${description}" ON) 76 | else() 77 | option(${variable} "${description}" OFF) 78 | endif() 79 | else() 80 | option(${variable} "${description}" ${__value}) 81 | endif() 82 | else() 83 | unset(${variable} CACHE) 84 | endif() 85 | endfunction() 86 | 87 | 88 | ################################################################################################ 89 | # Command for disabling warnings for different platforms (see below for gcc and VisualStudio) 90 | # Usage: 91 | # dau_conv_warnings_disable( -Wshadow /wd4996 ..,) 92 | macro(dau_conv_warnings_disable) 93 | set(_flag_vars "") 94 | set(_msvc_warnings "") 95 | set(_gxx_warnings "") 96 | 97 | foreach(arg ${ARGN}) 98 | if(arg MATCHES "^CMAKE_") 99 | list(APPEND _flag_vars ${arg}) 100 | elseif(arg MATCHES "^/wd") 101 | list(APPEND _msvc_warnings ${arg}) 102 | elseif(arg MATCHES "^-W") 103 | list(APPEND _gxx_warnings ${arg}) 104 | endif() 105 | endforeach() 106 | 107 | if(NOT _flag_vars) 108 | set(_flag_vars CMAKE_C_FLAGS CMAKE_CXX_FLAGS) 109 | endif() 110 | 111 | if(MSVC AND _msvc_warnings) 112 | foreach(var ${_flag_vars}) 113 | foreach(warning ${_msvc_warnings}) 114 | set(${var} "${${var}} ${warning}") 115 | endforeach() 116 | endforeach() 117 | elseif((CMAKE_COMPILER_IS_GNUCXX OR CMAKE_COMPILER_IS_CLANGXX) AND _gxx_warnings) 118 | foreach(var ${_flag_vars}) 119 | foreach(warning ${_gxx_warnings}) 120 | if(NOT warning MATCHES "^-Wno-") 121 | string(REPLACE "${warning}" "" ${var} "${${var}}") 122 | string(REPLACE "-W" "-Wno-" warning "${warning}") 123 | endif() 124 | set(${var} "${${var}} ${warning}") 125 | endforeach() 126 | endforeach() 127 | endif() 128 | dau_conv_clear_vars(_flag_vars _msvc_warnings _gxx_warnings) 129 | endmacro() 130 | 131 | 132 | ################################################################################################ 133 | # Helper function to detect Darwin version, i.e. 10.8, 10.9, 10.10, .... 134 | # Usage: 135 | # dau_conv_detect_darwin_version() 136 | function(dau_conv_detect_darwin_version output_var) 137 | if(APPLE) 138 | execute_process(COMMAND /usr/bin/sw_vers -productVersion 139 | RESULT_VARIABLE __sw_vers OUTPUT_VARIABLE __sw_vers_out 140 | ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE) 141 | 142 | set(${output_var} ${__sw_vers_out} PARENT_SCOPE) 143 | else() 144 | set(${output_var} "" PARENT_SCOPE) 145 | endif() 146 | endfunction() 147 | 148 | ################################################################################################ 149 | # Helper function to convert version string to integer 150 | function(convert_version_string_to_int version_string version_int) 151 | # Split the version string into its components 152 | string(REPLACE "." ";" version_list ${version_string}) 153 | 154 | list(GET version_list 0 major) 155 | list(GET version_list 1 minor) 156 | list(GET version_list 2 patch) 157 | 158 | # Convert the version components to integers 159 | math(EXPR major_int "${major}*1000") 160 | math(EXPR minor_int "${minor}*10") 161 | set(patch_int ${patch}) 162 | 163 | # Combine the version components into a single integer 164 | math(EXPR ret "${major_int}+${minor_int}+${patch_int}" ) 165 | set(${version_int} ${ret} PARENT_SCOPE) 166 | endfunction() -------------------------------------------------------------------------------- /include/dau_conv/dau_conv_impl/dau_conv_backward.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DAU_CONV_UTIL_DAU_BACKWARD_H_ 2 | #define DAU_CONV_UTIL_DAU_BACKWARD_H_ 3 | 4 | 5 | #include 6 | 7 | #include "dau_conv/util/common.hpp" 8 | 9 | namespace DAUConvNet { 10 | #ifndef CPU_ONLY // GPU 11 | 12 | #define MAX(x,y) (x > y ? x : y) 13 | 14 | template 15 | class DAUConvBackward { 16 | // TODO: 17 | // - make interpolation weights in 16 bit float (they are computed with 32 bit error so cannot use 16 bit float arithmetics) 18 | // - make input data in 16 bit float but retain error in 32 bit float and perform computation in 16 bit (this will reduce memory bandwidth required) 19 | // --> tried but not worked: 20 | // float 16 bit does half transfer time, but adds additionl conversions from fp16 to fp32 which brings total time back to the same !! 21 | // --> would be possible with new Nvidia VOLTA arch which should have fp16 dot product with aggregation to fp32 !!! 22 | // 23 | // - make data and computation with 16 bit float (only viable version but effect on performance is yet unknown) 24 | public: 25 | // fixed params during construction 26 | const int img_width_in, img_height_in; 27 | const int img_width, img_height; 28 | const int I, S, F, G, IN_K; 29 | int OUT_K; // this is const but is calculated in constructor 30 | 31 | private: 32 | // this parameters are used as template params for DAUConvBackwardCUDA 33 | int patch_size_w, patch_size_h, num_images; 34 | bool use_smaller_warp_and_group_k, use_interpolation, single_subfeature; 35 | bool last_k_optional; 36 | 37 | 38 | public: 39 | DAUConvBackward(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const int K, const bool last_k_optional, const bool use_interpolation); 40 | 41 | void get_allocation_sizes(const int kernel_width, const int kernel_height, const bool offsets_already_centered, 42 | size_t* prepared_filtered_images_size, size_t* prepared_error_images_size, size_t* prepared_filter_weights_size, size_t* prepared_filter_offsets_size); 43 | 44 | void backward_pass(const Dtype* filtered_images, const Dtype* error_images, 45 | const Dtype* filter_offsets_float_x, const Dtype* filter_offsets_float_y, 46 | const Dtype* filter_weights, 47 | const int kernel_w, const int kernel_h, const Dtype actual_max_offset, 48 | const bool offsets_already_centered, 49 | Dtype* output, 50 | Dtype* prepared_filtered_images, 51 | Dtype* prepared_error_images, 52 | Dtype* prepared_filter_weights, 53 | int* prepared_filter_offsets, 54 | const bool ignore_edge_gradients, 55 | cudaStream_t streamId = 0); 56 | 57 | class CUDAParams { 58 | public: 59 | // fixed params during construction 60 | const int img_width_in, img_height_in; 61 | const int img_width, img_height; 62 | const int I, S, F, G, K, IN_K; 63 | 64 | // parameters to setup before call 65 | 66 | // params for get_allocation_sizes call 67 | size_t* alloc_img, *alloc_err, *alloc_w, *alloc_off; 68 | 69 | // params for run_kernel call 70 | Dtype const* filtered_images, *error_images, *filter_offsets_float_x, *filter_offsets_float_y, *filter_weights; 71 | Dtype* output, *prepared_filtered_images, *prepared_error_images, *prepared_filter_weights; 72 | int* prepared_filter_offsets; 73 | int kernel_w, kernel_h; 74 | bool ignore_edge_gradients; 75 | bool offsets_already_centered; 76 | cudaStream_t streamId; 77 | 78 | float actual_max_offset; 79 | 80 | CUDAParams(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const int K, const int IN_K, const bool offsets_already_centered) : 81 | img_width_in(img_width_in), img_height_in(img_height_in), img_width(img_width), img_height(img_height), I(I), S(S), F(F), G(G), K(K), IN_K(IN_K), offsets_already_centered(offsets_already_centered){ 82 | 83 | } 84 | void set_params_for_allocation_call(size_t* alloc_img, size_t* alloc_err, size_t* alloc_w, size_t* alloc_off); 85 | 86 | void set_params_for_kernel_call(const Dtype* filtered_images, const Dtype* error_images, 87 | const Dtype* filter_offsets_float_x, const Dtype* filter_offsets_float_y, 88 | const Dtype* filter_weights, const int kernel_w, const int kernel_h, const Dtype actual_max_offset, 89 | Dtype* output, 90 | Dtype* prepared_filtered_images, 91 | Dtype* prepared_error_images, 92 | Dtype* prepared_filter_weights, 93 | int* prepared_filter_offsets, 94 | const bool ignore_edge_gradients, 95 | cudaStream_t streamId); 96 | }; 97 | private: 98 | 99 | void call_cuda_kernel(CUDAParams& params); 100 | 101 | static int select_optimal_block_size_bw(int img_size, int min_power, int max_power); 102 | 103 | }; 104 | 105 | 106 | 107 | // we make explicit functions for different combinations of 108 | // each function is implemented in separate .cu file to allow for parallel compile 109 | // (there are 288 combination all-together so this way we can reduce compute time by a factor of 8 if enough CPU cores) 110 | void DAUConv_backward_multi_subfeatures_patch_1x1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 111 | void DAUConv_backward_multi_subfeatures_patch_8x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 112 | void DAUConv_backward_multi_subfeatures_patch_8x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 113 | void DAUConv_backward_multi_subfeatures_patch_8x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 114 | void DAUConv_backward_multi_subfeatures_patch_8x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 115 | void DAUConv_backward_multi_subfeatures_patch_16x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 116 | void DAUConv_backward_multi_subfeatures_patch_16x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 117 | void DAUConv_backward_multi_subfeatures_patch_16x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 118 | void DAUConv_backward_multi_subfeatures_patch_16x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 119 | void DAUConv_backward_multi_subfeatures_patch_32x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 120 | void DAUConv_backward_multi_subfeatures_patch_32x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 121 | void DAUConv_backward_multi_subfeatures_patch_32x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 122 | void DAUConv_backward_multi_subfeatures_patch_32x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 123 | void DAUConv_backward_multi_subfeatures_patch_64x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 124 | void DAUConv_backward_multi_subfeatures_patch_64x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 125 | void DAUConv_backward_multi_subfeatures_patch_64x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, DAUConvBackward::CUDAParams &PARAMS); 126 | void DAUConv_backward_multi_subfeatures_patch_64x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE,DAUConvBackward::CUDAParams &PARAMS); 127 | 128 | #endif // !CPU_ONLY 129 | 130 | } // namespace DAUConvNet 131 | 132 | #endif // DAU_CONV_UTIL_DAU_BACKWARD_H_ 133 | -------------------------------------------------------------------------------- /include/dau_conv/dau_conv_impl/dau_conv_forward.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DAU_CONV_UTIL_DAU_FORWARD_H_ 2 | #define DAU_CONV_UTIL_DAU_FORWARD_H_ 3 | 4 | #include 5 | 6 | #include "dau_conv/util/common.hpp" 7 | 8 | namespace DAUConvNet { 9 | 10 | #ifndef CPU_ONLY // GPU 11 | 12 | template 13 | class DAUConvForward { 14 | // fixed params during construction 15 | const int img_width_in, img_height_in; 16 | const int img_width, img_height; 17 | const int I, S, F, G; 18 | 19 | // this parameters are used as template params for DAUConvBackwardCUDA 20 | int patch_size_w, patch_size_h, max_offset, num_images, warp_pixel_size_x, warp_pixel_size_y; 21 | bool single_feature, single_subfeature, use_interpolation; 22 | 23 | public: 24 | enum PARAM_FORMAT { SGF, FGS}; // default should be SGF 25 | 26 | DAUConvForward(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const bool use_interpolation); 27 | 28 | void get_allocation_sizes(const int kernel_width, const int kernel_height, const bool offsets_already_centered, 29 | size_t* prepared_filtered_images_size, 30 | size_t* prepared_filter_weights_size, 31 | size_t* prepared_filter_offsets_size); 32 | 33 | void forward_pass(const Dtype* filtered_images, 34 | const Dtype* filter_offsets_float_x, const Dtype* filter_offsets_float_y, 35 | const Dtype* filter_weights, const PARAM_FORMAT param_format, 36 | const int kernel_width, const int kernel_height, const Dtype actual_max_offset, 37 | const bool offsets_already_centered, Dtype* output, 38 | Dtype* prepared_filtered_images, 39 | Dtype* prepared_filter_weights, 40 | int* prepared_filter_offsets, 41 | Dtype* prepared_filter_offsets_and_weights, cudaStream_t streamId = NULL); 42 | 43 | class CUDAParams { 44 | public: 45 | // fixed params during construction 46 | const int img_width_in, img_height_in; 47 | const int img_width, img_height; 48 | const int I, S, F, G; 49 | 50 | // parameters for setup before call 51 | 52 | // params for get_allocation_sizes call 53 | size_t *alloc_img, *alloc_w, *alloc_off; 54 | 55 | // params for run_kernel call 56 | Dtype const *filtered_images, *filter_offsets_float_x, *filter_offsets_float_y, *filter_weights; 57 | Dtype *output, *prepared_filtered_images, *prepared_filter_weights, *prepared_filter_offsets_and_weights; 58 | int *prepared_filter_offsets; 59 | int kernel_w, kernel_h; 60 | PARAM_FORMAT param_format; 61 | bool offsets_already_centered; 62 | cudaStream_t streamId; 63 | 64 | float actual_max_offset; 65 | 66 | public: 67 | CUDAParams(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const bool offsets_already_centered) : 68 | img_width_in(img_width_in), img_height_in(img_height_in), img_width(img_width), img_height(img_height), I(I), S(S), F(F), G(G), offsets_already_centered(offsets_already_centered) { 69 | } 70 | 71 | void set_params_for_allocation_call(size_t *alloc_img, size_t *alloc_w, size_t *alloc_off); 72 | 73 | void set_params_for_kernel_call(const Dtype *filtered_images, 74 | const Dtype *filter_offsets_float_x, const Dtype *filter_offsets_float_y, 75 | const Dtype *filter_weights, 76 | const PARAM_FORMAT param_format, const int kernel_w, const int kernel_h, 77 | const Dtype actual_max_offset, Dtype *output, 78 | Dtype *prepared_filtered_images, 79 | Dtype *prepared_filter_weights, 80 | int *prepared_filter_offsets, 81 | Dtype *prepared_filter_offsets_and_weights, 82 | cudaStream_t streamId); 83 | 84 | }; 85 | 86 | private: 87 | void call_cuda_kernel(CUDAParams& params); 88 | 89 | }; 90 | // we make explicit functions for different combinations of [OFFSET, USE_SINGLE_FEATURE, USE_SINGLE_SUBFEATURE] 91 | // each function is implemented in separate .cu file to allow for parallel compile 92 | // (there are 288 combination all-together so this way we can reduce compute time by a factor of 8 if enough CPU cores) 93 | void DAUConv_forward_float_off_4_single_feat_0_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward::CUDAParams &PARAMS); 94 | void DAUConv_forward_float_off_4_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward::CUDAParams &PARAMS); 95 | void DAUConv_forward_float_off_4_single_feat_1_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward::CUDAParams &PARAMS); 96 | void DAUConv_forward_float_off_4_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward::CUDAParams &PARAMS); 97 | 98 | void DAUConv_forward_float_off_8_single_feat_0_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward::CUDAParams &PARAMS); 99 | void DAUConv_forward_float_off_8_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward::CUDAParams &PARAMS); 100 | void DAUConv_forward_float_off_8_single_feat_1_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward::CUDAParams &PARAMS); 101 | void DAUConv_forward_float_off_8_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward::CUDAParams &PARAMS); 102 | 103 | void DAUConv_forward_float_off_16_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward::CUDAParams &PARAMS); 104 | void DAUConv_forward_float_off_16_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward::CUDAParams &PARAMS); 105 | void DAUConv_forward_float_off_32_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, int BLOCK_IMAGES, int USE_INTERPOLATION, DAUConvForward::CUDAParams &PARAMS); 106 | 107 | 108 | 109 | #endif // !CPU_ONLY 110 | 111 | } // namespace DAUConvNet 112 | 113 | #endif // DAU_CONV_UTIL_DAU_FORWARD_H_ 114 | -------------------------------------------------------------------------------- /include/dau_conv/util/common.hpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by domen on 3/23/18. 3 | // 4 | 5 | #ifndef DAUCONVNET_COMMON_H 6 | #define DAUCONVNET_COMMON_H 7 | 8 | 9 | #include 10 | #include 11 | #include // NOLINT(readability/streams) 12 | #include // NOLINT(readability/streams) 13 | #include 14 | #include 15 | #include 16 | #include 17 | #include // pair 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include 25 | #include 26 | #include 27 | #include 28 | #include // cuda driver types 29 | 30 | #ifndef NDEBUG 31 | # define M_Assert(Expr, Msg) \ 32 | __M_Assert(#Expr, Expr, __FILE__, __LINE__, Msg) 33 | 34 | #else 35 | # define M_Assert(Expr, Msg) 36 | #endif 37 | 38 | void __M_Assert(const char* expr_str, bool expr, const char* file, int line, const char* msg); 39 | 40 | 41 | #ifndef DAU_CHECK 42 | #define DAU_CHECK(Expr,Msg ) \ 43 | if ((Expr) == false) { throw DAUConvNet::DAUException(string_format("ASSERT ERROR: %s\n", Msg)); } 44 | 45 | #endif 46 | 47 | // 48 | // CUDA macros 49 | // 50 | 51 | // CUDA: various checks for different function calls. 52 | #ifndef CUDA_CHECK 53 | #define CUDA_CHECK(condition) \ 54 | /* Code block avoids redefinition of cudaError_t error */ \ 55 | do { \ 56 | cudaError_t error = condition; \ 57 | DAU_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \ 58 | } while (0) 59 | #endif 60 | 61 | #ifndef CUBLAS_CHECK 62 | #define CUBLAS_CHECK(condition) \ 63 | do { \ 64 | cublasStatus_t status = condition; \ 65 | DAU_CHECK(status == CUBLAS_STATUS_SUCCESS, DAUConvNet::cublasGetErrorString(status)); \ 66 | } while (0) 67 | #endif 68 | 69 | // CUDA: grid stride looping 70 | #ifndef CUDA_KERNEL_LOOP 71 | #define CUDA_KERNEL_LOOP(i, n) \ 72 | for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ 73 | i < (n); \ 74 | i += blockDim.x * gridDim.x) 75 | #endif 76 | 77 | // CUDA: check for error after kernel execution and exit loudly if there is one. 78 | #ifndef CUDA_POST_KERNEL_CHECK 79 | #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) 80 | #endif 81 | 82 | 83 | namespace DAUConvNet { 84 | 85 | // Common functions and classes from std that dau_conv_impl often uses. 86 | using std::vector; 87 | 88 | // CUDA: library error reporting. 89 | const char* cublasGetErrorString(cublasStatus_t error); 90 | 91 | // CUDA: use 512 threads per block 92 | const int CUDA_NUM_THREADS = 512; 93 | 94 | // CUDA: number of blocks for threads. 95 | inline int CUDA_GET_BLOCKS(const int N) { 96 | return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; 97 | } 98 | 99 | class DAUException : public std::runtime_error { 100 | public: 101 | DAUException(const std::string& what_arg ) : std::runtime_error(what_arg) { 102 | } 103 | }; 104 | 105 | } // namespace DAUConvNet 106 | 107 | template 108 | std::string string_format( const std::string& format, Args ... args ) 109 | { 110 | size_t size = std::snprintf( nullptr, 0, format.c_str(), args ... ) + 1; // Extra space for '\0' 111 | std::unique_ptr buf( new char[ size ] ); 112 | std::snprintf( buf.get(), size, format.c_str(), args ... ); 113 | return std::string( buf.get(), size - 1 ); // We don't want the '\0' inside 114 | } 115 | 116 | 117 | 118 | 119 | 120 | #endif //DAUCONVNET_COMMON_H 121 | -------------------------------------------------------------------------------- /include/dau_conv/util/convolve.hpp: -------------------------------------------------------------------------------- 1 | /******************************************************* 2 | * Copyright (c) 2014, ArrayFire 3 | * All rights reserved. 4 | * 5 | * This file is distributed under 3-clause BSD license. 6 | * The complete license agreement can be obtained at: 7 | * http://arrayfire.com/licenses/BSD-3-Clause 8 | ********************************************************/ 9 | 10 | #include "dau_conv/util/common.hpp" 11 | 12 | namespace DAUConvNet 13 | { 14 | typedef enum { 15 | AF_BATCH_UNSUPPORTED = -1, /* invalid inputs */ 16 | AF_BATCH_NONE, /* one signal, one filter */ 17 | AF_BATCH_LHS, /* many signal, one filter */ 18 | AF_BATCH_RHS, /* one signal, many filter */ 19 | AF_BATCH_SAME, /* signal and filter have same batch size */ 20 | AF_BATCH_DIFF, /* signal and filter have different batch size */ 21 | } AF_BATCH_KIND; 22 | 23 | struct conv2_data_desc { 24 | conv2_data_desc() {} 25 | conv2_data_desc(int n, int c, int h, int w, int s_n, int s_c, int s_h, int s_w) { 26 | dims[0] = n; dims[1] = c; dims[2] = h; dims[3] = w; 27 | strides[0] = s_n; strides[1] = s_c; strides[2] = s_h; strides[3] = s_w; 28 | } 29 | int dims[4]; 30 | int strides[4]; 31 | }; 32 | 33 | template 34 | void caffe_gpu_convolve2(Dtype* out, const conv2_data_desc& out_desc, 35 | const Dtype* signal, const conv2_data_desc& signal_desc, 36 | const Dtype* filter, const conv2_data_desc& filter_desc, cudaStream_t streamId = 0); 37 | 38 | namespace kernel 39 | { 40 | 41 | 42 | 43 | template 44 | void convolve_nd(Dtype* out, const conv2_data_desc& out_desc, 45 | const Dtype* signal, const conv2_data_desc& signal_desc, 46 | const Dtype* filter, const conv2_data_desc& filter_desc, 47 | AF_BATCH_KIND kind, cudaStream_t streamId = 0); 48 | 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /include/dau_conv/util/im2col.hpp: -------------------------------------------------------------------------------- 1 | #ifndef DAU_CONV_UTIL_IM2COL_HPP 2 | #define DAU_CONV_UTIL_IM2COL_HPP 3 | 4 | namespace DAUConvNet { 5 | 6 | template 7 | void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes, 8 | const int* im_shape, const int* col_shape, 9 | const int* kernel_shape, const int* pad, const int* stride, 10 | const int* dilation, Dtype* data_col); 11 | 12 | template 13 | void im2col_cpu(const Dtype* data_im, const int channels, 14 | const int height, const int width, const int kernel_h, const int kernel_w, 15 | const int pad_h, const int pad_w, const int stride_h, 16 | const int stride_w, const int dilation_h, const int dilation_w, 17 | Dtype* data_col); 18 | 19 | template 20 | void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes, 21 | const int* im_shape, const int* col_shape, 22 | const int* kernel_shape, const int* pad, const int* stride, 23 | const int* dilation, Dtype* data_im); 24 | 25 | template 26 | void col2im_cpu(const Dtype* data_col, const int channels, 27 | const int height, const int width, const int kernel_h, const int kernel_w, 28 | const int pad_h, const int pad_w, const int stride_h, 29 | const int stride_w, const int dilation_h, const int dilation_w, 30 | Dtype* data_im); 31 | 32 | } // namespace DAUConvNet 33 | 34 | #endif // DAU_CONV_UTIL_IM2COL_HPP 35 | -------------------------------------------------------------------------------- /include/dau_conv/util/math_functions.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CAFFE_UTIL_MATH_FUNCTIONS_H_ 2 | #define CAFFE_UTIL_MATH_FUNCTIONS_H_ 3 | 4 | #include 5 | #include // for std::fabs and std::signbit 6 | #include 7 | 8 | #include "dau_conv/util/common.hpp" 9 | #include "dau_conv/util/mkl_alternate.hpp" 10 | 11 | 12 | namespace DAUConvNet { 13 | 14 | // Caffe gemm provides a simpler interface to the gemm functions, with the 15 | // limitation that the data has to be contiguous in memory. 16 | template 17 | void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, 18 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 19 | const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, 20 | Dtype* y); 21 | 22 | template 23 | void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, 24 | const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, 25 | Dtype* y); 26 | 27 | template 28 | void caffe_axpy(const int N, const Dtype alpha, const Dtype* X, Dtype* Y); 29 | 30 | template 31 | void caffe_cpu_axpby(const int N, const Dtype alpha, const Dtype* X, const Dtype beta, 32 | Dtype* Y); 33 | 34 | template 35 | void caffe_set(const int N, const Dtype alpha, Dtype *X); 36 | 37 | inline void caffe_memset(const size_t N, const int alpha, void* X) { 38 | memset(X, alpha, N); // NOLINT(dau_conv_impl/alt_fn) 39 | } 40 | 41 | template 42 | void caffe_add_scalar(const int N, const Dtype alpha, Dtype *X); 43 | 44 | template 45 | void caffe_scal(const int N, const Dtype alpha, Dtype *X); 46 | 47 | template 48 | void caffe_sqr(const int N, const Dtype* a, Dtype* y); 49 | 50 | template 51 | void caffe_sqrt(const int N, const Dtype* a, Dtype* y); 52 | 53 | template 54 | void caffe_add(const int N, const Dtype* a, const Dtype* b, Dtype* y); 55 | 56 | template 57 | void caffe_sub(const int N, const Dtype* a, const Dtype* b, Dtype* y); 58 | 59 | template 60 | void caffe_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y); 61 | 62 | template 63 | void caffe_div(const int N, const Dtype* a, const Dtype* b, Dtype* y); 64 | 65 | template 66 | void caffe_powx(const int n, const Dtype* a, const Dtype b, Dtype* y); 67 | 68 | template 69 | void caffe_exp(const int n, const Dtype* a, Dtype* y); 70 | 71 | template 72 | void caffe_log(const int n, const Dtype* a, Dtype* y); 73 | 74 | template 75 | void caffe_abs(const int n, const Dtype* a, Dtype* y); 76 | 77 | template 78 | Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y); 79 | 80 | template 81 | Dtype caffe_cpu_strided_dot(const int n, const Dtype* x, const int incx, 82 | const Dtype* y, const int incy); 83 | 84 | // Returns the sum of the absolute values of the elements of vector x 85 | template 86 | Dtype caffe_cpu_asum(const int n, const Dtype* x); 87 | 88 | 89 | // the branchless, type-safe version from 90 | // http://stackoverflow.com/questions/1903954/is-there-a-standard-sign-function-signum-sgn-in-c-c 91 | template 92 | inline int8_t caffe_sign(Dtype val) { 93 | return (Dtype(0) < val) - (val < Dtype(0)); 94 | } 95 | 96 | // The following two macros are modifications of DEFINE_VSL_UNARY_FUNC 97 | // in include/dau_conv_impl/util/mkl_alternate.hpp authored by @Rowland Depp. 98 | // Please refer to commit 7e8ef25c7 of the boost-eigen branch. 99 | // Git cherry picking that commit caused a conflict hard to resolve and 100 | // copying that file in convenient for code reviewing. 101 | // So they have to be pasted here temporarily. 102 | #define DEFINE_CAFFE_CPU_UNARY_FUNC(name, operation) \ 103 | template \ 104 | void caffe_cpu_##name(const int n, const Dtype* x, Dtype* y) { \ 105 | M_Assert(n > 0,""); M_Assert(x,""); M_Assert(y,""); \ 106 | for (int i = 0; i < n; ++i) { \ 107 | operation; \ 108 | } \ 109 | } 110 | 111 | // output is 1 for the positives, 0 for zero, and -1 for the negatives 112 | DEFINE_CAFFE_CPU_UNARY_FUNC(sign, y[i] = caffe_sign(x[i])) 113 | 114 | // This returns a nonzero value if the input has its sign bit set. 115 | // The name sngbit is meant to avoid conflicts with std::signbit in the macro. 116 | // The extra parens are needed because CUDA < 6.5 defines signbit as a macro, 117 | // and we don't want that to expand here when CUDA headers are also included. 118 | DEFINE_CAFFE_CPU_UNARY_FUNC(sgnbit, \ 119 | y[i] = static_cast((std::signbit)(x[i]))) 120 | 121 | DEFINE_CAFFE_CPU_UNARY_FUNC(fabs, y[i] = std::fabs(x[i])) 122 | 123 | template 124 | void caffe_cpu_scale(const int n, const Dtype alpha, const Dtype *x, Dtype* y); 125 | 126 | #ifndef CPU_ONLY // GPU 127 | 128 | // Decaf gpu gemm provides an interface that is almost the same as the cpu 129 | // gemm function - following the c convention and calling the fortran-order 130 | // gpu code under the hood. 131 | template 132 | void caffe_gpu_gemm(const CBLAS_TRANSPOSE TransA, 133 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 134 | const Dtype alpha, const Dtype* A, const Dtype* B, const Dtype beta, 135 | Dtype* C, cublasHandle_t cublas_handle); 136 | 137 | template 138 | void caffe_gpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, const int N, 139 | const Dtype alpha, const Dtype* A, const Dtype* x, const Dtype beta, 140 | Dtype* y, cublasHandle_t cublas_handle); 141 | 142 | template 143 | void caffe_gpu_axpy(const int N, const Dtype alpha, const Dtype* X, 144 | Dtype* Y, cublasHandle_t cublas_handle); 145 | 146 | template 147 | void caffe_gpu_axpby(const int N, const Dtype alpha, const Dtype* X, 148 | const Dtype beta, Dtype* Y, cublasHandle_t cublas_handle); 149 | 150 | void caffe_gpu_memcpy(const size_t N, const void *X, void *Y); 151 | 152 | template 153 | void caffe_gpu_set(const int N, const Dtype alpha, Dtype *X); 154 | 155 | inline void caffe_gpu_memset(const size_t N, const int alpha, void* X) { 156 | CUDA_CHECK(cudaMemset(X, alpha, N)); // NOLINT(dau_conv_impl/alt_fn) 157 | } 158 | 159 | template 160 | void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X, cudaStream_t streamId = 0); 161 | 162 | template 163 | void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X, cublasHandle_t cublas_handle); 164 | 165 | template 166 | void caffe_gpu_scal(const int N, const Dtype alpha, Dtype* X, cublasHandle_t cublas_handle, cudaStream_t str); 167 | 168 | template 169 | void caffe_gpu_mul(const int N, const Dtype* a, const Dtype* b, Dtype* y, cudaStream_t streamId = 0); 170 | 171 | template 172 | void caffe_gpu_dot(const int n, const Dtype* x, const Dtype* y, Dtype* out, cublasHandle_t cublas_handle); 173 | 174 | template 175 | void caffe_gpu_asum(const int n, const Dtype* x, Dtype* y, cublasHandle_t cublas_handle); 176 | 177 | 178 | void caffe_gpu_memcpy_async(const size_t N, const void* X, void* Y, cudaStream_t streamId = 0); 179 | 180 | template 181 | void caffe_gpu_set_async(const int N, const Dtype alpha, Dtype *X, cudaStream_t streamId = 0); 182 | 183 | template 184 | void caffe_gpu_sum(const int N, const Dtype* x, Dtype* y, const int num_segments, 185 | int* offsets_gpu, bool with_add = false, cudaStream_t streamId = NULL); 186 | 187 | template 188 | void caffe_gpu_clip_lower(const int N, const Dtype lower_bound, const Dtype* x, Dtype* y, 189 | cudaStream_t streamId = 0); 190 | 191 | template 192 | void caffe_gpu_clip_upper(const int N, const Dtype upper_bound, const Dtype* x, Dtype* y, 193 | cudaStream_t streamId = 0); 194 | 195 | template 196 | void caffe_gpu_clip_eps(const int N, const Dtype eps_bound, const Dtype* x, Dtype* y, 197 | cudaStream_t streamId = 0); 198 | 199 | template 200 | void caffe_gpu_clip_nan(const int N, const Dtype* x, Dtype* y, cudaStream_t streamId = 0); 201 | 202 | template 203 | void caffe_gpu_pad2d(const int I, const int H, const int W, int pad_size, const Dtype* X, Dtype* Y, 204 | cudaStream_t streamId = 0); 205 | 206 | template 207 | void caffe_gpu_amax(const int I, const Dtype* X, Dtype* Y, 208 | cublasHandle_t cublas_handle); 209 | 210 | #endif // !CPU_ONLY 211 | 212 | } // namespace dau_conv_impl 213 | 214 | #endif // CAFFE_UTIL_MATH_FUNCTIONS_H_ 215 | -------------------------------------------------------------------------------- /include/dau_conv/util/mkl_alternate.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CAFFE_UTIL_MKL_ALTERNATE_H_ 2 | #define CAFFE_UTIL_MKL_ALTERNATE_H_ 3 | 4 | #ifdef USE_MKL 5 | 6 | #include 7 | 8 | #else // If use MKL, simply include the MKL header 9 | 10 | #ifdef USE_ACCELERATE 11 | #include 12 | #else 13 | extern "C" { 14 | #include 15 | } 16 | #endif // USE_ACCELERATE 17 | 18 | #include 19 | 20 | // Functions that caffe uses but are not present if MKL is not linked. 21 | 22 | // A simple way to define the vsl unary functions. The operation should 23 | // be in the form e.g. y[i] = sqrt(a[i]) 24 | #define DEFINE_VSL_UNARY_FUNC(name, operation) \ 25 | template \ 26 | void v##name(const int n, const Dtype* a, Dtype* y) { \ 27 | M_Assert(n > 0,""); M_Assert(a,""); M_Assert(y,""); \ 28 | for (int i = 0; i < n; ++i) { operation; } \ 29 | } \ 30 | inline void vs##name( \ 31 | const int n, const float* a, float* y) { \ 32 | v##name(n, a, y); \ 33 | } \ 34 | inline void vd##name( \ 35 | const int n, const double* a, double* y) { \ 36 | v##name(n, a, y); \ 37 | } 38 | 39 | DEFINE_VSL_UNARY_FUNC(Sqr, y[i] = a[i] * a[i]) 40 | DEFINE_VSL_UNARY_FUNC(Sqrt, y[i] = sqrt(a[i])) 41 | DEFINE_VSL_UNARY_FUNC(Exp, y[i] = exp(a[i])) 42 | DEFINE_VSL_UNARY_FUNC(Ln, y[i] = log(a[i])) 43 | DEFINE_VSL_UNARY_FUNC(Abs, y[i] = fabs(a[i])) 44 | 45 | // A simple way to define the vsl unary functions with singular parameter b. 46 | // The operation should be in the form e.g. y[i] = pow(a[i], b) 47 | #define DEFINE_VSL_UNARY_FUNC_WITH_PARAM(name, operation) \ 48 | template \ 49 | void v##name(const int n, const Dtype* a, const Dtype b, Dtype* y) { \ 50 | M_Assert(n > 0,""); M_Assert(a,""); M_Assert(y,""); \ 51 | for (int i = 0; i < n; ++i) { operation; } \ 52 | } \ 53 | inline void vs##name( \ 54 | const int n, const float* a, const float b, float* y) { \ 55 | v##name(n, a, b, y); \ 56 | } \ 57 | inline void vd##name( \ 58 | const int n, const double* a, const float b, double* y) { \ 59 | v##name(n, a, b, y); \ 60 | } 61 | 62 | DEFINE_VSL_UNARY_FUNC_WITH_PARAM(Powx, y[i] = pow(a[i], b)) 63 | 64 | // A simple way to define the vsl binary functions. The operation should 65 | // be in the form e.g. y[i] = a[i] + b[i] 66 | #define DEFINE_VSL_BINARY_FUNC(name, operation) \ 67 | template \ 68 | void v##name(const int n, const Dtype* a, const Dtype* b, Dtype* y) { \ 69 | M_Assert(n > 0,""); M_Assert(a,""); M_Assert(b,""); M_Assert(y,""); \ 70 | for (int i = 0; i < n; ++i) { operation; } \ 71 | } \ 72 | inline void vs##name( \ 73 | const int n, const float* a, const float* b, float* y) { \ 74 | v##name(n, a, b, y); \ 75 | } \ 76 | inline void vd##name( \ 77 | const int n, const double* a, const double* b, double* y) { \ 78 | v##name(n, a, b, y); \ 79 | } 80 | 81 | DEFINE_VSL_BINARY_FUNC(Add, y[i] = a[i] + b[i]) 82 | DEFINE_VSL_BINARY_FUNC(Sub, y[i] = a[i] - b[i]) 83 | DEFINE_VSL_BINARY_FUNC(Mul, y[i] = a[i] * b[i]) 84 | DEFINE_VSL_BINARY_FUNC(Div, y[i] = a[i] / b[i]) 85 | 86 | // In addition, MKL comes with an additional function axpby that is not present 87 | // in standard blas. We will simply use a two-step (inefficient, of course) way 88 | // to mimic that. 89 | inline void cblas_saxpby(const int N, const float alpha, const float* X, 90 | const int incX, const float beta, float* Y, 91 | const int incY) { 92 | cblas_sscal(N, beta, Y, incY); 93 | cblas_saxpy(N, alpha, X, incX, Y, incY); 94 | } 95 | inline void cblas_daxpby(const int N, const double alpha, const double* X, 96 | const int incX, const double beta, double* Y, 97 | const int incY) { 98 | cblas_dscal(N, beta, Y, incY); 99 | cblas_daxpy(N, alpha, X, incX, Y, incY); 100 | } 101 | 102 | #endif // USE_MKL 103 | #endif // CAFFE_UTIL_MKL_ALTERNATE_H_ 104 | -------------------------------------------------------------------------------- /plugins/tensorflow/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required(VERSION 2.8) 2 | 3 | if (NOT BUILD_TENSORFLOW_PLUGIN) 4 | MESSAGE(STATUS "BUILD_TENSORFLOW_PLUGIN not set.") 5 | return() 6 | endif() 7 | 8 | #find_package (Python COMPONENTS Interpreter Development) 9 | find_package( PythonInterp REQUIRED ) 10 | MESSAGE(STATUS "Python exec ${PYTHON_EXECUTABLE}") 11 | 12 | #SET VARIABLE AS OTHER PYTHON EXECUTABLE IF PythonInterp FINDS A DIFFERENT EXECUTABLE 13 | #set(PYTHON_EXECUTABLE python3) 14 | 15 | #get TF VERSION 16 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow; import sys; sys.stdout.write(tensorflow.__version__)" OUTPUT_VARIABLE TF_VERSION) 17 | convert_version_string_to_int("${TF_VERSION}" TF_VERSION_INT) 18 | message(STATUS "TF_VERSION_INT: ${TF_VERSION_INT}") 19 | 20 | if( "${TF_VERSION}" VERSION_GREATER "2.0.0" OR "${TF_VERSION}" VERSION_EQUAL "2.0.0") 21 | MESSAGE(STATUS "TF VER MORE THAN 2.0.0: ${TF_VERSION}") 22 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow; import sys; sys.stdout.write(tensorflow.sysconfig.get_include())" OUTPUT_VARIABLE Tensorflow_INCLUDE_DIRS) 23 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_compile_flags()))" OUTPUT_VARIABLE TF_CFLAGS) 24 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_link_flags()))" OUTPUT_VARIABLE TF_LFLAGS) 25 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(tf.sysconfig.get_lib()+'/libtensorflow_framework.so.2')" OUTPUT_VARIABLE TF_LIB) 26 | MESSAGE(STATUS "${TF_LIB_DIR}") 27 | 28 | elseif( "${TF_VERSION}" VERSION_GREATER "1.14.0" OR "${TF_VERSION}" VERSION_EQUAL "1.14.0") 29 | MESSAGE(STATUS "TF VER BETWEEN THAN 1.14 and 1.15: ${TF_VERSION}") 30 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow; import sys; sys.stdout.write(tensorflow.sysconfig.get_include())" OUTPUT_VARIABLE Tensorflow_INCLUDE_DIRS) 31 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_compile_flags()))" OUTPUT_VARIABLE TF_CFLAGS) 32 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_link_flags()))" OUTPUT_VARIABLE TF_LFLAGS) 33 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(tf.sysconfig.get_lib()+'/libtensorflow_framework.so.1')" OUTPUT_VARIABLE TF_LIB) 34 | MESSAGE(STATUS "${TF_LIB_DIR}") 35 | 36 | elseif( ${TF_VERSION} VERSION_LESS "1.5.0") 37 | MESSAGE(STATUS "TF VER LOWER THAN 1.5.: ${TF_VERSION}") 38 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow; import sys; sys.stdout.write(tensorflow.sysconfig.get_include())" OUTPUT_VARIABLE Tensorflow_INCLUDE_DIRS) 39 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(tf.sysconfig.get_lib()+'/libtensorflow_framework.so')" OUTPUT_VARIABLE TF_LIB) 40 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(tf.sysconfig.get_lib())" OUTPUT_VARIABLE TF_LIB_DIR) 41 | set(TF_CFLAGS "-I ${Tensorflow_INCLUDE_DIRS} -D_GLIBCXX_USE_CXX11_ABI=0") 42 | set(TF_LFLAGS "-L ${TF_LIB_DIR} -ltensorflow_framework") 43 | 44 | else() 45 | MESSAGE(STATUS "TF VER: ${TF_VERSION}") 46 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow; import sys; sys.stdout.write(tensorflow.sysconfig.get_include())" OUTPUT_VARIABLE Tensorflow_INCLUDE_DIRS) 47 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_compile_flags()))" OUTPUT_VARIABLE TF_CFLAGS) 48 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(' '.join(tf.sysconfig.get_link_flags()))" OUTPUT_VARIABLE TF_LFLAGS) 49 | execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import tensorflow as tf; import sys; sys.stdout.write(tf.sysconfig.get_lib()+'/libtensorflow_framework.so')" OUTPUT_VARIABLE TF_LIB) 50 | endif() 51 | 52 | 53 | # NO need to explicitly add -std=c++11 since main CMakeLists.txt already does that 54 | #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") 55 | 56 | if(( "${TF_VERSION}" VERSION_GREATER "2.7.0" OR "${TF_VERSION}" VERSION_EQUAL "2.7.0") AND NOT COMPILER_SUPPORTS_CXX14) 57 | message(FATAL_ERROR "TF v2.7.0 or higher requires C++14 support. Please use a different C++ compiler.") 58 | elseif( "${TF_VERSION}" VERSION_GREATER "2.10.0" OR "${TF_VERSION}" VERSION_EQUAL "2.10.0") 59 | if(COMPILER_SUPPORTS_CXX17) 60 | message(STATUS "Enabling C++17 support for TF v2.10.0 or higher.") 61 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17") 62 | else() 63 | message(FATAL_ERROR "TF v2.10.0 or higher requires C++17 support. Please use a different C++ compiler.") 64 | endif() 65 | endif() 66 | 67 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D GOOGLE_CUDA=1") 68 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D TENSORFLOW_VERSION=${TF_VERSION_INT}") 69 | 70 | set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${TF_CFLAGS}") 71 | set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${TF_LFLAGS}") 72 | 73 | #TEMP INSERT FROM BASE CMAKE 74 | 75 | list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules) 76 | 77 | 78 | message(STATUS "flags: ${CMAKE_CXX_FLAGS}") 79 | message(STATUS "flags: ${CMAKE_SHARED_LINKER_FLAGS}") 80 | message(STATUS "tf link flags: ${TF_LFLAGS}") 81 | message(STATUS "tf compiler flags: ${TF_CFLAGS}") 82 | 83 | 84 | 85 | message(STATUS "LINKER LIBS: ${DAUConvNet_LINKER_LIBS}") 86 | message(STATUS "INCLUDE DIRS: ${DAUConvNet_INCLUDE_DIRS}") 87 | message(STATUS "INCLUDE DIR: ${DAUConvNet_INCLUDE_DIR}") 88 | message(STATUS "TENSORFLOW DIRS: ${Tensorflow_INCLUDE_DIRS}") 89 | message(STATUS "TENSORFLOW LIB: ${TF_LIB}") 90 | 91 | 92 | # build the gradient operation which is used in base_op_grad.py 93 | # to register it 94 | #LAYER ADD LIBRARY 95 | #LINK_DIRECTORIES(${TF_LIB}) 96 | include_directories(${Tensorflow_INCLUDE_DIRS}) 97 | include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src) 98 | 99 | # we need to manually add dependecny on .cu.o objects 100 | # by adding dummy output (real outputs will be created DAU-ConvNet target) 101 | add_custom_command(OUTPUT ${DAUConvNet_CU_OBJS} 102 | COMMAND echo 103 | DEPENDS ${DAUConvNet_OBJ_TARGET}) 104 | 105 | add_library(dau_conv_tensorflow SHARED src/dau_conv_layer_tensorflow.cpp src/dau_conv_layer_tensorflow.hpp ${DAUConvNet_OBJS}) 106 | 107 | # we also need to ensure to first compile DAU-ConvNet 108 | add_dependencies(dau_conv_tensorflow ${DAUConvNet_OBJ_TARGET}) 109 | 110 | target_include_directories(dau_conv_tensorflow PUBLIC ${DAUConvNet_INCLUDE_DIR}) 111 | target_include_directories(dau_conv_tensorflow PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}) 112 | target_include_directories(dau_conv_tensorflow PUBLIC ${Tensorflow_INCLUDE_DIRS}) 113 | target_include_directories(dau_conv_tensorflow PUBLIC ${DAUConvNet_INCLUDE_DIRS}) 114 | 115 | message(STATUS ${DAUConvNet_LINKER_LIBS}) 116 | target_link_libraries(dau_conv_tensorflow PUBLIC ${TF_LIB} ) 117 | target_link_libraries(dau_conv_tensorflow ${DAUConvNet_LINKER_LIBS}) 118 | 119 | 120 | # build the actual operation which can be used directory 121 | add_library(dau_conv_grad_op SHARED src/dau_conv_grad_op.cpp) 122 | target_link_libraries(dau_conv_grad_op PUBLIC dau_conv_tensorflow) 123 | target_link_libraries(dau_conv_grad_op PUBLIC ${DAUConvNet_LINKER_LIBS}) 124 | target_include_directories(dau_conv_grad_op PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) 125 | target_include_directories(dau_conv_grad_op PUBLIC "/usr/local/") 126 | target_include_directories(dau_conv_grad_op ${DAUConvNet_INCLUDE_DIRS} PUBLIC ${DAUConvNet_INCLUDE_DIR}) 127 | 128 | add_library(dau_conv_op SHARED src/dau_conv_op.cpp) 129 | 130 | target_link_libraries(dau_conv_op dau_conv_tensorflow) 131 | target_include_directories(dau_conv_op PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) 132 | target_include_directories(dau_conv_op PUBLIC "/usr/local/") 133 | target_include_directories(dau_conv_op ${DAUConvNet_INCLUDE_DIRS} PUBLIC ${DAUConvNet_INCLUDE_DIR}) 134 | 135 | 136 | include(GNUInstallDirs) 137 | 138 | set(CMAKE_INSTALL_PREFIX "/usr/local/lib/") 139 | 140 | message(STATUS ${CMAKE_INSTALL_FULL_LIBDIR}) 141 | message(STATUS ${CMAKE_INSTALL_FULL_BINDIR}) 142 | 143 | set(DAU_CONV_OP_PATH "${CMAKE_CURRENT_BINARY_DIR}/${DAU_CONV_MODULE_NAME}/${CMAKE_SHARED_LIBRARY_PREFIX}dau_conv_op${CMAKE_SHARED_LIBRARY_SUFFIX}") 144 | set(DAU_CONV_GRAD_OP_PATH "${CMAKE_CURRENT_BINARY_DIR}/${DAU_CONV_MODULE_NAME}/${CMAKE_SHARED_LIBRARY_PREFIX}dau_conv_grad_op${CMAKE_SHARED_LIBRARY_SUFFIX}") 145 | set(DAU_CONV_TENSORFLOW_PATH "${CMAKE_CURRENT_BINARY_DIR}/${DAU_CONV_MODULE_NAME}/${CMAKE_SHARED_LIBRARY_PREFIX}dau_conv_tensorflow${CMAKE_SHARED_LIBRARY_SUFFIX}") 146 | 147 | message(STATUS ${DAU_CONV_OP_PATH}) 148 | message(STATUS ${DAU_CONV_GRAD_OP_PATH}) 149 | message(STATUS ${DAU_CONV_TENSORFLOW_PATH}) 150 | 151 | set(DAU_CONV_MODULE_NAME dau_conv) 152 | 153 | set(SETUP_PY_IN "${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in") 154 | set(SETUP_PY "${CMAKE_CURRENT_BINARY_DIR}/setup.py") 155 | set(DEPS "${CMAKE_CURRENT_SOURCE_DIR}/${DAU_CONV_MODULE_NAME}/__init__.py") 156 | set(OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/build/timestamp") 157 | set(WHEELHOUSE "${CMAKE_CURRENT_BINARY_DIR}/wheelhouse") 158 | 159 | message(STATUS ${WHEELHOUSE}) 160 | 161 | set(MANIFEST_IN "${CMAKE_CURRENT_SOURCE_DIR}/MANIFEST.in.in") 162 | set(MANIFEST "${CMAKE_CURRENT_BINARY_DIR}/MANIFEST.in") 163 | 164 | # for TF v2.1 or higher there is no more tensorflow-gpu package 165 | if( "${TF_VERSION}" VERSION_GREATER "2.1.0" OR "${TF_VERSION}" VERSION_EQUAL "2.1.0") 166 | set(TENSORFLOW_PIP_PACKAGE_NAME "tensorflow==${TF_VERSION}") 167 | else() 168 | set(TENSORFLOW_PIP_PACKAGE_NAME "tensorflow-gpu==${TF_VERSION}") 169 | endif() 170 | 171 | 172 | configure_file(${MANIFEST_IN} ${MANIFEST}) 173 | configure_file(${SETUP_PY_IN} ${SETUP_PY}) 174 | 175 | add_custom_command(OUTPUT ${WHEELHOUSE} 176 | COMMAND cp -r ${CMAKE_CURRENT_SOURCE_DIR}/${DAU_CONV_MODULE_NAME} . 177 | COMMAND cp *.so ${DAU_CONV_MODULE_NAME} 178 | COMMAND ${PYTHON_EXECUTABLE} -m pip wheel . -w ${WHEELHOUSE} --no-deps 179 | DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/${DAU_CONV_MODULE_NAME}" dau_conv_tensorflow dau_conv_op dau_conv_grad_op) 180 | 181 | add_custom_target(target ALL DEPENDS ${WHEELHOUSE} dau_conv_tensorflow dau_conv_op dau_conv_grad_op) 182 | 183 | install(CODE "execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pip install --find-links=${WHEELHOUSE} --force-reinstall --no-deps ${DAU_CONV_MODULE_NAME}==${PACKAGE_VERSION}.${TF_VERSION})") 184 | -------------------------------------------------------------------------------- /plugins/tensorflow/MANIFEST.in.in: -------------------------------------------------------------------------------- 1 | include dau_conv/*.so 2 | include dau_conv/tmp 3 | -------------------------------------------------------------------------------- /plugins/tensorflow/build-ci/build-whl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This script builds dau-conv package for various TensorFlow and Python version 4 | # 5 | # Below is defined a list of all tensorflow builds (TF_BUILDS) and python builds (PYTHON_BUILDS) 6 | # for which DAU-ConvNet package will be build. 7 | # 8 | # This script performs: 9 | # 1. For all combinations of TensorFlow na Python version perform build using a prepared docker file 10 | # 11 | # 2. After all images are build it performs the following tests: 12 | # - integirety check by running "import dau_conv" within container 13 | # - quick unit test by running test "python -m dau_conv.test DAUConvTest.test_DAUConv 14 | # 15 | # 3. Wheel packages (.whl) are stored to the same location of this script. 16 | 17 | # Define a function to stop all parent processes and exit the script 18 | function stop_script { 19 | echo "Stopping all parent processes and exiting script..." 20 | pkill -P $$ 21 | exit 1 22 | } 23 | 24 | # Catch the SIGINT signal (CTRL+C) and execute the stop_script function 25 | trap stop_script SIGINT 26 | 27 | # Rest of the script goes here... 28 | 29 | #DOCKER_EXEC=nvidia-docker 30 | DOCKER_EXEC=docker 31 | DOCKER_GPUS=--gpus=all 32 | 33 | DAU_VERSION=1.0 34 | DOCKER_IMG_NAME=dau-convnet 35 | UNITTEST_DOCKER=0 36 | DOCKER_HUB_REPO="" 37 | 38 | # list of python version, TensorFlow version and corresponding 39 | # nvidia/cuda image version where each value is seperated by semicolumn (;) 40 | 41 | BUILD_CFG=("py3.8;TF2.12.0;nvidia/cuda:11.8.0-cudnn8-devel-ubuntu18.04" \ 42 | "py3.8;TF2.11.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \ 43 | "py3.8;TF2.10.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \ 44 | "py3.8;TF2.9.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \ 45 | "py3.8;TF2.8.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \ 46 | "py3.8;TF2.7.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \ 47 | "py3.8;TF2.6.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \ 48 | "py3.8;TF2.5.0;nvidia/cuda:11.2.0-cudnn8-devel-ubuntu18.04" \ 49 | "py3.8;TF2.4.0;nvidia/cuda:11.0.3-cudnn8-devel-ubuntu18.04" \ 50 | "py3.8;TF2.3.0;nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04" \ 51 | "py3.8;TF2.2.0;nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04" \ 52 | "py3.7;TF2.2.0;nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04" \ 53 | "py3.7;TF2.1.0;nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04" \ 54 | "py3.7;TF2.0.0;nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04" \ 55 | "py3.7;TF1.15.5;nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04" \ 56 | "py3.7;TF1.14.0;nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04") 57 | 58 | for i in "$@" 59 | do 60 | case $i in 61 | --dau-version=*) 62 | DAU_VERSION="${i#*=}" 63 | shift # past argument 64 | ;; 65 | --docker-basename=*) 66 | DOCKER_IMG_NAME="${i#*=}" 67 | shift # past argument 68 | ;; 69 | --python-builds=*) 70 | PYTHON_BUILDS="" 71 | IFS=',' read -r -a PYTHON_BUILDS <<< "${i#*=}" 72 | shift # past argument 73 | ;; 74 | --tf-builds=*) 75 | TF_BUILDS="" 76 | IFS=',' read -r -a TF_BUILDS <<< "${i#*=}" 77 | shift # past argument 78 | ;; 79 | --unit-test) 80 | UNITTEST_DOCKER=1 81 | shift # past argument 82 | ;; 83 | --docker-hub-repo=*) 84 | DOCKER_HUB_REPO="${i#*=}" 85 | shift # past argument 86 | ;; 87 | 88 | *) 89 | # unknown option 90 | ;; 91 | esac 92 | done 93 | 94 | echo "Settings:" 95 | echo " DAU_VERSION=${DAU_VERSION}" 96 | echo " DOCKER_IMG_NAME=${DOCKER_IMG_NAME}" 97 | echo " BUILD_CFG=${BUILD_CFG[*]}" 98 | echo " UNITTEST_DOCKER=${UNITTEST_DOCKER}" 99 | 100 | echo "Building docker images for:" 101 | for BUILD_CFG_STR in "${BUILD_CFG[@]}" 102 | do 103 | IFS=";" read -r -a SINGLE_BUILD_CFG <<< "${BUILD_CFG_STR}" 104 | PY_VER=${SINGLE_BUILD_CFG[0]:2} 105 | TF_VER=${SINGLE_BUILD_CFG[1]:2} 106 | TF_BASE_IMAGE=${SINGLE_BUILD_CFG[2]} 107 | DOCKER_IMG_TAG=${DAU_VERSION}-py${PY_VER}-tf${TF_VER} 108 | 109 | ############################################################################## 110 | echo -n " ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} ... " 111 | 112 | BUILD_LOG="build_dau_${DOCKER_IMG_TAG}.log" 113 | PY_VER_MAJOR=${PY_VER%.*} 114 | if [ ${PY_VER_MAJOR} -eq 2 ]; then 115 | PY_VER_MAJOR="" 116 | fi 117 | 118 | DAU_CMAKE_FLAGS="-DPACKAGE_VERSION=${DAU_VERSION}" 119 | 120 | DOCKERFILE_VERSION="" 121 | if [[ $TF_BASE_IMAGE == *"ubuntu18.04"* ]]; then 122 | DOCKERFILE_VERSION=".ubuntu18.04" 123 | fi 124 | 125 | ${DOCKER_EXEC} build -t ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} \ 126 | --build-arg BASE_CUDA_VERSION=${TF_BASE_IMAGE} \ 127 | --build-arg TF_VER=${TF_VER} \ 128 | --build-arg PY_VER=${PY_VER} \ 129 | --build-arg PY_VER_MAJOR="${PY_VER_MAJOR}" \ 130 | --build-arg DAU_CMAKE_FLAGS=${DAU_CMAKE_FLAGS} -f docker/Dockerfile${DOCKERFILE_VERSION} docker/ >& ${BUILD_LOG} 131 | STATUS=$? 132 | if [ ${STATUS} -ne 0 ]; then 133 | echo "ERROR: check ${BUILD_LOG} for logs." 134 | else 135 | echo "OK" 136 | fi 137 | ${DOCKER_EXEC} builder prune -f --keep-storage 5GB >> ${BUILD_LOG} 2>&1 138 | done 139 | 140 | # Run each docker for unit-test and extract whl file 141 | for TF_VER_BUILD_STR in "${TF_BUILDS[@]}" 142 | do 143 | IFS=";" read -r -a TF_VER_BUILD <<< "${TF_VER_BUILD_STR}" 144 | TF_VER=${TF_VER_BUILD[0]} 145 | TF_BASE_IMAGE=${TF_VER_BUILD[1]} 146 | for PY_VER in "${PYTHON_BUILDS[@]}" 147 | do 148 | PY_VER_MAJOR=${PY_VER%.*} 149 | PY_VER_STR=${PY_VER//.} 150 | PYTHON_EXEC=/usr/bin/python${PY_VER} 151 | 152 | DOCKER_IMG_TAG=${DAU_VERSION}-py${PY_VER}-tf${TF_VER} 153 | CONTAINER_NAME="integration-testing-dau-convnet-${DOCKER_IMG_TAG}" 154 | 155 | echo "Testing ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG}:" 156 | 157 | echo -n " Verifying dau_conv package integrity ... " 158 | ${DOCKER_EXEC} run -i --rm --name ${CONTAINER_NAME} ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} /usr/bin/python${PY_VER} /opt/verify_dau_import.py 159 | STATUS=$? 160 | 161 | if [ ${STATUS} -ne 0 ]; then 162 | echo "ERROR: cannot run 'import dau_conv'" 163 | else 164 | echo "OK" 165 | 166 | if [ ${UNITTEST_DOCKER} -ne 0 ]; then 167 | UNITTEST_LOG="test_dau_${DOCKER_IMG_TAG}.log" 168 | echo -n " Running UnitTest ... " 169 | ${DOCKER_EXEC} run $DOCKER_GPUS -i --rm --name ${CONTAINER_NAME} -e DEBIAN_FRONTEND=noninteractive ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} /bin/bash /opt/test_dau.sh ${PYTHON_EXEC} &> ${UNITTEST_LOG} 170 | STATUS=$? 171 | 172 | if [ ${STATUS} -ne 0 ]; then 173 | echo "ERROR: check ${UNITTEST_LOG} for logs." 174 | else 175 | echo "OK" 176 | fi 177 | fi 178 | 179 | 180 | echo -n " Copying .whl package to build-ci ... " 181 | WHL_STR="py${PY_VER_MAJOR}-none-any" 182 | if [ ${PY_VER_MAJOR} -eq 2 ]; then 183 | WHL_REPLACEMENT_STR="cp${PY_VER_STR}-cp${PY_VER_STR}mu-manylinux1_x86_64" 184 | else 185 | WHL_REPLACEMENT_STR="cp${PY_VER_STR}-cp${PY_VER_STR}m-manylinux1_x86_64" 186 | fi 187 | 188 | WHL_TMP_DIR=/tmp/whl-${DOCKER_IMG_TAG} 189 | if [ ! -d "$WHL_TMP_DIR" ]; then 190 | mkdir $WHL_TMP_DIR 191 | fi 192 | ${DOCKER_EXEC} rm -f dummy-${DOCKER_IMG_NAME} &> /dev/null 193 | ${DOCKER_EXEC} create --name dummy-${DOCKER_IMG_NAME} ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} /bin/bash 194 | ${DOCKER_EXEC} cp dummy-${DOCKER_IMG_NAME}:/opt ${WHL_TMP_DIR}/. 195 | ${DOCKER_EXEC} rm -f dummy-${DOCKER_IMG_NAME} 196 | 197 | WHL_TMP_DIR=$WHL_TMP_DIR/opt 198 | 199 | for file in $WHL_TMP_DIR/*.whl; do 200 | mv "$file" "${file/$WHL_STR/$WHL_REPLACEMENT_STR}" 201 | done 202 | mv -f $WHL_TMP_DIR/*.whl `dirname "$0"`/. 203 | rm -rf $WHL_TMP_DIR 204 | echo "done" 205 | 206 | if [ ! -z "${DOCKER_HUB_REPO}" ]; then 207 | echo -n " Tagging and pushing docker to DockerHub ... " 208 | 209 | DOCKERPUSH_LOG="docker_push_dau_${DOCKER_IMG_TAG}.log" 210 | 211 | ${DOCKER_EXEC} tag ${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} ${DOCKER_HUB_REPO}/${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} >& /dev/null 212 | ${DOCKER_EXEC} push ${DOCKER_HUB_REPO}/${DOCKER_IMG_NAME}:${DOCKER_IMG_TAG} &> ${DOCKERPUSH_LOG} 213 | STATUS=$? 214 | 215 | if [ ${STATUS} -ne 0 ]; then 216 | echo "ERROR: check ${DOCKERPUSH_LOG} for logs." 217 | else 218 | echo "OK" 219 | fi 220 | fi 221 | fi 222 | done 223 | done 224 | 225 | -------------------------------------------------------------------------------- /plugins/tensorflow/dau_conv/__init__.py: -------------------------------------------------------------------------------- 1 | from .dau_conv import * 2 | -------------------------------------------------------------------------------- /plugins/tensorflow/dau_conv/_dau_conv_grad_op.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | import os 4 | import tensorflow as tf 5 | from tensorflow.python.framework import ops 6 | 7 | # preload libdau_conv_tensorflow.so manuall since it is most likely not on the LD_LIBRARY_PATH 8 | from ctypes import cdll 9 | cdll.LoadLibrary(os.path.join(os.path.dirname(os.path.realpath(__file__)),'libdau_conv_tensorflow.so')) 10 | 11 | dau_conv_grad_module = tf.load_op_library(os.path.join(os.path.dirname(os.path.realpath(__file__)),'libdau_conv_grad_op.so')) 12 | 13 | 14 | @ops.RegisterGradient("DAUConv") 15 | def _dau_conv_op_grad_cc(op, grad): 16 | # Op is the Op object - get all the inputs 17 | # Grad is the gradient with respect to the first input 18 | number_units_x = op.get_attr("number_units_x") 19 | number_units_y = op.get_attr("number_units_y") 20 | number_units_ignore = op.get_attr("number_units_ignore") 21 | num_output = op.get_attr("num_output") 22 | kernel_size = op.get_attr("kernel_size") 23 | pad = op.get_attr("pad") 24 | stride = op.get_attr("stride") 25 | unit_normalization = op.get_attr("unit_normalization") 26 | square_unit_normalization = op.get_attr("square_unit_normalization") 27 | mean_iteration_step = op.get_attr("mean_iteration_step") 28 | sigma_iteration_step = op.get_attr("sigma_iteration_step") 29 | component_border_bound = op.get_attr("component_border_bound") 30 | sigma_lower_bound = op.get_attr("sigma_lower_bound") 31 | merge_iteration_step = op.get_attr("merge_iteration_step") 32 | merge_threshold = op.get_attr("merge_threshold") 33 | unit_testing = op.get_attr("unit_testing") 34 | mu_learning_rate_factor = op.get_attr("mu_learning_rate_factor") 35 | single_dim_kernel = op.get_attr("single_dim_kernel") 36 | forbid_positive_dim1 = op.get_attr("forbid_positive_dim1") 37 | use_interpolation = op.get_attr("use_interpolation") 38 | 39 | 40 | return dau_conv_grad_module.dau_conv_grad(grad, op.inputs[0], op.inputs[1], op.inputs[2], op.inputs[3], op.inputs[4], 41 | number_units_x=number_units_x, 42 | number_units_y=number_units_y, 43 | number_units_ignore=number_units_ignore, 44 | num_output=num_output, 45 | kernel_size=kernel_size, 46 | pad=pad, 47 | stride=stride, 48 | unit_normalization=unit_normalization, 49 | square_unit_normalization=square_unit_normalization, 50 | mean_iteration_step=mean_iteration_step, 51 | sigma_iteration_step=sigma_iteration_step, 52 | component_border_bound=component_border_bound, 53 | sigma_lower_bound=sigma_lower_bound, 54 | merge_iteration_step=merge_iteration_step, 55 | merge_threshold=merge_threshold, 56 | mu_learning_rate_factor=mu_learning_rate_factor, 57 | single_dim_kernel=single_dim_kernel, 58 | forbid_positive_dim1=forbid_positive_dim1, 59 | use_interpolation=use_interpolation, 60 | unit_testing=unit_testing) 61 | -------------------------------------------------------------------------------- /plugins/tensorflow/docker/Dockerfile: -------------------------------------------------------------------------------- 1 | ARG BASE_CUDA_VERSION=nvidia/cuda:10.0-cudnn7-devel-ubuntu16.04 2 | FROM $BASE_CUDA_VERSION 3 | 4 | LABEL maintainer "domen.tabernik@fri.uni-lj.si" 5 | 6 | ARG DAU_CMAKE_FLAGS="" 7 | 8 | # TF/PY version argument must be after FROM statement 9 | ARG TF_VER=1.13.1 10 | ARG PY_VER=3.5 11 | ARG PY_VER_MAJOR=3 12 | 13 | ENV PYTHON "python$PY_VER" 14 | ENV PYTHON_MAJOR "python$PY_VER_MAJOR" 15 | 16 | ENV LD_LIBRARY_PATH "/usr/local/nvidia/lib:/usr/local/nvidia/lib64" 17 | ENV DAU_CONVNET_HOME /opt/dau-convnet 18 | 19 | WORKDIR $DAU_CONVNET_HOME 20 | 21 | RUN echo "Using TensorFlow==$TF_VER" 22 | RUN echo "Using python binary path=$PYTHON" 23 | 24 | # Install general packages for building 25 | RUN apt-get update && \ 26 | apt-get install -y software-properties-common \ 27 | sudo \ 28 | build-essential \ 29 | cmake \ 30 | build-essential \ 31 | curl \ 32 | git \ 33 | libcurl3-dev \ 34 | libfreetype6-dev \ 35 | libpng12-dev \ 36 | libzmq3-dev \ 37 | pkg-config \ 38 | rsync \ 39 | software-properties-common \ 40 | unzip \ 41 | zip \ 42 | zlib1g-dev \ 43 | libopenblas-dev 44 | 45 | # Install specific python and tensorflow versions 46 | RUN apt-get install -y $PYTHON \ 47 | $PYTHON-dev \ 48 | $PYTHON_MAJOR-pip && \ 49 | apt-get clean 50 | 51 | RUN $PYTHON -m pip --no-cache-dir install numpy pathlib 52 | RUN $PYTHON -m pip install tensorflow==$TF_VER 53 | 54 | # NOTE: since docker build does not provide nvidia drivers we cannot run "import tensorflow" 55 | # using tensorflow-gpu so we only use CPU tensorflow during build-time and then install 56 | # tensorflow-gpu after DAU-ConvNet is compiled 57 | 58 | # Download and build DAU-ConvNet plugin 59 | RUN git clone https://github.com/skokec/DAU-ConvNet . && \ 60 | git submodule update --init --recursive 61 | 62 | RUN mkdir build && cd build && \ 63 | cmake -DBLAS=Open -DBUILD_TENSORFLOW_PLUGIN=on -DPYTHON_EXECUTABLE="/usr/bin/$PYTHON" $DAU_CMAKE_FLAGS .. && \ 64 | make -j install 65 | 66 | # We need to install back GPU support for tensorflow 67 | RUN $PYTHON -m pip install tensorflow-gpu==$TF_VER 68 | 69 | 70 | # Install two scripts that will verify integrity of build with tests 71 | COPY verify_dau_import.py /opt/verify_dau_import.py 72 | COPY test_dau.sh /opt/test_dau.sh 73 | 74 | -------------------------------------------------------------------------------- /plugins/tensorflow/docker/Dockerfile.ubuntu18.04: -------------------------------------------------------------------------------- 1 | ARG BASE_CUDA_VERSION=nvidia/cuda:10.0-cudnn7-devel-ubuntu18.04 2 | FROM $BASE_CUDA_VERSION as dau-base 3 | 4 | LABEL maintainer "domen.tabernik@fri.uni-lj.si" 5 | 6 | ARG DAU_CMAKE_FLAGS="" 7 | 8 | ENV LD_LIBRARY_PATH "/usr/local/nvidia/lib:/usr/local/nvidia/lib64" 9 | ENV DAU_CONVNET_HOME /opt/dau-convnet 10 | 11 | # Install general packages for building 12 | RUN apt-get update && \ 13 | apt-get install -y software-properties-common \ 14 | sudo \ 15 | build-essential \ 16 | wget \ 17 | cmake \ 18 | build-essential \ 19 | curl \ 20 | git \ 21 | libcurl3-dev \ 22 | libfreetype6-dev \ 23 | libpng-dev \ 24 | libzmq3-dev \ 25 | pkg-config \ 26 | rsync \ 27 | software-properties-common \ 28 | unzip \ 29 | zip \ 30 | zlib1g-dev \ 31 | libopenblas-dev && \ 32 | apt-get clean && \ 33 | rm -rf /var/lib/apt/lists/* 34 | 35 | ###################################################################### 36 | # TF/PY version argument must be after FROM statement 37 | ARG PY_VER=3.7 38 | ARG PY_VER_MAJOR=3 39 | 40 | ENV PYTHON "python$PY_VER" 41 | ENV PYTHON_MAJOR "python$PY_VER_MAJOR" 42 | 43 | # Install specific python and tensorflow versions 44 | RUN apt-get update && \ 45 | apt-get install -y $PYTHON \ 46 | $PYTHON-dev \ 47 | $PYTHON_MAJOR-pip && \ 48 | apt-get clean && \ 49 | rm -rf /var/lib/apt/lists/* 50 | 51 | ###################################################################### 52 | 53 | RUN $PYTHON -m pip --no-cache-dir install setuptools==57.5.0 && \ 54 | $PYTHON -m pip --no-cache-dir install cython numpy==1.19.5 pathlib protobuf==3.20 55 | RUN $PYTHON -m pip --no-cache-dir install pip --upgrade 56 | 57 | ###################################################################### 58 | 59 | FROM dau-base as dau-build 60 | 61 | WORKDIR /tmp 62 | 63 | # install latest CMAKE 64 | RUN wget -q https://cmake.org/files/v3.21/cmake-3.21.3-linux-x86_64.tar.gz -O - | tar -xz -C /opt && mv /opt/cmake-3.21.3-linux-x86_64 /opt/cmake-3.21.3 65 | ENV PATH /opt/cmake-3.21.3/bin:$PATH 66 | 67 | ###################################################################### 68 | 69 | ARG TF_VER=1.15.5 70 | RUN $PYTHON -m pip --no-cache-dir install tensorflow==$TF_VER 71 | 72 | ###################################################################### 73 | 74 | # NOTE: since docker build does not provide nvidia drivers we cannot run "import tensorflow" 75 | # using tensorflow-gpu so we only use CPU tensorflow during build-time and then install 76 | # tensorflow-gpu after DAU-ConvNet is compiled 77 | 78 | WORKDIR $DAU_CONVNET_HOME 79 | 80 | # Download and build DAU-ConvNet plugin 81 | RUN git clone --depth=1 --branch=v1.0-TF2 https://github.com/skokec/DAU-ConvNet . && \ 82 | git submodule update --init --recursive 83 | 84 | RUN mkdir build && cd build && \ 85 | cmake -DBLAS=Open -DBUILD_TENSORFLOW_PLUGIN=on -DPYTHON_EXECUTABLE="/usr/bin/$PYTHON" $DAU_CMAKE_FLAGS .. && \ 86 | make -j install 87 | 88 | FROM dau-base as dau-convnet 89 | 90 | # Copy DAU-ConvNet whl from build stage 91 | COPY --from=dau-build ${DAU_CONVNET_HOME}/build/plugins/tensorflow/wheelhouse/*.whl /opt/. 92 | 93 | # install DAU-ConvNet whl which will also install tensorflow-gpu 94 | RUN $PYTHON -m pip install --no-cache-dir /opt/*.whl 95 | 96 | # Install two scripts that will verify integrity of build with tests 97 | COPY verify_dau_import.py /opt/verify_dau_import.py 98 | COPY test_dau.sh /opt/test_dau.sh 99 | -------------------------------------------------------------------------------- /plugins/tensorflow/docker/test_dau.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export DEBIAN_FRONTEND=noninteractive 4 | PYTHON_EXEC=$1 5 | 6 | apt update && apt install -y python-tk 7 | ${PYTHON_EXEC} -m pip install --no-cache-dir scipy matplotlib==2.2 8 | 9 | ${PYTHON_EXEC} -m dau_conv.test DAUConvTest.test_DAUConv 10 | 11 | STATUS=$? 12 | exit $STATUS 13 | -------------------------------------------------------------------------------- /plugins/tensorflow/docker/verify_dau_import.py: -------------------------------------------------------------------------------- 1 | import sys 2 | try: 3 | import dau_conv 4 | except: 5 | sys.exit(1) 6 | sys.exit(0) 7 | -------------------------------------------------------------------------------- /plugins/tensorflow/scripts/start_main_build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | BUILD_DIR=~/Documents/5.letnik/gauss_conv/new_impl/VitjanZ/DAU-ConvNet/build/ 4 | if [ ! -d $BUILD_DIR ]; then 5 | mkdir $BUILD_DIR 6 | fi 7 | 8 | cd $BUILD_DIR 9 | cmake .. 10 | make 11 | cp ./plugins/tensorflow/*.so ../plugins/tensorflow/bin/ 12 | -------------------------------------------------------------------------------- /plugins/tensorflow/setup.py.in: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | 3 | setup(name='dau_conv', 4 | version='${PACKAGE_VERSION}.${TF_VERSION}', 5 | description='DAU-ConvNet (Displaced Aggregation Units) package for TensorFlow', 6 | url="https://https://github.com/skokec/DAU-ConvNet", 7 | author="Domen Tabernik", 8 | author_email="domen.tabernik@fri.uni-lj.si", 9 | include_package_data=True, 10 | install_requires=["${TENSORFLOW_PIP_PACKAGE_NAME}"], 11 | packages=['${DAU_CONV_MODULE_NAME}', '${DAU_CONV_MODULE_NAME}.test']) 12 | 13 | -------------------------------------------------------------------------------- /plugins/tensorflow/src/dau_conv_layer_tensorflow.hpp: -------------------------------------------------------------------------------- 1 | #ifndef CAFFE_DAU_CONV_LAYER_HPP_ 2 | #define CAFFE_DAU_CONV_LAYER_HPP_ 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include "tensorflow/core/framework/op_kernel.h" 9 | #include "tensorflow/core/framework/tensor_shape.h" 10 | #include "tensorflow/core/platform/logging.h" 11 | #include "tensorflow/core/framework/shape_inference.h" 12 | 13 | #include "dau_conv/base_dau_conv_layer.hpp" 14 | 15 | 16 | // we will be using base classes from DAUConvNet 17 | using DAUConvNet::DAUConvSettings; 18 | 19 | using DAUConvNet::BaseDAUConvLayer; 20 | using DAUConvNet::BaseDAUComponentInitializer; 21 | 22 | using DAUConvNet::BaseDAUKernelCompute; 23 | using DAUConvNet::BaseDAUKernelOutput; 24 | using DAUConvNet::BaseDAUKernelParams; 25 | using DAUConvNet::DAUException; 26 | 27 | using namespace std; 28 | using namespace tensorflow; 29 | 30 | 31 | #define TENSOR_DATA_PTR(t, TYPE) (t == NULL ? NULL : reinterpret_cast((t)->template flat().data())) 32 | #define TENSOR_DATA_PTR_CONST(t, TYPE) (t == NULL ? NULL : reinterpret_cast((t)->template flat().data())) 33 | 34 | //////////////////////////////////////////////////////////////////////////////// 35 | // Tensorflow implementation of buffers used in DAUKernel* 36 | 37 | template 38 | class DAUKernelParamsTF : public BaseDAUKernelParams { 39 | public: 40 | explicit DAUKernelParamsTF(OpKernelContext* context) 41 | : context_(context){} 42 | 43 | virtual ~DAUKernelParamsTF(); 44 | 45 | void reshape(int num_in_channels, int num_out_channels, int num_gauss); 46 | 47 | void initialize_params(Tensor w, Tensor mu1, Tensor mu2, Tensor sigma); 48 | 49 | Tensor* weight_= NULL; 50 | Tensor* mu1_= NULL; 51 | Tensor* mu2_ = NULL; 52 | Tensor* sigma_= NULL; 53 | 54 | private: 55 | OpKernelContext* context_ = NULL; 56 | 57 | }; 58 | 59 | 60 | template 61 | class DAUKernelOutputTF : public BaseDAUKernelOutput { 62 | public: 63 | explicit DAUKernelOutputTF(OpKernelContext* context) 64 | : context_(context){} 65 | 66 | virtual ~DAUKernelOutputTF(); 67 | 68 | virtual void reshape(int num_in_channels, int num_out_channels, int num_gauss, int kernel_h, int kernel_w); 69 | 70 | // main filter weights 71 | Tensor* weight_ = NULL; 72 | 73 | // derivative weights for back-propagation and all four parameters 74 | Tensor* d_error_ = NULL; 75 | Tensor* d_params_ = NULL; // four params == [w,mu1,mu2,sigma] 76 | 77 | private: 78 | OpKernelContext* context_ = NULL; 79 | 80 | }; 81 | 82 | template 83 | class DAUKernelComputeTF : public BaseDAUKernelCompute { 84 | public: 85 | explicit DAUKernelComputeTF(OpKernelContext* context) 86 | : context_(context){} 87 | 88 | virtual ~DAUKernelComputeTF(); 89 | 90 | virtual void reshape(int num_in_channels, int num_out_channels, int num_gauss, 91 | int kernel_h, int kernel_w); 92 | 93 | 94 | protected: 95 | void create_precompute_index(const int index_size, const int kernel_size); 96 | 97 | // intermediate buffers when computing derivative kernels in precompute_guassian_weights_gpu 98 | // temporary buffers for pre-computed sigma^2, sigma^3 and 1/2*sigma^2 99 | vector param_buffers_; 100 | vector kernels_buffers_; 101 | 102 | // pre-computed indexes for caffe_gpu_sum in get_kernels 103 | Tensor* tmp_precomp_index_ = NULL; 104 | 105 | private: 106 | OpKernelContext* context_ = NULL; 107 | 108 | }; 109 | 110 | 111 | //////////////////////////////////////////////////////////////////////////////// 112 | // GPU version of Tensorflow buffers used in DAUKernel* 113 | 114 | template 115 | class DAUKernelParamsTFGPU : public DAUKernelParamsTF { 116 | public: 117 | explicit DAUKernelParamsTFGPU(OpKernelContext* context) 118 | : DAUKernelParamsTF(context), context_(context){} 119 | 120 | virtual Dtype* weight() { return TENSOR_DATA_PTR(this->weight_, Dtype); } 121 | virtual Dtype* mu1() { return TENSOR_DATA_PTR(this->mu1_, Dtype); } 122 | virtual Dtype* mu2() { return TENSOR_DATA_PTR(this->mu2_, Dtype); } 123 | virtual Dtype* sigma() { return TENSOR_DATA_PTR(this->sigma_, Dtype); } 124 | 125 | private: 126 | OpKernelContext* context_; 127 | 128 | }; 129 | 130 | template 131 | class DAUKernelOutputTFGPU : public DAUKernelOutputTF { 132 | public: 133 | explicit DAUKernelOutputTFGPU(OpKernelContext* context) 134 | : DAUKernelOutputTF(context), context_(context){} 135 | 136 | virtual Dtype* weight() { return TENSOR_DATA_PTR(this->weight_, Dtype); } 137 | virtual Dtype* d_error() { return TENSOR_DATA_PTR(this->d_error_, Dtype); } 138 | virtual Dtype* d_params() { return TENSOR_DATA_PTR(this->d_params_, Dtype); } 139 | 140 | private: 141 | OpKernelContext* context_; 142 | 143 | }; 144 | 145 | template 146 | class DAUKernelComputeTFGPU : public DAUKernelComputeTF { 147 | public: 148 | 149 | explicit DAUKernelComputeTFGPU(OpKernelContext* context) 150 | : DAUKernelComputeTF(context), context_(context){} 151 | 152 | 153 | virtual Dtype* param_temp(typename BaseDAUKernelCompute::Param_IDX index) { return TENSOR_DATA_PTR(this->param_buffers_[index], Dtype); } 154 | virtual Dtype* kernels_temp(typename BaseDAUKernelCompute::Kernel_IDX index) { return TENSOR_DATA_PTR(this->param_buffers_[index], Dtype); } 155 | virtual int* precomp_index() { return TENSOR_DATA_PTR(this->tmp_precomp_index_, int); } 156 | 157 | private: 158 | OpKernelContext* context_; 159 | 160 | }; 161 | 162 | // 163 | template 164 | class DAUKernelParamsTFCPU : public DAUKernelParamsTF { 165 | public: 166 | 167 | virtual Dtype* weight() { return TENSOR_DATA_PTR(this->weight_, Dtype); } 168 | virtual Dtype* mu1() { return TENSOR_DATA_PTR(this->mu1_, Dtype); } 169 | virtual Dtype* mu2() { return TENSOR_DATA_PTR(this->mu2_, Dtype); } 170 | virtual Dtype* sigma() { return TENSOR_DATA_PTR(this->sigma_, Dtype); } 171 | }; 172 | 173 | template 174 | class DAUKernelOutputTFCPU : public DAUKernelOutputTF { 175 | public: 176 | virtual Dtype* weight() {return TENSOR_DATA_PTR(this->weight_, Dtype); } 177 | virtual Dtype* d_error() {return TENSOR_DATA_PTR(this->d_error_, Dtype); } 178 | virtual Dtype* d_params() {return TENSOR_DATA_PTR(this->d_params_, Dtype); } 179 | }; 180 | 181 | template 182 | class DAUKernelComputeTFCPU : public DAUKernelComputeTF { 183 | public: 184 | 185 | virtual Dtype* param_temp(typename BaseDAUKernelCompute::Param_IDX index) { return TENSOR_DATA_PTR(this->param_buffers_[index], Dtype); } 186 | virtual Dtype* kernels_temp(typename BaseDAUKernelCompute::Kernel_IDX index) { return TENSOR_DATA_PTR(this->param_buffers_[index], Dtype); } 187 | virtual int* precomp_index() { return TENSOR_DATA_PTR(this->tmp_precomp_index_, Dtype); } 188 | 189 | }; 190 | 191 | //////////////////////////////////////////////////////////////////////////////// 192 | // Tensorflow version of DAUComponentInitializer: 193 | // - variable initialization happens directly in Python so we do not need it here 194 | // - this implements empty initialization (no operation) 195 | 196 | template 197 | class NullDAUComponentInitializerTensorflow : public BaseDAUComponentInitializer { 198 | public: 199 | 200 | explicit NullDAUComponentInitializerTensorflow(){ 201 | } 202 | 203 | void InitializeParameters(const DAUConvSettings& settings, Dtype* w, Dtype* mu1, Dtype* mu2, Dtype* sigma, bool is_gpu_ptr, 204 | int num_units_per_x, int num_units_per_y, int num_units_ignore, 205 | int conv_in_channels, int conv_out_channels, int kernel_h, int kernel_w) const {}; 206 | }; 207 | 208 | //////////////////////////////////////////////////////////////////////////////// 209 | // Tensorflow GPU version of DAUConvolution layer (BaseDAUConvLayer) 210 | 211 | template 212 | class DAUConvLayerTensorflowGPU : public BaseDAUConvLayer { 213 | public: 214 | 215 | explicit DAUConvLayerTensorflowGPU(cublasHandle_t cublas_handle,OpKernelContext* context, bool ignore_edge_gradients = false) 216 | : BaseDAUConvLayer(cublas_handle, ignore_edge_gradients, true, false), context_(context),own_workspace_data(0), do_on_gpu_(true), cublasHandle(cublas_handle) { 217 | 218 | } 219 | 220 | virtual ~DAUConvLayerTensorflowGPU(); 221 | 222 | virtual void LayerSetUp(const DAUConvSettings& settings, 223 | const BaseDAUComponentInitializer& param_initializer, 224 | BaseDAUKernelCompute* kernel_compute, 225 | BaseDAUKernelParams* kernel_param, 226 | BaseDAUKernelOutput* kernel_output, 227 | const vector& bottom_shape, int num_dau_units_ignore, bool in_train = true); 228 | 229 | //Dtype* w, Dtype* mu1, Dtype* mu2, Dtype* sigma, bool is_gpu_ptr, 230 | // int num_units_per_x, int num_units_per_y, int num_units_ignore, 231 | // int conv_in_channels, int conv_out_channels, int kernel_h, int kernel_w 232 | virtual void InitializeFromInput(DAUConvSettings& settings, Tensor* w, Tensor* mu1, Tensor* mu2, Tensor* sigma); 233 | virtual void InitializeGrad(DAUConvSettings& settings, Tensor* w_grad, Tensor* mu1_grad, Tensor* mu2_grad, Tensor* sigma_grad); 234 | virtual vector Reshape(const vector& bottom_shape, const vector& top); 235 | 236 | // make compute_output_shape() public 237 | virtual void compute_output_shape() { return BaseDAUConvLayer::compute_output_shape(); } 238 | 239 | void set_processing_on_gpu(bool do_on_gpu) { do_on_gpu_ = do_on_gpu; } 240 | 241 | void set_max_kernel_size(int kernel_w, int kernel_h) { 242 | this->max_kernel_w_ = kernel_w; 243 | this->max_kernel_h_ = kernel_h; 244 | } 245 | 246 | // parameters to learn 247 | const Tensor* param_buffer_w_ = NULL; 248 | const Tensor* param_buffer_mu1_ = NULL; 249 | const Tensor* param_buffer_mu2_ = NULL; 250 | const Tensor* param_buffer_sigma_ = NULL; 251 | const Tensor* param_buffer_bias_ = NULL; 252 | Tensor* param_buffer_w_grad = NULL; 253 | Tensor* param_buffer_mu1_grad = NULL; 254 | Tensor* param_buffer_mu2_grad = NULL; 255 | Tensor* param_buffer_sigma_grad = NULL; 256 | Tensor* param_buffer_bias_grad = NULL; 257 | 258 | OpKernelContext* context_ = NULL; 259 | cublasHandle_t cublasHandle; 260 | 261 | protected: 262 | virtual bool is_data_on_gpu() { return do_on_gpu_; } 263 | 264 | virtual void reshape_params(const vector& shape) ; 265 | 266 | virtual bool update_prefiltering_kernels(cudaStream_t stream); 267 | 268 | virtual Dtype* param_w() { return (Dtype*)TENSOR_DATA_PTR_CONST(param_buffer_w_, Dtype); } 269 | virtual Dtype* param_mu1() { return (Dtype*)TENSOR_DATA_PTR_CONST(param_buffer_mu1_, Dtype); } 270 | virtual Dtype* param_mu2() { return (Dtype*)TENSOR_DATA_PTR_CONST(param_buffer_mu2_, Dtype); } 271 | virtual Dtype* param_sigma() { return (Dtype*)TENSOR_DATA_PTR_CONST(param_buffer_sigma_, Dtype); } 272 | virtual Dtype* param_bias() { return (Dtype*)TENSOR_DATA_PTR_CONST(param_buffer_bias_, Dtype); } 273 | 274 | 275 | virtual Dtype* param_w_grad() { return TENSOR_DATA_PTR(param_buffer_w_grad, Dtype); } 276 | virtual Dtype* param_mu1_grad() { return TENSOR_DATA_PTR(param_buffer_mu1_grad, Dtype); } 277 | virtual Dtype* param_mu2_grad() { return TENSOR_DATA_PTR(param_buffer_mu2_grad, Dtype); } 278 | virtual Dtype* param_sigma_grad(){ return TENSOR_DATA_PTR(param_buffer_sigma_grad, Dtype); } 279 | virtual Dtype* param_bias_grad() { return TENSOR_DATA_PTR(param_buffer_bias_grad, Dtype); } 280 | 281 | // remaining intermediate/temporary buffers 282 | virtual Dtype* temp_bwd_gradients() { return TENSOR_DATA_PTR(bwd_gradients_, Dtype); } 283 | virtual Dtype* temp_interm_buffer() { return TENSOR_DATA_PTR(interm_buffer_, Dtype); } 284 | virtual Dtype* temp_param_buffer() { return TENSOR_DATA_PTR(tmp_param_buffer_, Dtype); } 285 | virtual Dtype* temp_col_buffer() { return TENSOR_DATA_PTR(col_buffer_, Dtype); } 286 | virtual Dtype* temp_bias_multiplier() { return TENSOR_DATA_PTR(bias_multiplier_, Dtype); } 287 | 288 | virtual void* allocate_workspace_mem(size_t bytes); 289 | virtual void deallocate_workspace_mem(); 290 | 291 | // accumulated gradients 292 | Tensor* bwd_gradients_ = NULL; 293 | // additional buffers 294 | Tensor* interm_buffer_ = NULL; // GPU only 295 | Tensor* tmp_param_buffer_ = NULL; // GPU and CPU 296 | 297 | Tensor* col_buffer_= NULL; // CPU only 298 | Tensor* bias_multiplier_= NULL; // GPU and CPU 299 | 300 | // workspace memory that we have allocated 301 | void* own_workspace_data = NULL; 302 | //tensor that holds the workspace memory 303 | Tensor* own_workspace_tensor = NULL; 304 | 305 | bool do_on_gpu_; 306 | }; 307 | 308 | /** 309 | * We use this exception in OP_REQUIRES_OK_THROW_EX macro to mark that exception has already 310 | * been reported to tensorflow context using context->CtxFailureWithWarning(...) 311 | */ 312 | class DAUExceptionTF : public std::exception { 313 | public: 314 | DAUExceptionTF() : std::exception() {} 315 | 316 | virtual const char* what() const noexcept { 317 | return "TENSORFLOW reported status error"; 318 | } 319 | }; 320 | 321 | //OP_REQUIRES_OK uses return, problematic for compilation in non void functions 322 | #define OP_REQUIRES_OK_BREAK(CTX, ...) \ 323 | do { \ 324 | ::tensorflow::Status _s(__VA_ARGS__); \ 325 | if (!TF_PREDICT_TRUE(_s.ok())) { \ 326 | (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \ 327 | break; \ 328 | } \ 329 | } while (0) 330 | 331 | #define OP_REQUIRES_OK_THROW_EX(CTX, ...) \ 332 | do { \ 333 | ::tensorflow::Status _s(__VA_ARGS__); \ 334 | if (!TF_PREDICT_TRUE(_s.ok())) { \ 335 | (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \ 336 | throw new DAUExceptionTF(); \ 337 | } \ 338 | } while (0) 339 | 340 | #endif // CAFFE_DAU_CONV_LAYER_HPP_ 341 | -------------------------------------------------------------------------------- /src/dau_conv/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | 2 | # --[ DAU-ConvNet library 3 | 4 | # creates 'test_srcs', 'srcs', 'test_cuda', 'cuda' lists 5 | dau_conv_pickup_sources(${PROJECT_SOURCE_DIR}) 6 | 7 | set(cuda_objs "") 8 | 9 | # make cuda objects 10 | dau_conv_cuda_compile(cuda_objs ${cuda}) 11 | add_custom_target(dauc-conv-cu DEPENDS ${cuda_objs} SOURCES ${cuda}) 12 | 13 | # make objects from src 14 | add_library(dau-conv-obj OBJECT ${srcs} ) 15 | dau_conv_default_properties(dau-conv-obj) 16 | target_include_directories(dau-conv-obj ${DAUConvNet_INCLUDE_DIRS} 17 | PUBLIC 18 | $ 19 | $) 20 | target_compile_definitions(dau-conv-obj ${DAUConvNet_DEFINITIONS}) 21 | add_dependencies(dau-conv-obj dauc-conv-cu) 22 | if(DAUConvNet_COMPILE_OPTIONS) 23 | target_compile_options(dau-conv-obj ${DAUConvNet_COMPILE_OPTIONS}) 24 | endif() 25 | 26 | list(APPEND DAUConvNet_OBJS ${cuda_objs} ${cuda} $) 27 | 28 | # save list of .o objects (both src and cuda) so that parrent project can directly embedd into .so 29 | set(DAUConvNet_OBJS ${DAUConvNet_OBJS} PARENT_SCOPE) 30 | set(DAUConvNet_CU_OBJS ${cuda_objs} PARENT_SCOPE) 31 | set(DAUConvNet_CU_SRC ${cuda} PARENT_SCOPE) 32 | set(DAUConvNet_OBJ_TARGET "dau-conv-obj" PARENT_SCOPE) 33 | set(DAUConvNet_LINKER_LIBS ${DAUConvNet_LINKER_LIBS} PARENT_SCOPE) 34 | 35 | # crate shared object 36 | add_library(dau-conv ${DAUConvNet_OBJS}) 37 | target_link_libraries(dau-conv ${DAUConvNet_LINKER_LIBS}) 38 | 39 | 40 | 41 | add_executable(main ../main.cpp) 42 | target_link_libraries(main dau-conv) 43 | target_include_directories(main ${DAUConvNet_INCLUDE_DIRS} PUBLIC ${DAUConvNet_INCLUDE_DIR}) 44 | dau_conv_default_properties(main) 45 | 46 | 47 | -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_16x16.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_16x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 16, 16, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_16x32.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_16x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 16, 32, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_16x64.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_16x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 16, 64, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_16x8.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_16x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 16, 8, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_1x1.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_1x1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | if (SMALLER_WARP_AND_GROUP_K) { 11 | RUN_KERNEL_R4(DAUConvBackwardCUDA, 2, 2, MAX_OFFSET, 4, 4, 2, 2, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 12 | } else { 13 | RUN_KERNEL_R4(DAUConvBackwardCUDA, 2, 2, MAX_OFFSET, 3, 1, 2, 2, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 14 | } 15 | } 16 | 17 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_32x16.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_32x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 32, 16, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_32x32.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_32x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 32, 32, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_32x64.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_32x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 32, 64, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_32x8.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_32x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 32, 8, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_64x16.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_64x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 64, 16, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_64x32.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_64x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 64, 32, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_64x64.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_64x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 64, 64, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_64x8.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_64x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 64, 8, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_8x16.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_8x16(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 8, 16, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_8x32.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_8x32(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 8, 32, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_8x64.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_8x64(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 8, 64, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_backward_patch_8x8.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_backward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_backward_multi_subfeatures_patch_8x8(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, int MAX_OFFSET, 6 | bool SMALLER_WARP_AND_GROUP_K, int BATCH_IMAGES, 7 | bool USE_INTERPOLATION, bool SINGLE_SUBFEATURE, 8 | DAUConvBackward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R5(DAUConvBackwardCUDA, 8, 8, MAX_OFFSET, SMALLER_WARP_AND_GROUP_K, BATCH_IMAGES, USE_INTERPOLATION, SINGLE_SUBFEATURE, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "dau_conv/dau_conv_impl/dau_conv_forward.hpp" 5 | 6 | #include "dau_conv/util/common.hpp" 7 | 8 | namespace DAUConvNet { 9 | 10 | #define MAX(x,y) (x > y ? x : y) 11 | 12 | int select_optimal_block_size(int img_size, int min_power, int max_power) { 13 | float best_unutilized_percent = 1.0f; 14 | int best_block_size = 0; 15 | for (int i = min_power; i <= max_power; ++i) { 16 | int block_size = pow(2,i); 17 | 18 | float utilization_factor = (img_size / (float)block_size); 19 | float unutilized_percent = (ceil(utilization_factor) - utilization_factor); 20 | if (unutilized_percent <= best_unutilized_percent) { 21 | best_unutilized_percent = unutilized_percent; 22 | best_block_size = block_size; 23 | } 24 | } 25 | return best_block_size; 26 | } 27 | 28 | template 29 | DAUConvForward::DAUConvForward(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const bool use_interpolation) : 30 | img_width_in(img_width_in), img_height_in(img_height_in), img_width(img_width), img_height(img_height), I(I), S(S), F(F), G(G), use_interpolation(use_interpolation) { 31 | 32 | // calls either DAUConvForwardCUDA->run_kernel() or DAUConvForwardCUDA->get_allocation_sizes() 33 | // if prepared_filtered_images_size, prepared_filter_weights_size OR prepared_filter_offsets_size are not NULL 34 | 35 | // decide which size of patch to use to minimize wasted memory/processing 36 | if (img_width == 1 && img_height == 1) { 37 | patch_size_w = 1; 38 | patch_size_h = 1; 39 | } else { 40 | patch_size_w = img_width <= 8 ? 8 : 41 | (img_width <= 16 ? 16 : select_optimal_block_size(img_width, 5, 6)); // allowed patch sizes = 2^[5,6] i.e, [32,64] 42 | patch_size_h = img_height <= 8 ? 8 : 43 | (img_height <= 16 ? 16 : select_optimal_block_size(img_height, 5, 6)); // allowed patch sizes = 2^[5,6] i.e, [32,64] 44 | } 45 | 46 | // decide wheather to use: 47 | // - 32 pixels per warp 48 | // - if 32x8 pixels and 1 images per block (full utilization) 49 | // - 16 pixels per warp 50 | // - if 16x8 pixels and 2 images per block (full utilization) 51 | // - if 16x8 pixels and 1 images per block (half utilization) 52 | // - 8 pixels per warp 53 | // - if 8x8 pixels and 4 images per block (full utilization) 54 | // - if 8x8 pixels and 2 images per block (half utilization) 55 | // - if 8x8 pixels and 1 images per block (1/4 utilization) 56 | // - 1 pixel per warp 57 | // - if 1x1 pixels and 16 images per block (half utilization) (32 images uses too much shared memory so we cannot have full utilization) 58 | 59 | int boundry_img_width = img_width - floor(img_width/patch_size_w) * patch_size_w; 60 | 61 | 62 | // use warp size 1x1 if patch size only 1x1 otherwise use [16,32]x8 (if patch_size_w==8 then use 8x8 but do not prefer it) 63 | warp_pixel_size_x = patch_size_w == 1 ? 1 : 64 | (patch_size_w <= 8 ? 8 : std::min(patch_size_w, select_optimal_block_size(boundry_img_width, 4,5))); // allowed warp pixels sizes = 2^[3,4,5] i.e. [8,16,32] 65 | warp_pixel_size_y = patch_size_h == 1 ? 1 : 8; 66 | 67 | int new_img_parts_width = (int)ceil((float)img_width / patch_size_w); 68 | int new_img_parts_height = (int)ceil((float)img_height / patch_size_h); 69 | 70 | num_images = I * new_img_parts_width * new_img_parts_height; 71 | 72 | // we compute multiple features by one thread but that depends on interpolation 73 | int batch_features = 8 * (use_interpolation ? 2 : 4); 74 | 75 | single_feature = F % batch_features == 0 ? false : true; 76 | single_subfeature = S % 2 == 0 ? false : true; 77 | } 78 | 79 | template 80 | void DAUConvForward::CUDAParams::set_params_for_allocation_call(size_t *alloc_img, size_t *alloc_w, size_t *alloc_off) { 81 | this->alloc_img = alloc_img; 82 | this->alloc_w = alloc_w; 83 | this->alloc_off = alloc_off; 84 | } 85 | 86 | template 87 | void DAUConvForward::CUDAParams::set_params_for_kernel_call(const Dtype *filtered_images, 88 | const Dtype *filter_offsets_float_x, const Dtype *filter_offsets_float_y, 89 | const Dtype *filter_weights, const PARAM_FORMAT param_format, const int kernel_w, const int kernel_h, 90 | const Dtype actual_max_offset, Dtype *output, 91 | Dtype *prepared_filtered_images, 92 | Dtype *prepared_filter_weights, 93 | int *prepared_filter_offsets, 94 | Dtype *prepared_filter_offsets_and_weights, 95 | cudaStream_t streamId) { 96 | this->filtered_images = filtered_images; 97 | this->filter_offsets_float_x = filter_offsets_float_x; 98 | this->filter_offsets_float_y = filter_offsets_float_y; 99 | this->filter_weights = filter_weights; 100 | this->kernel_w = kernel_w; 101 | this->kernel_h = kernel_h; 102 | this->actual_max_offset = actual_max_offset; 103 | this->param_format = param_format; 104 | this->output = output; 105 | this->prepared_filtered_images = prepared_filtered_images; 106 | this->prepared_filter_weights = prepared_filter_weights; 107 | this->prepared_filter_offsets = prepared_filter_offsets; 108 | this->prepared_filter_offsets_and_weights = prepared_filter_offsets_and_weights; 109 | this->streamId = streamId; 110 | } 111 | template 112 | void DAUConvForward::get_allocation_sizes(const int kernel_width, const int kernel_height, const bool offsets_already_centered, 113 | size_t* prepared_filtered_images_size, 114 | size_t* prepared_filter_weights_size, 115 | size_t* prepared_filter_offsets_size) { 116 | 117 | CUDAParams params(img_width_in, img_height_in, img_width, img_height, I, S, F, G, offsets_already_centered); 118 | 119 | params.set_params_for_allocation_call(prepared_filtered_images_size, prepared_filter_weights_size, prepared_filter_offsets_size); 120 | 121 | params.set_params_for_kernel_call(NULL, NULL, NULL, NULL, PARAM_FORMAT::SGF, kernel_width, kernel_height, (MAX(kernel_width, kernel_height)-1)/2, NULL, 122 | NULL, NULL, NULL, NULL, 0); 123 | 124 | call_cuda_kernel(params); 125 | } 126 | 127 | 128 | template 129 | void DAUConvForward::forward_pass(const Dtype* filtered_images, 130 | const Dtype* filter_offsets_float_x, const Dtype* filter_offsets_float_y, 131 | const Dtype* filter_weights, const PARAM_FORMAT param_format, 132 | const int kernel_width, const int kernel_height, const Dtype actual_max_offset, 133 | const bool offsets_already_centered, 134 | Dtype* output, 135 | Dtype* prepared_filtered_images, 136 | Dtype* prepared_filter_weights, 137 | int* prepared_filter_offsets, 138 | Dtype* prepared_filter_offsets_and_weights, cudaStream_t streamId) { 139 | // Optimize the max possible offset that is needed since larger offsets require loading more memory and is less efficent 140 | 141 | // For offsets larger then 8 px then we need to : 142 | // * for offsets <= 16px: use OUT_K = 3 143 | // * for offsets <= 32px: use OUT_K = 1 and run several times for each K 144 | // 145 | // WARNING: this must be synced with RUN_KERNEL_R2 in dau_conv_backward_core.hpp 146 | 147 | float max_offset = 32; 148 | 149 | if (actual_max_offset <= 4) 150 | max_offset = 4; 151 | else if (actual_max_offset <= 8) 152 | max_offset = 8; 153 | else if (actual_max_offset <= 16) { 154 | max_offset = 16; 155 | } else if (actual_max_offset <= 32) { 156 | max_offset = 32; 157 | } else { 158 | throw DAUException(string_format("ERROR: actual offsets larger then what CUDA memory allows (setup max_kernel_size and unit_border_bound correctly to avoid this)!!")); 159 | } 160 | 161 | // To ensure we have enough memory we require max_offset not to exceed kernel_width or kernel_height 162 | // since kernel_width and kernel_height are used in get_allocation_sizes() 163 | DAU_CHECK(kernel_width >= max_offset*2+1, "Maximum offset values exceeds boundries as defined by kernel_width."); 164 | DAU_CHECK(kernel_height >= max_offset*2+1, "Maximum offset values exceeds boundries as defined by kernel_height."); 165 | 166 | CUDAParams params(img_width_in, img_height_in, img_width, img_height, I, S, F, G, offsets_already_centered); 167 | 168 | params.set_params_for_allocation_call(NULL, NULL, NULL); 169 | params.set_params_for_kernel_call(filtered_images, filter_offsets_float_x, filter_offsets_float_y, filter_weights, param_format, kernel_width, kernel_height, max_offset, output, 170 | prepared_filtered_images, prepared_filter_weights, prepared_filter_offsets, prepared_filter_offsets_and_weights, 171 | streamId); 172 | 173 | call_cuda_kernel(params); 174 | } 175 | 176 | template <> 177 | void DAUConvForward::call_cuda_kernel(CUDAParams& params) { 178 | 179 | int max_offset = ceil(params.actual_max_offset); 180 | //int max_offset = MAX(params.kernel_w, params.kernel_h)/2; 181 | 182 | if (max_offset <= 4) { 183 | if (single_feature == false && single_subfeature == false) { 184 | // version where single_feature is false and single_subfeature false 185 | DAUConv_forward_float_off_4_single_feat_0_single_subfeat_0(patch_size_w, patch_size_h, max_offset, 186 | warp_pixel_size_x, warp_pixel_size_y, num_images, 187 | use_interpolation, params); 188 | 189 | } else if (single_feature == false && single_subfeature == true) { 190 | // version where single_feature is false and single_subfeature true 191 | DAUConv_forward_float_off_4_single_feat_0_single_subfeat_1(patch_size_w, patch_size_h, max_offset, 192 | warp_pixel_size_x, warp_pixel_size_y, num_images, 193 | use_interpolation, params); 194 | 195 | } else if (single_feature == true && single_subfeature == false) { 196 | // version where single_feature is true and single_subfeature false 197 | DAUConv_forward_float_off_4_single_feat_1_single_subfeat_0(patch_size_w, patch_size_h, max_offset, 198 | warp_pixel_size_x, warp_pixel_size_y, num_images, 199 | use_interpolation, params); 200 | 201 | } else { 202 | // version where single_feature is true and single_subfeature true 203 | DAUConv_forward_float_off_4_single_feat_1_single_subfeat_1(patch_size_w, patch_size_h, max_offset, 204 | warp_pixel_size_x, warp_pixel_size_y, num_images, 205 | use_interpolation, params); 206 | } 207 | } else if (max_offset <= 8) { 208 | if (single_feature == false && single_subfeature == false) { 209 | // version where single_feature is false and single_subfeature false 210 | DAUConv_forward_float_off_8_single_feat_0_single_subfeat_0(patch_size_w, patch_size_h, max_offset, 211 | warp_pixel_size_x, warp_pixel_size_y, num_images, 212 | use_interpolation, params); 213 | 214 | } else if (single_feature == false && single_subfeature == true) { 215 | // version where single_feature is false and single_subfeature true 216 | DAUConv_forward_float_off_8_single_feat_0_single_subfeat_1(patch_size_w, patch_size_h, max_offset, 217 | warp_pixel_size_x, warp_pixel_size_y, num_images, 218 | use_interpolation, params); 219 | 220 | } else if (single_feature == true && single_subfeature == false) { 221 | // version where single_feature is true and single_subfeature false 222 | DAUConv_forward_float_off_8_single_feat_1_single_subfeat_0(patch_size_w, patch_size_h, max_offset, 223 | warp_pixel_size_x, warp_pixel_size_y, num_images, 224 | use_interpolation, params); 225 | 226 | } else { 227 | // version where single_feature is true and single_subfeature true 228 | DAUConv_forward_float_off_8_single_feat_1_single_subfeat_1(patch_size_w, patch_size_h, max_offset, 229 | warp_pixel_size_x, warp_pixel_size_y, num_images, 230 | use_interpolation, params); 231 | } 232 | } else if (max_offset <= 16) { 233 | 234 | if (single_feature == false) 235 | DAUConv_forward_float_off_16_single_feat_0_single_subfeat_1(patch_size_w, patch_size_h, max_offset, 236 | warp_pixel_size_x, warp_pixel_size_y, num_images, 237 | use_interpolation, params); 238 | else 239 | DAUConv_forward_float_off_16_single_feat_1_single_subfeat_1(patch_size_w, patch_size_h, max_offset, 240 | warp_pixel_size_x, warp_pixel_size_y, num_images, 241 | use_interpolation, params); 242 | 243 | } else if (max_offset <= 32) { 244 | DAUConv_forward_float_off_32_single_feat_1_single_subfeat_1(patch_size_w, patch_size_h, max_offset, 245 | warp_pixel_size_x, warp_pixel_size_y, num_images, 246 | use_interpolation, params); 247 | 248 | 249 | } else { 250 | throw DAUException(string_format("Unsupported filter size: %d. Supported only max up to 9x9 and 17x17 at the moment", max_offset)); 251 | } 252 | 253 | 254 | 255 | // CALL RUN_KERNEL_R4 macro that will call run_kernel() function on supplied class where first 4 parameters are replaced with compile-time known variables 256 | // replacing variables with compile-time known values allows CUDA compiler to generate kernels in advanced with pre-defined sizes 257 | /* 258 | RUN_KERNEL_R7(DAUConvForwardCUDA, patch_size_w, patch_size_h, max_offset, warp_pixel_size_x, num_images, use_interpolation, single_feature, single_subfeature, 259 | img_width, img_height, I, S, F, G, 260 | filtered_images, filter_offsets_float_x, filter_offsets_float_y, filter_weights, kernel_width, kernel_height, PARAM_FORMAT, output, 261 | prepared_filtered_images, prepared_filter_weights, prepared_filter_offsets, prepared_filter_offsets_and_weights, 262 | streamId); 263 | */ 264 | } 265 | 266 | template <> 267 | void DAUConvForward::call_cuda_kernel(CUDAParams& params) { 268 | throw DAUException("Not implemented for double"); 269 | } 270 | 271 | template DAUConvForward::DAUConvForward(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const bool use_interpolation); 272 | template DAUConvForward::DAUConvForward(const int img_width_in, const int img_height_in, const int img_width, const int img_height, const int I, const int S, const int F, const int G, const bool use_interpolation); 273 | 274 | template void DAUConvForward::get_allocation_sizes(const int kernel_width, const int kernel_height, const bool offsets_already_centered, size_t* prepared_filtered_images_size, size_t* prepared_filter_weights_size, size_t* prepared_filter_offsets_size); 275 | template void DAUConvForward::get_allocation_sizes(const int kernel_width, const int kernel_height, const bool offsets_already_centered, size_t* prepared_filtered_images_size, size_t* prepared_filter_weights_size, size_t* prepared_filter_offsets_size); 276 | 277 | template void DAUConvForward::forward_pass(const float* filtered_images, const float* filter_offsets_float_x, const float* filter_offsets_float_y, const float* filter_weights, const PARAM_FORMAT param_format, const int kernel_width, const int kernel_height, const float actual_max_offset, const bool offsets_already_centered, float* output, float* prepared_filtered_images, float* prepared_filter_weights, int* prepared_filter_offsets, float* prepared_filter_offsets_and_weights, cudaStream_t streamId); 278 | template void DAUConvForward::forward_pass(const double* filtered_images, const double* filter_offsets_float_x, const double* filter_offsets_float_y, const double* filter_weights, const PARAM_FORMAT param_format, const int kernel_width, const int kernel_height, const double actual_max_offset, const bool offsets_already_centered, double* output, double* prepared_filtered_images, double* prepared_filter_weights, int* prepared_filter_offsets, double* prepared_filter_offsets_and_weights, cudaStream_t streamId); 279 | 280 | } // namespace caffe 281 | 282 | 283 | -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward_off16_s0_f1.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_forward_float_off_16_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, 6 | int MAX_OFFSET_, int WARP_PIXELS_X, int WARP_PIXELS_Y, 7 | int BLOCK_IMAGES, int USE_INTERPOLATION, 8 | DAUConvForward::CUDAParams &PARAMS){ 9 | #define SINGLE_FEATURE false 10 | #define SINGLE_SUBFEATURE true 11 | #define MAX_OFFSET 16 12 | 13 | if (IMG_PATCH_SIZE_W == 1 && IMG_PATCH_SIZE_H == 1 && WARP_PIXELS_X == 1 && WARP_PIXELS_Y == 1) { 14 | if (BLOCK_IMAGES % 2 == 0) { 15 | RUN_KERNEL_R1(DAUConvForwardCUDA, 2, 1, MAX_OFFSET, 2, 1, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS); 16 | } else { 17 | RUN_KERNEL_R1(DAUConvForwardCUDA, 4, 1, MAX_OFFSET, 4, 1, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 18 | /*printf("Unsupported BATCH SIZE for 1x1 pixels: Supported only a multiple of 16 (at MAX_OFFSET<=4), 8 (at MAX_OFFSET<=8) or 4 images at the moment\n"); */ 19 | /*throw std::exception();*/ 20 | } 21 | } else if (IMG_PATCH_SIZE_W == 8 && WARP_PIXELS_X == 8) { 22 | /* We have 8px WARP_PIXELS_X sizes only for smaller patch sizes - but check just in case (fixing IMG_PATCH_SIZE_W avoids unneeded computation as well) */ 23 | if (BLOCK_IMAGES % 2 == 0) { 24 | RUN_KERNEL_R2(DAUConvForwardCUDA, 8, IMG_PATCH_SIZE_H, MAX_OFFSET, 8, 8, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 25 | } else { 26 | RUN_KERNEL_R2(DAUConvForwardCUDA, 8, IMG_PATCH_SIZE_H, MAX_OFFSET, 8, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 27 | } 28 | } else if (WARP_PIXELS_X == 16) { 29 | if (BLOCK_IMAGES % 2 == 0) { 30 | RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 31 | } else { 32 | RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 33 | } 34 | } else if (WARP_PIXELS_X == 32) { 35 | RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 32, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 36 | } else { 37 | printf("Unsupported WARP_PIXELS_X: %d. Supported only 16 or 32 at the moment (or 1 when WARP_PIXELS_Y==1 as well) \n", WARP_PIXELS_X); 38 | throw std::exception(); 39 | } 40 | } 41 | 42 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward_off16_s1_f1.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_forward_float_off_16_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, 6 | int MAX_OFFSET_, int WARP_PIXELS_X, int WARP_PIXELS_Y, 7 | int BLOCK_IMAGES, int USE_INTERPOLATION, 8 | DAUConvForward::CUDAParams &PARAMS){ 9 | #define SINGLE_FEATURE true 10 | #define SINGLE_SUBFEATURE true 11 | #define MAX_OFFSET 16 12 | 13 | if (IMG_PATCH_SIZE_W == 1 && IMG_PATCH_SIZE_H == 1 && WARP_PIXELS_X == 1 && WARP_PIXELS_Y == 1) { 14 | if (BLOCK_IMAGES % 2 == 0) { 15 | RUN_KERNEL_R1(DAUConvForwardCUDA, 2, 1, MAX_OFFSET, 2, 1, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS); 16 | } else { 17 | RUN_KERNEL_R1(DAUConvForwardCUDA, 4, 1, MAX_OFFSET, 4, 1, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 18 | /*printf("Unsupported BATCH SIZE for 1x1 pixels: Supported only a multiple of 16 (at MAX_OFFSET<=4), 8 (at MAX_OFFSET<=8) or 4 images at the moment\n"); */ 19 | /*throw std::exception();*/ 20 | } 21 | } else if (IMG_PATCH_SIZE_W == 8 && WARP_PIXELS_X == 8) { 22 | /* We have 8px WARP_PIXELS_X sizes only for smaller patch sizes - but check just in case (fixing IMG_PATCH_SIZE_W avoids unneeded computation as well) */ 23 | if (BLOCK_IMAGES % 2 == 0) { 24 | RUN_KERNEL_R2(DAUConvForwardCUDA, 8, IMG_PATCH_SIZE_H, MAX_OFFSET, 8, 8, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 25 | } else { 26 | RUN_KERNEL_R2(DAUConvForwardCUDA, 8, IMG_PATCH_SIZE_H, MAX_OFFSET, 8, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 27 | } 28 | } else if (WARP_PIXELS_X == 16) { 29 | if (BLOCK_IMAGES % 2 == 0) { 30 | RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 2, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 31 | } else { 32 | RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 33 | } 34 | } else if (WARP_PIXELS_X == 32) { 35 | RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 32, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 36 | } else { 37 | printf("Unsupported WARP_PIXELS_X: %d. Supported only 16 or 32 at the moment (or 1 when WARP_PIXELS_Y==1 as well) \n", WARP_PIXELS_X); 38 | throw std::exception(); 39 | } 40 | } 41 | 42 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward_off32_s1_f1.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_forward_float_off_32_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, 6 | int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, 7 | int BLOCK_IMAGES, int USE_INTERPOLATION, 8 | DAUConvForward::CUDAParams &PARAMS){ 9 | #define SINGLE_FEATURE true 10 | #define SINGLE_SUBFEATURE true 11 | #define MAX_OFFSET 32 12 | 13 | if (IMG_PATCH_SIZE_W == 1 && IMG_PATCH_SIZE_H == 1 && WARP_PIXELS_X == 1 && WARP_PIXELS_Y == 1) { 14 | RUN_KERNEL_R1(DAUConvForwardCUDA, 4, 1, MAX_OFFSET, 4, 1, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 15 | } else if (IMG_PATCH_SIZE_W == 8 && WARP_PIXELS_X == 8) { 16 | RUN_KERNEL_R2(DAUConvForwardCUDA, 8, IMG_PATCH_SIZE_H, MAX_OFFSET, 8, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 17 | } else if (WARP_PIXELS_X == 16) { 18 | RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 19 | } else if (WARP_PIXELS_X == 32) { 20 | RUN_KERNEL_R3(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, MAX_OFFSET, 16, 8, 1, USE_INTERPOLATION, SINGLE_FEATURE, SINGLE_SUBFEATURE, PARAMS) 21 | } else { 22 | printf("Unsupported WARP_PIXELS_X: %d. Supported only 16 or 32 at the moment (or 1 when WARP_PIXELS_Y==1 as well) \n", WARP_PIXELS_X); 23 | throw std::exception(); 24 | } 25 | //RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 32, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, false, false, PARAMS); 26 | } 27 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward_off4_s0_f0.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_forward_float_off_4_single_feat_0_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, 6 | int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, 7 | int BLOCK_IMAGES, int USE_INTERPOLATION, 8 | DAUConvForward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 4, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, false, false, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward_off4_s0_f1.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_forward_float_off_4_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, 6 | int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, 7 | int BLOCK_IMAGES, int USE_INTERPOLATION, 8 | DAUConvForward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 4, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, false, true, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward_off4_s1_f0.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_forward_float_off_4_single_feat_1_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, 6 | int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, 7 | int BLOCK_IMAGES, int USE_INTERPOLATION, 8 | DAUConvForward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 4, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, true, false, PARAMS); 11 | } 12 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward_off4_s1_f1.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_forward_float_off_4_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, 6 | int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, 7 | int BLOCK_IMAGES, int USE_INTERPOLATION, 8 | DAUConvForward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 4, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, true, true, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward_off8_s0_f0.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_forward_float_off_8_single_feat_0_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, 6 | int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, 7 | int BLOCK_IMAGES, int USE_INTERPOLATION, 8 | DAUConvForward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 8, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, false, false, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward_off8_s0_f1.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_forward_float_off_8_single_feat_0_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, 6 | int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, 7 | int BLOCK_IMAGES, int USE_INTERPOLATION, 8 | DAUConvForward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 8, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, false, true, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward_off8_s1_f0.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_forward_float_off_8_single_feat_1_single_subfeat_0(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, 6 | int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, 7 | int BLOCK_IMAGES, int USE_INTERPOLATION, 8 | DAUConvForward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 8, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, true, false, PARAMS); 11 | } 12 | } -------------------------------------------------------------------------------- /src/dau_conv/dau_conv_impl/dau_conv_forward_off8_s1_f1.cu: -------------------------------------------------------------------------------- 1 | #include "dau_conv/dau_conv_impl/dau_conv_forward_core.hpp" 2 | 3 | namespace DAUConvNet { 4 | 5 | void DAUConv_forward_float_off_8_single_feat_1_single_subfeat_1(int IMG_PATCH_SIZE_W, int IMG_PATCH_SIZE_H, 6 | int MAX_OFFSET, int WARP_PIXELS_X, int WARP_PIXELS_Y, 7 | int BLOCK_IMAGES, int USE_INTERPOLATION, 8 | DAUConvForward::CUDAParams &PARAMS){ 9 | 10 | RUN_KERNEL_R4(DAUConvForwardCUDA, IMG_PATCH_SIZE_W, IMG_PATCH_SIZE_H, 8, WARP_PIXELS_X, WARP_PIXELS_Y, BLOCK_IMAGES, USE_INTERPOLATION, true, true, PARAMS); 11 | } 12 | 13 | } -------------------------------------------------------------------------------- /src/dau_conv/util/common.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by domen on 3/23/18. 3 | // 4 | 5 | #include "dau_conv/util/common.hpp" 6 | 7 | void __M_Assert(const char* expr_str, bool expr, const char* file, int line, const char* msg) { 8 | if (!expr) { 9 | std::cerr << "Assert failed:\t" << msg << "\n" 10 | << "Expected:\t" << expr_str << "\n" 11 | << "Source:\t\t" << file << ", line " << line << "\n"; 12 | abort(); 13 | } 14 | } 15 | 16 | namespace DAUConvNet { 17 | const char *cublasGetErrorString(cublasStatus_t error) { 18 | switch (error) { 19 | case CUBLAS_STATUS_SUCCESS: 20 | return "CUBLAS_STATUS_SUCCESS"; 21 | case CUBLAS_STATUS_NOT_INITIALIZED: 22 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 23 | case CUBLAS_STATUS_ALLOC_FAILED: 24 | return "CUBLAS_STATUS_ALLOC_FAILED"; 25 | case CUBLAS_STATUS_INVALID_VALUE: 26 | return "CUBLAS_STATUS_INVALID_VALUE"; 27 | case CUBLAS_STATUS_ARCH_MISMATCH: 28 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 29 | case CUBLAS_STATUS_MAPPING_ERROR: 30 | return "CUBLAS_STATUS_MAPPING_ERROR"; 31 | case CUBLAS_STATUS_EXECUTION_FAILED: 32 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 33 | case CUBLAS_STATUS_INTERNAL_ERROR: 34 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 35 | #if CUDA_VERSION >= 6000 36 | case CUBLAS_STATUS_NOT_SUPPORTED: 37 | return "CUBLAS_STATUS_NOT_SUPPORTED"; 38 | #endif 39 | #if CUDA_VERSION >= 6050 40 | case CUBLAS_STATUS_LICENSE_ERROR: 41 | return "CUBLAS_STATUS_LICENSE_ERROR"; 42 | #endif 43 | } 44 | return "Unknown cublas status"; 45 | } 46 | } -------------------------------------------------------------------------------- /src/dau_conv/util/convolve.cpp: -------------------------------------------------------------------------------- 1 | /******************************************************* 2 | * Copyright (c) 2014, ArrayFire 3 | * All rights reserved. 4 | * 5 | * This file is distributed under 3-clause BSD license. 6 | * The complete license agreement can be obtained at: 7 | * http://arrayfire.com/licenses/BSD-3-Clause 8 | ********************************************************/ 9 | 10 | #include "dau_conv/util/convolve.hpp" 11 | 12 | #include 13 | 14 | namespace DAUConvNet 15 | { 16 | 17 | 18 | template 19 | void convolve(Dtype* out, const conv2_data_desc& out_desc, 20 | const Dtype* signal, const conv2_data_desc& signal_desc, 21 | const Dtype* filter, const conv2_data_desc& filter_desc, 22 | cudaStream_t streamId ) { 23 | 24 | AF_BATCH_KIND kind; 25 | size_t sn = sizeof(signal_desc.dims) / sizeof(int); 26 | size_t fn = sizeof(filter_desc.dims) / sizeof(int); 27 | 28 | bool sn_stop = false, fn_stop = false; 29 | for (int i = 0; i < 4; ++i) { 30 | if (signal_desc.dims[i] <= 1 && !sn_stop) 31 | sn--; 32 | else 33 | sn_stop = true; 34 | 35 | if (filter_desc.dims[i] <= 1 && !fn_stop) 36 | fn--; 37 | else 38 | fn_stop = true; 39 | } 40 | 41 | if (sn == baseDim && fn == baseDim) 42 | kind = AF_BATCH_NONE; 43 | else if (sn == baseDim && (fn > baseDim && fn <= 4)) 44 | kind = AF_BATCH_RHS; 45 | else if ((sn > baseDim && sn <= 4) && fn == baseDim) 46 | kind = AF_BATCH_LHS; 47 | else if ((sn > baseDim && sn <= 4) && (fn > baseDim && fn <= 4)) { 48 | bool doesDimensionsMatch = true; 49 | bool isInterleaved = true; 50 | for (int i = 3-baseDim; i >= 0; i--) { 51 | doesDimensionsMatch &= (signal_desc.dims[i] == filter_desc.dims[i]); 52 | isInterleaved &= (signal_desc.dims[i] == 1 || filter_desc.dims[i] == 1 || signal_desc.dims[i] == filter_desc.dims[i]); 53 | } 54 | if (doesDimensionsMatch) kind = AF_BATCH_SAME; 55 | else kind = (isInterleaved ? AF_BATCH_DIFF : AF_BATCH_UNSUPPORTED); 56 | } else 57 | kind = AF_BATCH_UNSUPPORTED; 58 | 59 | assert(kind != AF_BATCH_UNSUPPORTED && !(kind == AF_BATCH_DIFF && fn == 4)); 60 | 61 | 62 | 63 | conv2_data_desc out_new_desc; 64 | if (expand) { 65 | for(size_t d=0; d<4; ++d) { 66 | if (kind==AF_BATCH_NONE || kind==AF_BATCH_RHS) { 67 | out_new_desc.dims[d] = signal_desc.dims[d]+filter_desc.dims[d]-1; 68 | } else { 69 | out_new_desc.dims[d] = (d>=baseDim ? signal_desc.dims[d]+filter_desc.dims[d]-1 : signal_desc.dims[d]); 70 | } 71 | } 72 | } else { 73 | out_new_desc = signal_desc; 74 | if (kind==AF_BATCH_RHS) { 75 | for (size_t i=0; i<4- baseDim; ++i) { 76 | out_new_desc.dims[i] = filter_desc.dims[i]; 77 | } 78 | } else if (kind == AF_BATCH_DIFF) { 79 | for (size_t i=0; i<4- baseDim; ++i) { 80 | out_new_desc.dims[i] = signal_desc.dims[i] != 1 ? signal_desc.dims[i] : filter_desc.dims[i]; 81 | } 82 | } 83 | } 84 | 85 | // ensure output of correct size or reshape 86 | bool reshape = false; 87 | for (size_t i=0; i<4; ++i) { 88 | reshape = reshape || out_new_desc.dims[i] != out_desc.dims[i]; 89 | 90 | } 91 | if (reshape) { 92 | // out shape not consistent !! 93 | printf("Invalid output shape size, expetced shape size: %d,%d,%d,%d.\n", out_new_desc.dims[0], out_new_desc.dims[1], out_new_desc.dims[2], out_new_desc.dims[3]); 94 | throw std::exception(); 95 | } 96 | 97 | 98 | kernel::convolve_nd(out, out_desc, 99 | signal, signal_desc, 100 | filter, filter_desc, kind, streamId); 101 | } 102 | 103 | 104 | template 105 | void caffe_gpu_convolve2(Dtype* out, const conv2_data_desc& out_desc, 106 | const Dtype* signal, const conv2_data_desc& signal_desc, 107 | const Dtype* filter, const conv2_data_desc& filter_desc, cudaStream_t streamId ) { 108 | return convolve(out, out_desc, 109 | signal, signal_desc, 110 | filter, filter_desc, streamId); 111 | } 112 | 113 | template void caffe_gpu_convolve2(float * out, const conv2_data_desc& out_desc, 114 | const float* signal, const conv2_data_desc& signal_desc, 115 | const float* filter, const conv2_data_desc& filter_desc, cudaStream_t streamId ); 116 | 117 | template<> 118 | void caffe_gpu_convolve2(double* out, const conv2_data_desc& out_desc, 119 | const double* signal, const conv2_data_desc& signal_desc, 120 | const double* filter, const conv2_data_desc& filter_desc, cudaStream_t streamId ) { 121 | printf("Disabled compiling of caffe_gpu_convolve2 for double to speed-up compile."); 122 | throw std::exception(); 123 | } 124 | } 125 | -------------------------------------------------------------------------------- /src/dau_conv/util/convolve.cu: -------------------------------------------------------------------------------- 1 | /******************************************************* 2 | * Copyright (c) 2014, ArrayFire 3 | * All rights reserved. 4 | * 5 | * This file is distributed under 3-clause BSD license. 6 | * The complete license agreement can be obtained at: 7 | * http://arrayfire.com/licenses/BSD-3-Clause 8 | ********************************************************/ 9 | 10 | #include "dau_conv/util/convolve.hpp" 11 | 12 | #include "dau_conv/util/common.hpp" 13 | 14 | #include 15 | 16 | #include 17 | 18 | namespace DAUConvNet 19 | { 20 | 21 | namespace kernel 22 | { 23 | 24 | #define divup(a, b) (((a)+(b)-1)/(b)) 25 | 26 | // we do not use CUDA_NUM_THREADS as 256 is more optimal for this function 27 | static const int THREADS = 256; 28 | 29 | static const int THREADS_X = 16; 30 | static const int THREADS_Y = 16; 31 | 32 | //static const int CUBE_X = 8; 33 | //static const int CUBE_Y = 8; 34 | //static const int CUBE_Z = 4; 35 | 36 | // below shared MAX_*_LEN's are calculated based on 37 | // a maximum shared memory configuration of 48KB per block 38 | // considering complex types as well 39 | static const int MAX_CONV1_FILTER_LEN = 129; 40 | static const int MAX_CONV2_FILTER_LEN = 17; 41 | 42 | 43 | // we shall declare the maximum size required of above all three cases 44 | // and re-use the same constant memory locations for every case 45 | __constant__ char cFilter[2*(2*(MAX_CONV1_FILTER_LEN-1)+CUDA_NUM_THREADS)*sizeof(double)]; 46 | 47 | 48 | template 49 | __global__ 50 | void convolve2(T* out, const conv2_data_desc out_desc, 51 | const T* signal, const conv2_data_desc signal_desc, 52 | int nBBS0, int nBBS1, int o2, int o3, int s2, int s3) 53 | { 54 | const size_t C_SIZE = (THREADS_X+2*(fLen0-1))* (THREADS_Y+2*(fLen1-1)); 55 | __shared__ T shrdMem[C_SIZE]; 56 | 57 | const int radius0 = fLen0-1; 58 | const int radius1 = fLen1-1; 59 | const int padding0 = 2*radius0; 60 | const int padding1 = 2*radius1; 61 | const int shrdLen0 = THREADS_X + padding0; 62 | const int shrdLen1 = THREADS_Y + padding1; 63 | 64 | unsigned b0 = blockIdx.x / nBBS0; 65 | unsigned b1 = (blockIdx.y + blockIdx.z * gridDim.y) / nBBS1; 66 | T *dst = (T *)out+ (b0 * out_desc.strides[3-2] + /* activated with batched input signal */ 67 | o2 * out_desc.strides[3-2] + /* activated with batched input filter */ 68 | b1 * out_desc.strides[3-3] + /* activated with batched input signal */ 69 | o3 * out_desc.strides[3-3]); /* activated with batched input filter */ 70 | 71 | const T *src = (const T *)signal + (b0 * signal_desc.strides[3-2] + /* activated with batched input signal */ 72 | s2 * signal_desc.strides[3-2] + /* activated with batched input filter */ 73 | b1 * signal_desc.strides[3-3] + /* activated with batched input signal */ 74 | s3 * signal_desc.strides[3-3]); /* activated with batched input filter */ 75 | 76 | const T *impulse = (const T *)cFilter; 77 | 78 | int lx = threadIdx.x; 79 | int ly = threadIdx.y; 80 | int gx = THREADS_X * (blockIdx.x-b0*nBBS0) + lx; 81 | int gy = THREADS_Y * ((blockIdx.y + blockIdx.z * gridDim.y) -b1*nBBS1) + ly; 82 | 83 | if(b1 >= out_desc.dims[3-3]) 84 | return; 85 | 86 | int s0 = signal_desc.strides[3-0]; 87 | int s1 = signal_desc.strides[3-1]; 88 | int d0 = signal_desc.dims[3-0]; 89 | int d1 = signal_desc.dims[3-1]; 90 | // below loops are traditional loops, they only run multiple 91 | // times filter length is more than launch size 92 | #pragma unroll 93 | for (int b=ly, gy2=gy; b=0 && j=0 && i>1); 108 | int cj = ly + radius1 + (expand ? 0 : fLen1>>1); 109 | 110 | T accum[fLen2]; 111 | for (int fk = 0; fk < fLen2; ++fk) accum[fk] = T(0); 112 | #pragma unroll 113 | for(int fj=0; fj 153 | void prepareKernelArgs(conv_kparam_t ¶ms, const int* oDims, const int* fDims, int baseDim) 154 | { 155 | int batchDims[4] = {1, 1, 1, 1}; 156 | for(int i=0; i<4-baseDim; ++i) { 157 | batchDims[i] = (params.launchMoreBlocks ? 1 : oDims[i]); 158 | } 159 | 160 | const int maxBlocksY = 64*1024-1; //cuda::getDeviceProp(cuda::getActiveDeviceId()).maxGridSize[1]; 161 | if (baseDim==1) { 162 | // unsupported 163 | } else if (baseDim==2) { 164 | params.mThreads = dim3(THREADS_X, THREADS_Y); 165 | params.mBlk_x = divup(oDims[3-0], params.mThreads.x); 166 | params.mBlk_y = divup(oDims[3-1], params.mThreads.y); 167 | params.mBlocks = dim3(params.mBlk_x * batchDims[3-2], params.mBlk_y * batchDims[3-3]); 168 | params.mBlocks.z = divup(params.mBlocks.y, maxBlocksY); 169 | params.mBlocks.y = divup(params.mBlocks.y, params.mBlocks.z); 170 | } else if (baseDim==3) { 171 | // unsupported 172 | } 173 | } 174 | 175 | 176 | template 177 | void conv2Helper(const conv_kparam_t &p, 178 | Dtype* out, const conv2_data_desc& out_desc, 179 | const Dtype* sig, const conv2_data_desc& sig_desc, 180 | cudaStream_t streamId) 181 | { 182 | convolve2<<>>(out, out_desc, sig, sig_desc, 183 | p.mBlk_x, p.mBlk_y, p.o[1], p.o[2], p.s[1], p.s[2]); 184 | 185 | CUDA_POST_KERNEL_CHECK; 186 | } 187 | 188 | template 189 | void conv2Helper(const conv_kparam_t &p, 190 | Dtype* out, const conv2_data_desc& out_desc, 191 | const Dtype* sig, const conv2_data_desc& sig_desc, 192 | int f2, cudaStream_t streamId) 193 | { 194 | switch(f2) { 195 | case 1: conv2Helper(p, out, out_desc, sig, sig_desc, streamId); break; 196 | case 3: conv2Helper(p, out, out_desc, sig, sig_desc, streamId); break; 197 | case 4: conv2Helper(p, out, out_desc, sig, sig_desc, streamId); break; 198 | default: printf("Unsupported filter batched filter third-dimention. Supported only [1 x K x K], [3 x K x K] and [4 x K x K].\n"); throw std::exception(); 199 | } 200 | } 201 | 202 | template 203 | void conv2Helper(const conv_kparam_t &p, 204 | Dtype* out, const conv2_data_desc& out_desc, 205 | const Dtype* sig, const conv2_data_desc& sig_desc, 206 | int f1, int f2, cudaStream_t streamId) 207 | { 208 | switch(f1) { 209 | case 1: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 210 | case 2: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 211 | case 3: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 212 | case 4: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 213 | case 5: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 214 | default: printf("Unsupported filter size in caffe_gpu_convolve2. Supported up to 5x5 when unmatched witdh/height sizes.\n"); throw std::exception(); 215 | } 216 | } 217 | 218 | template 219 | void conv2Helper(const conv_kparam_t &p, 220 | Dtype* out, const conv2_data_desc& out_desc, 221 | const Dtype* sig, const conv2_data_desc& sig_desc, 222 | int f0, int f1, int f2, cudaStream_t streamId) 223 | { 224 | switch(f0) { 225 | case 1: conv2Helper(p, out, out_desc, sig, sig_desc, f1, f2, streamId); break; 226 | case 2: conv2Helper(p, out, out_desc, sig, sig_desc, f1, f2, streamId); break; 227 | case 3: conv2Helper(p, out, out_desc, sig, sig_desc, f1, f2, streamId); break; 228 | case 4: conv2Helper(p, out, out_desc, sig, sig_desc, f1, f2, streamId); break; 229 | case 5: conv2Helper(p, out, out_desc, sig, sig_desc, f1, f2, streamId); break; 230 | default: { 231 | if (f0==f1) { 232 | switch(f1) { 233 | case 6: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 234 | case 7: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 235 | case 8: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 236 | case 9: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 237 | case 10: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 238 | case 11: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 239 | case 12: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 240 | case 13: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 241 | case 14: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 242 | case 15: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 243 | case 16: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 244 | case 17: conv2Helper(p, out, out_desc, sig, sig_desc, f2, streamId); break; 245 | default: printf("Unsupported filter size in caffe_gpu_convolve2. Supported up to 17x17.\n"); throw std::exception(); 246 | } 247 | } else { 248 | printf("Unsupported filter size in caffe_gpu_convolve2. Supported up to 5x5 when unmatched witdh/height sizes.\n"); throw std::exception(); 249 | } 250 | } break; 251 | } 252 | } 253 | 254 | template 255 | void convolve_2d(conv_kparam_t &p, 256 | Dtype* out, const conv2_data_desc& out_desc, 257 | const Dtype* signal, const conv2_data_desc& signal_desc, 258 | const Dtype* filt, const conv2_data_desc& filt_desc, cudaStream_t streamId) 259 | { 260 | prepareKernelArgs(p, signal_desc.dims, filt_desc.dims, 2); 261 | 262 | int filterLen = filt_desc.dims[3-0] * filt_desc.dims[3-1]; 263 | 264 | for (int b3=0; b3(p, out, out_desc, signal, signal_desc, filt_desc.dims[3-0], filt_desc.dims[3-1], filt_desc.dims[3-2], streamId); 281 | 282 | } else { 283 | for (int b2=0; b2(p, out, out_desc, signal, signal_desc, filt_desc.dims[3-0], filt_desc.dims[3-1], 1, streamId); 299 | } 300 | } 301 | 302 | 303 | } 304 | } 305 | 306 | 307 | template 308 | void convolve_nd(Dtype* out, const conv2_data_desc& out_desc, 309 | const Dtype* signal, const conv2_data_desc& signal_desc, 310 | const Dtype* filt, const conv2_data_desc& filt_desc, 311 | AF_BATCH_KIND kind, cudaStream_t streamId) 312 | { 313 | bool callKernel = true; 314 | 315 | 316 | int MCFL2 = kernel::MAX_CONV2_FILTER_LEN; 317 | switch(baseDim) { 318 | case 2: if ((filt_desc.dims[3]*filt_desc.dims[2]) > (MCFL2 * MCFL2)) callKernel = false; break; 319 | } 320 | 321 | if (!callKernel) { 322 | printf("Unsupported filter dimension. Supported only 2-dimensional filter with third dimension as batch.\n"); throw std::exception(); 323 | } 324 | 325 | conv_kparam_t param; 326 | for (int i=0; i<3; ++i) { 327 | param.o[i] = 0; 328 | param.s[i] = 0; 329 | } 330 | param.launchMoreBlocks = kind==AF_BATCH_SAME || kind==AF_BATCH_RHS; 331 | param.outHasNoOffset = kind==AF_BATCH_LHS || kind==AF_BATCH_NONE; 332 | param.inHasNoOffset = kind!=AF_BATCH_SAME; 333 | 334 | switch(baseDim) { 335 | case 2: convolve_2d(param, out, out_desc, signal, signal_desc, filt, filt_desc, streamId); break; 336 | } 337 | 338 | } 339 | 340 | #define INSTANTIATE(T) \ 341 | template void convolve_nd(T* out, const conv2_data_desc& out_desc,\ 342 | const T* signal, const conv2_data_desc& signal_desc,\ 343 | const T* filt, const conv2_data_desc& filt_desc, AF_BATCH_KIND kind, cudaStream_t streamId); \ 344 | template void convolve_nd(T* out, const conv2_data_desc& out_desc, \ 345 | const T* signal, const conv2_data_desc& signal_desc, \ 346 | const T* filt, const conv2_data_desc& filt_desc, AF_BATCH_KIND kind, cudaStream_t streamId);\ 347 | 348 | 349 | INSTANTIATE(float) 350 | 351 | } 352 | 353 | } 354 | -------------------------------------------------------------------------------- /src/dau_conv/util/im2col.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "dau_conv/util/im2col.hpp" 4 | #include "dau_conv/util/math_functions.hpp" 5 | 6 | using namespace std; 7 | 8 | namespace DAUConvNet { 9 | 10 | // Function uses casting from int to unsigned to compare if value of 11 | // parameter a is greater or equal to zero and lower than value of 12 | // parameter b. The b parameter is of type signed and is always positive, 13 | // therefore its value is always lower than 0x800... where casting 14 | // negative value of a parameter converts it to value higher than 0x800... 15 | // The casting allows to use one condition instead of two. 16 | inline bool is_a_ge_zero_and_a_lt_b(int a, int b) { 17 | return static_cast(a) < static_cast(b); 18 | } 19 | 20 | template 21 | void im2col_cpu(const Dtype* data_im, const int channels, 22 | const int height, const int width, const int kernel_h, const int kernel_w, 23 | const int pad_h, const int pad_w, 24 | const int stride_h, const int stride_w, 25 | const int dilation_h, const int dilation_w, 26 | Dtype* data_col) { 27 | const int output_h = (height + 2 * pad_h - 28 | (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; 29 | const int output_w = (width + 2 * pad_w - 30 | (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; 31 | const int channel_size = height * width; 32 | for (int channel = channels; channel--; data_im += channel_size) { 33 | for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { 34 | for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { 35 | int input_row = -pad_h + kernel_row * dilation_h; 36 | for (int output_rows = output_h; output_rows; output_rows--) { 37 | if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { 38 | for (int output_cols = output_w; output_cols; output_cols--) { 39 | *(data_col++) = 0; 40 | } 41 | } else { 42 | int input_col = -pad_w + kernel_col * dilation_w; 43 | for (int output_col = output_w; output_col; output_col--) { 44 | if (is_a_ge_zero_and_a_lt_b(input_col, width)) { 45 | *(data_col++) = data_im[input_row * width + input_col]; 46 | } else { 47 | *(data_col++) = 0; 48 | } 49 | input_col += stride_w; 50 | } 51 | } 52 | input_row += stride_h; 53 | } 54 | } 55 | } 56 | } 57 | } 58 | 59 | // Explicit instantiation 60 | template void im2col_cpu(const float* data_im, const int channels, 61 | const int height, const int width, const int kernel_h, const int kernel_w, 62 | const int pad_h, const int pad_w, const int stride_h, 63 | const int stride_w, const int dilation_h, const int dilation_w, 64 | float* data_col); 65 | template void im2col_cpu(const double* data_im, const int channels, 66 | const int height, const int width, const int kernel_h, const int kernel_w, 67 | const int pad_h, const int pad_w, const int stride_h, 68 | const int stride_w, const int dilation_h, const int dilation_w, 69 | double* data_col); 70 | 71 | template 72 | inline void im2col_nd_core_cpu(const Dtype* data_input, const bool im2col, 73 | const int num_spatial_axes, const int* im_shape, const int* col_shape, 74 | const int* kernel_shape, const int* pad, const int* stride, 75 | const int* dilation, Dtype* data_output) { 76 | if (!im2col) { 77 | int im_size = im_shape[0]; 78 | for (int i = 0; i < num_spatial_axes; ++i) { 79 | im_size *= im_shape[1 + i]; 80 | } 81 | caffe_set(im_size, Dtype(0), data_output); 82 | } 83 | int kernel_size = 1; 84 | for (int i = 0; i < num_spatial_axes; ++i) { 85 | kernel_size *= kernel_shape[i]; 86 | } 87 | const int channels_col = col_shape[0]; 88 | vector d_offset(num_spatial_axes, 0); 89 | vector d_iter(num_spatial_axes, 0); 90 | for (int c_col = 0; c_col < channels_col; ++c_col) { 91 | // Loop over spatial axes in reverse order to compute a per-axis offset. 92 | int offset = c_col; 93 | for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) { 94 | if (d_i < num_spatial_axes - 1) { 95 | offset /= kernel_shape[d_i + 1]; 96 | } 97 | d_offset[d_i] = offset % kernel_shape[d_i]; 98 | } 99 | for (bool incremented = true; incremented; ) { 100 | // Loop over spatial axes in forward order to compute the indices in the 101 | // image and column, and whether the index lies in the padding. 102 | int index_col = c_col; 103 | int index_im = c_col / kernel_size; 104 | bool is_padding = false; 105 | for (int d_i = 0; d_i < num_spatial_axes; ++d_i) { 106 | const int d = d_iter[d_i]; 107 | const int d_im = d * stride[d_i] - pad[d_i] + 108 | d_offset[d_i] * dilation[d_i]; 109 | is_padding |= d_im < 0 || d_im >= im_shape[d_i + 1]; 110 | index_col *= col_shape[d_i + 1]; 111 | index_col += d; 112 | index_im *= im_shape[d_i + 1]; 113 | index_im += d_im; 114 | } 115 | if (im2col) { 116 | if (is_padding) { 117 | data_output[index_col] = 0; 118 | } else { 119 | data_output[index_col] = data_input[index_im]; 120 | } 121 | } else if (!is_padding) { // col2im 122 | data_output[index_im] += data_input[index_col]; 123 | } 124 | // Loop over spatial axes in reverse order to choose an index, 125 | // like counting. 126 | incremented = false; 127 | for (int d_i = num_spatial_axes - 1; d_i >= 0; --d_i) { 128 | const int d_max = col_shape[d_i + 1]; 129 | M_Assert(d_iter[d_i] < d_max,""); 130 | if (d_iter[d_i] == d_max - 1) { 131 | d_iter[d_i] = 0; 132 | } else { // d_iter[d_i] < d_max - 1 133 | ++d_iter[d_i]; 134 | incremented = true; 135 | break; 136 | } 137 | } 138 | } // while(incremented) { 139 | } // for (int c = 0; c < channels_col; ++c) { 140 | } 141 | 142 | template 143 | void im2col_nd_cpu(const Dtype* data_im, const int num_spatial_axes, 144 | const int* im_shape, const int* col_shape, 145 | const int* kernel_shape, const int* pad, const int* stride, 146 | const int* dilation, Dtype* data_col) { 147 | const bool kIm2Col = true; 148 | im2col_nd_core_cpu(data_im, kIm2Col, num_spatial_axes, im_shape, col_shape, 149 | kernel_shape, pad, stride, dilation, data_col); 150 | } 151 | 152 | // Explicit instantiation 153 | template void im2col_nd_cpu(const float* data_im, 154 | const int num_spatial_axes, 155 | const int* im_shape, const int* col_shape, 156 | const int* kernel_shape, const int* pad, const int* stride, 157 | const int* dilation, float* data_col); 158 | template void im2col_nd_cpu(const double* data_im, 159 | const int num_spatial_axes, 160 | const int* im_shape, const int* col_shape, 161 | const int* kernel_shape, const int* pad, const int* stride, 162 | const int* dilation, double* data_col); 163 | 164 | template 165 | void col2im_cpu(const Dtype* data_col, const int channels, 166 | const int height, const int width, const int kernel_h, const int kernel_w, 167 | const int pad_h, const int pad_w, 168 | const int stride_h, const int stride_w, 169 | const int dilation_h, const int dilation_w, 170 | Dtype* data_im) { 171 | caffe_set(height * width * channels, Dtype(0), data_im); 172 | const int output_h = (height + 2 * pad_h - 173 | (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1; 174 | const int output_w = (width + 2 * pad_w - 175 | (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1; 176 | const int channel_size = height * width; 177 | for (int channel = channels; channel--; data_im += channel_size) { 178 | for (int kernel_row = 0; kernel_row < kernel_h; kernel_row++) { 179 | for (int kernel_col = 0; kernel_col < kernel_w; kernel_col++) { 180 | int input_row = -pad_h + kernel_row * dilation_h; 181 | for (int output_rows = output_h; output_rows; output_rows--) { 182 | if (!is_a_ge_zero_and_a_lt_b(input_row, height)) { 183 | data_col += output_w; 184 | } else { 185 | int input_col = -pad_w + kernel_col * dilation_w; 186 | for (int output_col = output_w; output_col; output_col--) { 187 | if (is_a_ge_zero_and_a_lt_b(input_col, width)) { 188 | data_im[input_row * width + input_col] += *data_col; 189 | } 190 | data_col++; 191 | input_col += stride_w; 192 | } 193 | } 194 | input_row += stride_h; 195 | } 196 | } 197 | } 198 | } 199 | } 200 | 201 | // Explicit instantiation 202 | template void col2im_cpu(const float* data_col, const int channels, 203 | const int height, const int width, const int kernel_h, const int kernel_w, 204 | const int pad_h, const int pad_w, const int stride_h, 205 | const int stride_w, const int dilation_h, const int dilation_w, 206 | float* data_im); 207 | template void col2im_cpu(const double* data_col, const int channels, 208 | const int height, const int width, const int kernel_h, const int kernel_w, 209 | const int pad_h, const int pad_w, const int stride_h, 210 | const int stride_w, const int dilation_h, const int dilation_w, 211 | double* data_im); 212 | 213 | template 214 | void col2im_nd_cpu(const Dtype* data_col, const int num_spatial_axes, 215 | const int* im_shape, const int* col_shape, 216 | const int* kernel_shape, const int* pad, const int* stride, 217 | const int* dilation, Dtype* data_im) { 218 | const bool kIm2Col = false; 219 | im2col_nd_core_cpu(data_col, kIm2Col, num_spatial_axes, im_shape, col_shape, 220 | kernel_shape, pad, stride, dilation, data_im); 221 | } 222 | 223 | // Explicit instantiation 224 | template void col2im_nd_cpu(const float* data_col, 225 | const int num_spatial_axes, 226 | const int* im_shape, const int* col_shape, 227 | const int* kernel_shape, const int* pad, const int* stride, 228 | const int* dilation, float* data_im); 229 | template void col2im_nd_cpu(const double* data_col, 230 | const int num_spatial_axes, 231 | const int* im_shape, const int* col_shape, 232 | const int* kernel_shape, const int* pad, const int* stride, 233 | const int* dilation, double* data_im); 234 | 235 | 236 | } // namespace dau_conv_impl 237 | -------------------------------------------------------------------------------- /src/dau_conv/util/math_functions.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "dau_conv/util/common.hpp" 4 | #include "dau_conv/util/math_functions.hpp" 5 | 6 | namespace DAUConvNet { 7 | 8 | 9 | template<> 10 | void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, 11 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 12 | const float alpha, const float* A, const float* B, const float beta, 13 | float* C) { 14 | int lda = (TransA == CblasNoTrans) ? K : M; 15 | int ldb = (TransB == CblasNoTrans) ? N : K; 16 | cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, 17 | ldb, beta, C, N); 18 | } 19 | 20 | template<> 21 | void caffe_cpu_gemm(const CBLAS_TRANSPOSE TransA, 22 | const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, 23 | const double alpha, const double* A, const double* B, const double beta, 24 | double* C) { 25 | int lda = (TransA == CblasNoTrans) ? K : M; 26 | int ldb = (TransB == CblasNoTrans) ? N : K; 27 | cblas_dgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, 28 | ldb, beta, C, N); 29 | } 30 | 31 | template <> 32 | void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, 33 | const int N, const float alpha, const float* A, const float* x, 34 | const float beta, float* y) { 35 | cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); 36 | } 37 | 38 | template <> 39 | void caffe_cpu_gemv(const CBLAS_TRANSPOSE TransA, const int M, 40 | const int N, const double alpha, const double* A, const double* x, 41 | const double beta, double* y) { 42 | cblas_dgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); 43 | } 44 | 45 | template <> 46 | void caffe_axpy(const int N, const float alpha, const float* X, 47 | float* Y) { cblas_saxpy(N, alpha, X, 1, Y, 1); } 48 | 49 | template <> 50 | void caffe_axpy(const int N, const double alpha, const double* X, 51 | double* Y) { cblas_daxpy(N, alpha, X, 1, Y, 1); } 52 | 53 | template 54 | void caffe_set(const int N, const Dtype alpha, Dtype* Y) { 55 | if (alpha == 0) { 56 | memset(Y, 0, sizeof(Dtype) * N); // NOLINT(dau_conv_impl/alt_fn) 57 | return; 58 | } 59 | for (int i = 0; i < N; ++i) { 60 | Y[i] = alpha; 61 | } 62 | } 63 | 64 | template void caffe_set(const int N, const int alpha, int* Y); 65 | template void caffe_set(const int N, const float alpha, float* Y); 66 | template void caffe_set(const int N, const double alpha, double* Y); 67 | 68 | template <> 69 | void caffe_add_scalar(const int N, const float alpha, float* Y) { 70 | for (int i = 0; i < N; ++i) { 71 | Y[i] += alpha; 72 | } 73 | } 74 | 75 | template <> 76 | void caffe_add_scalar(const int N, const double alpha, double* Y) { 77 | for (int i = 0; i < N; ++i) { 78 | Y[i] += alpha; 79 | } 80 | } 81 | template <> 82 | void caffe_scal(const int N, const float alpha, float *X) { 83 | cblas_sscal(N, alpha, X, 1); 84 | } 85 | 86 | template <> 87 | void caffe_scal(const int N, const double alpha, double *X) { 88 | cblas_dscal(N, alpha, X, 1); 89 | } 90 | 91 | template <> 92 | void caffe_cpu_axpby(const int N, const float alpha, const float* X, 93 | const float beta, float* Y) { 94 | cblas_saxpby(N, alpha, X, 1, beta, Y, 1); 95 | } 96 | 97 | template <> 98 | void caffe_cpu_axpby(const int N, const double alpha, const double* X, 99 | const double beta, double* Y) { 100 | cblas_daxpby(N, alpha, X, 1, beta, Y, 1); 101 | } 102 | 103 | template <> 104 | void caffe_add(const int n, const float* a, const float* b, 105 | float* y) { 106 | vsAdd(n, a, b, y); 107 | } 108 | 109 | template <> 110 | void caffe_add(const int n, const double* a, const double* b, 111 | double* y) { 112 | vdAdd(n, a, b, y); 113 | } 114 | 115 | template <> 116 | void caffe_sub(const int n, const float* a, const float* b, 117 | float* y) { 118 | vsSub(n, a, b, y); 119 | } 120 | 121 | template <> 122 | void caffe_sub(const int n, const double* a, const double* b, 123 | double* y) { 124 | vdSub(n, a, b, y); 125 | } 126 | 127 | template <> 128 | void caffe_mul(const int n, const float* a, const float* b, 129 | float* y) { 130 | vsMul(n, a, b, y); 131 | } 132 | 133 | template <> 134 | void caffe_mul(const int n, const double* a, const double* b, 135 | double* y) { 136 | vdMul(n, a, b, y); 137 | } 138 | 139 | template <> 140 | void caffe_div(const int n, const float* a, const float* b, 141 | float* y) { 142 | vsDiv(n, a, b, y); 143 | } 144 | 145 | template <> 146 | void caffe_div(const int n, const double* a, const double* b, 147 | double* y) { 148 | vdDiv(n, a, b, y); 149 | } 150 | 151 | template <> 152 | void caffe_powx(const int n, const float* a, const float b, 153 | float* y) { 154 | vsPowx(n, a, b, y); 155 | } 156 | 157 | template <> 158 | void caffe_powx(const int n, const double* a, const double b, 159 | double* y) { 160 | vdPowx(n, a, b, y); 161 | } 162 | 163 | template <> 164 | void caffe_sqr(const int n, const float* a, float* y) { 165 | vsSqr(n, a, y); 166 | } 167 | 168 | template <> 169 | void caffe_sqr(const int n, const double* a, double* y) { 170 | vdSqr(n, a, y); 171 | } 172 | 173 | template <> 174 | void caffe_sqrt(const int n, const float* a, float* y) { 175 | vsSqrt(n, a, y); 176 | } 177 | 178 | template <> 179 | void caffe_sqrt(const int n, const double* a, double* y) { 180 | vdSqrt(n, a, y); 181 | } 182 | 183 | template <> 184 | void caffe_exp(const int n, const float* a, float* y) { 185 | vsExp(n, a, y); 186 | } 187 | 188 | template <> 189 | void caffe_exp(const int n, const double* a, double* y) { 190 | vdExp(n, a, y); 191 | } 192 | 193 | template <> 194 | void caffe_log(const int n, const float* a, float* y) { 195 | vsLn(n, a, y); 196 | } 197 | 198 | template <> 199 | void caffe_log(const int n, const double* a, double* y) { 200 | vdLn(n, a, y); 201 | } 202 | 203 | template <> 204 | void caffe_abs(const int n, const float* a, float* y) { 205 | vsAbs(n, a, y); 206 | } 207 | 208 | template <> 209 | void caffe_abs(const int n, const double* a, double* y) { 210 | vdAbs(n, a, y); 211 | } 212 | 213 | template <> 214 | float caffe_cpu_strided_dot(const int n, const float* x, const int incx, 215 | const float* y, const int incy) { 216 | return cblas_sdot(n, x, incx, y, incy); 217 | } 218 | 219 | template <> 220 | double caffe_cpu_strided_dot(const int n, const double* x, 221 | const int incx, const double* y, const int incy) { 222 | return cblas_ddot(n, x, incx, y, incy); 223 | } 224 | 225 | template 226 | Dtype caffe_cpu_dot(const int n, const Dtype* x, const Dtype* y) { 227 | return caffe_cpu_strided_dot(n, x, 1, y, 1); 228 | } 229 | 230 | template 231 | float caffe_cpu_dot(const int n, const float* x, const float* y); 232 | 233 | template 234 | double caffe_cpu_dot(const int n, const double* x, const double* y); 235 | 236 | template <> 237 | float caffe_cpu_asum(const int n, const float* x) { 238 | return cblas_sasum(n, x, 1); 239 | } 240 | 241 | template <> 242 | double caffe_cpu_asum(const int n, const double* x) { 243 | return cblas_dasum(n, x, 1); 244 | } 245 | 246 | template <> 247 | void caffe_cpu_scale(const int n, const float alpha, const float *x, 248 | float* y) { 249 | cblas_scopy(n, x, 1, y, 1); 250 | cblas_sscal(n, alpha, y, 1); 251 | } 252 | 253 | template <> 254 | void caffe_cpu_scale(const int n, const double alpha, const double *x, 255 | double* y) { 256 | cblas_dcopy(n, x, 1, y, 1); 257 | cblas_dscal(n, alpha, y, 1); 258 | } 259 | } // namespace dau_conv_impl 260 | -------------------------------------------------------------------------------- /src/main.cpp: -------------------------------------------------------------------------------- 1 | // 2 | // Created by domen on 3/21/18. 3 | // 4 | 5 | #include 6 | #include "dau_conv/base_dau_conv_layer.hpp" 7 | 8 | 9 | int main(int argc, char** argv) { 10 | 11 | /* 12 | DAUConvSettings layer_param; 13 | 14 | DAUConvLayerCaffeGPU layer; 15 | const int N = 128; 16 | const int S = 32; 17 | const int F = 64; 18 | const int H = 64; 19 | const int W = 64; 20 | 21 | Blob input(N,S,H,W); 22 | Blob output(N,F,H,W); 23 | 24 | vector*> top; 25 | vector param_propagate_down; 26 | 27 | DAUKernelComputeGPU* dau_kernel_compute = new DAUKernelComputeGPU(); 28 | DAUKernelParamsGPU* dau_kernel_params = new DAUKernelParamsGPU(); 29 | DAUKernelOutputGPU* dau_kernel_output = new DAUKernelOutputGPU(); 30 | 31 | 32 | layer.LayerSetUp(layer_param, 33 | dau_kernel_compute, dau_kernel_params, dau_kernel_output, 34 | param_propagate_down, input.shape()); 35 | 36 | layer.Reshape(input.shape(), output.shape()); 37 | 38 | layer.Forward_gpu(input.gpu_data(), input.shape(), output.mutable_gpu_data(), output.shape()); 39 | 40 | layer.Backward_gpu(output.gpu_data(), output.gpu_diff(), output.shape(), true, 41 | input.gpu_data(), input.mutable_gpu_diff(), input.shape()); 42 | */ 43 | } --------------------------------------------------------------------------------