├── .gitignore ├── CMakeLists.txt ├── CMakeModules └── FindTBB.cmake ├── LICENSE ├── README.md └── src ├── CMakeLists.txt ├── cuv ├── CMakeLists.txt ├── allocators.cu ├── allocators.hpp ├── cuda_general.hpp ├── memory.cu ├── memory.hpp ├── meta_programming.hpp ├── ndarray.hpp ├── reference.cu ├── reference.hpp └── tags.hpp └── tests ├── CMakeLists.txt ├── allocators_test.cpp └── ndarray_test.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | build/ 2 | 3 | *.o 4 | *.so 5 | .*.swo 6 | .*.swp 7 | .*.swn 8 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ####################################################################################### 2 | # The MIT License 3 | 4 | # Copyright (c) 2014 Hannes Schulz, University of Bonn 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2008-2009 Sebastian Nowozin 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | cmake_minimum_required( VERSION 2.6 FATAL_ERROR ) 27 | 28 | # 29 | # If the user specifies -DCMAKE_BUILD_TYPE on the command line, take their 30 | # definition # and dump it in the cache along with proper documentation, 31 | # otherwise set CMAKE_BUILD_TYPE # to Debug prior to calling PROJECT() 32 | # 33 | IF(DEFINED CMAKE_BUILD_TYPE) 34 | SET(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel.") 35 | ELSE() 36 | SET(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel.") 37 | ENDIF() 38 | 39 | PROJECT(ndarray CXX C) 40 | SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_SOURCE_DIR}/CMakeModules ) 41 | 42 | ENABLE_TESTING() 43 | add_subdirectory(src) 44 | 45 | CUDA_BUILD_CLEAN_TARGET() 46 | -------------------------------------------------------------------------------- /CMakeModules/FindTBB.cmake: -------------------------------------------------------------------------------- 1 | # Locate Intel Threading Building Blocks include paths and libraries 2 | # FindTBB.cmake can be found at https://code.google.com/p/findtbb/ 3 | # Written by Hannes Hofmann 4 | # Improvements by Gino van den Bergen , 5 | # Florian Uhlig , 6 | # Jiri Marsik 7 | 8 | # The MIT License 9 | # 10 | # Copyright (c) 2011 Hannes Hofmann 11 | # 12 | # Permission is hereby granted, free of charge, to any person obtaining a copy 13 | # of this software and associated documentation files (the "Software"), to deal 14 | # in the Software without restriction, including without limitation the rights 15 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 16 | # copies of the Software, and to permit persons to whom the Software is 17 | # furnished to do so, subject to the following conditions: 18 | # 19 | # The above copyright notice and this permission notice shall be included in 20 | # all copies or substantial portions of the Software. 21 | # 22 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 23 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 24 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 25 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 26 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 27 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 28 | # THE SOFTWARE. 29 | 30 | # GvdB: This module uses the environment variable TBB_ARCH_PLATFORM which defines architecture and compiler. 31 | # e.g. "ia32/vc8" or "em64t/cc4.1.0_libc2.4_kernel2.6.16.21" 32 | # TBB_ARCH_PLATFORM is set by the build script tbbvars[.bat|.sh|.csh], which can be found 33 | # in the TBB installation directory (TBB_INSTALL_DIR). 34 | # 35 | # GvdB: Mac OS X distribution places libraries directly in lib directory. 36 | # 37 | # For backwards compatibility, you may explicitely set the CMake variables TBB_ARCHITECTURE and TBB_COMPILER. 38 | # TBB_ARCHITECTURE [ ia32 | em64t | itanium ] 39 | # which architecture to use 40 | # TBB_COMPILER e.g. vc9 or cc3.2.3_libc2.3.2_kernel2.4.21 or cc4.0.1_os10.4.9 41 | # which compiler to use (detected automatically on Windows) 42 | 43 | # This module respects 44 | # TBB_INSTALL_DIR or $ENV{TBB21_INSTALL_DIR} or $ENV{TBB_INSTALL_DIR} 45 | 46 | # This module defines 47 | # TBB_INCLUDE_DIRS, where to find task_scheduler_init.h, etc. 48 | # TBB_LIBRARY_DIRS, where to find libtbb, libtbbmalloc 49 | # TBB_DEBUG_LIBRARY_DIRS, where to find libtbb_debug, libtbbmalloc_debug 50 | # TBB_INSTALL_DIR, the base TBB install directory 51 | # TBB_LIBRARIES, the libraries to link against to use TBB. 52 | # TBB_DEBUG_LIBRARIES, the libraries to link against to use TBB with debug symbols. 53 | # TBB_FOUND, If false, don't try to use TBB. 54 | # TBB_INTERFACE_VERSION, as defined in tbb/tbb_stddef.h 55 | 56 | 57 | if (WIN32) 58 | # has em64t/vc8 em64t/vc9 59 | # has ia32/vc7.1 ia32/vc8 ia32/vc9 60 | set(_TBB_DEFAULT_INSTALL_DIR "C:/Program Files/Intel/TBB" "C:/Program Files (x86)/Intel/TBB") 61 | set(_TBB_LIB_NAME "tbb") 62 | set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc") 63 | set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug") 64 | set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug") 65 | if (MSVC71) 66 | set (_TBB_COMPILER "vc7.1") 67 | endif(MSVC71) 68 | if (MSVC80) 69 | set(_TBB_COMPILER "vc8") 70 | endif(MSVC80) 71 | if (MSVC90) 72 | set(_TBB_COMPILER "vc9") 73 | endif(MSVC90) 74 | if(MSVC10) 75 | set(_TBB_COMPILER "vc10") 76 | endif(MSVC10) 77 | # Todo: add other Windows compilers such as ICL. 78 | set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE}) 79 | endif (WIN32) 80 | 81 | if (UNIX) 82 | if (APPLE) 83 | # MAC 84 | set(_TBB_DEFAULT_INSTALL_DIR "/Library/Frameworks/Intel_TBB.framework/Versions") 85 | # libs: libtbb.dylib, libtbbmalloc.dylib, *_debug 86 | set(_TBB_LIB_NAME "tbb") 87 | set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc") 88 | set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug") 89 | set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug") 90 | # default flavor on apple: ia32/cc4.0.1_os10.4.9 91 | # Jiri: There is no reason to presume there is only one flavor and 92 | # that user's setting of variables should be ignored. 93 | if(NOT TBB_COMPILER) 94 | set(_TBB_COMPILER "cc4.0.1_os10.4.9") 95 | elseif (NOT TBB_COMPILER) 96 | set(_TBB_COMPILER ${TBB_COMPILER}) 97 | endif(NOT TBB_COMPILER) 98 | if(NOT TBB_ARCHITECTURE) 99 | set(_TBB_ARCHITECTURE "ia32") 100 | elseif(NOT TBB_ARCHITECTURE) 101 | set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE}) 102 | endif(NOT TBB_ARCHITECTURE) 103 | else (APPLE) 104 | # LINUX 105 | set(_TBB_DEFAULT_INSTALL_DIR "/opt/intel/tbb" "/usr/local/include" "/usr/include") 106 | set(_TBB_LIB_NAME "tbb") 107 | set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc") 108 | set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug") 109 | set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug") 110 | # has em64t/cc3.2.3_libc2.3.2_kernel2.4.21 em64t/cc3.3.3_libc2.3.3_kernel2.6.5 em64t/cc3.4.3_libc2.3.4_kernel2.6.9 em64t/cc4.1.0_libc2.4_kernel2.6.16.21 111 | # has ia32/* 112 | # has itanium/* 113 | set(_TBB_COMPILER ${TBB_COMPILER}) 114 | set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE}) 115 | endif (APPLE) 116 | endif (UNIX) 117 | 118 | if (CMAKE_SYSTEM MATCHES "SunOS.*") 119 | # SUN 120 | # not yet supported 121 | # has em64t/cc3.4.3_kernel5.10 122 | # has ia32/* 123 | endif (CMAKE_SYSTEM MATCHES "SunOS.*") 124 | 125 | 126 | #-- Clear the public variables 127 | set (TBB_FOUND "NO") 128 | 129 | 130 | #-- Find TBB install dir and set ${_TBB_INSTALL_DIR} and cached ${TBB_INSTALL_DIR} 131 | # first: use CMake variable TBB_INSTALL_DIR 132 | if (TBB_INSTALL_DIR) 133 | set (_TBB_INSTALL_DIR ${TBB_INSTALL_DIR}) 134 | endif (TBB_INSTALL_DIR) 135 | # second: use environment variable 136 | if (NOT _TBB_INSTALL_DIR) 137 | if (NOT "$ENV{TBB_INSTALL_DIR}" STREQUAL "") 138 | set (_TBB_INSTALL_DIR $ENV{TBB_INSTALL_DIR}) 139 | endif (NOT "$ENV{TBB_INSTALL_DIR}" STREQUAL "") 140 | # Intel recommends setting TBB21_INSTALL_DIR 141 | if (NOT "$ENV{TBB21_INSTALL_DIR}" STREQUAL "") 142 | set (_TBB_INSTALL_DIR $ENV{TBB21_INSTALL_DIR}) 143 | endif (NOT "$ENV{TBB21_INSTALL_DIR}" STREQUAL "") 144 | if (NOT "$ENV{TBB22_INSTALL_DIR}" STREQUAL "") 145 | set (_TBB_INSTALL_DIR $ENV{TBB22_INSTALL_DIR}) 146 | endif (NOT "$ENV{TBB22_INSTALL_DIR}" STREQUAL "") 147 | if (NOT "$ENV{TBB30_INSTALL_DIR}" STREQUAL "") 148 | set (_TBB_INSTALL_DIR $ENV{TBB30_INSTALL_DIR}) 149 | endif (NOT "$ENV{TBB30_INSTALL_DIR}" STREQUAL "") 150 | endif (NOT _TBB_INSTALL_DIR) 151 | # third: try to find path automatically 152 | if (NOT _TBB_INSTALL_DIR) 153 | if (_TBB_DEFAULT_INSTALL_DIR) 154 | set (_TBB_INSTALL_DIR ${_TBB_DEFAULT_INSTALL_DIR}) 155 | endif (_TBB_DEFAULT_INSTALL_DIR) 156 | endif (NOT _TBB_INSTALL_DIR) 157 | # sanity check 158 | if (NOT _TBB_INSTALL_DIR) 159 | message ("ERROR: Unable to find Intel TBB install directory. ${_TBB_INSTALL_DIR}") 160 | else (NOT _TBB_INSTALL_DIR) 161 | # finally: set the cached CMake variable TBB_INSTALL_DIR 162 | if (NOT TBB_INSTALL_DIR) 163 | set (TBB_INSTALL_DIR ${_TBB_INSTALL_DIR} CACHE PATH "Intel TBB install directory") 164 | mark_as_advanced(TBB_INSTALL_DIR) 165 | endif (NOT TBB_INSTALL_DIR) 166 | 167 | 168 | #-- A macro to rewrite the paths of the library. This is necessary, because 169 | # find_library() always found the em64t/vc9 version of the TBB libs 170 | macro(TBB_CORRECT_LIB_DIR var_name) 171 | # if (NOT "${_TBB_ARCHITECTURE}" STREQUAL "em64t") 172 | string(REPLACE em64t "${_TBB_ARCHITECTURE}" ${var_name} ${${var_name}}) 173 | # endif (NOT "${_TBB_ARCHITECTURE}" STREQUAL "em64t") 174 | string(REPLACE ia32 "${_TBB_ARCHITECTURE}" ${var_name} ${${var_name}}) 175 | string(REPLACE vc7.1 "${_TBB_COMPILER}" ${var_name} ${${var_name}}) 176 | string(REPLACE vc8 "${_TBB_COMPILER}" ${var_name} ${${var_name}}) 177 | string(REPLACE vc9 "${_TBB_COMPILER}" ${var_name} ${${var_name}}) 178 | string(REPLACE vc10 "${_TBB_COMPILER}" ${var_name} ${${var_name}}) 179 | endmacro(TBB_CORRECT_LIB_DIR var_content) 180 | 181 | 182 | #-- Look for include directory and set ${TBB_INCLUDE_DIR} 183 | set (TBB_INC_SEARCH_DIR ${_TBB_INSTALL_DIR}/include) 184 | # Jiri: tbbvars now sets the CPATH environment variable to the directory 185 | # containing the headers. 186 | find_path(TBB_INCLUDE_DIR 187 | tbb/task_scheduler_init.h 188 | PATHS ${TBB_INC_SEARCH_DIR} ENV CPATH 189 | ) 190 | mark_as_advanced(TBB_INCLUDE_DIR) 191 | 192 | 193 | #-- Look for libraries 194 | # GvdB: $ENV{TBB_ARCH_PLATFORM} is set by the build script tbbvars[.bat|.sh|.csh] 195 | if (NOT $ENV{TBB_ARCH_PLATFORM} STREQUAL "") 196 | set (_TBB_LIBRARY_DIR 197 | ${_TBB_INSTALL_DIR}/lib/$ENV{TBB_ARCH_PLATFORM} 198 | ${_TBB_INSTALL_DIR}/$ENV{TBB_ARCH_PLATFORM}/lib 199 | ) 200 | endif (NOT $ENV{TBB_ARCH_PLATFORM} STREQUAL "") 201 | # Jiri: This block isn't mutually exclusive with the previous one 202 | # (hence no else), instead I test if the user really specified 203 | # the variables in question. 204 | if ((NOT ${TBB_ARCHITECTURE} STREQUAL "") AND (NOT ${TBB_COMPILER} STREQUAL "")) 205 | # HH: deprecated 206 | message(STATUS "[Warning] FindTBB.cmake: The use of TBB_ARCHITECTURE and TBB_COMPILER is deprecated and may not be supported in future versions. Please set \$ENV{TBB_ARCH_PLATFORM} (using tbbvars.[bat|csh|sh]).") 207 | # Jiri: It doesn't hurt to look in more places, so I store the hints from 208 | # ENV{TBB_ARCH_PLATFORM} and the TBB_ARCHITECTURE and TBB_COMPILER 209 | # variables and search them both. 210 | set (_TBB_LIBRARY_DIR "${_TBB_INSTALL_DIR}/${_TBB_ARCHITECTURE}/${_TBB_COMPILER}/lib" ${_TBB_LIBRARY_DIR}) 211 | endif ((NOT ${TBB_ARCHITECTURE} STREQUAL "") AND (NOT ${TBB_COMPILER} STREQUAL "")) 212 | 213 | # GvdB: Mac OS X distribution places libraries directly in lib directory. 214 | list(APPEND _TBB_LIBRARY_DIR ${_TBB_INSTALL_DIR}/lib) 215 | 216 | # Jiri: No reason not to check the default paths. From recent versions, 217 | # tbbvars has started exporting the LIBRARY_PATH and LD_LIBRARY_PATH 218 | # variables, which now point to the directories of the lib files. 219 | # It all makes more sense to use the ${_TBB_LIBRARY_DIR} as a HINTS 220 | # argument instead of the implicit PATHS as it isn't hard-coded 221 | # but computed by system introspection. Searching the LIBRARY_PATH 222 | # and LD_LIBRARY_PATH environment variables is now even more important 223 | # that tbbvars doesn't export TBB_ARCH_PLATFORM and it facilitates 224 | # the use of TBB built from sources. 225 | find_library(TBB_LIBRARY ${_TBB_LIB_NAME} HINTS ${_TBB_LIBRARY_DIR} 226 | PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH) 227 | find_library(TBB_MALLOC_LIBRARY ${_TBB_LIB_MALLOC_NAME} HINTS ${_TBB_LIBRARY_DIR} 228 | PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH) 229 | 230 | #Extract path from TBB_LIBRARY name 231 | get_filename_component(TBB_LIBRARY_DIR ${TBB_LIBRARY} PATH) 232 | 233 | #TBB_CORRECT_LIB_DIR(TBB_LIBRARY) 234 | #TBB_CORRECT_LIB_DIR(TBB_MALLOC_LIBRARY) 235 | mark_as_advanced(TBB_LIBRARY TBB_MALLOC_LIBRARY) 236 | 237 | #-- Look for debug libraries 238 | # Jiri: Changed the same way as for the release libraries. 239 | find_library(TBB_LIBRARY_DEBUG ${_TBB_LIB_DEBUG_NAME} HINTS ${_TBB_LIBRARY_DIR} 240 | PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH) 241 | find_library(TBB_MALLOC_LIBRARY_DEBUG ${_TBB_LIB_MALLOC_DEBUG_NAME} HINTS ${_TBB_LIBRARY_DIR} 242 | PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH) 243 | 244 | # Jiri: Self-built TBB stores the debug libraries in a separate directory. 245 | # Extract path from TBB_LIBRARY_DEBUG name 246 | get_filename_component(TBB_LIBRARY_DEBUG_DIR ${TBB_LIBRARY_DEBUG} PATH) 247 | 248 | #TBB_CORRECT_LIB_DIR(TBB_LIBRARY_DEBUG) 249 | #TBB_CORRECT_LIB_DIR(TBB_MALLOC_LIBRARY_DEBUG) 250 | mark_as_advanced(TBB_LIBRARY_DEBUG TBB_MALLOC_LIBRARY_DEBUG) 251 | 252 | 253 | if (TBB_INCLUDE_DIR) 254 | if (TBB_LIBRARY) 255 | set (TBB_FOUND "YES") 256 | set (TBB_LIBRARIES ${TBB_LIBRARY} ${TBB_MALLOC_LIBRARY} ${TBB_LIBRARIES}) 257 | set (TBB_DEBUG_LIBRARIES ${TBB_LIBRARY_DEBUG} ${TBB_MALLOC_LIBRARY_DEBUG} ${TBB_DEBUG_LIBRARIES}) 258 | set (TBB_INCLUDE_DIRS ${TBB_INCLUDE_DIR} CACHE PATH "TBB include directory" FORCE) 259 | set (TBB_LIBRARY_DIRS ${TBB_LIBRARY_DIR} CACHE PATH "TBB library directory" FORCE) 260 | # Jiri: Self-built TBB stores the debug libraries in a separate directory. 261 | set (TBB_DEBUG_LIBRARY_DIRS ${TBB_LIBRARY_DEBUG_DIR} CACHE PATH "TBB debug library directory" FORCE) 262 | mark_as_advanced(TBB_INCLUDE_DIRS TBB_LIBRARY_DIRS TBB_DEBUG_LIBRARY_DIRS TBB_LIBRARIES TBB_DEBUG_LIBRARIES) 263 | message(STATUS "Found Intel TBB") 264 | endif (TBB_LIBRARY) 265 | endif (TBB_INCLUDE_DIR) 266 | 267 | if (NOT TBB_FOUND) 268 | message("ERROR: Intel TBB NOT found!") 269 | message(STATUS "Looked for Threading Building Blocks in ${_TBB_INSTALL_DIR}") 270 | # do only throw fatal, if this pkg is REQUIRED 271 | if (TBB_FIND_REQUIRED) 272 | message(FATAL_ERROR "Could NOT find TBB library.") 273 | endif (TBB_FIND_REQUIRED) 274 | endif (NOT TBB_FOUND) 275 | 276 | endif (NOT _TBB_INSTALL_DIR) 277 | 278 | if (TBB_FOUND) 279 | set(TBB_INTERFACE_VERSION 0) 280 | FILE(READ "${TBB_INCLUDE_DIRS}/tbb/tbb_stddef.h" _TBB_VERSION_CONTENTS) 281 | STRING(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_TBB_VERSION_CONTENTS}") 282 | set(TBB_INTERFACE_VERSION "${TBB_INTERFACE_VERSION}") 283 | endif (TBB_FOUND) 284 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2013, University of Bonn, Institute for Computer Science VI 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions are met: 6 | 7 | * Redistributions of source code must retain the above copyright notice, 8 | this list of conditions and the following disclaimer. 9 | * Redistributions in binary form must reproduce the above copyright notice, 10 | this list of conditions and the following disclaimer in the documentation 11 | and/or other materials provided with the distribution. 12 | * Neither the name of the University of Bonn 13 | nor the names of its contributors may be used to endorse or promote 14 | products derived from this software without specific prior written 15 | permission. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ndarray Documentation 2 | ===================== 3 | 4 | Summary 5 | ------- 6 | 7 | ndarray is a C++ template library for n-dimensional arrays on CPU and GPU using NVIDIA CUDA™. It is extracted from the [CUV library][cuv]. 8 | 9 | Features 10 | -------- 11 | 12 | ### Supported Platforms ### 13 | 14 | - This library was only tested on Ubuntu Karmic, Lucid and Maverick. It uses 15 | mostly standard components and should run without major 16 | modification on any current linux system. 17 | 18 | ### Supported GPUs ### 19 | 20 | - By default, code is generated for the lowest compute architecture. We 21 | recommend you change this to match your hardware. Using ccmake you can set 22 | the build variable "CUDA_ARCHITECTURE" for example to -arch=compute_20 23 | - All GT 9800 and GTX 280 and above 24 | - GT 9200 without convolutions. It might need some minor modifications to 25 | make the rest work. If you want to use that card and have problems, just 26 | get in contact. 27 | - On 8800GTS, random numbers and convolutions wont work. 28 | 29 | 30 | Installation 31 | ------------ 32 | 33 | ### Dependencies ### 34 | 35 | To build the C++ lib, you will need: 36 | 37 | - cmake (and cmake-curses-gui for easy configuration) 38 | - libboost-dev >= 1.37 39 | - NVIDIA CUDA (tm), including SDK. We support versions 3.X, 4.X and 5.X 40 | - [thrust library][thrust] - included in CUDA since 4.0 41 | 42 | 43 | ### Building a debug version ### 44 | 45 | ```bash 46 | mkdir -p build/debug 47 | cd build/debug 48 | cmake -DCMAKE_BUILD_TYPE=Release ../../ 49 | ccmake . # adjust paths to your system (cuda, thrust, ...)! 50 | make -j 51 | ctest # run tests to see if it went well 52 | sudo make install 53 | ``` 54 | 55 | ### Building a release version ### 56 | 57 | ```bash 58 | mkdir -p build/release 59 | cd build/release 60 | cmake -DCMAKE_BUILD_TYPE=Release ../../ 61 | ccmake . # adjust paths to your system (cuda, thrust, ...)! 62 | make -j 63 | ctest # run tests to see if it went well 64 | sudo make install 65 | ``` 66 | 67 | Usage 68 | ----- 69 | 70 | ### Example ### 71 | 72 | ```c++ 73 | #include 74 | 75 | int main(void) { 76 | 77 | // allocate a 10×20 array of ints in row-major order on host (CPU) 78 | cuv::ndarray a_host(10, 20); 79 | 80 | assert(a_host.ndim() == 2); // a_host is a two-dimensional array 81 | assert(a_host.size() == 10 * 20); 82 | 83 | // initialize the array 84 | int x = 0; 85 | for(int i=0; i < a_host.shape(0); i++) { // shape(0) == 10 86 | for(int j=0; j < a_host.shape(1); j++) { // shape(1) == 20 87 | a_host(i, j) = x++; 88 | } 89 | } 90 | 91 | // reshape to a 20×10 array 92 | a_host.reshape(20, 10); 93 | assert(a_host.shape(0) == 20); 94 | assert(a_host.shape(1) == 10); 95 | 96 | // copy the array to the GPU 97 | cuv::ndarray a_device = a_host; 98 | 99 | // get the pointer to global device memory 100 | int* device_ptr = a_device.ptr(); 101 | 102 | return 0; 103 | } 104 | ``` 105 | 106 | [thrust]: http://code.google.com/p/thrust/ 107 | [cuv]: https://github.com/deeplearningais/CUV 108 | -------------------------------------------------------------------------------- /src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ####################################################################################### 2 | # The MIT License 3 | 4 | # Copyright (c) 2014 Hannes Schulz, University of Bonn 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2008-2009 Sebastian Nowozin 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | cmake_minimum_required( VERSION 2.6 FATAL_ERROR ) 27 | 28 | FIND_PACKAGE(CUDA) 29 | 30 | if ( NOT CUDA_ARCHITECTURE ) 31 | SET( CUDA_ARCHITECTURE -gencode;arch=compute_13,code=sm_13;-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35 ) 32 | endif() 33 | 34 | # ---------- Find Boost Headers/Libraries ----------------------- 35 | SET (Boost_FIND_REQUIRED TRUE) 36 | SET (Boost_FIND_QUIETLY TRUE) 37 | SET (Boost_USE_MULTITHREADED TRUE) 38 | SET (Boost_USE_STATIC_LIBS FALSE) 39 | SET (Boost_ADDITIONAL_VERSIONS "1.39" "1.39.0" "1.40" "1.42" "1.43" "1.44" "1.45" "1.46" "1.47" "1.48") 40 | FIND_PACKAGE( Boost 1.37 COMPONENTS unit_test_framework serialization system REQUIRED ) 41 | INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS}) 42 | LINK_DIRECTORIES(${Boost_LIBRARY_DIRS}) 43 | 44 | FIND_PATH(THRUST_PATH thrust/device_vector.h /usr/include /usr/local/include ${CUDA_INCLUDE_DIRS} "$ENV{THRUST_ROOT}") 45 | IF(NOT THRUST_PATH) 46 | MESSAGE(FATAL_ERROR "Could not find the thrust library. Please install in standard locations or set THRUST_ROOT environment variable.") 47 | ENDIF(NOT THRUST_PATH) 48 | 49 | SET(CUDA_ARCHITECTURE "" CACHE STRING "The CUDA architecture to compile for, i.e. -arch=sm_20") 50 | SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_ARCHITECTURE}") 51 | MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}") 52 | 53 | CUDA_INCLUDE_DIRECTORIES( ${THRUST_PATH} ) 54 | INCLUDE_DIRECTORIES( ${THRUST_PATH} ) 55 | 56 | add_subdirectory(cuv) 57 | add_subdirectory(tests) 58 | -------------------------------------------------------------------------------- /src/cuv/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ####################################################################################### 2 | # The MIT License 3 | 4 | # Copyright (c) 2014 Hannes Schulz, University of Bonn 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2008-2009 Sebastian Nowozin 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | CUDA_ADD_LIBRARY("ndarray${LIB_SUFFIX}" SHARED allocators.cu memory.cu reference.cu) 27 | 28 | install(TARGETS "ndarray${LIB_SUFFIX}" 29 | RUNTIME DESTINATION bin 30 | LIBRARY DESTINATION lib 31 | ARCHIVE DESTINATION lib/static ) 32 | 33 | INSTALL(FILES ndarray.hpp tags.hpp allocators.hpp cuda_general.hpp memory.hpp meta_programming.hpp reference.hpp 34 | DESTINATION "include/cuv" 35 | ) 36 | -------------------------------------------------------------------------------- /src/cuv/allocators.cu: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #include "allocators.hpp" 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | 37 | #include "cuda_general.hpp" 38 | 39 | namespace cuv { 40 | 41 | void default_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space) { 42 | assert(*ptr == 0); 43 | *ptr = malloc(memsize * valueSize); 44 | assert(*ptr); 45 | } 46 | 47 | void default_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space) { 48 | assert(*ptr == 0); 49 | cuvSafeCall(cudaMalloc(ptr, memsize * valueSize)); 50 | assert(*ptr); 51 | } 52 | 53 | void default_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 54 | host_memory_space m) { 55 | pitch = width * valueSize; 56 | alloc(ptr, height * width, valueSize, m); 57 | assert(*ptr); 58 | } 59 | 60 | void default_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 61 | dev_memory_space) { 62 | cuvSafeCall(cudaMallocPitch(ptr, &pitch, valueSize * width, height)); 63 | assert(*ptr); 64 | } 65 | 66 | void default_allocator::dealloc(void** ptr, host_memory_space) { 67 | assert(*ptr != 0); 68 | free(*ptr); 69 | *ptr = 0; 70 | } 71 | 72 | void default_allocator::dealloc(void** ptr, dev_memory_space) { 73 | assert(*ptr != 0); 74 | cuvSafeCall(cudaFree(*ptr)); 75 | *ptr = 0; 76 | } 77 | 78 | void cuda_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space) { 79 | assert(*ptr == 0); 80 | cuvSafeCall(cudaMallocHost(ptr, memsize * valueSize)); 81 | assert(*ptr != 0); 82 | } 83 | 84 | void cuda_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space m) { 85 | default_allocator::alloc(ptr, memsize, valueSize, m); 86 | } 87 | 88 | void cuda_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 89 | host_memory_space m) { 90 | pitch = width * valueSize; 91 | alloc(ptr, height * width, valueSize, m); 92 | } 93 | 94 | void cuda_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 95 | dev_memory_space) { 96 | cuvSafeCall(cudaMallocPitch(ptr, &pitch, valueSize * width, height)); 97 | } 98 | 99 | void cuda_allocator::dealloc(void** ptr, host_memory_space) { 100 | assert(*ptr != 0); 101 | cuvSafeCall(cudaFreeHost(*ptr)); 102 | *ptr = 0; 103 | } 104 | 105 | void cuda_allocator::dealloc(void** ptr, dev_memory_space m) { 106 | default_allocator::dealloc(ptr, m); 107 | } 108 | 109 | template 110 | void pooled_cuda_allocator::collect_garbage(memory_space m) { 111 | 112 | boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m)); 113 | std::map& pool = get_pool(m); 114 | std::map& pool_sizes = get_pool_sizes(m); 115 | 116 | std::vector to_delete; 117 | std::map::iterator it; 118 | for (it = pool.begin(); it != pool.end(); it++) { 119 | if (it->second) { 120 | to_delete.push_back(it->first); 121 | } 122 | } 123 | 124 | for (size_t i = 0; i < to_delete.size(); i++) { 125 | void* ptr = to_delete[i]; 126 | pool.erase(ptr); 127 | pool_sizes.erase(ptr); 128 | cuda_alloc.dealloc(&ptr, m); 129 | } 130 | 131 | assert(pool_free_count(m) == 0); 132 | 133 | CUV_LOG_DEBUG("garbage collection in memory pool " << m_name << " (" << memtype(m) << 134 | "): removed " << to_delete.size() << " elements"); 135 | } 136 | 137 | template<> 138 | boost::recursive_mutex& pooled_cuda_allocator::get_pool_mutex(dev_memory_space) const { 139 | // locking/unlocking a mutex does not violate constness of this object 140 | // unfortunately, the design of the scoped_lock and mutex class requires this hack of a const_cast 141 | return *(const_cast(&m_dev_pool_mutex)); 142 | } 143 | 144 | template<> 145 | boost::recursive_mutex& pooled_cuda_allocator::get_pool_mutex(host_memory_space) const { 146 | // locking/unlocking a mutex does not violate constness of this object 147 | // unfortunately, the design of the scoped_lock and mutex class requires this hack of a const_cast 148 | return *(const_cast(&m_host_pool_mutex)); 149 | } 150 | 151 | template<> 152 | std::map& pooled_cuda_allocator::get_pool(dev_memory_space) { 153 | return m_dev_pool; 154 | } 155 | 156 | template<> 157 | std::map& pooled_cuda_allocator::get_pool(host_memory_space) { 158 | return m_host_pool; 159 | } 160 | 161 | template<> 162 | const std::map& pooled_cuda_allocator::get_pool(dev_memory_space) const { 163 | return m_dev_pool; 164 | } 165 | 166 | template<> 167 | const std::map& pooled_cuda_allocator::get_pool(host_memory_space) const { 168 | return m_host_pool; 169 | } 170 | 171 | template<> 172 | std::map& pooled_cuda_allocator::get_pool_sizes(dev_memory_space) { 173 | return m_dev_pool_sizes; 174 | } 175 | 176 | template<> 177 | std::map& pooled_cuda_allocator::get_pool_sizes(host_memory_space) { 178 | return m_host_pool_sizes; 179 | } 180 | 181 | template<> 182 | const std::map& pooled_cuda_allocator::get_pool_sizes(dev_memory_space) const { 183 | return m_dev_pool_sizes; 184 | } 185 | 186 | template<> 187 | const std::map& pooled_cuda_allocator::get_pool_sizes(host_memory_space) const { 188 | return m_host_pool_sizes; 189 | } 190 | 191 | template 192 | void pooled_cuda_allocator::delete_pool(memory_space m) { 193 | 194 | boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m)); 195 | std::map& pool = get_pool(m); 196 | std::map& pool_sizes = get_pool_sizes(m); 197 | 198 | #ifndef NDEBUG 199 | size_t free_count = pool_free_count(m); 200 | size_t count = pool_count(m); 201 | if (free_count != count) { 202 | throw std::runtime_error( 203 | (boost::format("detected potential memory leak in memory pool '%s' (%s): free: %d, count: %d") 204 | % m_name % memtype(m) % free_count % count).str()); 205 | } 206 | #endif 207 | 208 | std::map::iterator it; 209 | for (it = pool.begin(); it != pool.end(); it++) { 210 | if (!it->second) { 211 | throw std::runtime_error( 212 | "misuse of allocator. memory was not deallocated before allocator is destroyed. this is a programming failure."); 213 | } 214 | void* ptr = it->first; 215 | cuda_alloc.dealloc(&ptr, m); 216 | } 217 | pool.clear(); 218 | pool_sizes.clear(); 219 | 220 | CUV_LOG_DEBUG("deleted memory pool " << m_name << " (" << memtype(m) << ")"); 221 | } 222 | 223 | pooled_cuda_allocator::pooled_cuda_allocator(const std::string& _name) : 224 | m_name(_name), 225 | m_dev_pool_mutex(), m_host_pool_mutex(), 226 | m_dev_pool(), m_dev_pool_sizes(), 227 | m_host_pool(), m_host_pool_sizes() { 228 | if (m_name.empty()) { 229 | std::ostringstream o; 230 | o << this; 231 | m_name = o.str(); 232 | } 233 | } 234 | 235 | pooled_cuda_allocator::~pooled_cuda_allocator() { 236 | delete_pool(dev_memory_space()); 237 | delete_pool(host_memory_space()); 238 | } 239 | 240 | template 241 | size_t pooled_cuda_allocator::pool_size(memory_space m) const { 242 | size_t sum = 0; 243 | 244 | boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m)); 245 | const std::map& pool_sizes = get_pool_sizes(m); 246 | 247 | std::map::const_iterator it; 248 | for (it = pool_sizes.begin(); it != pool_sizes.end(); it++) { 249 | sum += it->second; 250 | } 251 | return sum; 252 | } 253 | 254 | template 255 | size_t pooled_cuda_allocator::pool_count(memory_space m) const { 256 | boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m)); 257 | return get_pool_sizes(m).size(); 258 | } 259 | 260 | template 261 | size_t pooled_cuda_allocator::pool_free_count(memory_space m) const { 262 | size_t free = 0; 263 | 264 | boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m)); 265 | const std::map& pool = get_pool(m); 266 | 267 | std::map::const_iterator it; 268 | for (it = pool.begin(); it != pool.end(); it++) { 269 | if (it->second) { 270 | free++; 271 | } 272 | } 273 | return free; 274 | } 275 | 276 | size_t pooled_cuda_allocator::pool_free_count() const { 277 | return pool_free_count(dev_memory_space()) + pool_free_count(host_memory_space()); 278 | } 279 | 280 | size_t pooled_cuda_allocator::pool_size() const { 281 | return pool_size(dev_memory_space()) + pool_size(host_memory_space()); 282 | } 283 | 284 | size_t pooled_cuda_allocator::pool_count() const { 285 | return pool_count(dev_memory_space()) + pool_count(host_memory_space()); 286 | } 287 | 288 | void pooled_cuda_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space m) { 289 | if (memsize * valueSize < MIN_SIZE_DEV) { 290 | default_alloc.alloc(ptr, memsize, valueSize, m); 291 | } else { 292 | alloc_pooled(ptr, memsize, valueSize, m); 293 | } 294 | } 295 | 296 | template 297 | void pooled_cuda_allocator::alloc_pooled(void** ptr, size_t memsize, size_t valueSize, memory_space m) { 298 | 299 | assert(memsize > 0); 300 | 301 | // try to find memory in the pool that is available and large enough but not too large 302 | size_t bestSize = 0; 303 | void* bestPtr = 0; 304 | 305 | boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m)); 306 | std::map& pool = get_pool(m); 307 | std::map& pool_sizes = get_pool_sizes(m); 308 | 309 | std::map::iterator it; 310 | { 311 | for (it = pool.begin(); it != pool.end(); it++) { 312 | // available? 313 | if (!it->second) { 314 | continue; 315 | } 316 | 317 | size_t size = pool_sizes[it->first]; 318 | // large enough? 319 | if (size > memsize * valueSize) { 320 | if (bestPtr == 0 || size < bestSize) { 321 | bestPtr = it->first; 322 | bestSize = size; 323 | } 324 | } 325 | // can’t get better 326 | else if (size == memsize * valueSize) { 327 | bestPtr = it->first; 328 | bestSize = size; 329 | break; 330 | } 331 | } 332 | 333 | if (bestPtr) { 334 | // we take it 335 | assert(pool[bestPtr]); 336 | pool[bestPtr] = false; 337 | *ptr = bestPtr; 338 | 339 | CUV_LOG_DEBUG("reusing " << memsize * valueSize << "/" << pool_sizes[bestPtr] << " bytes in pool " 340 | << m_name << " (" << memtype(m) << ")"); 341 | 342 | return; 343 | } 344 | } 345 | 346 | CUV_LOG_DEBUG("allocating " << memsize << "x" << valueSize << " bytes in pool " << m_name << 347 | " (" << memtype(m) << ")"); 348 | 349 | // nothing found? 350 | // allocate new memory 351 | cuda_alloc.alloc(ptr, memsize, valueSize, m); 352 | 353 | pool[*ptr] = false; 354 | pool_sizes[*ptr] = memsize * valueSize; 355 | 356 | CUV_LOG_DEBUG("allocated in pool " << m_name << " (" << memtype(m) << 357 | "). total bytes: " << pool_size(m) << ". count: " << pool_count(m) << ". free: " 358 | << pool_free_count(m)); 359 | 360 | assert(!pool.empty()); 361 | } 362 | 363 | void pooled_cuda_allocator::dealloc(void** ptr, dev_memory_space m) { 364 | do_dealloc(ptr, m); 365 | } 366 | 367 | void pooled_cuda_allocator::dealloc(void** ptr, host_memory_space m) { 368 | do_dealloc(ptr, m); 369 | } 370 | 371 | template 372 | void pooled_cuda_allocator::do_dealloc(void** ptr, memory_space m) { 373 | 374 | assert(*ptr); 375 | 376 | boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m)); 377 | std::map& pool = get_pool(m); 378 | 379 | std::map::iterator it = pool.find(*ptr); 380 | if (it == pool.end()) { 381 | default_alloc.dealloc(ptr, m); 382 | return; 383 | } 384 | 385 | // mark the memory as available 386 | assert(it->second == false); 387 | it->second = true; 388 | 389 | #ifndef NDEBUG 390 | std::map& pool_sizes = get_pool_sizes(m); 391 | 392 | assert(pool_sizes[*ptr] > 0); 393 | 394 | CUV_LOG_DEBUG( 395 | "released " << pool_sizes[*ptr] << " bytes in pool " << m_name << " (" 396 | << memtype(m) << "). total bytes: " << pool_size(m) << ". count: " << pool_count(m) <<", free: " << pool_free_count(m)); 397 | #endif 398 | 399 | *ptr = 0; 400 | } 401 | 402 | void pooled_cuda_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space m) { 403 | if (memsize * valueSize < MIN_SIZE_HOST) { 404 | default_alloc.alloc(ptr, memsize, valueSize, m); 405 | } else { 406 | alloc_pooled(ptr, memsize, valueSize, m); 407 | } 408 | } 409 | 410 | void pooled_cuda_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 411 | host_memory_space m) { 412 | // not yet pooled 413 | default_alloc.alloc2d(ptr, pitch, height, width, valueSize, m); 414 | } 415 | 416 | void pooled_cuda_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 417 | dev_memory_space m) { 418 | // not yet pooled 419 | default_alloc.alloc2d(ptr, pitch, height, width, valueSize, m); 420 | } 421 | 422 | } 423 | 424 | #define CUV_POOLED_CUDA_ALLOCATOR_INST(X) \ 425 | template size_t cuv::pooled_cuda_allocator::pool_count(X) const; \ 426 | template size_t cuv::pooled_cuda_allocator::pool_free_count(X) const; \ 427 | template size_t cuv::pooled_cuda_allocator::pool_size(X) const; 428 | 429 | CUV_POOLED_CUDA_ALLOCATOR_INST(cuv::dev_memory_space); 430 | CUV_POOLED_CUDA_ALLOCATOR_INST(cuv::host_memory_space); 431 | -------------------------------------------------------------------------------- /src/cuv/allocators.hpp: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #ifndef __CUV_ALLOCATORS_HPP__ 28 | #define __CUV_ALLOCATORS_HPP__ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #ifdef DEBUG_POOLING 37 | #include 38 | #define CUV_LOG_DEBUG(X) std::cout << X << std::endl; 39 | #else 40 | #define CUV_LOG_DEBUG(X) 41 | #endif 42 | 43 | #include "tags.hpp" 44 | #include "meta_programming.hpp" 45 | 46 | namespace cuv { 47 | 48 | class allocator { 49 | 50 | public: 51 | 52 | virtual ~allocator() { 53 | } 54 | 55 | virtual void alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space) = 0; 56 | 57 | virtual void alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space) = 0; 58 | 59 | virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 60 | host_memory_space) = 0; 61 | 62 | virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 63 | dev_memory_space) = 0; 64 | 65 | virtual void dealloc(void** ptr, host_memory_space) = 0; 66 | 67 | virtual void dealloc(void** ptr, dev_memory_space) = 0; 68 | 69 | }; 70 | 71 | /** 72 | * Allocator allows allocation, deallocation and copying depending on memory_space_type 73 | * 74 | * \ingroup tools 75 | */ 76 | class default_allocator: public allocator { 77 | 78 | public: 79 | 80 | virtual ~default_allocator() { 81 | } 82 | 83 | virtual void alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space); 84 | 85 | virtual void alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space); 86 | 87 | virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 88 | host_memory_space); 89 | 90 | virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 91 | dev_memory_space); 92 | 93 | virtual void dealloc(void** ptr, host_memory_space); 94 | 95 | virtual void dealloc(void** ptr, dev_memory_space); 96 | 97 | }; 98 | 99 | /** 100 | * @brief allocator that uses cudaMallocHost for allocations in host_memory_space 101 | */ 102 | class cuda_allocator: public default_allocator { 103 | 104 | public: 105 | 106 | virtual ~cuda_allocator() { 107 | } 108 | 109 | virtual void alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space); 110 | 111 | virtual void alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space); 112 | 113 | virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 114 | host_memory_space); 115 | 116 | virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 117 | dev_memory_space); 118 | 119 | virtual void dealloc(void** ptr, host_memory_space); 120 | 121 | virtual void dealloc(void** ptr, dev_memory_space); 122 | 123 | } 124 | ; 125 | 126 | /** 127 | * @brief allocator that naively pools device and host memory 128 | */ 129 | class pooled_cuda_allocator: public allocator { 130 | private: 131 | 132 | std::string m_name; 133 | 134 | boost::recursive_mutex m_dev_pool_mutex; 135 | boost::recursive_mutex m_host_pool_mutex; 136 | 137 | // maps pointers to flag: true means memory is available. false means: currently in use 138 | std::map m_dev_pool; 139 | std::map m_dev_pool_sizes; 140 | 141 | std::map m_host_pool; 142 | std::map m_host_pool_sizes; 143 | 144 | default_allocator default_alloc; 145 | cuda_allocator cuda_alloc; 146 | 147 | pooled_cuda_allocator(const pooled_cuda_allocator& o); 148 | pooled_cuda_allocator& operator=(const pooled_cuda_allocator& o); 149 | 150 | // for logging 151 | std::string memtype(host_memory_space) const { 152 | return "host space"; 153 | } 154 | 155 | // for logging 156 | std::string memtype(dev_memory_space) const { 157 | return "dev space"; 158 | } 159 | 160 | template 161 | boost::recursive_mutex& get_pool_mutex(memory_space m) const; 162 | 163 | template 164 | std::map& get_pool(memory_space m); 165 | 166 | template 167 | const std::map& get_pool(memory_space m) const; 168 | 169 | template 170 | std::map& get_pool_sizes(memory_space m); 171 | 172 | template 173 | const std::map& get_pool_sizes(memory_space m) const; 174 | 175 | template 176 | void collect_garbage(memory_space m); 177 | 178 | template 179 | void alloc_pooled(void** ptr, size_t memsize, size_t valueSize, memory_space m); 180 | 181 | template 182 | void delete_pool(memory_space); 183 | 184 | template 185 | void do_dealloc(void** ptr, memory_space m); 186 | 187 | public: 188 | static const size_t MIN_SIZE_HOST = 8192; 189 | static const size_t MIN_SIZE_DEV = 1; 190 | explicit pooled_cuda_allocator(const std::string& _name = ""); 191 | 192 | virtual ~pooled_cuda_allocator(); 193 | 194 | virtual void garbage_collection() { 195 | collect_garbage(host_memory_space()); 196 | collect_garbage(dev_memory_space()); 197 | } 198 | 199 | virtual void alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space); 200 | 201 | virtual void alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space); 202 | 203 | virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 204 | host_memory_space); 205 | 206 | virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize, 207 | dev_memory_space); 208 | 209 | virtual void dealloc(void** ptr, host_memory_space); 210 | 211 | virtual void dealloc(void** ptr, dev_memory_space); 212 | 213 | template 214 | size_t pool_free_count(memory_space m) const; 215 | 216 | template 217 | size_t pool_size(memory_space m) const; 218 | 219 | template 220 | size_t pool_count(memory_space m) const; 221 | 222 | size_t pool_free_count() const; 223 | 224 | size_t pool_size() const; 225 | 226 | size_t pool_count() const; 227 | 228 | }; 229 | 230 | } 231 | 232 | #endif 233 | -------------------------------------------------------------------------------- /src/cuv/cuda_general.hpp: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #ifndef __CUV_GENERAL_HPP__ 28 | #define __CUV_GENERAL_HPP__ 29 | 30 | #include 31 | #include 32 | 33 | #ifndef CUDA_TEST_DEVICE 34 | # define CUDA_TEST_DEVICE 0 35 | #endif 36 | 37 | namespace cuv { 38 | 39 | /** check whether cuda thinks there was an error and fail with msg, if this is the case 40 | * @ingroup tools 41 | */ 42 | static inline void checkCudaError(const char *msg) { 43 | cudaError_t err = cudaGetLastError(); 44 | if (cudaSuccess != err) { 45 | throw std::runtime_error(std::string(msg) + ": " + cudaGetErrorString(err)); 46 | } 47 | } 48 | 49 | // use this macro to make sure no error occurs when cuda functions are called 50 | #ifdef NDEBUG 51 | # define cuvSafeCall(X) \ 52 | if(strcmp(#X,"cudaThreadSynchronize()")!=0){ X; cuv::checkCudaError(#X); } 53 | #else 54 | # define cuvSafeCall(X) X; cuv::checkCudaError(#X); 55 | #endif 56 | 57 | } 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /src/cuv/memory.cu: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #include "memory.hpp" 28 | 29 | #include 30 | 31 | #include "cuda_general.hpp" 32 | #include "meta_programming.hpp" 33 | 34 | namespace cuv { 35 | 36 | namespace detail { 37 | 38 | template 39 | void copy(value_type* dst, const value_type* src, size_t size, host_memory_space, host_memory_space, 40 | cudaStream_t stream) { 41 | memcpy(dst, src, size * sizeof(value_type)); 42 | } 43 | 44 | template 45 | void copy(value_type* dst, const value_type2* src, size_t size, host_memory_space, host_memory_space, 46 | cudaStream_t stream) { 47 | for (size_t i = 0; i < size; i++) 48 | dst[i] = static_cast(src[i]); 49 | } 50 | 51 | template 52 | void copy(value_type* dst, const value_type* src, size_t size, host_memory_space, dev_memory_space, 53 | cudaStream_t stream) { 54 | cuvSafeCall(cudaMemcpyAsync(dst, src, size * sizeof(value_type), cudaMemcpyDeviceToHost, stream)); 55 | if (stream == 0) { 56 | cuvSafeCall(cudaStreamSynchronize(stream)); 57 | } 58 | } 59 | template 60 | void copy(value_type* dst, const value_type* src, size_t size, dev_memory_space, dev_memory_space, 61 | cudaStream_t stream) { 62 | cuvSafeCall(cudaMemcpyAsync(dst, src, size * sizeof(value_type), cudaMemcpyDeviceToDevice, stream)); 63 | if (stream == 0) { 64 | cuvSafeCall(cudaStreamSynchronize(stream)); 65 | } 66 | } 67 | 68 | template 69 | void copy(value_type* dst, const value_type2* src, size_t size, host_memory_space, dev_memory_space, 70 | cudaStream_t stream) { 71 | cuvSafeCall(cudaMemcpyAsync(dst, src, size * sizeof(value_type), cudaMemcpyDeviceToHost, stream)); 72 | if (stream == 0) { 73 | cuvSafeCall(cudaStreamSynchronize(stream)); 74 | } 75 | } 76 | 77 | template 78 | void copy(value_type* dst, const value_type* src, size_t size, dev_memory_space, host_memory_space, 79 | cudaStream_t stream) { 80 | cuvSafeCall(cudaMemcpyAsync(dst, src, size * sizeof(value_type), cudaMemcpyHostToDevice, stream)); 81 | if (stream == 0) { 82 | cuvSafeCall(cudaStreamSynchronize(stream)); 83 | } 84 | } 85 | 86 | template 87 | void copy(value_type* dst, const value_type2* src, size_t size, dev_memory_space, dev_memory_space, 88 | cudaStream_t stream) { 89 | if (IsSame::Result::value) { 90 | cuvSafeCall(cudaMemcpyAsync(dst, src, size * sizeof(value_type), cudaMemcpyDeviceToDevice, stream)); 91 | if (stream == 0) { 92 | cuvSafeCall(cudaStreamSynchronize(stream)); 93 | } 94 | } else { 95 | thrust::copy(thrust::device_ptr(const_cast(src)), 96 | thrust::device_ptr(const_cast(src)) + size, 97 | thrust::device_ptr(dst)); 98 | cuvSafeCall(cudaThreadSynchronize()); 99 | } 100 | } 101 | 102 | template 103 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w, 104 | host_memory_space, host_memory_space, cudaStream_t stream) { 105 | cuvSafeCall(cudaMemcpy2DAsync(dst, dpitch * sizeof(value_type), 106 | src, spitch * sizeof(value_type2), 107 | w * sizeof(value_type), h, cudaMemcpyHostToHost, stream)); 108 | if (stream == 0) { 109 | cuvSafeCall(cudaStreamSynchronize(stream)); 110 | } 111 | } 112 | 113 | template 114 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, 115 | size_t w, host_memory_space, dev_memory_space, cudaStream_t stream) { 116 | cuvSafeCall(cudaMemcpy2DAsync(dst, dpitch * sizeof(value_type), src, spitch * sizeof(value_type2), 117 | w * sizeof(value_type), h, cudaMemcpyDeviceToHost, stream)); 118 | if (stream == 0) { 119 | cuvSafeCall(cudaStreamSynchronize(stream)); 120 | } 121 | } 122 | 123 | template 124 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, 125 | size_t w, dev_memory_space, host_memory_space, cudaStream_t stream) { 126 | cuvSafeCall(cudaMemcpy2DAsync(dst, dpitch * sizeof(value_type), src, spitch * sizeof(value_type2), 127 | w * sizeof(value_type), h, cudaMemcpyHostToDevice, stream)); 128 | if (stream == 0) { 129 | cuvSafeCall(cudaStreamSynchronize(stream)); 130 | } 131 | } 132 | 133 | template 134 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, 135 | size_t w, dev_memory_space, dev_memory_space, cudaStream_t stream) { 136 | cuvSafeCall(cudaMemcpy2DAsync(dst, dpitch * sizeof(value_type), 137 | src, spitch * sizeof(value_type2), 138 | w * sizeof(value_type), h, cudaMemcpyDeviceToDevice, stream)); 139 | if (stream == 0) { 140 | cuvSafeCall(cudaStreamSynchronize(stream)); 141 | } 142 | } 143 | 144 | #define CUV_MEMORY_COPY(TYPE) \ 145 | template void copy(TYPE*, const TYPE*, size_t, host_memory_space, host_memory_space, cudaStream_t); \ 146 | template void copy(TYPE*, const TYPE*, size_t, host_memory_space, dev_memory_space, cudaStream_t); \ 147 | template void copy(TYPE*, const TYPE*, size_t, dev_memory_space, host_memory_space, cudaStream_t); \ 148 | template void copy(TYPE*, const TYPE*, size_t, dev_memory_space, dev_memory_space, cudaStream_t); \ 149 | template void copy2d(TYPE*, const TYPE*, size_t, size_t, size_t, size_t, host_memory_space, host_memory_space, cudaStream_t); \ 150 | template void copy2d(TYPE*, const TYPE*, size_t, size_t, size_t, size_t, host_memory_space, dev_memory_space, cudaStream_t); \ 151 | template void copy2d(TYPE*, const TYPE*, size_t, size_t, size_t, size_t, dev_memory_space, host_memory_space, cudaStream_t); \ 152 | template void copy2d(TYPE*, const TYPE*, size_t, size_t, size_t, size_t, dev_memory_space, dev_memory_space, cudaStream_t); 153 | 154 | CUV_MEMORY_COPY(signed char); 155 | CUV_MEMORY_COPY(unsigned char); 156 | CUV_MEMORY_COPY(short); 157 | CUV_MEMORY_COPY(unsigned short); 158 | CUV_MEMORY_COPY(int); 159 | CUV_MEMORY_COPY(unsigned int); 160 | CUV_MEMORY_COPY(float); 161 | CUV_MEMORY_COPY(double); 162 | 163 | } 164 | 165 | } 166 | -------------------------------------------------------------------------------- /src/cuv/memory.hpp: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #ifndef __CUV_MEMORY_HPP__ 28 | #define __CUV_MEMORY_HPP__ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | #include "allocators.hpp" 37 | #include "reference.hpp" 38 | 39 | namespace boost { 40 | namespace serialization { 41 | class access; 42 | } 43 | } 44 | 45 | namespace cuv { 46 | 47 | /** 48 | * @addtogroup data_structures 49 | * @{ 50 | */ 51 | 52 | /** 53 | * @addtogroup tags 54 | * @{ 55 | */ 56 | /// Tag for column major matrices 57 | struct column_major { 58 | }; 59 | /// Tag for row major matrices 60 | struct row_major { 61 | }; 62 | 63 | /// tag for linear memory 64 | struct linear_memory_tag { 65 | }; 66 | 67 | /// tag for pitched memory 68 | struct pitched_memory_tag { 69 | }; 70 | 71 | /** @} */ // tags 72 | namespace detail { 73 | 74 | /// copy from host to host 75 | template 76 | void copy(value_type* dst, const value_type* src, size_t size, host_memory_space, host_memory_space, cudaStream_t); 77 | 78 | /// copy from device to host 79 | template 80 | void copy(value_type* dst, const value_type* src, size_t size, host_memory_space, dev_memory_space, cudaStream_t); 81 | 82 | /// copy from host to host 83 | template 84 | void copy(value_type* dst, const value_type2* src, size_t size, host_memory_space, host_memory_space, cudaStream_t); 85 | 86 | /// copy from device to host 87 | template 88 | void copy(value_type* dst, const value_type2* src, size_t size, host_memory_space, dev_memory_space, cudaStream_t); 89 | 90 | /// copy from host to device 91 | template 92 | void copy(value_type* dst, const value_type* src, size_t size, dev_memory_space, host_memory_space, cudaStream_t); 93 | 94 | /// copy from device to device 95 | template 96 | void copy(value_type* dst, const value_type* src, size_t size, dev_memory_space, dev_memory_space, cudaStream_t); 97 | 98 | /// copy from host to device 99 | template 100 | void copy(value_type* dst, const value_type2* src, size_t size, dev_memory_space, host_memory_space, cudaStream_t); 101 | 102 | /// copy from device to device 103 | template 104 | void copy(value_type* dst, const value_type2* src, size_t size, dev_memory_space, dev_memory_space, cudaStream_t); 105 | 106 | /// copy from host to host 107 | template 108 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w, 109 | host_memory_space, host_memory_space, cudaStream_t); 110 | 111 | /// copy from device to host 112 | template 113 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w, 114 | host_memory_space, dev_memory_space, cudaStream_t); 115 | 116 | /// copy from host to device 117 | template 118 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w, 119 | dev_memory_space, host_memory_space, cudaStream_t); 120 | 121 | /// copy from device to device 122 | template 123 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w, 124 | dev_memory_space, dev_memory_space, cudaStream_t); 125 | } 126 | 127 | /** 128 | * simply keeps a pointer and deallocates it when destroyed 129 | */ 130 | template 131 | class memory { 132 | 133 | public: 134 | typedef typename unconst::type value_type; ///< type of contained values 135 | typedef const V const_value_type; ///< const version of value_type 136 | typedef M memory_space_type; ///< host or dev memory_space 137 | typedef unsigned int size_type; ///< type of shapes 138 | typedef int index_type; ///< how to index values 139 | typedef reference reference_type; ///< type of reference you get using operator[] 140 | typedef const reference const_reference_type; ///< type of reference you get using operator[] 141 | 142 | private: 143 | friend class boost::serialization::access; 144 | 145 | /// prohibit copying 146 | memory(const memory&); 147 | 148 | /// prohibit copying 149 | memory& operator=(const memory& o); 150 | 151 | protected: 152 | V* m_ptr; ///< points to allocated memory 153 | size_type m_size; ///< size (for serialization) 154 | boost::shared_ptr m_allocator; ///< how stored memory was allocated 155 | bool m_owned; ///< flag is this instance owns the memory (m_ptr) and is responsibly for destroying 156 | 157 | void check_size_limit(size_t size) const { 158 | if (size > static_cast(std::numeric_limits::max())) { 159 | throw std::runtime_error("maximum memory size exceeded"); 160 | } 161 | } 162 | 163 | public: 164 | 165 | /// @return pointer to allocated memory 166 | V* ptr() { 167 | return m_ptr; 168 | } 169 | 170 | /// @return pointer to allocated memory (const) 171 | const V* ptr() const { 172 | return m_ptr; 173 | } 174 | 175 | /// @return number of stored elements 176 | size_type size() const { 177 | return m_size; 178 | } 179 | 180 | /// @return number of stored bytes 181 | size_type memsize() const { 182 | return size() * sizeof(V); 183 | } 184 | 185 | /// reset information (use with care, for deserialization) 186 | void reset(V* p, size_type s) { 187 | m_ptr = p; 188 | m_size = s; 189 | } 190 | 191 | /// default constructor (just sets ptr to NULL) 192 | explicit memory(const boost::shared_ptr& _allocator) : 193 | m_ptr(NULL), m_size(0), m_allocator(_allocator), m_owned(true) { 194 | } 195 | 196 | /// construct with pointer (takes /ownership/ of this pointer and deletes it when destroyed!) 197 | explicit memory(value_type* ptr, size_type size, const boost::shared_ptr& _allocator, bool owned = true) : 198 | m_ptr(ptr), m_size(size), m_allocator(_allocator), m_owned(owned) { 199 | } 200 | 201 | /// destructor (deallocates the memory) 202 | ~memory() { 203 | dealloc(); 204 | } 205 | 206 | /// dellocate space 207 | void dealloc() { 208 | if (m_ptr && m_owned) { 209 | m_allocator->dealloc(reinterpret_cast(&this->m_ptr), memory_space_type()); 210 | } 211 | m_ptr = NULL; 212 | m_size = 0; 213 | } 214 | 215 | template 216 | void copy_from(V* dst, const value_type2* src, size_t size, memory_space m, cudaStream_t stream) { 217 | detail::copy(dst, src, size, M(), m, stream); 218 | } 219 | 220 | template 221 | void copy_from(const value_type2* src, size_t size, memory_space m, cudaStream_t stream) { 222 | copy_from(m_ptr, src, size, m, stream); 223 | } 224 | 225 | template 226 | void copy2d_from(V* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w, 227 | memory_space m, cudaStream_t stream) { 228 | detail::copy2d(dst, src, dpitch, spitch, h, w, M(), m, stream); 229 | } 230 | 231 | template 232 | void copy2d_from(const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w, 233 | memory_space m, cudaStream_t stream) { 234 | copy2d_from(m_ptr, src, dpitch, spitch, h, w, m, stream); 235 | } 236 | 237 | }; 238 | 239 | /** 240 | * represents contiguous memory 241 | */ 242 | template 243 | class linear_memory: public memory { 244 | private: 245 | typedef memory super; 246 | public: 247 | typedef typename super::value_type value_type; ///< type of contained values 248 | typedef typename super::const_value_type const_value_type; ///< const version of value_type 249 | typedef typename super::memory_space_type memory_space_type; ///< host or dev memory_space 250 | typedef typename super::index_type index_type; ///< how to index values 251 | typedef typename super::size_type size_type; ///< type of shapes 252 | typedef typename super::reference_type reference_type; ///< type of reference you get using operator[] 253 | typedef typename super::const_reference_type const_reference_type; ///< type of reference you get using operator[] 254 | 255 | private: 256 | 257 | friend class boost::serialization::access; 258 | typedef linear_memory my_type; ///< my own type 259 | using super::m_ptr; 260 | using super::m_size; 261 | using super::m_allocator; 262 | 263 | public: 264 | 265 | /// default constructor: does nothing 266 | explicit linear_memory(const boost::shared_ptr _allocator = boost::make_shared()) : 267 | memory(_allocator) { 268 | } 269 | 270 | /** constructor: reserves space for i elements 271 | * @param i number of elements 272 | */ 273 | explicit linear_memory(size_type i, const boost::shared_ptr _allocator = 274 | boost::make_shared()) : 275 | memory(_allocator) { 276 | m_size = i; 277 | alloc(); 278 | } 279 | 280 | /// releases ownership of pointer (for storage in memory class) 281 | value_type* release() { 282 | value_type* ptr = m_ptr; 283 | m_ptr = NULL; 284 | return ptr; 285 | } 286 | 287 | /// sets the size (reallocates if necessary) 288 | void set_size(size_type s) { 289 | if (s != this->size()) { 290 | this->dealloc(); 291 | m_size = s; 292 | alloc(); 293 | } 294 | } 295 | 296 | /// allocate space according to size() 297 | void alloc() { 298 | assert(this->m_ptr == NULL); 299 | if (m_size > 0) 300 | m_allocator->alloc(reinterpret_cast(&m_ptr), m_size, sizeof(V), memory_space_type()); 301 | } 302 | 303 | /** 304 | * @brief Copy linear_memory. 305 | * 306 | * @param o Source linear_memory 307 | * 308 | * @return *this 309 | * 310 | */ 311 | my_type& operator=(const my_type& o) { 312 | if (this == &o) 313 | return *this; 314 | 315 | if (this->size() != o.size()) { 316 | this->dealloc(); 317 | m_size = o.size(); 318 | this->alloc(); 319 | } 320 | 321 | // TODO async copy 322 | cudaStream_t stream = 0; 323 | this->copy_from(o, stream); 324 | 325 | return *this; 326 | } 327 | 328 | /** 329 | * @overload 330 | * 331 | * @brief Copy linear_memory from other memory type. 332 | * 333 | * @param o Source linear_memory 334 | * 335 | * @return *this 336 | * 337 | */ 338 | template 339 | my_type& operator=(const linear_memory& o) { 340 | if (this->size() != o.size()) { 341 | this->dealloc(); 342 | m_size = o.size(); 343 | this->alloc(); 344 | } 345 | 346 | // TODO async copy 347 | cudaStream_t stream = 0; 348 | this->copy_from(o, stream); 349 | return *this; 350 | } 351 | 352 | /** 353 | * construct from other linear memory 354 | */ 355 | explicit linear_memory(const my_type& o) : 356 | memory(o.m_allocator) { 357 | operator=(o); 358 | } 359 | 360 | /** 361 | * construct from other linear memory 362 | */ 363 | template 364 | explicit linear_memory(const linear_memory& o) : 365 | memory(o.m_allocator) { 366 | operator=(o); 367 | } 368 | 369 | /** 370 | * @return a reference to memory at a position 371 | * @param idx position 372 | */ 373 | reference_type operator[](const index_type& idx) { 374 | assert(idx >= 0); 375 | assert((size_type) idx < m_size); 376 | return reference_type(this->m_ptr + idx); 377 | } 378 | 379 | /** 380 | * @overload 381 | * 382 | * @return a reference to memory at a position 383 | * @param idx position 384 | */ 385 | const_reference_type operator[](const index_type& idx) const { 386 | assert(idx >= 0); 387 | assert((size_type) idx < m_size); 388 | return const_reference_type(this->m_ptr + idx); 389 | } 390 | 391 | /// deallocates memory 392 | ~linear_memory() { 393 | this->dealloc(); 394 | } 395 | 396 | /// set strides for this memory 397 | void set_strides(linear_memory& strides, 398 | const linear_memory& shape, row_major) { 399 | size_t size = 1; 400 | for (int i = shape.size() - 1; i >= 0; --i) { 401 | strides[i] = (shape[i] == 1) ? 0 : size; 402 | size *= shape[i]; 403 | } 404 | this->check_size_limit(size); 405 | } 406 | 407 | /// set strides for this memory 408 | void set_strides(linear_memory& strides, 409 | const linear_memory& shape, column_major) { 410 | size_t size = 1; 411 | for (size_t i = 0; i < shape.size(); ++i) { 412 | strides[i] = (shape[i] == 1) ? 0 : size; 413 | size *= shape[i]; 414 | } 415 | this->check_size_limit(size); 416 | } 417 | 418 | /** reverse the array (for transposing etc) 419 | * 420 | * currently only enabled for host memory space arrays 421 | */ 422 | void reverse() { 423 | if (IsSame::Result::value) 424 | throw std::runtime_error("reverse of dev linear memory not implemented"); 425 | value_type* __first = m_ptr, *__last = m_ptr + this->size(); 426 | while (true) 427 | if (__first == __last || __first == --__last) 428 | return; 429 | else { 430 | std::iter_swap(__first, __last); 431 | ++__first; 432 | } 433 | } 434 | 435 | template 436 | void copy_from(const value_type2* src, size_t size, memory_space m, cudaStream_t stream) { 437 | memory::copy_from(src, size, m, stream); 438 | } 439 | 440 | template 441 | void copy_from(const linear_memory& src, cudaStream_t stream) const { 442 | detail::copy(m_ptr, src.ptr(), src.size(), M(), OM(), stream); 443 | } 444 | 445 | }; 446 | 447 | /** 448 | * represents 2D non-contiguous ("pitched") memory 449 | */ 450 | template 451 | class pitched_memory: public memory { 452 | 453 | private: 454 | typedef memory super; 455 | 456 | public: 457 | 458 | typedef typename super::value_type value_type; ///< type of contained values 459 | typedef typename super::const_value_type const_value_type; ///< const version of value_type 460 | typedef typename super::memory_space_type memory_space_type; ///< host or dev memory_space 461 | typedef typename super::index_type index_type; ///< how to index values 462 | typedef typename super::size_type size_type; ///< type of shapes 463 | typedef typename super::reference_type reference_type; ///< type of reference you get using operator[] 464 | typedef typename super::const_reference_type const_reference_type; ///< type of reference you get using operator[] 465 | 466 | private: 467 | friend class boost::serialization::access; 468 | typedef pitched_memory my_type; ///< my own type 469 | size_type m_rows; ///< number of rows 470 | size_type m_cols; ///< number of columns 471 | size_type m_pitch; ///< pitch (multiples of sizeof(V)) 472 | using super::m_ptr; 473 | using super::m_size; 474 | using super::m_allocator; 475 | public: 476 | 477 | /// @return the number of rows 478 | size_type rows() const { 479 | return m_rows; 480 | } 481 | 482 | /// @return the number of cols 483 | size_type cols() const { 484 | return m_cols; 485 | } 486 | 487 | /// @return the number of allocated cols 488 | size_type pitch() const { 489 | return m_pitch; 490 | } 491 | 492 | /// @return number of stored elements 493 | size_type size() const { 494 | return m_rows * m_pitch; 495 | } 496 | 497 | /// @return number of stored bytes 498 | size_type memsize() const { 499 | return size() * sizeof(V); 500 | } 501 | 502 | /// default constructor: does nothing 503 | explicit pitched_memory(const boost::shared_ptr _allocator = boost::make_shared()) : 504 | memory(_allocator), m_rows(0), m_cols(0), m_pitch(0) { 505 | } 506 | 507 | /** constructor: reserves space for at least i*j elements 508 | * @param i number of rows 509 | * @param j minimum number of elements per row 510 | */ 511 | explicit pitched_memory(index_type i, index_type j, const boost::shared_ptr _allocator = 512 | boost::make_shared()) : 513 | memory(_allocator), m_rows(i), m_cols(j), m_pitch(0) { 514 | alloc(); 515 | } 516 | 517 | /** 518 | * allocate space according to size() 519 | */ 520 | void alloc() { 521 | assert(this->m_ptr == NULL); 522 | size_t pitch; 523 | m_allocator->alloc2d(reinterpret_cast(&this->m_ptr), pitch, m_rows, m_cols, sizeof(V), 524 | memory_space_type()); 525 | assert(this->m_ptr != NULL); 526 | m_pitch = pitch; 527 | assert(m_pitch % sizeof(value_type) == 0); 528 | m_pitch /= sizeof(value_type); 529 | m_size = m_rows * m_pitch; // in class memory 530 | } 531 | 532 | /// releases ownership of pointer (for storage in memory class) 533 | value_type* release() { 534 | value_type* ptr = m_ptr; 535 | m_ptr = NULL; 536 | return ptr; 537 | } 538 | 539 | /** 540 | * set the size (reallocating, if necessary) 541 | * @param rows number of desired rows 542 | * @param cols number of desired columns 543 | */ 544 | void set_size(size_type rows, size_type cols) { 545 | if (cols > m_pitch || rows > m_rows) { 546 | this->dealloc(); 547 | m_rows = rows; 548 | m_cols = cols; 549 | this->alloc(); 550 | } else { 551 | m_rows = rows; 552 | m_cols = cols; 553 | } 554 | } 555 | 556 | /** 557 | * @brief Copy pitched_memory. 558 | * 559 | * @param o Source pitched_memory 560 | * 561 | * @return *this 562 | * 563 | */ 564 | my_type& operator=(const my_type& o) { 565 | if (this == &o) 566 | return *this; 567 | 568 | if (m_pitch < o.m_cols || m_rows < o.m_rows) { 569 | this->dealloc(); 570 | m_cols = o.m_cols; 571 | m_rows = o.m_rows; 572 | this->alloc(); 573 | } 574 | m_cols = o.m_cols; 575 | m_rows = o.m_rows; 576 | this->copy_from(o); 577 | return *this; 578 | } 579 | 580 | /** 581 | * @overload 582 | * 583 | * @brief Copy pitched_memory from other memory type. 584 | * 585 | * @param o Source pitched_memory 586 | * 587 | * @return *this 588 | * 589 | */ 590 | template 591 | my_type& 592 | operator=(const pitched_memory& o) { 593 | if (m_pitch < o.m_cols || m_rows < o.m_rows) { 594 | this->dealloc(); 595 | m_cols = o.m_cols; 596 | m_rows = o.m_rows; 597 | this->alloc(); 598 | } 599 | m_cols = o.m_cols; 600 | m_rows = o.m_rows; 601 | this->copy_from(o); 602 | return *this; 603 | } 604 | 605 | /** 606 | * @return a reference to memory at a position as if this were pitched memory 607 | * @param idx position 608 | */ 609 | reference_type operator[](const index_type& idx) { 610 | assert(idx >= 0); 611 | index_type row = idx / m_cols; 612 | index_type col = idx % m_cols; 613 | assert((size_type) row < m_rows); 614 | assert((size_type) col < m_cols); 615 | return reference_type(this->m_ptr + row * m_pitch + col); 616 | } 617 | 618 | /** 619 | * @overload 620 | * 621 | * @return a reference to memory at a position 622 | * @param idx position 623 | */ 624 | const_reference_type operator[](const index_type& idx) const { 625 | return const_cast(*this)(idx); 626 | } 627 | 628 | /** 629 | * get a reference to a datum in memory 630 | * 631 | * @param i first (slow-changing) dimension index 632 | * @param j second (fast-changing) dimension index 633 | * @return reference to datum at index i,j 634 | */ 635 | reference_type operator()(const index_type& i, const index_type& j) { 636 | assert(i >= 0); 637 | assert(j >= 0); 638 | assert((size_type) i < m_rows); 639 | assert((size_type) j < m_cols); 640 | return reference_type(this->m_ptr + i * m_pitch + j); 641 | } 642 | /** @overload */ 643 | const_reference_type operator()(const index_type& i, const index_type& j) const { 644 | return const_cast(*this)(i, j); 645 | } 646 | 647 | /** 648 | * set strides for this memory 649 | * 650 | * determines the strides for a given shape, with special consideration to pitched dimension 651 | * 652 | * @param strides output vector 653 | * @param shape shape of the vector 654 | * 655 | * row major version 656 | */ 657 | void set_strides(linear_memory& strides, 658 | const linear_memory& shape, row_major) { 659 | size_type size = 1; 660 | assert(shape.size() >= 2); 661 | const int pitched_dim = shape.size() - 1; 662 | for (int i = shape.size() - 1; i >= 0; --i) { 663 | if (shape[i] == 1) { 664 | strides[i] = 0; 665 | } else if (i == pitched_dim) { 666 | strides[i] = 1; 667 | size *= pitch(); 668 | } else { 669 | strides[i] = size; 670 | size *= shape[i]; 671 | } 672 | } 673 | } 674 | /** 675 | * @overload 676 | * 677 | * column major version 678 | */ 679 | void set_strides(linear_memory& strides, 680 | const linear_memory& shape, column_major) { 681 | size_type size = 1; 682 | assert(shape.size() >= 2); 683 | const size_type pitched_dim = 0; 684 | for (unsigned int i = 0; i < shape.size(); ++i) { 685 | if (shape[i] == 1) { 686 | strides[i] = 0; 687 | } else if (i == pitched_dim) { 688 | strides[i] = 1; 689 | size *= pitch(); 690 | } else { 691 | strides[i] = size; 692 | size *= shape[i]; 693 | } 694 | } 695 | } 696 | 697 | template 698 | void copy2d_from(const memory src, cudaStream_t stream) const { 699 | memory::copy2d_from(m_ptr, src.ptr(), m_pitch / sizeof(value_type), src.m_pitch / sizeof(V2), 700 | m_rows, m_cols, M(), OM(), stream); 701 | } 702 | 703 | template 704 | void copy_from(const pitched_memory& src, cudaStream_t stream) const { 705 | detail::copy(m_ptr, src.ptr(), src.size(), M(), OM(), stream); 706 | } 707 | 708 | }; 709 | 710 | /** @} */ // data_structures 711 | namespace detail { 712 | 713 | /** 714 | * true iff there are no "holes" in memory 715 | */ 716 | inline bool is_c_contiguous(row_major, const linear_memory& shape, 717 | const linear_memory& stride) { 718 | bool c_contiguous = true; 719 | int size = 1; 720 | for (int i = shape.size() - 1; (i >= 0) && c_contiguous; --i) { 721 | if (shape[i] == 1) 722 | continue; 723 | if (stride[i] != size) 724 | c_contiguous = false; 725 | size = size * shape[i]; 726 | } 727 | return c_contiguous; 728 | } 729 | 730 | /** 731 | * @overload 732 | */ 733 | inline bool is_c_contiguous(column_major, const linear_memory& shape, 734 | const linear_memory& stride) { 735 | bool c_contiguous = true; 736 | int size = 1; 737 | for (unsigned int i = 0; i < shape.size() && c_contiguous; ++i) { 738 | if (shape[i] == 1) 739 | continue; 740 | if (stride[i] != size) 741 | c_contiguous = false; 742 | size = size * shape[i]; 743 | } 744 | return c_contiguous; 745 | } 746 | 747 | /// returns true iff memory can be copied using copy2d 748 | inline bool is_2dcopyable(row_major, const linear_memory& shape, 749 | const linear_memory& stride) { 750 | bool c_contiguous = shape.size() > 1; 751 | int pitched_dim = shape.size() - 1; // last dim 752 | while (shape[pitched_dim] == 1 && stride[pitched_dim] == 1) 753 | pitched_dim--; 754 | int size = 1; 755 | for (int i = shape.size() - 1; (i >= 0) && c_contiguous; --i) { 756 | if (shape[i] == 1) { 757 | continue; 758 | } else if (i == pitched_dim) { 759 | size *= stride[i - 1]; 760 | } else if (stride[i] != size) { 761 | c_contiguous = false; 762 | } else { 763 | size *= shape[i]; 764 | } 765 | } 766 | return c_contiguous; 767 | } 768 | 769 | /// @overload 770 | inline bool is_2dcopyable(column_major, const linear_memory& shape, 771 | const linear_memory& stride) { 772 | bool c_contiguous = shape.size() > 1; 773 | unsigned int pitched_dim = 0; 774 | while (shape[pitched_dim] == 1 && stride[pitched_dim] == 1) 775 | pitched_dim++; 776 | int size = 1; 777 | for (unsigned int i = 0; (i < shape.size()) && c_contiguous; ++i) { 778 | if (shape[i] == 1) { 779 | continue; 780 | } else if (i == pitched_dim) { 781 | size *= stride[i]; 782 | } else if (stride[i] != size) { 783 | c_contiguous = false; 784 | } else { 785 | size *= shape[i]; 786 | } 787 | } 788 | return c_contiguous; 789 | } 790 | 791 | } 792 | } 793 | 794 | #endif 795 | -------------------------------------------------------------------------------- /src/cuv/meta_programming.hpp: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #ifndef __CUV_META_PROGRAMMING_HPP__ 28 | #define __CUV_META_PROGRAMMING_HPP__ 29 | 30 | namespace cuv { 31 | 32 | /** 33 | * @addtogroup MetaProgramming 34 | * @{ 35 | */ 36 | 37 | /// defines "False" 38 | struct FalseType { 39 | enum { 40 | value = false 41 | }; 42 | }; 43 | /// defines "True" 44 | struct TrueType { 45 | enum { 46 | value = true 47 | }; 48 | }; 49 | 50 | /** 51 | * @brief Checks whether two types are equal 52 | */ 53 | template 54 | struct IsSame 55 | { 56 | /// is true only if T1==T2 57 | typedef FalseType Result; 58 | }; 59 | 60 | /** 61 | * @see IsSame 62 | */ 63 | template 64 | struct IsSame 65 | { 66 | /// T==T, therefore Result==TrueType 67 | typedef TrueType Result; 68 | }; 69 | 70 | /** 71 | * @brief Checks whether two types are different 72 | */ 73 | template 74 | struct IsDifferent 75 | { 76 | /// is true only if T1!=T2 77 | typedef TrueType Result; 78 | }; 79 | 80 | /** 81 | * @see IsDifferent 82 | */ 83 | template 84 | struct IsDifferent 85 | { 86 | /// T==T, therefore Result==FalseType 87 | typedef FalseType Result; 88 | }; 89 | 90 | /** 91 | * @brief Remove "const" from a type 92 | */ 93 | template 94 | struct unconst { 95 | /// no change 96 | typedef T type; 97 | }; 98 | 99 | /** 100 | * @see unconst 101 | */ 102 | template 103 | struct unconst { 104 | /// T without the const 105 | typedef T type; 106 | }; 107 | 108 | /** 109 | * @brief Switch result depending on Condition 110 | */ 111 | template 112 | struct If { 113 | /// assume condition is true 114 | typedef Then result; 115 | }; 116 | /** 117 | * @see If 118 | */ 119 | template 120 | struct If { 121 | /// condition is false 122 | typedef Else result; 123 | }; 124 | 125 | /** 126 | * @brief enable-if controlled creation of SFINAE conditions 127 | */ 128 | template 129 | struct EnableIfC { 130 | typedef T type; /// enabling succeeded :-) 131 | }; 132 | 133 | /// @see EnableIfC 134 | template 135 | struct EnableIfC { 136 | }; 137 | 138 | /// @see EnableIfC 139 | template 140 | struct EnableIf: public EnableIfC { 141 | }; 142 | 143 | /// @see EnableIfC 144 | template 145 | struct DisableIf: public EnableIfC { 146 | }; 147 | 148 | /** 149 | * @} 150 | */ 151 | } 152 | 153 | #endif 154 | -------------------------------------------------------------------------------- /src/cuv/ndarray.hpp: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #ifndef __CUV_NDARRAY_HPP__ 28 | #define __CUV_NDARRAY_HPP__ 29 | 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | #include "allocators.hpp" 40 | #include "memory.hpp" 41 | #include "meta_programming.hpp" 42 | 43 | namespace cuv { 44 | 45 | /** fail with an error message, a stack trace and a runtime_exception (the nicest failures you've seen ^^!) 46 | * @ingroup tools 47 | */ 48 | static inline void cuvAssertFailed(const char *msg) { 49 | throw std::runtime_error(std::string(msg)); 50 | } 51 | 52 | /** 53 | * @def cuvAssert 54 | * @ingroup tools 55 | * use this macro to ensure that a condition is true. 56 | * in contrast to assert(), this will throw a runtime_exception, 57 | * which can be translated to python. 58 | * Additionally, when using Linux, you get a full stack trace printed 59 | */ 60 | #define cuvAssert(X) \ 61 | if(!(X)){ cuv::cuvAssertFailed(#X); } 62 | 63 | using boost::detail::multi_array::extent_gen; 64 | using boost::detail::multi_array::index_gen; 65 | 66 | /** 67 | * defines an index range, stolen from boost::multi_array 68 | * 69 | * examples: 70 | * @code 71 | * index_range(1,3) 72 | * index(1) <= index_range() < index(3) 73 | * @endcode 74 | */ 75 | typedef boost::detail::multi_array::index_range index_range; 76 | 77 | /** 78 | * the index type used in index_range, useful for comparator syntax in @see index_range 79 | */ 80 | typedef index_range::index index; 81 | 82 | #ifndef CUV_DONT_CREATE_EXTENTS_OBJ 83 | 84 | namespace { 85 | /** 86 | * extents object, can be used to generate a multi-dimensional array conveniently. 87 | * 88 | * stolen from boost::multi_array. 89 | * 90 | * Example: 91 | * @code 92 | * ndarray<...> v(extents[5][6][7]); // 3-dimensional ndarray 93 | * @endcode 94 | */ 95 | extent_gen<0> extents; 96 | 97 | /** 98 | * indices object, can be used to generate multi-dimensional views conveniently. 99 | * 100 | * stolen form boost::multi_array. 101 | * 102 | * Example: 103 | * @code 104 | * ndarray_view<...> v(indices[index_range(1,3)][index_range()], other_ndarray); 105 | * // or, equivalently 106 | * other_ndarray[indices[index_range(1,3)][index_range()]]; 107 | * @endcode 108 | */ 109 | index_gen<0, 0> indices; 110 | } 111 | #endif 112 | 113 | /** 114 | * @addtogroup data_structures Basic datastructures 115 | * @{ 116 | */ 117 | 118 | template class ndarray; 119 | template class ndarray_view; 120 | 121 | /// used in implementation of ndarray.operator= for value_type argument 122 | template 123 | void fill(ndarray& v, const V& p); 124 | 125 | namespace detail { 126 | 127 | /** 128 | * this is intended for copying pitched memory. 129 | * 130 | * given shape, stride and a memory layout, we can determine the number of 131 | * rows, columns and the pitch of a 132 | */ 133 | template 134 | void get_pitched_params(size_type& rows, size_type& cols, size_type& pitch, 135 | const linear_memory& shape, 136 | const linear_memory& stride, row_major) { 137 | // strided dimension is the LAST one 138 | rows = std::accumulate(shape[0].ptr, shape[0].ptr + shape.size() - 1, 1, std::multiplies()); 139 | cols = shape[shape.size() - 1]; 140 | pitch = stride[shape.size() - 2]; 141 | } 142 | 143 | /** 144 | * @overload 145 | */ 146 | template 147 | void get_pitched_params(size_type& rows, size_type& cols, size_type& pitch, 148 | const linear_memory& shape, 149 | const linear_memory& stride, column_major) { 150 | // strided dimension is the FIRST one 151 | rows = std::accumulate(shape[0].ptr + 1, shape[0].ptr + shape.size(), 1, std::multiplies()); 152 | cols = shape[0]; 153 | pitch = stride[1]; 154 | } 155 | 156 | } 157 | 158 | /** 159 | * contains infos about shape and stride on host and in the ndarray data space. 160 | */ 161 | template 162 | class ndarray_info { 163 | 164 | public: 165 | 166 | typedef unsigned int size_type; ///< type of shapes of the ndarray 167 | typedef int index_type; ///< type of indices in ndarray 168 | typedef M data_memory_space; ///< this is where the data lies 169 | 170 | boost::shared_ptr m_allocator; 171 | 172 | /// shape stored in host memory 173 | linear_memory host_shape; 174 | 175 | /// strides stored in host memory 176 | linear_memory host_stride; 177 | 178 | /// shape stored in data memory 179 | linear_memory data_shape; 180 | 181 | /// strides stored in data memory 182 | linear_memory data_stride; 183 | 184 | /// default constructor: does nothing 185 | ndarray_info(const boost::shared_ptr& _allocator) : 186 | m_allocator(_allocator), host_shape(_allocator), host_stride(_allocator), 187 | data_shape(_allocator), data_stride(_allocator) 188 | { 189 | } 190 | 191 | /// @return the size of the arrays (should all be the same) 192 | size_type size() { 193 | return host_shape.size(); 194 | } 195 | 196 | /// construct with known shape 197 | ndarray_info(size_type s, const boost::shared_ptr& _allocator) : 198 | m_allocator(_allocator), host_shape(_allocator), host_stride(_allocator), 199 | data_shape(_allocator), data_stride(_allocator) 200 | { 201 | resize(s); 202 | } 203 | 204 | /// resize all memories 205 | void resize(size_type s) { 206 | host_shape.set_size(s); 207 | host_stride.set_size(s); 208 | } 209 | 210 | /// copy-constructor 211 | ndarray_info(const ndarray_info& o) : 212 | m_allocator(o.m_allocator), host_shape(o.host_shape), host_stride(o.host_stride), 213 | data_shape(m_allocator), data_stride(m_allocator) 214 | { 215 | } 216 | 217 | /// copy-construct from other memory space 218 | template 219 | ndarray_info(const ndarray_info& o) : 220 | m_allocator(o.m_allocator), host_shape(o.host_shape), host_stride(o.host_stride), 221 | data_shape(m_allocator), data_stride(m_allocator) 222 | { 223 | } 224 | 225 | }; 226 | 227 | /** 228 | * represents an n-dimensional array on GPU or CPU. 229 | */ 230 | template 231 | class ndarray { 232 | 233 | public: 234 | 235 | typedef memory memory_type; ///< type of stored memory 236 | typedef typename memory_type::reference_type reference_type; ///< values returned by operator() and [] 237 | typedef typename memory_type::const_reference_type const_reference_type; ///< values returned by operator() 238 | typedef typename memory_type::memory_space_type memory_space_type; ///< dev/host 239 | typedef typename memory_type::value_type value_type; ///< type of stored values 240 | typedef typename memory_type::size_type size_type; ///< type shapes 241 | typedef typename memory_type::index_type index_type; ///< type strides 242 | typedef L memory_layout_type; ///< column/row major 243 | 244 | typedef ndarray_info info_type; ///< type of shape info struct 245 | typedef ndarray_view view_type; ///< type of views on this ndarray 246 | 247 | public: 248 | boost::shared_ptr m_allocator; 249 | 250 | private: 251 | void check_size_limit(size_t size) const { 252 | if (size > static_cast(std::numeric_limits::max())) { 253 | throw std::runtime_error("maximum ndarray size exceeded"); 254 | } 255 | } 256 | 257 | /// ndarray views are our friends 258 | template 259 | friend class ndarray_view; 260 | 261 | protected: 262 | 263 | /// information about shape, strides 264 | info_type m_info; 265 | 266 | /// points to (possibly shared) memory 267 | boost::shared_ptr m_memory; 268 | 269 | /// points to start of actually referenced memory (within m_memory) 270 | V* m_ptr; 271 | 272 | /** 273 | * determine linear index in memory of an index array 274 | * 275 | * this function takes strides etc. into account, so that indices 276 | * are interpreted as relative to the (strided) sub-ndarray we're 277 | * referring to. 278 | * 279 | * @param D size of index array 280 | * @param arr index array 281 | * @return linear index in memory of index array 282 | * 283 | */ 284 | size_type index_of(int D, index_type* arr) const { 285 | index_type pos = 0; 286 | for (int i = 0; i < D; i++) { 287 | index_type temp = arr[i]; 288 | if (temp < 0) 289 | temp = m_info.host_shape[i] + temp; 290 | pos += temp * m_info.host_stride[i]; 291 | } 292 | return pos; 293 | } 294 | 295 | /** 296 | * allocate linear memory (c-contiguous version) 297 | * 298 | * @param t ndarray to allocate 299 | */ 300 | void allocate(ndarray& t, linear_memory_tag) { 301 | linear_memory mem(t.size(), t.m_allocator); 302 | mem.set_strides(t.m_info.host_stride, t.m_info.host_shape, L()); 303 | t.m_ptr = mem.ptr(); 304 | t.m_memory.reset(new memory(mem.release(), mem.size(), t.m_allocator)); 305 | } 306 | 307 | /** 308 | * @overload 309 | * 310 | * pitched version 311 | */ 312 | void allocate(ndarray& t, pitched_memory_tag) { 313 | typename ndarray::size_type row, col, pitch; 314 | detail::get_pitched_params(row, col, pitch, t.m_info.host_shape, t.m_info.host_stride, L()); 315 | pitched_memory d(row, col); 316 | d.set_strides(t.m_info.host_stride, t.m_info.host_shape, L()); 317 | t.m_ptr = d.ptr(); 318 | t.m_memory.reset(new memory(d.release(), d.size(), t.m_allocator)); 319 | } 320 | 321 | public: 322 | 323 | /** 324 | * determine linear index in memory of an index array 325 | * 326 | * this function takes strides etc. into account, so that indices 327 | * are interpreted as relative to the (strided) sub-ndarray we're 328 | * referring to. 329 | * 330 | * @tparam D size of index array 331 | * @param eg position in array 332 | * @return linear index in memory of index array 333 | * 334 | */ 335 | template 336 | size_type index_of(const extent_gen& eg) const { 337 | index_type pos = 0; 338 | for (size_t i = 0; i < D; i++) { 339 | index_type temp = eg.ranges_[i].finish(); 340 | if (temp < 0) 341 | temp = m_info.host_shape[i] + temp; 342 | pos += temp * m_info.host_stride[i]; 343 | } 344 | return pos; 345 | } 346 | 347 | /** 348 | * @name Accessors 349 | * @{ 350 | */ 351 | /// return the number of dimensions 352 | index_type ndim() const { 353 | return m_info.host_shape.size(); 354 | } 355 | 356 | /** return the size of the i-th dimension 357 | * @param i the index of the queried dimension 358 | */ 359 | size_type shape(const size_t i) const { 360 | return m_info.host_shape[i]; 361 | } 362 | 363 | /** return the stride of the i-th dimension 364 | * @param i the index of the queried dimension 365 | */ 366 | index_type stride(const size_t i) const { 367 | return m_info.host_stride[i]; 368 | } 369 | 370 | /** @return the pointer to the referenced memory */ 371 | V* ptr() { 372 | return m_ptr; 373 | } 374 | 375 | /** 376 | * @overload 377 | * @return the const pointer to the referenced memory 378 | * */ 379 | const V* ptr() const { 380 | return m_ptr; 381 | } 382 | 383 | /** set the pointer offset (used in deserialization) */ 384 | void set_ptr_offset(long int i) { 385 | m_ptr = m_memory->ptr() + i; 386 | } 387 | 388 | /** * @return pointer to allocated memory */ 389 | boost::shared_ptr& mem() { 390 | return m_memory; 391 | } 392 | /** 393 | * @overload 394 | * @return the const pointer to the allocated memory 395 | * */ 396 | const boost::shared_ptr& mem() const { 397 | return m_memory; 398 | } 399 | 400 | /** @return the number of stored elements 401 | */ 402 | size_type size() const { 403 | size_t size = std::accumulate(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size(), 1, 404 | std::multiplies()); 405 | 406 | check_size_limit(size); 407 | 408 | return static_cast(size); 409 | } 410 | 411 | /** 412 | * determine size in bytes 413 | * 414 | * assumes that the memory is c_contiguous! 415 | * 416 | * @return the size in bytes 417 | */ 418 | size_type memsize() const { 419 | #ifndef NDEBUG 420 | cuvAssert(is_c_contiguous()); 421 | #endif 422 | size_t size = std::accumulate(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size(), 1, 423 | std::multiplies()); 424 | 425 | check_size_limit(size); 426 | 427 | return static_cast(size); 428 | } 429 | 430 | /// return the shape of the ndarray (as a vector for backward compatibility) 431 | std::vector shape() const { 432 | if (ndim() == 0) 433 | return std::vector(); 434 | return std::vector(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size()); 435 | } 436 | 437 | /** 438 | * return the effective shape of the ndarray (as a vector for backward compatibility) 439 | * 440 | * the effective shape removes all degenerate dimensions (i.e. shape(i)==1). 441 | */ 442 | std::vector effective_shape() const { 443 | std::vector shape; 444 | shape.reserve(ndim()); 445 | if (ndim() == 0) 446 | return shape; 447 | std::remove_copy_if(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size(), 448 | std::back_inserter(shape), std::bind2nd(std::equal_to(), 1)); 449 | return shape; 450 | } 451 | 452 | /// @return the ndarray info struct (const) 453 | const info_type& info() const { 454 | return m_info; 455 | } 456 | 457 | /// @return the ndarray info struct 458 | info_type& info() { 459 | return m_info; 460 | } 461 | 462 | /// true iff there are no "holes" in memory 463 | bool is_c_contiguous() const { 464 | return detail::is_c_contiguous(memory_layout_type(), m_info.host_shape, m_info.host_stride); 465 | } 466 | 467 | /// true iff it can be copied as a 2d array (only one dimension is pitched) 468 | bool is_2dcopyable() const { 469 | return detail::is_2dcopyable(memory_layout_type(), m_info.host_shape, m_info.host_stride); 470 | } 471 | 472 | /** @} */ // accessors 473 | /** 474 | * @name accessing stored values 475 | * @{ 476 | */ 477 | 478 | /** 479 | * member access: "flat" access as if memory was linear 480 | */ 481 | reference_type operator[](index_type idx) { 482 | size_type ndim = m_info.host_shape.size(); 483 | size_type* virtualstride = new size_type[ndim]; 484 | size_type pos = 0; 485 | if (IsSame::Result::value) { 486 | // row major 487 | { 488 | size_type virt_size = 1; 489 | for (int i = ndim - 1; i >= 0; --i) { 490 | virtualstride[i] = virt_size; 491 | virt_size *= m_info.host_shape[i]; 492 | } 493 | } 494 | for (size_type i = 0; i < ndim; ++i) { 495 | pos += (idx / virtualstride[i]) * m_info.host_stride[i]; 496 | idx -= (idx / virtualstride[i]) * virtualstride[i]; 497 | } 498 | } else { 499 | // column major 500 | { 501 | size_type virt_size = 1; 502 | for (unsigned int i = 0; i < ndim; ++i) { 503 | virtualstride[i] = virt_size; 504 | virt_size *= m_info.host_shape[i]; 505 | } 506 | } 507 | for (int i = ndim - 1; i >= 0; --i) { 508 | pos += (idx / virtualstride[i]) * m_info.host_stride[i]; 509 | idx -= (idx / virtualstride[i]) * virtualstride[i]; 510 | } 511 | } 512 | delete[] virtualstride; 513 | return reference_type(m_ptr + pos); 514 | } 515 | 516 | /** @overload */ 517 | const_reference_type operator[](index_type idx) const { 518 | return const_cast(*this)[idx]; 519 | } 520 | 521 | /** 522 | * get a reference to the datum at an index 523 | * @param i0 index for a 1-dimensional ndarray 524 | * @return reference to datum at i0 525 | */ 526 | reference_type operator()(index_type i0) { 527 | #ifndef NDEBUG 528 | cuvAssert(ndim()==1); 529 | cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)(-i0)= 0) { 532 | return reference_type(m_ptr + i0); 533 | } else { 534 | return reference_type(m_ptr + shape(0) - i0); 535 | } 536 | } 537 | 538 | /** @overload */ 539 | const_reference_type operator()(index_type i0) const { 540 | return const_cast(*this)(i0); 541 | } 542 | 543 | /** @overload */ 544 | const_reference_type operator()(index_type i0, index_type i1) const { 545 | return const_cast(*this)(i0, i1); 546 | } 547 | 548 | /** @overload */ 549 | reference_type operator()(index_type i0, index_type i1) { 550 | #ifndef NDEBUG 551 | cuvAssert(ndim()==2); 552 | cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)(-i0)=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)(-i1)(*this)(i0, i1, i2); 562 | } 563 | 564 | /** @overload */ 565 | reference_type operator()(index_type i0, index_type i1, index_type i2) { 566 | #ifndef NDEBUG 567 | cuvAssert(ndim()==3); 568 | cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2(*this)(i0, i1, i2, i3); 579 | } 580 | 581 | /** @overload */ 582 | reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3) { 583 | #ifndef NDEBUG 584 | cuvAssert(ndim()==4); 585 | cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2=0 && (size_type)i3 < shape(3)) || (i3<0 && (size_type)-i3(*this)(i0, i1, i2, i3, i4); 598 | } 599 | 600 | /** @overload */ 601 | reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3, index_type i4) { 602 | #ifndef NDEBUG 603 | cuvAssert(ndim()==5); 604 | cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2=0 && (size_type)i3 < shape(3)) || (i3<0 && (size_type)-i3=0 && (size_type)i4 < shape(4)) || (i4<0 && (size_type)-i4 _allocator = boost::make_shared()) : 623 | m_allocator(_allocator), m_info(_allocator), m_ptr(NULL) { 624 | } 625 | 626 | // **************************************************************** 627 | // Constructing from other ndarray 628 | // **************************************************************** 629 | 630 | /** 631 | * construct ndarray from ndarray of exact same type 632 | * 633 | * time O(1) 634 | */ 635 | ndarray(const ndarray& o) : 636 | m_allocator(o.m_allocator), 637 | m_info(o.m_info), // copy only shape 638 | m_memory(o.m_memory), // increase ref counter 639 | m_ptr(o.m_ptr) { 640 | } // same pointer in memory 641 | 642 | /** 643 | * construct ndarray from ndarray of other memory space 644 | * in (dense) /linear/ memory. Note: this /copies/ the memory! 645 | */ 646 | template 647 | ndarray(const ndarray& o, cudaStream_t stream = 0) : 648 | m_allocator(o.m_allocator), 649 | m_info(o.info()), // primarily to copy shape 650 | m_ptr(NULL) { 651 | copy_memory(o, linear_memory_tag(), stream); 652 | m_ptr = m_memory->ptr(); 653 | } 654 | 655 | /** 656 | * construct ndarray from ndarray of same memory space 657 | * in /pitched/ memory. Note: this /copies/ the memory! 658 | */ 659 | explicit ndarray(const ndarray& o, pitched_memory_tag, cudaStream_t stream = 0) : 660 | m_allocator(o.m_allocator), 661 | m_info(o.m_info), // primarily to copy shape 662 | m_ptr(NULL) { 663 | copy_memory(o, pitched_memory_tag(), stream); 664 | m_ptr = m_memory->ptr(); 665 | } 666 | 667 | /** 668 | * construct ndarray from ndarray of other memory space 669 | * in /pitched/ memory. Note: this /copies/ the memory! 670 | */ 671 | template 672 | explicit ndarray(const ndarray& o, pitched_memory_tag, cudaStream_t stream = 0) : 673 | m_allocator(o.m_allocator), 674 | m_info(o.info()), // primarily to copy shape 675 | m_ptr(NULL) { 676 | copy_memory(o, pitched_memory_tag(), stream); 677 | m_ptr = m_memory->ptr(); 678 | } 679 | 680 | /** 681 | * construct ndarray from ndarray of same memory space 682 | * in (dense) /linear/ memory. Note: this /copies/ the memory! 683 | */ 684 | explicit ndarray(const ndarray& o, linear_memory_tag, cudaStream_t stream = 0) : 685 | m_allocator(o.m_allocator), 686 | m_info(o.m_info), // primarily to copy shape 687 | m_ptr(NULL) { 688 | copy_memory(o, linear_memory_tag(), stream); 689 | m_ptr = m_memory->ptr(); 690 | } 691 | 692 | /** 693 | * construct ndarray from ndarray of other memory space 694 | * in (dense) /linear/ memory. Note: this /copies/ the memory! 695 | */ 696 | template 697 | explicit ndarray(const ndarray& o, linear_memory_tag, cudaStream_t stream = 0) : 698 | m_allocator(o.m_allocator), 699 | m_info(o.info()), // primarily to copy shape 700 | m_ptr(NULL) { 701 | copy_memory(o, linear_memory_tag(), stream); 702 | m_ptr = m_memory->ptr(); 703 | } 704 | 705 | /** 706 | * construct ndarray from other memory layout 707 | * 708 | * this does not copy memory, but reverses dimensions and strides 709 | * (and therefore only takes O(1) time) 710 | */ 711 | template 712 | explicit ndarray(const ndarray& o) : 713 | m_allocator(o.m_allocator), 714 | m_info(o.m_allocator), 715 | m_memory(o.mem()), // increase ref counter 716 | m_ptr(const_cast(o.ptr())) { // same pointer in memory 717 | m_info.host_shape = o.info().host_shape; 718 | m_info.host_shape.reverse(); 719 | m_info.host_stride = o.info().host_stride; 720 | m_info.host_stride.reverse(); 721 | } 722 | 723 | // **************************************************************** 724 | // Constructing from SHAPE 725 | // **************************************************************** 726 | 727 | /** 728 | * construct one-dimensional ndarray 729 | */ 730 | explicit ndarray(const size_type i, 731 | const boost::shared_ptr _allocator = boost::make_shared()) : 732 | m_allocator(_allocator), 733 | m_info(_allocator), 734 | m_ptr(NULL) { 735 | m_info.resize(1); 736 | m_info.host_shape[0] = i; 737 | allocate(*this, linear_memory_tag()); 738 | } 739 | 740 | /** 741 | * construct two-dimensional ndarray 742 | */ 743 | explicit ndarray(const size_type i, const int j, const boost::shared_ptr _allocator = 744 | boost::make_shared()) : 745 | m_allocator(_allocator), 746 | m_info(_allocator), 747 | m_ptr(NULL) { 748 | m_info.resize(2); 749 | m_info.host_shape[0] = i; 750 | m_info.host_shape[1] = j; 751 | allocate(*this, linear_memory_tag()); 752 | } 753 | 754 | /** 755 | * construct ndarray from a shape 756 | */ 757 | template 758 | explicit ndarray(const extent_gen& eg, 759 | const boost::shared_ptr _allocator = boost::make_shared()) : 760 | m_allocator(_allocator), 761 | m_info(_allocator), 762 | m_ptr(NULL) { 763 | m_info.resize(D); 764 | for (size_t i = 0; i < D; i++) 765 | m_info.host_shape[i] = eg.ranges_[i].finish(); 766 | allocate(*this, linear_memory_tag()); 767 | } 768 | 769 | /** 770 | * construct ndarray from a shape 771 | * 772 | * @deprecated 773 | */ 774 | explicit ndarray(const std::vector& eg, 775 | const boost::shared_ptr _allocator = boost::make_shared()) : 776 | m_allocator(_allocator), 777 | m_info(_allocator), 778 | m_ptr(NULL) { 779 | m_info.resize(eg.size()); 780 | for (size_t i = 0; i < eg.size(); i++) 781 | m_info.host_shape[i] = eg[i]; 782 | allocate(*this, linear_memory_tag()); 783 | } 784 | 785 | /** 786 | * construct ndarray from a shape 787 | * 788 | * @deprecated 789 | */ 790 | explicit ndarray(const std::vector& eg, pitched_memory_tag, 791 | const boost::shared_ptr _allocator = boost::make_shared()) : 792 | m_allocator(_allocator), 793 | m_info(_allocator), 794 | m_ptr(NULL) { 795 | m_info.resize(eg.size()); 796 | for (size_t i = 0; i < eg.size(); i++) 797 | m_info.host_shape[i] = eg[i]; 798 | allocate(*this, pitched_memory_tag()); 799 | } 800 | 801 | /** 802 | * construct ndarray from a shape (pitched) 803 | */ 804 | template 805 | explicit ndarray(const extent_gen& eg, pitched_memory_tag, const boost::shared_ptr _allocator = 806 | boost::make_shared()) : 807 | m_allocator(_allocator), 808 | m_info(_allocator), 809 | m_ptr(NULL) { 810 | m_info.resize(D); 811 | for (size_t i = 0; i < D; i++) 812 | m_info.host_shape[i] = eg.ranges_[i].finish(); 813 | allocate(*this, pitched_memory_tag()); 814 | } 815 | 816 | // **************************************************************** 817 | // Constructing from shape and raw pointer 818 | // **************************************************************** 819 | 820 | /** 821 | * construct ndarray from a shape and a pointer (does not copy memory) 822 | * 823 | * @warning You have to ensure that the memory lives as long as this object. 824 | */ 825 | template 826 | explicit ndarray(const extent_gen& eg, value_type* ptr, const boost::shared_ptr _allocator = 827 | boost::make_shared()) : 828 | m_allocator(_allocator), 829 | m_info(_allocator), 830 | m_ptr(ptr) { 831 | m_info.resize(D); 832 | size_t size = 1; 833 | if (IsSame::Result::value) { 834 | for (int i = D - 1; i >= 0; i--) { 835 | m_info.host_shape[i] = eg.ranges_[i].finish(); 836 | m_info.host_stride[i] = size; 837 | size *= eg.ranges_[i].finish(); 838 | } 839 | } else { 840 | for (size_t i = 0; i < D; i++) { 841 | m_info.host_shape[i] = eg.ranges_[i].finish(); 842 | m_info.host_stride[i] = size; 843 | size *= eg.ranges_[i].finish(); 844 | } 845 | } 846 | m_memory.reset(new memory(ptr, size, m_allocator, false)); 847 | } 848 | 849 | explicit ndarray(const std::vector& shape, value_type* ptr, 850 | const boost::shared_ptr _allocator = boost::make_shared()) : 851 | m_allocator(_allocator), 852 | m_info(_allocator), 853 | m_ptr(ptr) { 854 | unsigned int D = shape.size(); 855 | m_info.resize(D); 856 | size_type size = 1; 857 | if (IsSame::Result::value) 858 | for (int i = D - 1; i >= 0; i--) { 859 | m_info.host_shape[i] = shape[i]; 860 | m_info.host_stride[i] = size; 861 | size *= shape[i]; 862 | } 863 | else 864 | for (size_t i = 0; i < D; i++) { 865 | m_info.host_shape[i] = shape[i]; 866 | m_info.host_stride[i] = size; 867 | size *= shape[i]; 868 | } 869 | } 870 | /** 871 | * construct ndarray from a shape and a pointer (does not copy memory) 872 | * 873 | * @warning You have to ensure that the memory lives as long as this object. 874 | * @deprecated 875 | */ 876 | template 877 | explicit ndarray(const index_gen& idx, value_type* ptr, const boost::shared_ptr _allocator = 878 | boost::make_shared()) : 879 | m_allocator(_allocator), 880 | m_info(_allocator), 881 | m_ptr(ptr) { 882 | m_info.resize(D); 883 | size_type size = 1; 884 | if (IsSame::Result::value) 885 | for (int i = D - 1; i >= 0; i--) { 886 | m_info.host_shape[i] = idx.ranges_[i].finish(); 887 | m_info.host_stride[i] = size; 888 | size *= idx.ranges_[i].finish(); 889 | } 890 | else 891 | for (size_t i = 0; i < D; i++) { 892 | m_info.host_shape[i] = idx.ranges_[i].finish(); 893 | m_info.host_stride[i] = size; 894 | size *= idx.ranges_[i].finish(); 895 | } 896 | } 897 | // @} // constructors 898 | 899 | // **************************************************************** 900 | // assignment operators (try not to reallocate if shapes match) 901 | // **************************************************************** 902 | 903 | /** 904 | * @name assigning other values to a ndarray object 905 | * @{ 906 | */ 907 | 908 | /** 909 | * explicitly assign by copying memory 910 | */ 911 | template 912 | ndarray& assign(const ndarray& o, cudaStream_t stream = 0) { 913 | if (!copy_memory(o, false, stream)) 914 | throw std::runtime_error("copying ndarray did not succeed. Maybe a shape mismatch?"); 915 | return *this; 916 | } 917 | 918 | /** 919 | * assign from ndarray of same type 920 | * 921 | * always an O(1) operation. 922 | */ 923 | ndarray& operator=(const ndarray& o) { 924 | if (this == &o) 925 | return *this; // check for self-assignment 926 | 927 | // TODO make use of copy-and-swap idiom 928 | m_memory = o.mem(); 929 | m_ptr = const_cast(o.ptr()); 930 | m_info = o.info(); 931 | return *this; 932 | } 933 | 934 | /** 935 | * assign from value (sets all elements equal to one scalar) 936 | */ 937 | template 938 | typename boost::enable_if_c::value, ndarray&>::type operator=( 939 | const _V& scalar) { 940 | fill(*this, scalar); 941 | return *this; 942 | } 943 | 944 | /** 945 | * assign from ndarray of different memory space type. 946 | * 947 | * If shapes do not match, it defaults to linear memory. 948 | * 949 | * this copies memory (obviously) but tries to avoid reallocation 950 | */ 951 | template 952 | ndarray& assign(const ndarray& o, cudaStream_t stream = 0) { 953 | if (!copy_memory(o, false, stream)) 954 | copy_memory(o, linear_memory_tag(), stream); 955 | if (mem()) 956 | // if mem() does not exist, we're just wrapping a pointer 957 | // of a std::vector or so -> simply keep it 958 | m_ptr = mem()->ptr(); 959 | return *this; 960 | } 961 | 962 | /** 963 | * assign from ndarray of different memory space type. 964 | * 965 | * If shapes do not match, it defaults to linear memory. 966 | * 967 | * this copies memory (obviously) but tries to avoid reallocation 968 | */ 969 | template 970 | ndarray& operator=(const ndarray& o) { 971 | return assign(o); 972 | } 973 | 974 | /** 975 | * assign from ndarray of different memory layout type. 976 | * 977 | * this does not copy memory, but reverses strides and shapes. 978 | */ 979 | template 980 | ndarray& operator=(const ndarray& o) { 981 | return assign(o); 982 | } 983 | 984 | /** @} */ // assignment 985 | /** 986 | * copy memory using given allocator tag (linear/pitched) 987 | */ 988 | template 989 | ndarray copy(T tag = linear_memory_tag(), cudaStream_t stream = 0) const { 990 | ndarray t(m_allocator); 991 | const ndarray& o = *this; 992 | t.m_info = o.info(); 993 | t.copy_memory(o, tag, stream); 994 | t.m_ptr = t.mem()->ptr(); 995 | return t; 996 | } 997 | 998 | /** 999 | * copy memory using linear memory 1000 | */ 1001 | ndarray copy() const { 1002 | return copy(linear_memory_tag()); 1003 | } 1004 | 1005 | /** 1006 | * create a sub-ndarray of the current ndarray 1007 | * 1008 | * this works in O(1). 1009 | */ 1010 | template 1011 | ndarray_view operator[](const index_gen& idx) const { 1012 | 1013 | ndarray_view t(m_allocator); 1014 | const ndarray& o = *this; 1015 | t.m_memory = o.mem(); 1016 | t.m_ptr = const_cast(o.ptr()); 1017 | 1018 | std::vector shapes; 1019 | std::vector strides; 1020 | shapes.reserve(D); 1021 | strides.reserve(D); 1022 | cuvAssert(o.ndim()==D); 1023 | 1024 | for (size_t i = 0; i < D; i++) { 1025 | int start = idx.ranges_[i].get_start(0); 1026 | int finish = idx.ranges_[i].get_finish(o.shape(i)); 1027 | int stride = idx.ranges_[i].stride(); 1028 | if (start < 0) 1029 | start += o.shape(i); 1030 | if (finish < 0) 1031 | finish += o.shape(i); 1032 | #ifndef NDEBUG 1033 | cuvAssert(finish>start); 1034 | #endif 1035 | t.m_ptr += start * o.stride(i); 1036 | if (idx.ranges_[i].is_degenerate()) { 1037 | // skip dimension 1038 | } else { 1039 | shapes.push_back((finish - start) / stride); 1040 | strides.push_back(o.stride(i) * stride); 1041 | } 1042 | } 1043 | 1044 | // store in m_info 1045 | t.m_info.resize(shapes.size()); 1046 | 1047 | std::copy(shapes.begin(), shapes.end(), t.m_info.host_shape[0].ptr); 1048 | std::copy(strides.begin(), strides.end(), t.m_info.host_stride[0].ptr); 1049 | return t; // should not copy mem, only m_info 1050 | } 1051 | 1052 | /** 1053 | * reshape the ndarray (in place) 1054 | * 1055 | * works only for c_contiguous memory! 1056 | * 1057 | * @param eg new shape 1058 | */ 1059 | template 1060 | void reshape(const extent_gen& eg) { 1061 | std::vector shape(D); 1062 | for (size_t i = 0; i < D; i++) 1063 | shape[i] = eg.ranges_[i].finish(); 1064 | reshape(shape); 1065 | } 1066 | /** 1067 | * reshape the ndarray (in place) 1068 | * 1069 | * works only for c_contiguous memory! 1070 | * 1071 | * @param shape new shape 1072 | */ 1073 | void reshape(const std::vector& shape) { 1074 | size_type new_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); 1075 | if (!is_c_contiguous()) 1076 | throw std::runtime_error("cannot reshape: ndarray is not c_contiguous"); 1077 | if (size() != new_size) 1078 | throw std::runtime_error("cannot reshape: products do not match"); 1079 | m_info.resize(shape.size()); 1080 | size_type size = 1; 1081 | if (IsSame::Result::value) 1082 | for (int i = shape.size() - 1; i >= 0; i--) { 1083 | m_info.host_shape[i] = shape[i]; 1084 | m_info.host_stride[i] = size; 1085 | size *= shape[i]; 1086 | } 1087 | else 1088 | for (size_t i = 0; i < shape.size(); i++) { 1089 | m_info.host_shape[i] = shape[i]; 1090 | m_info.host_stride[i] = size; 1091 | size *= shape[i]; 1092 | } 1093 | } 1094 | /** 1095 | * convenience wrapper for reshape(extents[r][c]) 1096 | * @param r leading index of new shape 1097 | * @param c following index of new shape 1098 | */ 1099 | void reshape(size_type r, size_type c) { 1100 | reshape(extents[r][c]); 1101 | } 1102 | 1103 | /** 1104 | * resize the ndarray (deallocates memory if product changes, otherwise equivalent to reshape) 1105 | * 1106 | * @param shape new shape 1107 | */ 1108 | void resize(const std::vector& shape) { 1109 | if (ndim() != 0) { 1110 | size_type new_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies()); 1111 | if (is_c_contiguous() && size() == new_size) { 1112 | reshape(shape); 1113 | return; 1114 | } 1115 | } 1116 | 1117 | // free memory before we allocate new memory (important if pooling is active) 1118 | m_memory.reset(new memory(0, 0, m_allocator)); 1119 | *this = ndarray(shape, m_allocator); 1120 | } 1121 | /** 1122 | * resize the ndarray (deallocates memory if product changes, otherwise equivalent to reshape) 1123 | * 1124 | * @overload 1125 | * 1126 | * @param eg new shape 1127 | */ 1128 | template 1129 | void resize(const extent_gen& eg) { 1130 | std::vector shape(D); 1131 | for (size_t i = 0; i < D; i++) 1132 | shape[i] = eg.ranges_[i].finish(); 1133 | resize(shape); 1134 | } 1135 | 1136 | /** 1137 | * convenience wrapper for resize(extents[size]) 1138 | * @param size size of the new shape 1139 | */ 1140 | void resize(size_type size) { 1141 | resize(extents[size]); 1142 | } 1143 | 1144 | /** 1145 | * convenience wrapper for resize(extents[r][c]) 1146 | * @param r leading index of new shape 1147 | * @param c following index of new shape 1148 | */ 1149 | void resize(size_type r, size_type c) { 1150 | resize(extents[r][c]); 1151 | } 1152 | 1153 | /** 1154 | * force deallocation of memory if possible 1155 | */ 1156 | void dealloc() { 1157 | m_memory.reset(); 1158 | m_ptr = NULL; 1159 | m_info.host_shape.set_size(0); 1160 | } 1161 | 1162 | /// tries to copy memory, succeeds if shapes match AND both ndarrays are c_contiguous or 2d-copyable. 1163 | template 1164 | bool copy_memory(const ndarray& src, bool force_dst_contiguous, cudaStream_t stream) { 1165 | if (effective_shape() != src.effective_shape() || !ptr()) { 1166 | return false; 1167 | } 1168 | 1169 | assert(m_memory.get()); 1170 | // ATTENTION: m_ptr might be different than m_memory->ptr()! 1171 | 1172 | // TODO: this could be probably implemented in the memory classes as well 1173 | 1174 | if (is_c_contiguous() && src.is_c_contiguous()) { 1175 | // can copy w/o bothering about m_memory 1176 | m_memory->copy_from(m_ptr, src.ptr(), src.size(), OM(), stream); 1177 | } else if (is_c_contiguous() && src.is_2dcopyable()) { 1178 | size_type row, col, pitch; 1179 | detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL()); 1180 | m_memory->copy2d_from(m_ptr, src.ptr(), col, pitch, row, col, OM(), stream); 1181 | } else if (!force_dst_contiguous && is_2dcopyable() && src.is_c_contiguous()) { 1182 | size_type row, col, pitch; 1183 | detail::get_pitched_params(row, col, pitch, info().host_shape, info().host_stride, L()); 1184 | m_memory->copy2d_from(m_ptr, src.ptr(), pitch, col, row, col, OM(), stream); 1185 | } else if (!force_dst_contiguous && is_2dcopyable() && src.is_c_contiguous()) { 1186 | size_type srow, scol, spitch; 1187 | size_type drow, dcol, dpitch; 1188 | detail::get_pitched_params(drow, dcol, dpitch, info().host_shape, info().host_stride, L()); 1189 | detail::get_pitched_params(srow, scol, spitch, src.info().host_shape, src.info().host_stride, OL()); 1190 | cuvAssert(scol==srow); 1191 | cuvAssert(dcol==drow); 1192 | m_memory->copy2d_from(m_ptr, src.ptr(), dpitch, spitch, srow, scol, OM(), stream); 1193 | } else { 1194 | throw std::runtime_error("copying of generic strides not implemented yet"); 1195 | } 1196 | 1197 | if (!IsSame::Result::value) { 1198 | info().host_stride.reverse(); 1199 | info().host_shape.reverse(); 1200 | } 1201 | return true; 1202 | } 1203 | 1204 | /// copies between different memory spaces 1205 | template 1206 | void copy_memory(const ndarray& src, linear_memory_tag, cudaStream_t stream) { 1207 | if (copy_memory(src, true, stream)) // destination must be contiguous 1208 | return; 1209 | info().resize(src.ndim()); 1210 | info().host_shape = src.info().host_shape; 1211 | 1212 | // free old memory 1213 | m_memory.reset(new memory(m_allocator)); 1214 | 1215 | linear_memory d(src.size(), m_allocator); 1216 | d.set_strides(info().host_stride, info().host_shape, L()); 1217 | if (src.is_c_contiguous()) { 1218 | // easiest case: both linear, simply copy 1219 | d.copy_from(src.ptr(), src.size(), OM(), stream); 1220 | } else if (src.is_2dcopyable()) { 1221 | // other memory is probably a pitched memory or some view onto an array 1222 | size_type row, col, pitch; 1223 | detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL()); 1224 | d.copy2d_from(src.ptr(), col, pitch, row, col, OM(), stream); 1225 | } else { 1226 | throw std::runtime_error("copying arbitrarily strided memory not implemented"); 1227 | } 1228 | mem().reset(new memory(d.release(), d.size(), m_allocator)); 1229 | if (!IsSame::Result::value) { 1230 | info().host_stride.reverse(); 1231 | info().host_shape.reverse(); 1232 | } 1233 | } 1234 | 1235 | /// copies between different memory spaces 1236 | template 1237 | void copy_memory(const ndarray& src, pitched_memory_tag, cudaStream_t stream) { 1238 | assert(src.ndim()>=2); 1239 | if (copy_memory(src, false, stream)) // destination need not be contiguous 1240 | return; 1241 | info().resize(src.ndim()); 1242 | info().host_shape = src.info().host_shape; 1243 | size_type row, col, pitch; 1244 | detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL()); 1245 | pitched_memory d(row, col); 1246 | //dst.mem().reset(d); 1247 | d->set_strides(info().host_stride, info().host_shape, L()); 1248 | if (src.is_2dcopyable()) { 1249 | // other memory is probably a pitched memory or some view onto an array 1250 | detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL()); 1251 | d.copy2d_from(src, stream); 1252 | } else { 1253 | throw std::runtime_error("copying arbitrarily strided memory not implemented"); 1254 | } 1255 | mem().reset(new memory(d.release(), d.size(), m_allocator)); 1256 | 1257 | if (!IsSame::Result::value) { 1258 | info().host_stride.reverse(); 1259 | info().host_shape.reverse(); 1260 | } 1261 | } 1262 | 1263 | }; 1264 | 1265 | /** 1266 | * primarily used as result of ndarray::operator[] 1267 | */ 1268 | template 1269 | class ndarray_view: public ndarray 1270 | { 1271 | private: 1272 | typedef ndarray super; 1273 | using super::m_memory; 1274 | using super::m_ptr; 1275 | using super::m_info; 1276 | 1277 | template 1278 | friend class ndarray; 1279 | 1280 | public: 1281 | 1282 | /** default constructor does nothing */ 1283 | ndarray_view(const boost::shared_ptr& allocator) : 1284 | ndarray(allocator) { 1285 | } 1286 | 1287 | /** 1288 | * /always/ try to copy memory 1289 | */ 1290 | ndarray_view& assign(const ndarray& o, cudaStream_t stream = 0) { 1291 | if (!this->copy_memory(o, false, stream)) 1292 | throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?"); 1293 | return *this; 1294 | } 1295 | 1296 | /** 1297 | * /always/ try to copy memory 1298 | */ 1299 | ndarray_view& assign(const ndarray_view& o, cudaStream_t stream = 0) { 1300 | if (!this->copy_memory(o, false, stream)) 1301 | throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?"); 1302 | return *this; 1303 | } 1304 | 1305 | /** 1306 | * assignment operator for other memory space type 1307 | * 1308 | * @param o a ndarray of another memory space type 1309 | */ 1310 | template 1311 | ndarray_view& assign(const ndarray& o, cudaStream_t stream = 0) { 1312 | if (!this->copy_memory(o, false, stream)) 1313 | throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?"); 1314 | return *this; 1315 | } 1316 | 1317 | /** 1318 | * assignment operator for views in other memory space types 1319 | * 1320 | * @param o a ndarray_view of another memory space type 1321 | */ 1322 | template 1323 | ndarray_view& assign(const ndarray_view& o, cudaStream_t stream = 0) { 1324 | if (!this->copy_memory(o, false, stream)) 1325 | throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?"); 1326 | return *this; 1327 | } 1328 | 1329 | /** 1330 | * /always/ try to copy memory 1331 | */ 1332 | ndarray_view& operator=(const ndarray& o) { 1333 | return assign(o); 1334 | } 1335 | 1336 | /** 1337 | * /always/ try to copy memory 1338 | */ 1339 | ndarray_view& operator=(const ndarray_view& o) { 1340 | return assign(o); 1341 | } 1342 | 1343 | /** 1344 | * assign from value (sets all elements equal to one scalar) 1345 | * 1346 | * @param scalar value which should be assigned to all elements 1347 | */ 1348 | template 1349 | typename boost::enable_if_c::value, ndarray_view&>::type operator=( 1350 | const _V& scalar) { 1351 | super::operator=(scalar); 1352 | return *this; 1353 | } 1354 | 1355 | /** 1356 | * assignment operator for other memory space type 1357 | * 1358 | * @param o a ndarray of another memory space type 1359 | */ 1360 | template 1361 | ndarray_view& operator=(const ndarray& o) { 1362 | return assign(o); 1363 | } 1364 | 1365 | /** 1366 | * assignment operator for views in other memory space types 1367 | * 1368 | * @param o a ndarray_view of another memory space type 1369 | */ 1370 | template 1371 | ndarray_view& operator=(const ndarray_view& o) { 1372 | return assign(o); 1373 | } 1374 | 1375 | /** 1376 | * construct ndarray_view 1377 | * 1378 | * @warning if a dimension has size 1, the resulting ndarray has fewer dimensions than the original one. 1379 | * 1380 | * @warning most operations in CUV on ndarrays currently only work 1381 | * if the sub-ndarray is a connected area in memory. Basically this 1382 | * means that you can only slice in the first dimension which has 1383 | * size>1. 1384 | * 1385 | * @param idx the indices of the sub-ndarray 1386 | * @param o the original ndarray 1387 | * 1388 | * Example: 1389 | * @code 1390 | * ndarray v(extents[5][10]); 1391 | * 1392 | * // these are equivalent: 1393 | * ndarray w0(v,indices[index_range(2,3)][index_range(0,10)]); 1394 | * ndarray w0(v,indices[index_range(2,3)][index_range()]); 1395 | * ndarray w0(v,indices[index_range(2,3)][index_range() < index(10)]); 1396 | * ndarray w0(v,indices[index_range(2,3)][index(0) < index_range() < index(10)]); 1397 | * 1398 | * // yields a 1D-ndarray corresponding to the 2nd slice in the 1st dimension: 1399 | * ndarray w0(indices[1][index_range()]); 1400 | * @endcode 1401 | */ 1402 | template 1403 | explicit ndarray_view(const ndarray& o, const index_gen& idx) : 1404 | ndarray(o.m_allocator) 1405 | { 1406 | m_memory = o.mem(); 1407 | m_ptr = const_cast(o.ptr()); 1408 | std::vector shapes; 1409 | std::vector strides; 1410 | shapes.reserve(D); 1411 | strides.reserve(D); 1412 | cuvAssert(o.ndim()==D); 1413 | for (size_t i = 0; i < D; i++) { 1414 | int start = idx.ranges_[i].get_start(0); 1415 | int finish = idx.ranges_[i].get_finish(o.shape(i)); 1416 | int stride = idx.ranges_[i].stride(); 1417 | if (start < 0) 1418 | start += o.shape(i); 1419 | if (finish < 0) 1420 | finish += o.shape(i); 1421 | #ifndef NDEBUG 1422 | cuvAssert(finish>start); 1423 | #endif 1424 | m_ptr += start * o.stride(i); 1425 | if (idx.ranges_[i].is_degenerate()) { 1426 | // skip dimension 1427 | } else { 1428 | shapes.push_back((finish - start) / stride); 1429 | strides.push_back(o.stride(i) * stride); 1430 | } 1431 | } 1432 | // store in m_info 1433 | m_info.resize(shapes.size()); 1434 | std::copy(shapes.begin(), shapes.end(), m_info.host_shape[0].ptr); 1435 | std::copy(strides.begin(), strides.end(), m_info.host_stride[0].ptr); 1436 | } 1437 | 1438 | /** 1439 | * different order of arguments as above, all else being equal. 1440 | * 1441 | * @deprecated 1442 | * @param idx a set of index ranges into o 1443 | * @param o other ndarray 1444 | */ 1445 | template 1446 | explicit ndarray_view(const index_gen& idx, const ndarray& o) : 1447 | ndarray(o.m_allocator) 1448 | { 1449 | m_memory = o.mem(); 1450 | m_ptr = const_cast(o.ptr()); 1451 | std::vector shapes; 1452 | std::vector strides; 1453 | shapes.reserve(D); 1454 | strides.reserve(D); 1455 | cuvAssert(o.ndim()==D); 1456 | for (size_t i = 0; i < D; i++) { 1457 | int start = idx.ranges_[i].get_start(0); 1458 | int finish = idx.ranges_[i].get_finish(o.shape(i)); 1459 | int stride = idx.ranges_[i].stride(); 1460 | if (start < 0) 1461 | start += o.shape(i); 1462 | if (finish < 0) 1463 | finish += o.shape(i); 1464 | #ifndef NDEBUG 1465 | cuvAssert(finish>start); 1466 | #endif 1467 | m_ptr += start * o.stride(i); 1468 | if (idx.ranges_[i].is_degenerate()) { 1469 | // skip dimension 1470 | } else { 1471 | shapes.push_back((finish - start) / stride); 1472 | strides.push_back(o.stride(i) * stride); 1473 | } 1474 | } 1475 | // store in m_info 1476 | m_info.resize(shapes.size()); 1477 | std::copy(shapes.begin(), shapes.end(), m_info.host_shape[0].ptr); 1478 | std::copy(strides.begin(), strides.end(), m_info.host_stride[0].ptr); 1479 | } 1480 | }; 1481 | 1482 | /** @} */ // data_structures 1483 | /** 1484 | * test whether two ndarrays have the same shape 1485 | * @ingroup tools 1486 | * @param a first ndarray 1487 | * @param a second ndarray 1488 | */ 1489 | template 1490 | bool equal_shape(const ndarray& a, const ndarray& b) { 1491 | return a.effective_shape() == b.effective_shape(); 1492 | } 1493 | 1494 | /** 1495 | * @addtogroup MetaProgramming 1496 | */ 1497 | /// create a ndarray type with the same template parameters, but with switched value type 1498 | template 1499 | struct switch_value_type { 1500 | typedef ndarray type; ///< new ndarray type after switch 1501 | }; 1502 | /// create a ndarray type with the same template parameters, but with switched memory_layout_type 1503 | template 1504 | struct switch_memory_layout_type { 1505 | typedef ndarray type; ///< new ndarray type after switch 1506 | }; 1507 | /// create a ndarray type with the same template parameters, but with switched memory_space_type 1508 | template 1509 | struct switch_memory_space_type { 1510 | typedef ndarray type; ///< new ndarray type after switch 1511 | }; 1512 | 1513 | /** @} */ 1514 | 1515 | } 1516 | 1517 | /** 1518 | * input and output operations 1519 | * 1520 | * @addtogroup io 1521 | * @{ 1522 | */ 1523 | namespace std { 1524 | 1525 | /** 1526 | * print a host linear memory to a stream 1527 | * @param o the stream 1528 | * @param t the ndarray 1529 | */ 1530 | template 1531 | ostream& operator<<(ostream& o, const cuv::linear_memory& t) { 1532 | o << "[ "; 1533 | for (unsigned int i = 0; i < t.size(); i++) 1534 | o << t[i] << " "; 1535 | o << "]"; 1536 | return o; 1537 | } 1538 | 1539 | /** 1540 | * print a dev linear memory to a stream (copies first) 1541 | * @param o the stream 1542 | * @param t_ the ndarray 1543 | */ 1544 | template 1545 | ostream& operator<<(ostream& o, const cuv::linear_memory& t_) { 1546 | cuv::linear_memory t = t_; // pull 1547 | o << "[ "; 1548 | for (unsigned int i = 0; i < t.size(); i++) 1549 | o << t[i] << " "; 1550 | o << "]"; 1551 | return o; 1552 | } 1553 | 1554 | /** 1555 | * print a host pitched memory to a stream 1556 | * @param o the stream 1557 | * @param t the ndarray 1558 | */ 1559 | template 1560 | ostream& operator<<(ostream& o, const cuv::pitched_memory& t) { 1561 | o << "[ "; 1562 | for (unsigned int i = 0; i < t.rows(); i++) { 1563 | for (unsigned int j = 0; j < t.rows(); j++) { 1564 | o << t(i, j) << " "; 1565 | } 1566 | if (i < t.rows() - 1) 1567 | o << std::endl; 1568 | } 1569 | o << "]"; 1570 | return o; 1571 | } 1572 | 1573 | /** 1574 | * print a dev pitched memory to a stream (copies first) 1575 | * @param o the stream 1576 | * @param t_ the ndarray 1577 | */ 1578 | template 1579 | ostream& operator<<(ostream& o, const cuv::pitched_memory& t_) { 1580 | cuv::pitched_memory t = t_; // pull 1581 | o << "[ "; 1582 | for (unsigned int i = 0; i < t.rows(); i++) { 1583 | for (unsigned int j = 0; j < t.rows(); j++) { 1584 | o << t(i, j) << " "; 1585 | } 1586 | if (i < t.rows() - 1) 1587 | o << std::endl; 1588 | } 1589 | o << "]"; 1590 | return o; 1591 | } 1592 | 1593 | /** 1594 | * print a dev ndarray to a stream (copying to host first) 1595 | * 1596 | * @param o the stream 1597 | * @param t the ndarray 1598 | */ 1599 | template 1600 | ostream& operator<<(ostream& o, const cuv::ndarray& t) { 1601 | return o << cuv::ndarray(t); 1602 | } 1603 | 1604 | /** 1605 | * print a host ndarray to a stream 1606 | * 1607 | * @param o the stream 1608 | * @param t the ndarray 1609 | */ 1610 | template 1611 | ostream& operator<<(ostream& o, const cuv::ndarray& t) { 1612 | if (t.ndim() == 0) 1613 | return o << "[]"; 1614 | 1615 | if (t.ndim() == 1) { 1616 | o << "[ "; 1617 | for (unsigned int i = 0; i < t.shape(0); i++) 1618 | o << t[i] << " "; 1619 | return o << "]"; 1620 | } 1621 | if (t.ndim() == 2) { 1622 | o << "["; 1623 | for (unsigned int i = 0; i < t.shape(0); ++i) { 1624 | if (i > 0) 1625 | o << " "; 1626 | o << "[ "; 1627 | for (unsigned int j = 0; j < t.shape(1); j++) 1628 | o << t(i, j) << " "; 1629 | o << "]"; 1630 | if (i != t.shape(0) - 1) 1631 | o << std::endl; 1632 | } 1633 | return o << "]"; 1634 | } 1635 | if (t.ndim() == 3) { 1636 | o << "[" << std::endl; 1637 | for (unsigned int l = 0; l < t.shape(0); l++) { 1638 | o << "["; 1639 | for (unsigned int i = 0; i < t.shape(1); ++i) { 1640 | if (i > 0) 1641 | o << " "; 1642 | o << "[ "; 1643 | //for(unsigned int j=0;j3 dimensions not implemented"); 1657 | } 1658 | } 1659 | /** @} */ // io 1660 | #endif 1661 | -------------------------------------------------------------------------------- /src/cuv/reference.cu: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #include "reference.hpp" 28 | 29 | #include 30 | 31 | namespace cuv { 32 | namespace detail { 33 | 34 | template 35 | void entry_set(value_type* ptr, size_t idx, value_type val, host_memory_space) { 36 | ptr[idx] = val; 37 | } 38 | 39 | template 40 | value_type entry_get(const value_type* ptr, size_t idx, host_memory_space) { 41 | return ptr[idx]; 42 | } 43 | 44 | template 45 | void entry_set(value_type* ptr, size_t idx, value_type val, dev_memory_space) { 46 | thrust::device_ptr dev_ptr(ptr); 47 | dev_ptr[idx] = val; 48 | } 49 | 50 | template 51 | value_type entry_get(const value_type* ptr, size_t idx, dev_memory_space) { 52 | const thrust::device_ptr dev_ptr(ptr); 53 | return static_cast(*(dev_ptr + idx)); 54 | } 55 | 56 | } 57 | } 58 | 59 | template 60 | std::ostream& operator<<(std::ostream& os, const cuv::reference& reference) { 61 | os << static_cast(reference); 62 | return os; 63 | } 64 | 65 | 66 | #define CUV_REFERENCE_INST(TYPE) \ 67 | template void cuv::detail::entry_set(TYPE*, size_t, TYPE, cuv::host_memory_space); \ 68 | template void cuv::detail::entry_set(TYPE*, size_t, TYPE, cuv::dev_memory_space); \ 69 | template TYPE cuv::detail::entry_get(const TYPE*, size_t, cuv::host_memory_space); \ 70 | template TYPE cuv::detail::entry_get(const TYPE*, size_t, cuv::dev_memory_space); \ 71 | template std::ostream& operator<<(std::ostream& os, const cuv::reference& reference); \ 72 | template std::ostream& operator<<(std::ostream& os, const cuv::reference& reference); 73 | 74 | CUV_REFERENCE_INST(signed char); 75 | CUV_REFERENCE_INST(unsigned char); 76 | CUV_REFERENCE_INST(short); 77 | CUV_REFERENCE_INST(unsigned short); 78 | CUV_REFERENCE_INST(int); 79 | CUV_REFERENCE_INST(unsigned int); 80 | CUV_REFERENCE_INST(float); 81 | CUV_REFERENCE_INST(double); 82 | -------------------------------------------------------------------------------- /src/cuv/reference.hpp: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #ifndef __CUV_REFERENCE_HPP__ 28 | #define __CUV_REFERENCE_HPP__ 29 | 30 | #include 31 | #include 32 | #include 33 | 34 | #include "meta_programming.hpp" 35 | #include "tags.hpp" 36 | 37 | namespace cuv { 38 | 39 | namespace detail { 40 | 41 | /** 42 | * @brief Setting entry of host linear_memory at ptr at index idx to value val 43 | * 44 | * @param ptr Address of array in memory 45 | * @param idx Index of value to set 46 | * @param val Value to set linear_memory entry to 47 | * 48 | */ 49 | template 50 | void entry_set(value_type* ptr, size_t idx, value_type val, host_memory_space); 51 | 52 | /** 53 | * @brief Getting entry of host linear_memory at ptr at index idx 54 | * 55 | * @param ptr Address of array in memory 56 | * @param idx Index of value to get 57 | * 58 | * @return 59 | */ 60 | template 61 | value_type entry_get(const value_type* ptr, size_t idx, host_memory_space); 62 | 63 | template 64 | void entry_set(value_type* ptr, size_t idx, value_type val, dev_memory_space); 65 | 66 | /** 67 | * Set the value at *(ptr+idx) to val, when ptr is in dev_memory_space. 68 | */ 69 | template 70 | value_type entry_get(const value_type* ptr, size_t idx, dev_memory_space); 71 | 72 | } 73 | 74 | /** 75 | * This objects acts like a reference to the object stored at the wrapped pointer. 76 | * \ingroup data_structures 77 | */ 78 | template 79 | class reference 80 | { 81 | 82 | public: 83 | 84 | typedef typename unconst::type value_type; ///< the type of the pointer 85 | typedef M memory_space_type; ///< the memory space of the pointer 86 | typedef reference my_type; ///< the type of this reference 87 | 88 | value_type* ptr; ///< the wrapped pointer 89 | 90 | /// convert to the stored value 91 | operator value_type() const { 92 | return detail::entry_get(ptr, 0, memory_space_type()); 93 | } 94 | 95 | /// assign a new value 96 | void operator=(const value_type& v) { 97 | detail::entry_set(ptr, 0, v, memory_space_type()); 98 | } 99 | 100 | /// assign a value of a different (but convertible) value type 101 | template 102 | typename boost::enable_if_c::value>::type operator=(const _T& v) { 103 | detail::entry_set(ptr, 0, (value_type) v, memory_space_type()); 104 | } 105 | 106 | /// assignment from reference of same type 107 | reference& operator=(const reference& o) { 108 | if (&o == &(*this)) // operator & is overloaded and returns value_type* 109 | return *this; 110 | (*this) = (value_type) o; 111 | return *this; 112 | } 113 | 114 | /// assignment from reference of other memory type 115 | template 116 | reference& operator=(const reference& o) { 117 | (*this) = static_cast(o); 118 | return *this; 119 | } 120 | 121 | /// get the wrapped pointer 122 | const value_type* operator&() const { 123 | return ptr; 124 | } 125 | 126 | /// get the wrapped pointer 127 | value_type* operator&() { 128 | return ptr; 129 | } 130 | 131 | /// construct using a pointer 132 | reference(const T* p) : 133 | ptr(p) { 134 | } 135 | 136 | /// construct using a pointer 137 | reference(T* p) : 138 | ptr(p) { 139 | } 140 | 141 | /// implicit construction using value 142 | reference(value_type& p) : 143 | ptr(&p) { 144 | } 145 | 146 | /// implicit construction using value 147 | reference(const value_type& p) : 148 | ptr(&p) { 149 | } 150 | 151 | /// add to the value stored at ptr 152 | my_type& operator+=(const value_type& v) { 153 | *this = (value_type) (*this) + v; 154 | return *this; 155 | } 156 | 157 | /// subtract from the value stored at ptr 158 | my_type& operator-=(const value_type& v) { 159 | *this = (value_type) (*this) - v; 160 | return *this; 161 | } 162 | 163 | /// multiply with the value stored at ptr 164 | my_type& operator*=(const value_type& v) { 165 | *this = (value_type) (*this) * v; 166 | return *this; 167 | } 168 | 169 | /// divide by the value stored at ptr 170 | my_type& operator/=(const value_type& v) { 171 | *this = (value_type) (*this) / v; 172 | return *this; 173 | } 174 | 175 | /// increment value at ptr 176 | value_type operator++(int) { 177 | value_type v = *this; 178 | *this = v + 1; 179 | return v; 180 | } 181 | 182 | /// decrement value at ptr 183 | value_type operator--(int) { 184 | value_type v = *this; 185 | *this = v - 1; 186 | return v; 187 | } 188 | 189 | /// increment value at ptr 190 | value_type operator++() { 191 | value_type v = *this; 192 | *this = v + 1; 193 | return v + 1; 194 | } 195 | 196 | /// decrement value at ptr 197 | value_type operator--() { 198 | value_type v = *this; 199 | *this = v - 1; 200 | return v - 1; 201 | } 202 | 203 | /// compare value at ptr with another 204 | bool operator==(const value_type& v) { 205 | return ((value_type) *this) == v; 206 | } 207 | 208 | /// compare value at ptr with another 209 | bool operator<=(const value_type& v) { 210 | return ((value_type) *this) <= v; 211 | } 212 | 213 | /// compare value at ptr with another 214 | bool operator<(const value_type& v) { 215 | return ((value_type) *this) < v; 216 | } 217 | 218 | /// compare value at ptr with another 219 | bool operator>=(const value_type& v) { 220 | return ((value_type) *this) >= v; 221 | } 222 | 223 | /// compare value at ptr with another 224 | bool operator>(const value_type& v) { 225 | return ((value_type) *this) > v; 226 | } 227 | }; 228 | 229 | } 230 | 231 | template 232 | std::ostream& operator<<(std::ostream& os, const cuv::reference& reference); 233 | 234 | #endif 235 | -------------------------------------------------------------------------------- /src/cuv/tags.hpp: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #ifndef __CUV_TAGS_HPP__ 28 | #define __CUV_TAGS_HPP__ 29 | 30 | namespace cuv { 31 | /** 32 | * @addtogroup tags 33 | * @{ 34 | */ 35 | 36 | /** Tag for host memory 37 | * @ingroup basics 38 | */ 39 | struct host_memory_space { 40 | }; 41 | 42 | /** Tag for device memory 43 | * @ingroup basics 44 | */ 45 | struct dev_memory_space { 46 | }; 47 | 48 | /** 49 | * @} 50 | */ 51 | } 52 | 53 | #endif 54 | -------------------------------------------------------------------------------- /src/tests/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ####################################################################################### 2 | # The MIT License 3 | 4 | # Copyright (c) 2014 Hannes Schulz, University of Bonn 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2008-2009 Sebastian Nowozin 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIR}) 27 | INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/src/cuv) 28 | INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/src/) 29 | 30 | FIND_PACKAGE(TBB REQUIRED) 31 | 32 | CUDA_INCLUDE_DIRECTORIES( ${TBB_INCLUDE_DIRS} ) 33 | INCLUDE_DIRECTORIES( ${TBB_INCLUDE_DIRS} ) 34 | 35 | SET (TEST_LINK_LIBS ${Boost_LIBRARIES} ${TBB_LIBRARIES} ndarray${LIB_SUFFIX}) 36 | 37 | CUDA_ADD_EXECUTABLE(allocators_test allocators_test.cpp) 38 | TARGET_LINK_LIBRARIES(allocators_test ${TEST_LINK_LIBS}) 39 | 40 | CUDA_ADD_EXECUTABLE(ndarray_test ndarray_test.cpp) 41 | TARGET_LINK_LIBRARIES(ndarray_test ${TEST_LINK_LIBS}) 42 | 43 | ADD_TEST(allocators_test "${CMAKE_BINARY_DIR}/src/tests/allocators_test") 44 | ADD_TEST(ndarray_test "${CMAKE_BINARY_DIR}/src/tests/ndarray_test") 45 | -------------------------------------------------------------------------------- /src/tests/allocators_test.cpp: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #define BOOST_TEST_MODULE example 28 | 29 | #include 30 | #include 31 | #include 32 | #include 33 | #include 34 | #include 35 | 36 | using namespace cuv; 37 | 38 | BOOST_AUTO_TEST_SUITE(allocators_test) 39 | 40 | template 41 | static void test_pooled_allocator() { 42 | memory_space m; 43 | pooled_cuda_allocator allocator; 44 | int* ptr1 = 0; 45 | int* ptr2 = 0; 46 | 47 | const int NUM_ELEMENTS = 10000; 48 | 49 | allocator.alloc(reinterpret_cast(&ptr1), NUM_ELEMENTS, sizeof(int), m); 50 | allocator.alloc(reinterpret_cast(&ptr2), NUM_ELEMENTS, sizeof(int), m); 51 | BOOST_CHECK(ptr1); 52 | BOOST_CHECK(ptr2); 53 | BOOST_CHECK_NE(ptr1, ptr2); 54 | BOOST_CHECK_EQUAL(allocator.pool_count(m), 2); 55 | BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 0); 56 | BOOST_CHECK_EQUAL(allocator.pool_size(m), 2 * NUM_ELEMENTS * sizeof(int)); 57 | 58 | for (size_t i = 0; i < 10000; i++) { 59 | reference ref(ptr1 + i); 60 | ref = i; 61 | BOOST_CHECK_EQUAL(static_cast(ref), i); 62 | } 63 | 64 | allocator.dealloc(reinterpret_cast(&ptr1), m); 65 | BOOST_CHECK(ptr1 == 0); 66 | 67 | BOOST_CHECK_EQUAL(allocator.pool_count(m), 2); 68 | BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 1); 69 | BOOST_CHECK_EQUAL(allocator.pool_size(m), 2 * NUM_ELEMENTS * sizeof(int)); 70 | 71 | for (size_t i = 0; i < 10000; i++) { 72 | reference ref(ptr2 + i); 73 | ref = i + 100; 74 | BOOST_CHECK_EQUAL(static_cast(ref), i + 100); 75 | } 76 | 77 | allocator.dealloc(reinterpret_cast(&ptr2), m); 78 | 79 | BOOST_CHECK_EQUAL(allocator.pool_free_count(), allocator.pool_count()); 80 | } 81 | 82 | template 83 | class allocate { 84 | private: 85 | pooled_cuda_allocator& allocator; 86 | int allocSize; 87 | boost::mutex& mutex; 88 | 89 | 90 | public: 91 | allocate(pooled_cuda_allocator& allocator, int allocSize, boost::mutex& mutex) 92 | : allocator(allocator), allocSize(allocSize), mutex(mutex) {} 93 | 94 | void operator()(void*& ptr) const { 95 | memory_space m; 96 | size_t pool_size = allocator.pool_size(m); 97 | void* ptr1 = NULL; 98 | void* ptr2 = NULL; 99 | allocator.alloc(&ptr1, allocSize, 1, m); 100 | allocator.alloc(&ptr2, 1, 1, m); 101 | allocator.alloc(&ptr, allocSize, 1, m); 102 | 103 | { 104 | boost::mutex::scoped_lock lock(mutex); 105 | BOOST_REQUIRE(ptr1); 106 | BOOST_REQUIRE(ptr2); 107 | BOOST_REQUIRE(ptr); 108 | 109 | BOOST_REQUIRE_NE(ptr1, ptr2); 110 | BOOST_REQUIRE_NE(ptr2, ptr); 111 | BOOST_REQUIRE_NE(ptr1, ptr); 112 | 113 | BOOST_REQUIRE_GE(allocator.pool_count(m), 2lu); 114 | } 115 | 116 | allocator.dealloc(&ptr1, m); 117 | allocator.dealloc(&ptr2, m); 118 | 119 | { 120 | boost::mutex::scoped_lock lock(mutex); 121 | BOOST_REQUIRE_GE(allocator.pool_size(m), pool_size); 122 | BOOST_REQUIRE_GE(allocator.pool_free_count(m), 0lu); 123 | } 124 | } 125 | }; 126 | 127 | template 128 | class deallocate { 129 | private: 130 | pooled_cuda_allocator& allocator; 131 | boost::mutex& mutex; 132 | 133 | public: 134 | deallocate(pooled_cuda_allocator &allocator, boost::mutex &mutex) 135 | : allocator(allocator), mutex(mutex) {} 136 | 137 | void operator()(void*& ptr) const { 138 | allocator.dealloc(&ptr, memory_space()); 139 | 140 | { 141 | boost::mutex::scoped_lock lock(mutex); 142 | BOOST_CHECK(!ptr); 143 | } 144 | } 145 | }; 146 | 147 | template 148 | static void test_pooled_allocator_multi_threaded() { 149 | memory_space m; 150 | pooled_cuda_allocator allocator("allocator_multi_threaded"); 151 | 152 | const int allocSize = pooled_cuda_allocator::MIN_SIZE_HOST; 153 | 154 | // boost-test is not thread-safe 155 | boost::mutex boost_mutex; 156 | 157 | std::vector pointers(1000, NULL); 158 | tbb::parallel_for_each(pointers.begin(), pointers.end(), 159 | allocate(allocator, allocSize, boost_mutex)); 160 | 161 | for (size_t i = 0; i < pointers.size(); i++) { 162 | BOOST_REQUIRE(pointers[i]); 163 | } 164 | 165 | BOOST_CHECK_GE(allocator.pool_size(m), pointers.size() * allocSize); 166 | BOOST_CHECK_LE(allocator.pool_count(m), 10 * pointers.size()); 167 | 168 | size_t count = allocator.pool_count(m); 169 | BOOST_CHECK_GE(count, pointers.size()); 170 | 171 | tbb::parallel_for_each(pointers.begin(), pointers.end(), 172 | deallocate(allocator, boost_mutex)); 173 | 174 | BOOST_CHECK_EQUAL(allocator.pool_free_count(), allocator.pool_count()); 175 | } 176 | 177 | template 178 | static void test_pooled_allocator_garbage_collection() { 179 | memory_space m; 180 | pooled_cuda_allocator allocator; 181 | int* ptr1 = 0; 182 | int* ptr2 = 0; 183 | allocator.alloc(reinterpret_cast(&ptr1), 10000, sizeof(int), m); 184 | allocator.alloc(reinterpret_cast(&ptr2), 10000, sizeof(int), m); 185 | 186 | BOOST_CHECK_EQUAL(allocator.pool_count(m), 2); 187 | 188 | allocator.dealloc(reinterpret_cast(&ptr1), m); 189 | 190 | BOOST_CHECK_EQUAL(allocator.pool_count(m), 2); 191 | BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 1); 192 | 193 | allocator.garbage_collection(); 194 | 195 | BOOST_CHECK_EQUAL(allocator.pool_count(m), 1); 196 | BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 0); 197 | 198 | allocator.dealloc(reinterpret_cast(&ptr2), m); 199 | 200 | BOOST_CHECK_EQUAL(allocator.pool_count(m), 1); 201 | BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 1); 202 | 203 | allocator.garbage_collection(); 204 | 205 | BOOST_CHECK_EQUAL(allocator.pool_count(m), 0); 206 | BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 0); 207 | } 208 | 209 | BOOST_AUTO_TEST_CASE( pooled_cuda_allocator_test_simple ) { 210 | test_pooled_allocator(); 211 | test_pooled_allocator(); 212 | } 213 | 214 | BOOST_AUTO_TEST_CASE( pooled_cuda_allocator_test_multithreaded ) { 215 | test_pooled_allocator_multi_threaded(); 216 | test_pooled_allocator_multi_threaded(); 217 | } 218 | 219 | BOOST_AUTO_TEST_CASE( pooled_cuda_allocator_test_garbage_collection ) { 220 | test_pooled_allocator_garbage_collection(); 221 | test_pooled_allocator_garbage_collection(); 222 | } 223 | BOOST_AUTO_TEST_SUITE_END() 224 | -------------------------------------------------------------------------------- /src/tests/ndarray_test.cpp: -------------------------------------------------------------------------------- 1 | #if 0 2 | ####################################################################################### 3 | # The MIT License 4 | 5 | # Copyright (c) 2013 Benedikt Waldvogel, University of Bonn 6 | # Copyright (c) 2012-2014 Hannes Schulz, University of Bonn 7 | 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy 9 | # of this software and associated documentation files (the "Software"), to deal 10 | # in the Software without restriction, including without limitation the rights 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 12 | # copies of the Software, and to permit persons to whom the Software is 13 | # furnished to do so, subject to the following conditions: 14 | # 15 | # The above copyright notice and this permission notice shall be included in all 16 | # copies or substantial portions of the Software. 17 | # 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 24 | # SOFTWARE. 25 | ####################################################################################### 26 | #endif 27 | #define BOOST_TEST_MODULE example 28 | #include 29 | #include 30 | 31 | using namespace cuv; 32 | 33 | BOOST_AUTO_TEST_SUITE(ndarray_test) 34 | 35 | /** 36 | * @test 37 | * @brief create ndarray 38 | */ 39 | BOOST_AUTO_TEST_CASE( create_ndarray ) { 40 | // column_major 41 | ndarray m(extents[2][3][4]); 42 | BOOST_CHECK_EQUAL(24, m.size()); 43 | BOOST_CHECK_EQUAL(2ul, m.shape(0)); 44 | BOOST_CHECK_EQUAL(3ul, m.shape(1)); 45 | BOOST_CHECK_EQUAL(4ul, m.shape(2)); 46 | 47 | BOOST_CHECK_EQUAL(0ul, m.index_of(extents[0][0][0])); 48 | // column major test 49 | BOOST_CHECK_EQUAL(1ul, m.index_of(extents[1][0][0])); 50 | BOOST_CHECK_EQUAL(2ul, m.index_of(extents[0][1][0])); 51 | 52 | // row_major 53 | ndarray n(extents[2][3][4]); 54 | BOOST_CHECK_EQUAL(24, m.size()); 55 | BOOST_CHECK_EQUAL(2ul, n.shape(0)); 56 | BOOST_CHECK_EQUAL(3ul, n.shape(1)); 57 | BOOST_CHECK_EQUAL(4ul, n.shape(2)); 58 | 59 | BOOST_CHECK_EQUAL(0ul, n.index_of(extents[0][0][0])); 60 | // row major test 61 | BOOST_CHECK_EQUAL(1ul, n.index_of(extents[0][0][1])); 62 | BOOST_CHECK_EQUAL(2ul, n.index_of(extents[0][0][2])); 63 | BOOST_CHECK_EQUAL(4ul, n.index_of(extents[0][1][0])); 64 | } 65 | 66 | BOOST_AUTO_TEST_CASE( ndarray_data_access ) { 67 | ndarray m(extents[2][3][4]); 68 | ndarray n(extents[2][3][4]); 69 | 70 | ndarray o(extents[2][3][4]); 71 | ndarray p(extents[2][3][4]); 72 | for (int i = 0; i < 2; ++i) { 73 | for (int j = 0; j < 3; ++j) { 74 | for (int k = 0; k < 4; ++k) { 75 | m(i, j, k) = i * j + k; 76 | n(i, j, k) = i * j + k; 77 | 78 | o(i, j, k) = i * j + k; 79 | p(i, j, k) = i * j + k; 80 | } 81 | } 82 | } 83 | BOOST_CHECK_EQUAL(1*2+3, m(1,2,3)); 84 | BOOST_CHECK_EQUAL(1*2+3, n(1,2,3)); 85 | BOOST_CHECK_EQUAL(1*2+3, o(1,2,3)); 86 | BOOST_CHECK_EQUAL(1*2+3, p(1,2,3)); 87 | 88 | BOOST_CHECK_EQUAL(1*2+3-1, --p(1,2,3)); 89 | BOOST_CHECK_EQUAL(1*2+3, p(1,2,3)+=1); 90 | } 91 | 92 | BOOST_AUTO_TEST_CASE( ndarray_assignment ) { 93 | ndarray m(extents[2][3][4]); 94 | ndarray n(extents[2][3][4]); 95 | 96 | ndarray o(extents[2][3][4]); 97 | 98 | for (int i = 0; i < 2 * 3 * 4; ++i) 99 | m[i] = i; 100 | n = m; 101 | o = m; 102 | 103 | ndarray s(n); 104 | ndarray t(n); 105 | 106 | for (int i = 0; i < 2 * 3 * 4; ++i) { 107 | BOOST_CHECK_EQUAL(m[i], i); 108 | BOOST_CHECK_EQUAL(n[i], i); 109 | BOOST_CHECK_EQUAL(o[i], i); 110 | BOOST_CHECK_EQUAL(s[i], i); 111 | BOOST_CHECK_EQUAL(t[i], i); 112 | } 113 | 114 | } 115 | 116 | BOOST_AUTO_TEST_CASE( ndarray_zero_copy_assignment ) { 117 | ndarray x(extents[4][5][6]); 118 | for (int i = 0; i < 4 * 5 * 6; i++) { 119 | x[i] = i; 120 | } 121 | 122 | ndarray y = x; 123 | 124 | for (int i = 0; i < 4 * 5 * 6; i++) { 125 | BOOST_CHECK_EQUAL(x[i], y[i]); 126 | y[i] = i + 1; // change the copy results in change of original! 127 | BOOST_CHECK_EQUAL(x[i], y[i]); 128 | } 129 | } 130 | 131 | BOOST_AUTO_TEST_CASE( ndarray_copy ) { 132 | boost::shared_ptr allocator(new pooled_cuda_allocator("ndarray_copy")); 133 | ndarray x(extents[4][5][6], allocator); 134 | for (int i = 0; i < 4 * 5 * 6; i++) { 135 | x[i] = i; 136 | } 137 | 138 | ndarray y = x.copy(); 139 | BOOST_CHECK_NE(x.ptr(), y.ptr()); 140 | 141 | for (int i = 0; i < 4; i++) { 142 | BOOST_CHECK_NE(x[indices[i][index_range()][index_range()]].ptr(), 143 | y[indices[i][index_range()][index_range()]].ptr()); 144 | } 145 | 146 | ndarray y2(x.copy()); 147 | BOOST_CHECK_NE(x.ptr(), y2.ptr()); 148 | 149 | for (int i = 0; i < 4; i++) { 150 | BOOST_CHECK_NE(x[indices[i][index_range()][index_range()]].ptr(), 151 | y2[indices[i][index_range()][index_range()]].ptr()); 152 | } 153 | 154 | for (int i = 0; i < 4 * 5 * 6; i++) { 155 | BOOST_CHECK_EQUAL(x[i], y[i]); 156 | y[i]++; // change must not change original! 157 | BOOST_CHECK_NE(x[i], y[i]); 158 | } 159 | } 160 | 161 | BOOST_AUTO_TEST_CASE( ndarray_copy_assignment ) 162 | { 163 | ndarray x(extents[10][10][10]); 164 | ndarray y(extents[1][10][10]); 165 | 166 | // avoid fill() dependency 167 | for (int i = 0; i < x.size(); i++) { 168 | x.ptr()[i] = 0; 169 | } 170 | for (int i = 0; i < y.size(); i++) { 171 | y.ptr()[i] = 1; 172 | } 173 | 174 | // assign to a view should copy the array 175 | x[indices[1][index_range(0, 10)][index_range(0, 10)]] = y[indices[0][index_range()][index_range()]]; 176 | 177 | // x[0, ...] must remain unchanged 178 | for (int i = 0; i < 10 * 10; i++) { 179 | BOOST_REQUIRE_EQUAL(x.ptr()[i], 0); 180 | } 181 | 182 | // x[1, ...] must have changed to 1 183 | for (int i = 0; i < 10 * 10; i++) { 184 | BOOST_REQUIRE_EQUAL(x.ptr()[100 + i], 1); 185 | } 186 | 187 | // changing y must not influence x 188 | for (int i = 0; i < y.size(); i++) { 189 | y.ptr()[i] = 2; 190 | } 191 | 192 | for (int i = 0; i < 10 * 10; i++) { 193 | BOOST_REQUIRE_EQUAL(x.ptr()[100 + i], 1); 194 | } 195 | } 196 | 197 | BOOST_AUTO_TEST_CASE( ndarray_out_of_scope_view ) { 198 | // sub-ndarray views should persist when original ndarray falls out of scope 199 | ndarray y; 200 | { 201 | ndarray x(extents[4][5][6]); 202 | for (int i = 0; i < 4; ++i) 203 | for (int j = 0; j < 5; ++j) 204 | for (int k = 0; k < 6; ++k) 205 | x(i, j, k) = i + j + k; 206 | y = x[indices[index_range(1, 3)][index_range()][index_range()]]; 207 | } 208 | for (int i = 1; i < 3; ++i) 209 | for (int j = 0; j < 5; ++j) 210 | for (int k = 0; k < 6; ++k) { 211 | BOOST_CHECK_EQUAL(y(i-1,j,k), i+j+k); 212 | } 213 | } 214 | 215 | BOOST_AUTO_TEST_CASE( ndarray_slice1col ) { 216 | ndarray y; 217 | ndarray x(extents[4][5][6]); 218 | 219 | for (int i = 0; i < 4; ++i) { 220 | for (int j = 0; j < 5; ++j) { 221 | for (int k = 0; k < 6; ++k) { 222 | x(i, j, k) = i + j + k; 223 | } 224 | } 225 | } 226 | 227 | // accessing strided memory 228 | y = x[indices[index_range(0,1)][index_range()][index_range()]]; 229 | for (int i = 0; i < 1; ++i) { 230 | for (int j = 0; j < 5; ++j) { 231 | for (int k = 0; k < 6; ++k) { 232 | BOOST_CHECK_EQUAL(y(i,j,k), i+j+k); 233 | } 234 | } 235 | } 236 | x[indices[index_range(0,1)][index_range()][index_range()]] = y.copy(); 237 | } 238 | 239 | BOOST_AUTO_TEST_CASE( ndarray_slice1row ) { 240 | ndarray y; 241 | ndarray x(extents[4][5][6]); 242 | 243 | for (int i = 0; i < 4; ++i) { 244 | for (int j = 0; j < 5; ++j) { 245 | for (int k = 0; k < 6; ++k) { 246 | x(i, j, k) = i + j + k; 247 | } 248 | } 249 | } 250 | 251 | // accessing strided memory 252 | y = x[indices[index_range()][index_range()][index_range(0,1)]]; 253 | for (int i = 0; i < 4; ++i) { 254 | for (int j = 0; j < 5; ++j) { 255 | for (int k = 0; k < 1; ++k) { 256 | BOOST_CHECK_EQUAL(y(i,j,k), i+j+k); 257 | } 258 | } 259 | } 260 | x[indices[index_range()][index_range()][index_range(0,1)]] = y.copy(); 261 | } 262 | 263 | BOOST_AUTO_TEST_CASE( ndarray_memcpy2d ) { 264 | ndarray y; 265 | ndarray x(extents[4][5][6]); 266 | 267 | for (int i = 0; i < 4; ++i) { 268 | for (int j = 0; j < 5; ++j) { 269 | for (int k = 0; k < 6; ++k) { 270 | x(i, j, k) = i + j + k; 271 | } 272 | } 273 | } 274 | 275 | // accessing strided memory 276 | y = x[indices[index_range()][index_range()][index_range(0, 1)]]; 277 | for (int i = 0; i < 4; ++i) { 278 | for (int j = 0; j < 5; ++j) { 279 | for (int k = 0; k < 1; ++k) { 280 | BOOST_CHECK_EQUAL(y(i,j,k), i+j+k); 281 | } 282 | } 283 | } 284 | 285 | // copying strided memory 286 | y = y.copy(); // y in R^(4,5,1) 287 | for (size_t k = 0; k < y.size(); k++) { // avoid fill() dependency in this file (speed up compiling...) 288 | y[k] = 0.f; 289 | } 290 | 291 | ndarray_view m(x, indices[index_range()][index_range()][index_range(0, 1)]); 292 | m = y; 293 | for (int i = 0; i < 4; ++i) { 294 | for (int j = 0; j < 5; ++j) { 295 | for (int k = 0; k < 1; ++k) { 296 | if (k != 0) { 297 | BOOST_CHECK_EQUAL(x(i,j,k), i+j+k); 298 | } else { 299 | BOOST_CHECK_EQUAL(x(i,j,k), 0.f); 300 | } 301 | } 302 | } 303 | } 304 | } 305 | 306 | template 307 | void test_resize() { 308 | 309 | // resize with default allocator 310 | ndarray a(100, 100); 311 | V* p0 = a.ptr(); 312 | a.resize(100, 100); 313 | BOOST_CHECK_EQUAL(p0, a.ptr()); 314 | // no size change. pointer must not change 315 | 316 | boost::shared_ptr allocator(new pooled_cuda_allocator("test_resize")); 317 | { 318 | ndarray a(200, 300, allocator); 319 | 320 | BOOST_CHECK_EQUAL(a.shape(0), 200); 321 | BOOST_CHECK_EQUAL(a.shape(1), 300); 322 | 323 | BOOST_CHECK_EQUAL(allocator->pool_count(M()), 1); 324 | BOOST_CHECK_EQUAL(allocator->pool_free_count(M()), 0); 325 | BOOST_CHECK_EQUAL(allocator->pool_size(M()), 200 * 300 * sizeof(V)); 326 | 327 | a.resize(100, 100); 328 | 329 | // make sure the memory is freed before new memory is allocated 330 | 331 | BOOST_CHECK_EQUAL(allocator->pool_count(M()), 1); 332 | BOOST_CHECK_EQUAL(allocator->pool_free_count(M()), 0); 333 | BOOST_CHECK_EQUAL(allocator->pool_size(M()), 200 * 300 * sizeof(V)); 334 | 335 | BOOST_CHECK_EQUAL(a.shape(0), 100); 336 | BOOST_CHECK_EQUAL(a.shape(1), 100); 337 | } 338 | 339 | BOOST_CHECK_EQUAL(allocator->pool_count(M()), 1); 340 | BOOST_CHECK_EQUAL(allocator->pool_free_count(M()), 1); 341 | } 342 | 343 | template 344 | void test_pushpull_2d() { 345 | static const int h = 123, w = 247; 346 | ndarray t1; 347 | ndarray t2(extents[h][w]); 348 | 349 | for (int i = 0; i < h; i++) 350 | for (int j = 0; j < w; j++) { 351 | t2(i, j) = (float) drand48(); 352 | } 353 | t1 = t2; 354 | BOOST_CHECK(equal_shape(t1,t2)); 355 | for (int i = 0; i < h; i++) { 356 | for (int j = 0; j < w; j++) { 357 | BOOST_CHECK_EQUAL( (V) t1(i,j), (V) t2(i,j)); 358 | } 359 | } 360 | } 361 | 362 | template 363 | void test_pushpull_3d() { 364 | static const int d = 3, h = 123, w = 247; 365 | ndarray t1; 366 | ndarray t2(extents[d][h][w]); 367 | 368 | // *************************************** 369 | // assignment 2D --> 1D 370 | // *************************************** 371 | for (int k = 0; k < d; k++) 372 | for (int i = 0; i < h; i++) 373 | for (int j = 0; j < w; j++) { 374 | t2(k, i, j) = (float) drand48(); 375 | } 376 | t1 = t2; 377 | BOOST_CHECK(equal_shape(t1,t2)); 378 | for (int k = 0; k < d; ++k) { 379 | for (int i = 0; i < h; i++) { 380 | for (int j = 0; j < w; j++) { 381 | BOOST_CHECK_EQUAL( (V) t1(k,i,j), (V) t2(k,i,j)); 382 | } 383 | } 384 | } 385 | } 386 | 387 | template 388 | void test_lowdim_views() { 389 | static const int d = 3, h = 123, w = 247; 390 | ndarray t1d(extents[d][h][w]); 391 | ndarray t2d(extents[d][h][w]); 392 | 393 | for (int k = 0; k < d; k++) { 394 | for (int i = 0; i < h; i++) { 395 | for (int j = 0; j < w; j++) { 396 | t2d(k, i, j) = (float) drand48(); 397 | } 398 | } 399 | } 400 | 401 | // *************************************** 402 | // 2D View on 3D ndarray 403 | // *************************************** 404 | for (int k = 0; k < d; ++k) { 405 | ndarray_view view(indices[k][index_range(0, h)][index_range(0, w)], t2d); 406 | BOOST_CHECK_EQUAL( view.ndim(), 2); 407 | BOOST_CHECK_EQUAL( view.shape(0), h); 408 | BOOST_CHECK_EQUAL( view.shape(1), w); 409 | for (int i = 0; i < h; i++) { 410 | for (int j = 0; j < w; j++) { 411 | BOOST_CHECK_EQUAL( (V) view(i,j), (V) t2d(k,i,j)); 412 | } 413 | } 414 | 415 | // alternative spec 416 | ndarray_view view_(indices[k][index_range()][index_range() < cuv::index(w)], t2d); 417 | BOOST_CHECK_EQUAL( view_.ndim(), 2); 418 | BOOST_CHECK_EQUAL( view_.shape(0), h); 419 | BOOST_CHECK_EQUAL( view_.shape(1), w); 420 | for (int i = 0; i < h; i++) { 421 | for (int j = 0; j < w; j++) { 422 | BOOST_CHECK_EQUAL( (V) view_(i,j), (V) t2d(k,i,j)); 423 | } 424 | } 425 | } 426 | 427 | // *************************************** 428 | // 1D View on 3D ndarray 429 | // *************************************** 430 | for (int k = 0; k < d; ++k) { 431 | for (int i = 0; i < h; ++i) { 432 | ndarray_view view(indices[k][i][index_range(0, w)], t2d); 433 | for (int j = 0; j < w; j++) { 434 | BOOST_REQUIRE_EQUAL( (V) view(j), (V) t2d(k,i,j)); 435 | } 436 | } 437 | } 438 | } 439 | 440 | BOOST_AUTO_TEST_CASE( lowdim_views ) { 441 | test_lowdim_views(); 442 | test_lowdim_views(); 443 | } 444 | 445 | BOOST_AUTO_TEST_CASE( ndarray_wrapping ) { 446 | { 447 | std::vector v_orig(10, 0.f); 448 | ndarray v(extents[10], &v_orig[0]); 449 | ndarray w(extents[10]); 450 | for (unsigned int i = 0; i < 10; i++) 451 | w[i] = 1.f; 452 | 453 | // overwrite the wrapped memory (needs copying) 454 | v = w; 455 | } 456 | { 457 | std::vector v_orig(10, 0.f); 458 | ndarray v(extents[10], &v_orig[0]); 459 | ndarray w(extents[10]); 460 | for (unsigned int i = 0; i < 10; i++) 461 | w[i] = 1.f; 462 | 463 | // overwrite the wrapped memory (needs copying) 464 | v = w; 465 | } 466 | } 467 | 468 | BOOST_AUTO_TEST_CASE( pushpull_nd ) { 469 | // same memory space, linear container 470 | test_pushpull_2d(); 471 | test_pushpull_2d(); 472 | 473 | // same memory space, 2d container 474 | test_pushpull_2d(); 475 | test_pushpull_2d(); 476 | 477 | // same memory space, 2d vs. 1d 478 | test_pushpull_2d(); 479 | test_pushpull_2d(); 480 | test_pushpull_2d(); 481 | test_pushpull_2d(); 482 | } 483 | 484 | BOOST_AUTO_TEST_CASE( ndarray_resize ) { 485 | test_resize(); 486 | test_resize(); 487 | } 488 | 489 | BOOST_AUTO_TEST_CASE( create_lm ) 490 | { 491 | unsigned int N = 54; 492 | { 493 | linear_memory v(N); 494 | BOOST_CHECK_EQUAL(v.size(), N); 495 | BOOST_CHECK_NE(v.ptr(), (float*)NULL); 496 | v.dealloc(); 497 | BOOST_CHECK_EQUAL(v.ptr(), (float*)NULL); 498 | } 499 | { 500 | linear_memory v(N); 501 | BOOST_CHECK_EQUAL(v.size(), N); 502 | BOOST_CHECK_NE(v.ptr(), (float*)NULL); 503 | v.dealloc(); 504 | BOOST_CHECK_EQUAL(v.ptr(), (float*)NULL); 505 | } 506 | 507 | } 508 | 509 | BOOST_AUTO_TEST_CASE( readwrite_lm ) 510 | { 511 | unsigned int N = 54; 512 | { 513 | linear_memory v(N); 514 | v[1] = 0; 515 | BOOST_CHECK_EQUAL(v[1], 0); 516 | v[1] = 1; 517 | BOOST_CHECK_EQUAL(v[1], 1); 518 | } 519 | { 520 | linear_memory v(N); 521 | v[1] = 0; 522 | BOOST_CHECK_EQUAL(v[1], 0); 523 | v[1] = 1; 524 | BOOST_CHECK_EQUAL(v[1], 1); 525 | } 526 | 527 | } 528 | 529 | BOOST_AUTO_TEST_CASE( create_pm ) 530 | { 531 | unsigned int N = 54, M = 97; 532 | { 533 | pitched_memory v(N, M); 534 | BOOST_CHECK_EQUAL(v.size(), N*M); 535 | BOOST_CHECK_EQUAL(v.rows(), N); 536 | BOOST_CHECK_EQUAL(v.cols(), M); 537 | BOOST_CHECK_GE(v.pitch(), M); 538 | BOOST_CHECK_NE(v.ptr(), (float*)NULL); 539 | v.dealloc(); 540 | BOOST_CHECK_EQUAL(v.ptr(), (float*)NULL); 541 | } 542 | { 543 | pitched_memory v(N, M); 544 | BOOST_CHECK_GE(v.size(), N*M); 545 | BOOST_CHECK_EQUAL(v.rows(), N); 546 | BOOST_CHECK_EQUAL(v.cols(), M); 547 | BOOST_CHECK_GE(v.pitch(), M); 548 | BOOST_CHECK_NE(v.ptr(), (float*)NULL); 549 | v.dealloc(); 550 | BOOST_CHECK_EQUAL(v.ptr(), (float*)NULL); 551 | } 552 | 553 | } 554 | 555 | BOOST_AUTO_TEST_CASE( readwrite_pm ) 556 | { 557 | unsigned int N = 54, M = 97; 558 | { 559 | pitched_memory v(N, M); 560 | v[1] = 0; 561 | BOOST_CHECK_EQUAL(v[1], 0); 562 | v[1] = 1; 563 | BOOST_CHECK_EQUAL(v[1], 1); 564 | } 565 | { 566 | pitched_memory v(N, M); 567 | v[1] = 0; 568 | BOOST_CHECK_EQUAL(v[1], 0); 569 | v[1] = 1; 570 | BOOST_CHECK_EQUAL(v[1], 1); 571 | } 572 | 573 | { 574 | pitched_memory v(N, M); 575 | v(3, 4) = 0; 576 | BOOST_CHECK_EQUAL(v(3,4), 0); 577 | v(3, 4) = 1; 578 | BOOST_CHECK_EQUAL(v(3,4), 1); 579 | } 580 | { 581 | pitched_memory v(N, M); 582 | v(3, 4) = 0; 583 | BOOST_CHECK_EQUAL(v(3,4), 0); 584 | v(3, 4) = 1; 585 | BOOST_CHECK_EQUAL(v(3,4), 1); 586 | } 587 | 588 | } 589 | 590 | /** 591 | * @test 592 | * @brief create dense matrix. 593 | */BOOST_AUTO_TEST_CASE( create_linear ) 594 | { 595 | unsigned int N = 16, M = 32; 596 | { 597 | ndarray m(extents[N][M]); 598 | BOOST_CHECK_EQUAL(m.size(), N*M); 599 | BOOST_CHECK_EQUAL(m.shape(0), N); 600 | BOOST_CHECK_EQUAL(m.shape(1), M); 601 | BOOST_CHECK_EQUAL(m.stride(0), M); 602 | BOOST_CHECK_EQUAL(m.stride(1), 1); 603 | } 604 | 605 | { 606 | ndarray m(extents[N][M]); 607 | BOOST_CHECK_EQUAL(m.size(), N*M); 608 | BOOST_CHECK_EQUAL(m.shape(0), N); 609 | BOOST_CHECK_EQUAL(m.shape(1), M); 610 | BOOST_CHECK_EQUAL(m.stride(0), M); 611 | BOOST_CHECK_EQUAL(m.stride(1), 1); 612 | } 613 | 614 | { 615 | ndarray m(extents[N][M]); 616 | BOOST_CHECK_EQUAL(m.size(), N*M); 617 | BOOST_CHECK_EQUAL(m.shape(0), N); 618 | BOOST_CHECK_EQUAL(m.shape(1), M); 619 | BOOST_CHECK_EQUAL(m.stride(0), 1); 620 | BOOST_CHECK_EQUAL(m.stride(1), N); 621 | } 622 | 623 | { 624 | ndarray m(extents[N][M]); 625 | BOOST_CHECK_EQUAL(m.size(), N*M); 626 | BOOST_CHECK_EQUAL(m.shape(0), N); 627 | BOOST_CHECK_EQUAL(m.shape(1), M); 628 | BOOST_CHECK_EQUAL(m.stride(0), 1); 629 | BOOST_CHECK_EQUAL(m.stride(1), N); 630 | } 631 | } 632 | 633 | /** 634 | * @test 635 | * @brief create pitched matrix. 636 | */BOOST_AUTO_TEST_CASE( create_pitched ) 637 | { 638 | unsigned int N = 16, M = 32; 639 | { 640 | ndarray m(extents[N][M], pitched_memory_tag()); 641 | BOOST_CHECK_EQUAL(m.size(), N*M); 642 | BOOST_CHECK_EQUAL(m.shape(0), N); 643 | BOOST_CHECK_EQUAL(m.shape(1), M); 644 | BOOST_CHECK_GE(m.stride(0), M); 645 | BOOST_CHECK_EQUAL(m.stride(1), 1); 646 | } 647 | 648 | { 649 | ndarray m(extents[N][M], pitched_memory_tag()); 650 | BOOST_CHECK_EQUAL(m.size(), N*M); 651 | BOOST_CHECK_EQUAL(m.shape(0), N); 652 | BOOST_CHECK_EQUAL(m.shape(1), M); 653 | BOOST_CHECK_GE(m.stride(0), M); 654 | BOOST_CHECK_EQUAL(m.stride(1), 1); 655 | } 656 | 657 | { 658 | ndarray m(extents[N][M], pitched_memory_tag()); 659 | BOOST_CHECK_EQUAL(m.size(), N*M); 660 | BOOST_CHECK_EQUAL(m.shape(0), N); 661 | BOOST_CHECK_EQUAL(m.shape(1), M); 662 | BOOST_CHECK_EQUAL(m.stride(0), 1); 663 | BOOST_CHECK_GE(m.stride(1), N); 664 | } 665 | 666 | { 667 | ndarray m(extents[N][M], pitched_memory_tag()); 668 | BOOST_CHECK_EQUAL(m.size(), N*M); 669 | BOOST_CHECK_EQUAL(m.shape(0), N); 670 | BOOST_CHECK_EQUAL(m.shape(1), M); 671 | BOOST_CHECK_EQUAL(m.stride(0), 1); 672 | BOOST_CHECK_GE(m.stride(1), N); 673 | } 674 | } 675 | 676 | /** 677 | * @test 678 | * @brief setting and getting for device and host vectors. 679 | */BOOST_AUTO_TEST_CASE( set_vector_elements ) 680 | { 681 | static const unsigned int N = 145; 682 | static const unsigned int M = 97; 683 | ndarray v(extents[N][M]); // linear memory 684 | ndarray w(extents[N][M], pitched_memory_tag()); // pitched memory 685 | for (unsigned int i = 0; i < N; i++) { 686 | v[i] = (float) i / N; 687 | w[i] = (float) i / N; 688 | } 689 | //convert(w,v); 690 | for (unsigned int i = 0; i < N; i++) { 691 | BOOST_CHECK_EQUAL(v[i], (float) i/N); 692 | BOOST_CHECK_EQUAL(w[i], (float) i/N); 693 | } 694 | } 695 | 696 | BOOST_AUTO_TEST_CASE( assign_func ) 697 | { 698 | static const unsigned int N = 145; 699 | static const unsigned int M = 97; 700 | ndarray v(extents[N][M]); 701 | ndarray w(extents[N][M]); 702 | v[5] = 5; 703 | w[5] = 0; 704 | w.assign(v); 705 | BOOST_CHECK_NE(w.ptr(), v.ptr()); 706 | BOOST_CHECK_EQUAL(v[5], 5); 707 | BOOST_CHECK_EQUAL(w[5], 5); 708 | } 709 | 710 | BOOST_AUTO_TEST_CASE( stream_values ) 711 | { 712 | ndarray v(3, 2); 713 | for (size_t i = 0; i < v.size(); i++) { 714 | v[i] = i; 715 | } 716 | std::ostringstream o; 717 | for (size_t i = 0; i < v.size(); i++) { 718 | o << v[i]; 719 | } 720 | BOOST_CHECK_EQUAL(o.str(), "012345"); 721 | } 722 | 723 | 724 | 725 | BOOST_AUTO_TEST_SUITE_END() 726 | --------------------------------------------------------------------------------