├── .gitignore
├── CMakeLists.txt
├── CMakeModules
    └── FindTBB.cmake
├── LICENSE
├── README.md
└── src
    ├── CMakeLists.txt
    ├── cuv
        ├── CMakeLists.txt
        ├── allocators.cu
        ├── allocators.hpp
        ├── cuda_general.hpp
        ├── memory.cu
        ├── memory.hpp
        ├── meta_programming.hpp
        ├── ndarray.hpp
        ├── reference.cu
        ├── reference.hpp
        └── tags.hpp
    └── tests
        ├── CMakeLists.txt
        ├── allocators_test.cpp
        └── ndarray_test.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | build/
2 | 
3 | *.o
4 | *.so
5 | .*.swo
6 | .*.swp
7 | .*.swn
8 | 


--------------------------------------------------------------------------------
/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #######################################################################################
 2 | # The MIT License
 3 | 
 4 | # Copyright (c) 2014       Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
 5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
 6 | # Copyright (c) 2008-2009  Sebastian Nowozin                       <nowozin@gmail.com>
 7 | 
 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | # 
15 | # The above copyright notice and this permission notice shall be included in all
16 | # copies or substantial portions of the Software.
17 | # 
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | # SOFTWARE.
25 | #######################################################################################
26 | cmake_minimum_required( VERSION 2.6 FATAL_ERROR )
27 | 
28 | #
29 | # If the user specifies -DCMAKE_BUILD_TYPE on the command line, take their
30 | # definition # and dump it in the cache along with proper documentation,
31 | # otherwise set CMAKE_BUILD_TYPE # to Debug prior to calling PROJECT()
32 | #
33 | IF(DEFINED CMAKE_BUILD_TYPE)
34 |    SET(CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE} CACHE STRING "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel.")
35 | ELSE()
36 |     SET(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None(CMAKE_CXX_FLAGS or CMAKE_C_FLAGS used) Debug Release RelWithDebInfo MinSizeRel.")
37 | ENDIF()
38 | 
39 | PROJECT(ndarray CXX C)
40 | SET(CMAKE_MODULE_PATH  ${CMAKE_MODULE_PATH} ${CMAKE_SOURCE_DIR}/CMakeModules )
41 | 
42 | ENABLE_TESTING()
43 | add_subdirectory(src)
44 | 
45 | CUDA_BUILD_CLEAN_TARGET()
46 | 


--------------------------------------------------------------------------------
/CMakeModules/FindTBB.cmake:
--------------------------------------------------------------------------------
  1 | # Locate Intel Threading Building Blocks include paths and libraries
  2 | # FindTBB.cmake can be found at https://code.google.com/p/findtbb/
  3 | # Written by Hannes Hofmann <hannes.hofmann _at_ informatik.uni-erlangen.de>
  4 | # Improvements by Gino van den Bergen <gino _at_ dtecta.com>,
  5 | #   Florian Uhlig <F.Uhlig _at_ gsi.de>,
  6 | #   Jiri Marsik <jiri.marsik89 _at_ gmail.com>
  7 | 
  8 | # The MIT License
  9 | #
 10 | # Copyright (c) 2011 Hannes Hofmann
 11 | #
 12 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 13 | # of this software and associated documentation files (the "Software"), to deal
 14 | # in the Software without restriction, including without limitation the rights
 15 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 16 | # copies of the Software, and to permit persons to whom the Software is
 17 | # furnished to do so, subject to the following conditions:
 18 | #
 19 | # The above copyright notice and this permission notice shall be included in
 20 | # all copies or substantial portions of the Software.
 21 | #
 22 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 23 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 24 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 25 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 26 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 27 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 28 | # THE SOFTWARE.
 29 | 
 30 | # GvdB: This module uses the environment variable TBB_ARCH_PLATFORM which defines architecture and compiler.
 31 | #   e.g. "ia32/vc8" or "em64t/cc4.1.0_libc2.4_kernel2.6.16.21"
 32 | #   TBB_ARCH_PLATFORM is set by the build script tbbvars[.bat|.sh|.csh], which can be found
 33 | #   in the TBB installation directory (TBB_INSTALL_DIR).
 34 | #
 35 | # GvdB: Mac OS X distribution places libraries directly in lib directory.
 36 | #
 37 | # For backwards compatibility, you may explicitely set the CMake variables TBB_ARCHITECTURE and TBB_COMPILER.
 38 | # TBB_ARCHITECTURE [ ia32 | em64t | itanium ]
 39 | #   which architecture to use
 40 | # TBB_COMPILER e.g. vc9 or cc3.2.3_libc2.3.2_kernel2.4.21 or cc4.0.1_os10.4.9
 41 | #   which compiler to use (detected automatically on Windows)
 42 | 
 43 | # This module respects
 44 | # TBB_INSTALL_DIR or $ENV{TBB21_INSTALL_DIR} or $ENV{TBB_INSTALL_DIR}
 45 | 
 46 | # This module defines
 47 | # TBB_INCLUDE_DIRS, where to find task_scheduler_init.h, etc.
 48 | # TBB_LIBRARY_DIRS, where to find libtbb, libtbbmalloc
 49 | # TBB_DEBUG_LIBRARY_DIRS, where to find libtbb_debug, libtbbmalloc_debug
 50 | # TBB_INSTALL_DIR, the base TBB install directory
 51 | # TBB_LIBRARIES, the libraries to link against to use TBB.
 52 | # TBB_DEBUG_LIBRARIES, the libraries to link against to use TBB with debug symbols.
 53 | # TBB_FOUND, If false, don't try to use TBB.
 54 | # TBB_INTERFACE_VERSION, as defined in tbb/tbb_stddef.h
 55 | 
 56 | 
 57 | if (WIN32)
 58 |     # has em64t/vc8 em64t/vc9
 59 |     # has ia32/vc7.1 ia32/vc8 ia32/vc9
 60 |     set(_TBB_DEFAULT_INSTALL_DIR "C:/Program Files/Intel/TBB" "C:/Program Files (x86)/Intel/TBB")
 61 |     set(_TBB_LIB_NAME "tbb")
 62 |     set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc")
 63 |     set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug")
 64 |     set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug")
 65 |     if (MSVC71)
 66 |         set (_TBB_COMPILER "vc7.1")
 67 |     endif(MSVC71)
 68 |     if (MSVC80)
 69 |         set(_TBB_COMPILER "vc8")
 70 |     endif(MSVC80)
 71 |     if (MSVC90)
 72 |         set(_TBB_COMPILER "vc9")
 73 |     endif(MSVC90)
 74 |     if(MSVC10)
 75 |         set(_TBB_COMPILER "vc10")
 76 |     endif(MSVC10)
 77 |     # Todo: add other Windows compilers such as ICL.
 78 |     set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})
 79 | endif (WIN32)
 80 | 
 81 | if (UNIX)
 82 |     if (APPLE)
 83 |         # MAC
 84 |         set(_TBB_DEFAULT_INSTALL_DIR "/Library/Frameworks/Intel_TBB.framework/Versions")
 85 |         # libs: libtbb.dylib, libtbbmalloc.dylib, *_debug
 86 |         set(_TBB_LIB_NAME "tbb")
 87 |         set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc")
 88 |         set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug")
 89 |         set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug")
 90 |         # default flavor on apple: ia32/cc4.0.1_os10.4.9
 91 |         # Jiri: There is no reason to presume there is only one flavor and
 92 |         #       that user's setting of variables should be ignored.
 93 |         if(NOT TBB_COMPILER)
 94 |             set(_TBB_COMPILER "cc4.0.1_os10.4.9")
 95 |         elseif (NOT TBB_COMPILER)
 96 |             set(_TBB_COMPILER ${TBB_COMPILER})
 97 |         endif(NOT TBB_COMPILER)
 98 |         if(NOT TBB_ARCHITECTURE)
 99 |             set(_TBB_ARCHITECTURE "ia32")
100 |         elseif(NOT TBB_ARCHITECTURE)
101 |             set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})
102 |         endif(NOT TBB_ARCHITECTURE)
103 |     else (APPLE)
104 |         # LINUX
105 |         set(_TBB_DEFAULT_INSTALL_DIR "/opt/intel/tbb" "/usr/local/include" "/usr/include")
106 |         set(_TBB_LIB_NAME "tbb")
107 |         set(_TBB_LIB_MALLOC_NAME "${_TBB_LIB_NAME}malloc")
108 |         set(_TBB_LIB_DEBUG_NAME "${_TBB_LIB_NAME}_debug")
109 |         set(_TBB_LIB_MALLOC_DEBUG_NAME "${_TBB_LIB_MALLOC_NAME}_debug")
110 |         # has em64t/cc3.2.3_libc2.3.2_kernel2.4.21 em64t/cc3.3.3_libc2.3.3_kernel2.6.5 em64t/cc3.4.3_libc2.3.4_kernel2.6.9 em64t/cc4.1.0_libc2.4_kernel2.6.16.21
111 |         # has ia32/*
112 |         # has itanium/*
113 |         set(_TBB_COMPILER ${TBB_COMPILER})
114 |         set(_TBB_ARCHITECTURE ${TBB_ARCHITECTURE})
115 |     endif (APPLE)
116 | endif (UNIX)
117 | 
118 | if (CMAKE_SYSTEM MATCHES "SunOS.*")
119 | # SUN
120 | # not yet supported
121 | # has em64t/cc3.4.3_kernel5.10
122 | # has ia32/*
123 | endif (CMAKE_SYSTEM MATCHES "SunOS.*")
124 | 
125 | 
126 | #-- Clear the public variables
127 | set (TBB_FOUND "NO")
128 | 
129 | 
130 | #-- Find TBB install dir and set ${_TBB_INSTALL_DIR} and cached ${TBB_INSTALL_DIR}
131 | # first: use CMake variable TBB_INSTALL_DIR
132 | if (TBB_INSTALL_DIR)
133 |     set (_TBB_INSTALL_DIR ${TBB_INSTALL_DIR})
134 | endif (TBB_INSTALL_DIR)
135 | # second: use environment variable
136 | if (NOT _TBB_INSTALL_DIR)
137 |     if (NOT "$ENV{TBB_INSTALL_DIR}" STREQUAL "")
138 |         set (_TBB_INSTALL_DIR $ENV{TBB_INSTALL_DIR})
139 |     endif (NOT "$ENV{TBB_INSTALL_DIR}" STREQUAL "")
140 |     # Intel recommends setting TBB21_INSTALL_DIR
141 |     if (NOT "$ENV{TBB21_INSTALL_DIR}" STREQUAL "")
142 |         set (_TBB_INSTALL_DIR $ENV{TBB21_INSTALL_DIR})
143 |     endif (NOT "$ENV{TBB21_INSTALL_DIR}" STREQUAL "")
144 |     if (NOT "$ENV{TBB22_INSTALL_DIR}" STREQUAL "")
145 |         set (_TBB_INSTALL_DIR $ENV{TBB22_INSTALL_DIR})
146 |     endif (NOT "$ENV{TBB22_INSTALL_DIR}" STREQUAL "")
147 |     if (NOT "$ENV{TBB30_INSTALL_DIR}" STREQUAL "")
148 |         set (_TBB_INSTALL_DIR $ENV{TBB30_INSTALL_DIR})
149 |     endif (NOT "$ENV{TBB30_INSTALL_DIR}" STREQUAL "")
150 | endif (NOT _TBB_INSTALL_DIR)
151 | # third: try to find path automatically
152 | if (NOT _TBB_INSTALL_DIR)
153 |     if (_TBB_DEFAULT_INSTALL_DIR)
154 |         set (_TBB_INSTALL_DIR ${_TBB_DEFAULT_INSTALL_DIR})
155 |     endif (_TBB_DEFAULT_INSTALL_DIR)
156 | endif (NOT _TBB_INSTALL_DIR)
157 | # sanity check
158 | if (NOT _TBB_INSTALL_DIR)
159 |     message ("ERROR: Unable to find Intel TBB install directory. ${_TBB_INSTALL_DIR}")
160 | else (NOT _TBB_INSTALL_DIR)
161 | # finally: set the cached CMake variable TBB_INSTALL_DIR
162 | if (NOT TBB_INSTALL_DIR)
163 |     set (TBB_INSTALL_DIR ${_TBB_INSTALL_DIR} CACHE PATH "Intel TBB install directory")
164 |     mark_as_advanced(TBB_INSTALL_DIR)
165 | endif (NOT TBB_INSTALL_DIR)
166 | 
167 | 
168 | #-- A macro to rewrite the paths of the library. This is necessary, because
169 | #   find_library() always found the em64t/vc9 version of the TBB libs
170 | macro(TBB_CORRECT_LIB_DIR var_name)
171 | #    if (NOT "${_TBB_ARCHITECTURE}" STREQUAL "em64t")
172 |         string(REPLACE em64t "${_TBB_ARCHITECTURE}" ${var_name} ${${var_name}})
173 | #    endif (NOT "${_TBB_ARCHITECTURE}" STREQUAL "em64t")
174 |     string(REPLACE ia32 "${_TBB_ARCHITECTURE}" ${var_name} ${${var_name}})
175 |     string(REPLACE vc7.1 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
176 |     string(REPLACE vc8 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
177 |     string(REPLACE vc9 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
178 |     string(REPLACE vc10 "${_TBB_COMPILER}" ${var_name} ${${var_name}})
179 | endmacro(TBB_CORRECT_LIB_DIR var_content)
180 | 
181 | 
182 | #-- Look for include directory and set ${TBB_INCLUDE_DIR}
183 | set (TBB_INC_SEARCH_DIR ${_TBB_INSTALL_DIR}/include)
184 | # Jiri: tbbvars now sets the CPATH environment variable to the directory
185 | #       containing the headers.
186 | find_path(TBB_INCLUDE_DIR
187 |     tbb/task_scheduler_init.h
188 |     PATHS ${TBB_INC_SEARCH_DIR} ENV CPATH
189 | )
190 | mark_as_advanced(TBB_INCLUDE_DIR)
191 | 
192 | 
193 | #-- Look for libraries
194 | # GvdB: $ENV{TBB_ARCH_PLATFORM} is set by the build script tbbvars[.bat|.sh|.csh]
195 | if (NOT $ENV{TBB_ARCH_PLATFORM} STREQUAL "")
196 |     set (_TBB_LIBRARY_DIR 
197 |          ${_TBB_INSTALL_DIR}/lib/$ENV{TBB_ARCH_PLATFORM}
198 |          ${_TBB_INSTALL_DIR}/$ENV{TBB_ARCH_PLATFORM}/lib
199 |         )
200 | endif (NOT $ENV{TBB_ARCH_PLATFORM} STREQUAL "")
201 | # Jiri: This block isn't mutually exclusive with the previous one
202 | #       (hence no else), instead I test if the user really specified
203 | #       the variables in question.
204 | if ((NOT ${TBB_ARCHITECTURE} STREQUAL "") AND (NOT ${TBB_COMPILER} STREQUAL ""))
205 |     # HH: deprecated
206 |     message(STATUS "[Warning] FindTBB.cmake: The use of TBB_ARCHITECTURE and TBB_COMPILER is deprecated and may not be supported in future versions. Please set \$ENV{TBB_ARCH_PLATFORM} (using tbbvars.[bat|csh|sh]).")
207 |     # Jiri: It doesn't hurt to look in more places, so I store the hints from
208 |     #       ENV{TBB_ARCH_PLATFORM} and the TBB_ARCHITECTURE and TBB_COMPILER
209 |     #       variables and search them both.
210 |     set (_TBB_LIBRARY_DIR "${_TBB_INSTALL_DIR}/${_TBB_ARCHITECTURE}/${_TBB_COMPILER}/lib" ${_TBB_LIBRARY_DIR})
211 | endif ((NOT ${TBB_ARCHITECTURE} STREQUAL "") AND (NOT ${TBB_COMPILER} STREQUAL ""))
212 | 
213 | # GvdB: Mac OS X distribution places libraries directly in lib directory.
214 | list(APPEND _TBB_LIBRARY_DIR ${_TBB_INSTALL_DIR}/lib)
215 | 
216 | # Jiri: No reason not to check the default paths. From recent versions,
217 | #       tbbvars has started exporting the LIBRARY_PATH and LD_LIBRARY_PATH
218 | #       variables, which now point to the directories of the lib files.
219 | #       It all makes more sense to use the ${_TBB_LIBRARY_DIR} as a HINTS
220 | #       argument instead of the implicit PATHS as it isn't hard-coded
221 | #       but computed by system introspection. Searching the LIBRARY_PATH
222 | #       and LD_LIBRARY_PATH environment variables is now even more important
223 | #       that tbbvars doesn't export TBB_ARCH_PLATFORM and it facilitates
224 | #       the use of TBB built from sources.
225 | find_library(TBB_LIBRARY ${_TBB_LIB_NAME} HINTS ${_TBB_LIBRARY_DIR}
226 |         PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)
227 | find_library(TBB_MALLOC_LIBRARY ${_TBB_LIB_MALLOC_NAME} HINTS ${_TBB_LIBRARY_DIR}
228 |         PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)
229 | 
230 | #Extract path from TBB_LIBRARY name
231 | get_filename_component(TBB_LIBRARY_DIR ${TBB_LIBRARY} PATH)
232 | 
233 | #TBB_CORRECT_LIB_DIR(TBB_LIBRARY)
234 | #TBB_CORRECT_LIB_DIR(TBB_MALLOC_LIBRARY)
235 | mark_as_advanced(TBB_LIBRARY TBB_MALLOC_LIBRARY)
236 | 
237 | #-- Look for debug libraries
238 | # Jiri: Changed the same way as for the release libraries.
239 | find_library(TBB_LIBRARY_DEBUG ${_TBB_LIB_DEBUG_NAME} HINTS ${_TBB_LIBRARY_DIR}
240 |         PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)
241 | find_library(TBB_MALLOC_LIBRARY_DEBUG ${_TBB_LIB_MALLOC_DEBUG_NAME} HINTS ${_TBB_LIBRARY_DIR}
242 |         PATHS ENV LIBRARY_PATH ENV LD_LIBRARY_PATH)
243 | 
244 | # Jiri: Self-built TBB stores the debug libraries in a separate directory.
245 | #       Extract path from TBB_LIBRARY_DEBUG name
246 | get_filename_component(TBB_LIBRARY_DEBUG_DIR ${TBB_LIBRARY_DEBUG} PATH)
247 | 
248 | #TBB_CORRECT_LIB_DIR(TBB_LIBRARY_DEBUG)
249 | #TBB_CORRECT_LIB_DIR(TBB_MALLOC_LIBRARY_DEBUG)
250 | mark_as_advanced(TBB_LIBRARY_DEBUG TBB_MALLOC_LIBRARY_DEBUG)
251 | 
252 | 
253 | if (TBB_INCLUDE_DIR)
254 |     if (TBB_LIBRARY)
255 |         set (TBB_FOUND "YES")
256 |         set (TBB_LIBRARIES ${TBB_LIBRARY} ${TBB_MALLOC_LIBRARY} ${TBB_LIBRARIES})
257 |         set (TBB_DEBUG_LIBRARIES ${TBB_LIBRARY_DEBUG} ${TBB_MALLOC_LIBRARY_DEBUG} ${TBB_DEBUG_LIBRARIES})
258 |         set (TBB_INCLUDE_DIRS ${TBB_INCLUDE_DIR} CACHE PATH "TBB include directory" FORCE)
259 |         set (TBB_LIBRARY_DIRS ${TBB_LIBRARY_DIR} CACHE PATH "TBB library directory" FORCE)
260 |         # Jiri: Self-built TBB stores the debug libraries in a separate directory.
261 |         set (TBB_DEBUG_LIBRARY_DIRS ${TBB_LIBRARY_DEBUG_DIR} CACHE PATH "TBB debug library directory" FORCE)
262 |         mark_as_advanced(TBB_INCLUDE_DIRS TBB_LIBRARY_DIRS TBB_DEBUG_LIBRARY_DIRS TBB_LIBRARIES TBB_DEBUG_LIBRARIES)
263 |         message(STATUS "Found Intel TBB")
264 |     endif (TBB_LIBRARY)
265 | endif (TBB_INCLUDE_DIR)
266 | 
267 | if (NOT TBB_FOUND)
268 |     message("ERROR: Intel TBB NOT found!")
269 |     message(STATUS "Looked for Threading Building Blocks in ${_TBB_INSTALL_DIR}")
270 |     # do only throw fatal, if this pkg is REQUIRED
271 |     if (TBB_FIND_REQUIRED)
272 |         message(FATAL_ERROR "Could NOT find TBB library.")
273 |     endif (TBB_FIND_REQUIRED)
274 | endif (NOT TBB_FOUND)
275 | 
276 | endif (NOT _TBB_INSTALL_DIR)
277 | 
278 | if (TBB_FOUND)
279 | 	set(TBB_INTERFACE_VERSION 0)
280 | 	FILE(READ "${TBB_INCLUDE_DIRS}/tbb/tbb_stddef.h" _TBB_VERSION_CONTENTS)
281 | 	STRING(REGEX REPLACE ".*#define TBB_INTERFACE_VERSION ([0-9]+).*" "\\1" TBB_INTERFACE_VERSION "${_TBB_VERSION_CONTENTS}")
282 | 	set(TBB_INTERFACE_VERSION "${TBB_INTERFACE_VERSION}")
283 | endif (TBB_FOUND)
284 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2013, University of Bonn, Institute for Computer Science VI
 2 | All rights reserved.
 3 | 
 4 | Redistribution and use in source and binary forms, with or without
 5 | modification, are permitted provided that the following conditions are met:
 6 | 
 7 | * Redistributions of source code must retain the above copyright notice,
 8 | this list of conditions and the following disclaimer.
 9 | * Redistributions in binary form must reproduce the above copyright notice,
10 | this list of conditions and the following disclaimer in the documentation
11 | and/or other materials provided with the distribution.
12 | * Neither the name of the University of Bonn
13 | nor the names of its contributors may be used to endorse or promote
14 | products derived from this software without specific prior written
15 | permission.
16 | 
17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
21 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
23 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
24 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
25 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ndarray Documentation
  2 | =====================
  3 | 
  4 | Summary
  5 | -------
  6 | 
  7 | ndarray is a C++ template library for n-dimensional arrays on CPU and GPU using NVIDIA CUDA™. It is extracted from the [CUV library][cuv].
  8 | 
  9 | Features
 10 | --------
 11 | 
 12 | ### Supported Platforms ###
 13 | 
 14 |   - This library was only tested on Ubuntu Karmic, Lucid and Maverick. It uses
 15 |     mostly standard components and should run without major
 16 |     modification on any current linux system.
 17 | 
 18 | ### Supported GPUs ###
 19 | 
 20 |   - By default, code is generated for the lowest compute architecture. We
 21 |     recommend you change this to match your hardware. Using ccmake you can set
 22 |     the build variable "CUDA_ARCHITECTURE" for example to -arch=compute_20
 23 |   - All GT 9800 and GTX 280 and above
 24 |   - GT 9200 without convolutions. It might need some minor modifications to
 25 |     make the rest work. If you want to use that card and have problems, just
 26 |     get in contact.
 27 |   - On 8800GTS, random numbers and convolutions wont work.
 28 | 
 29 | 
 30 | Installation
 31 | ------------
 32 | 
 33 | ### Dependencies ###
 34 | 
 35 | To build the C++ lib, you will need:
 36 | 
 37 |   - cmake (and cmake-curses-gui for easy configuration)
 38 |   - libboost-dev >= 1.37
 39 |   - NVIDIA CUDA (tm), including SDK. We support versions 3.X, 4.X and 5.X
 40 |   - [thrust library][thrust] - included in CUDA since 4.0
 41 | 
 42 | 
 43 | ### Building a debug version ###
 44 | 
 45 | ```bash
 46 | mkdir -p build/debug
 47 | cd build/debug
 48 | cmake -DCMAKE_BUILD_TYPE=Release ../../
 49 | ccmake .             # adjust paths to your system (cuda, thrust, ...)!
 50 | make -j
 51 | ctest                # run tests to see if it went well
 52 | sudo make install
 53 | ```
 54 | 
 55 | ### Building a release version ###
 56 | 
 57 | ```bash
 58 | mkdir -p build/release
 59 | cd build/release
 60 | cmake -DCMAKE_BUILD_TYPE=Release ../../
 61 | ccmake .             # adjust paths to your system (cuda, thrust, ...)!
 62 | make -j
 63 | ctest                # run tests to see if it went well
 64 | sudo make install
 65 | ```
 66 | 
 67 | Usage
 68 | -----
 69 | 
 70 | ### Example ###
 71 | 
 72 | ```c++
 73 | #include <cuv/ndarray.hpp>
 74 | 
 75 | int main(void) {
 76 | 
 77 | 	// allocate a 10×20 array of ints in row-major order on host (CPU)
 78 | 	cuv::ndarray<int, cuv::host_memory_space> a_host(10, 20);
 79 | 
 80 | 	assert(a_host.ndim() == 2);        // a_host is a two-dimensional array
 81 | 	assert(a_host.size() == 10 * 20);
 82 | 
 83 | 	// initialize the array
 84 | 	int x = 0;
 85 | 	for(int i=0; i < a_host.shape(0); i++) { // shape(0) == 10
 86 | 		for(int j=0; j < a_host.shape(1); j++) { // shape(1) == 20
 87 | 			a_host(i, j) = x++;
 88 | 		}
 89 | 	}
 90 | 
 91 | 	// reshape to a 20×10 array
 92 | 	a_host.reshape(20, 10);
 93 | 	assert(a_host.shape(0) == 20);
 94 | 	assert(a_host.shape(1) == 10);
 95 | 
 96 | 	// copy the array to the GPU
 97 | 	cuv::ndarray<int, cuv::dev_memory_space> a_device = a_host;
 98 | 
 99 | 	// get the pointer to global device memory
100 | 	int* device_ptr = a_device.ptr();
101 | 
102 | 	return 0;
103 | }
104 | ```
105 | 
106 | [thrust]: http://code.google.com/p/thrust/
107 | [cuv]: https://github.com/deeplearningais/CUV
108 | 


--------------------------------------------------------------------------------
/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #######################################################################################
 2 | # The MIT License
 3 | 
 4 | # Copyright (c) 2014       Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
 5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
 6 | # Copyright (c) 2008-2009  Sebastian Nowozin                       <nowozin@gmail.com>
 7 | 
 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | # 
15 | # The above copyright notice and this permission notice shall be included in all
16 | # copies or substantial portions of the Software.
17 | # 
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | # SOFTWARE.
25 | #######################################################################################
26 | cmake_minimum_required( VERSION 2.6 FATAL_ERROR )
27 | 
28 | FIND_PACKAGE(CUDA)
29 | 
30 | if ( NOT CUDA_ARCHITECTURE )
31 | 	SET( CUDA_ARCHITECTURE -gencode;arch=compute_13,code=sm_13;-gencode;arch=compute_20,code=sm_20;-gencode;arch=compute_30,code=sm_30;-gencode;arch=compute_35,code=sm_35 )
32 | endif()
33 | 
34 | # ---------- Find Boost Headers/Libraries -----------------------
35 | SET (Boost_FIND_REQUIRED TRUE)
36 | SET (Boost_FIND_QUIETLY TRUE)
37 | SET (Boost_USE_MULTITHREADED TRUE)
38 | SET (Boost_USE_STATIC_LIBS FALSE)
39 | SET (Boost_ADDITIONAL_VERSIONS "1.39" "1.39.0" "1.40" "1.42" "1.43" "1.44" "1.45" "1.46" "1.47" "1.48")
40 | FIND_PACKAGE( Boost 1.37 COMPONENTS unit_test_framework serialization system REQUIRED )
41 | INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIRS})
42 | LINK_DIRECTORIES(${Boost_LIBRARY_DIRS})
43 | 
44 | FIND_PATH(THRUST_PATH thrust/device_vector.h /usr/include /usr/local/include ${CUDA_INCLUDE_DIRS} "$ENV{THRUST_ROOT}")
45 | IF(NOT THRUST_PATH)
46 | 	MESSAGE(FATAL_ERROR "Could not find the thrust library. Please install in standard locations or set THRUST_ROOT environment variable.")
47 | ENDIF(NOT THRUST_PATH)
48 | 
49 | SET(CUDA_ARCHITECTURE "" CACHE STRING "The CUDA architecture to compile for, i.e. -arch=sm_20")
50 | SET(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS};${CUDA_ARCHITECTURE}")
51 | MESSAGE(STATUS "CUDA_NVCC_FLAGS: ${CUDA_NVCC_FLAGS}")
52 | 
53 | CUDA_INCLUDE_DIRECTORIES( ${THRUST_PATH}                                )
54 | INCLUDE_DIRECTORIES(      ${THRUST_PATH}                                )
55 | 
56 | add_subdirectory(cuv)
57 | add_subdirectory(tests)
58 | 


--------------------------------------------------------------------------------
/src/cuv/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #######################################################################################
 2 | # The MIT License
 3 | 
 4 | # Copyright (c) 2014       Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
 5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
 6 | # Copyright (c) 2008-2009  Sebastian Nowozin                       <nowozin@gmail.com>
 7 | 
 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | # 
15 | # The above copyright notice and this permission notice shall be included in all
16 | # copies or substantial portions of the Software.
17 | # 
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | # SOFTWARE.
25 | #######################################################################################
26 | CUDA_ADD_LIBRARY("ndarray${LIB_SUFFIX}" SHARED allocators.cu memory.cu reference.cu)
27 | 
28 | install(TARGETS "ndarray${LIB_SUFFIX}"
29 |         RUNTIME DESTINATION bin
30 |         LIBRARY DESTINATION lib
31 |         ARCHIVE DESTINATION lib/static )
32 | 
33 | INSTALL(FILES ndarray.hpp tags.hpp allocators.hpp cuda_general.hpp memory.hpp meta_programming.hpp reference.hpp
34 | 	DESTINATION "include/cuv"
35 | )
36 | 


--------------------------------------------------------------------------------
/src/cuv/allocators.cu:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | #######################################################################################
  3 | # The MIT License
  4 | 
  5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
  6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
  7 | 
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | # of this software and associated documentation files (the "Software"), to deal
 10 | # in the Software without restriction, including without limitation the rights
 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | # copies of the Software, and to permit persons to whom the Software is
 13 | # furnished to do so, subject to the following conditions:
 14 | # 
 15 | # The above copyright notice and this permission notice shall be included in all
 16 | # copies or substantial portions of the Software.
 17 | # 
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 | # SOFTWARE.
 25 | #######################################################################################
 26 | #endif
 27 | #include "allocators.hpp"
 28 | 
 29 | #include <boost/format.hpp>
 30 | #include <boost/thread/recursive_mutex.hpp>
 31 | #include <cuda_runtime_api.h>
 32 | #include <sstream>
 33 | #include <stdexcept>
 34 | #include <thrust/device_ptr.h>
 35 | #include <vector>
 36 | 
 37 | #include "cuda_general.hpp"
 38 | 
 39 | namespace cuv {
 40 | 
 41 | void default_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space) {
 42 |     assert(*ptr == 0);
 43 |     *ptr = malloc(memsize * valueSize);
 44 |     assert(*ptr);
 45 | }
 46 | 
 47 | void default_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space) {
 48 |     assert(*ptr == 0);
 49 |     cuvSafeCall(cudaMalloc(ptr, memsize * valueSize));
 50 |     assert(*ptr);
 51 | }
 52 | 
 53 | void default_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
 54 |         host_memory_space m) {
 55 |     pitch = width * valueSize;
 56 |     alloc(ptr, height * width, valueSize, m);
 57 |     assert(*ptr);
 58 | }
 59 | 
 60 | void default_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
 61 |         dev_memory_space) {
 62 |     cuvSafeCall(cudaMallocPitch(ptr, &pitch, valueSize * width, height));
 63 |     assert(*ptr);
 64 | }
 65 | 
 66 | void default_allocator::dealloc(void** ptr, host_memory_space) {
 67 |     assert(*ptr != 0);
 68 |     free(*ptr);
 69 |     *ptr = 0;
 70 | }
 71 | 
 72 | void default_allocator::dealloc(void** ptr, dev_memory_space) {
 73 |     assert(*ptr != 0);
 74 |     cuvSafeCall(cudaFree(*ptr));
 75 |     *ptr = 0;
 76 | }
 77 | 
 78 | void cuda_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space) {
 79 |     assert(*ptr == 0);
 80 |     cuvSafeCall(cudaMallocHost(ptr, memsize * valueSize));
 81 |     assert(*ptr != 0);
 82 | }
 83 | 
 84 | void cuda_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space m) {
 85 |     default_allocator::alloc(ptr, memsize, valueSize, m);
 86 | }
 87 | 
 88 | void cuda_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
 89 |         host_memory_space m) {
 90 |     pitch = width * valueSize;
 91 |     alloc(ptr, height * width, valueSize, m);
 92 | }
 93 | 
 94 | void cuda_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
 95 |         dev_memory_space) {
 96 |     cuvSafeCall(cudaMallocPitch(ptr, &pitch, valueSize * width, height));
 97 | }
 98 | 
 99 | void cuda_allocator::dealloc(void** ptr, host_memory_space) {
100 |     assert(*ptr != 0);
101 |     cuvSafeCall(cudaFreeHost(*ptr));
102 |     *ptr = 0;
103 | }
104 | 
105 | void cuda_allocator::dealloc(void** ptr, dev_memory_space m) {
106 |     default_allocator::dealloc(ptr, m);
107 | }
108 | 
109 | template<class memory_space>
110 | void pooled_cuda_allocator::collect_garbage(memory_space m) {
111 | 
112 |     boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m));
113 |     std::map<void*, bool>& pool = get_pool(m);
114 |     std::map<void*, size_t>& pool_sizes = get_pool_sizes(m);
115 | 
116 |     std::vector<void*> to_delete;
117 |     std::map<void*, bool>::iterator it;
118 |     for (it = pool.begin(); it != pool.end(); it++) {
119 |         if (it->second) {
120 |             to_delete.push_back(it->first);
121 |         }
122 |     }
123 | 
124 |     for (size_t i = 0; i < to_delete.size(); i++) {
125 |         void* ptr = to_delete[i];
126 |         pool.erase(ptr);
127 |         pool_sizes.erase(ptr);
128 |         cuda_alloc.dealloc(&ptr, m);
129 |     }
130 | 
131 |     assert(pool_free_count(m) == 0);
132 | 
133 |     CUV_LOG_DEBUG("garbage collection in memory pool " << m_name << " (" << memtype(m) <<
134 |             "): removed " << to_delete.size() << " elements");
135 | }
136 | 
137 | template<>
138 | boost::recursive_mutex& pooled_cuda_allocator::get_pool_mutex(dev_memory_space) const {
139 |     // locking/unlocking a mutex does not violate constness of this object
140 |     // unfortunately, the design of the scoped_lock and mutex class requires this hack of a const_cast
141 |     return *(const_cast<boost::recursive_mutex*>(&m_dev_pool_mutex));
142 | }
143 | 
144 | template<>
145 | boost::recursive_mutex& pooled_cuda_allocator::get_pool_mutex(host_memory_space) const {
146 |     // locking/unlocking a mutex does not violate constness of this object
147 |     // unfortunately, the design of the scoped_lock and mutex class requires this hack of a const_cast
148 |     return *(const_cast<boost::recursive_mutex*>(&m_host_pool_mutex));
149 | }
150 | 
151 | template<>
152 | std::map<void*, bool>& pooled_cuda_allocator::get_pool(dev_memory_space) {
153 |     return m_dev_pool;
154 | }
155 | 
156 | template<>
157 | std::map<void*, bool>& pooled_cuda_allocator::get_pool(host_memory_space) {
158 |     return m_host_pool;
159 | }
160 | 
161 | template<>
162 | const std::map<void*, bool>& pooled_cuda_allocator::get_pool(dev_memory_space) const {
163 |     return m_dev_pool;
164 | }
165 | 
166 | template<>
167 | const std::map<void*, bool>& pooled_cuda_allocator::get_pool(host_memory_space) const {
168 |     return m_host_pool;
169 | }
170 | 
171 | template<>
172 | std::map<void*, size_t>& pooled_cuda_allocator::get_pool_sizes(dev_memory_space) {
173 |     return m_dev_pool_sizes;
174 | }
175 | 
176 | template<>
177 | std::map<void*, size_t>& pooled_cuda_allocator::get_pool_sizes(host_memory_space) {
178 |     return m_host_pool_sizes;
179 | }
180 | 
181 | template<>
182 | const std::map<void*, size_t>& pooled_cuda_allocator::get_pool_sizes(dev_memory_space) const {
183 |     return m_dev_pool_sizes;
184 | }
185 | 
186 | template<>
187 | const std::map<void*, size_t>& pooled_cuda_allocator::get_pool_sizes(host_memory_space) const {
188 |     return m_host_pool_sizes;
189 | }
190 | 
191 | template<class memory_space>
192 | void pooled_cuda_allocator::delete_pool(memory_space m) {
193 | 
194 |     boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m));
195 |     std::map<void*, bool>& pool = get_pool(m);
196 |     std::map<void*, size_t>& pool_sizes = get_pool_sizes(m);
197 | 
198 | #ifndef NDEBUG
199 |     size_t free_count = pool_free_count(m);
200 |     size_t count = pool_count(m);
201 |     if (free_count != count) {
202 |         throw std::runtime_error(
203 |                 (boost::format("detected potential memory leak in memory pool '%s' (%s): free: %d, count: %d")
204 |                         % m_name % memtype(m) % free_count % count).str());
205 |     }
206 | #endif
207 | 
208 |     std::map<void*, bool>::iterator it;
209 |     for (it = pool.begin(); it != pool.end(); it++) {
210 |         if (!it->second) {
211 |             throw std::runtime_error(
212 |                     "misuse of allocator. memory was not deallocated before allocator is destroyed. this is a programming failure.");
213 |         }
214 |         void* ptr = it->first;
215 |         cuda_alloc.dealloc(&ptr, m);
216 |     }
217 |     pool.clear();
218 |     pool_sizes.clear();
219 | 
220 |     CUV_LOG_DEBUG("deleted memory pool " << m_name << " (" << memtype(m) << ")");
221 | }
222 | 
223 | pooled_cuda_allocator::pooled_cuda_allocator(const std::string& _name) :
224 |         m_name(_name),
225 |                 m_dev_pool_mutex(), m_host_pool_mutex(),
226 |                 m_dev_pool(), m_dev_pool_sizes(),
227 |                 m_host_pool(), m_host_pool_sizes() {
228 |     if (m_name.empty()) {
229 |         std::ostringstream o;
230 |         o << this;
231 |         m_name = o.str();
232 |     }
233 | }
234 | 
235 | pooled_cuda_allocator::~pooled_cuda_allocator() {
236 |     delete_pool(dev_memory_space());
237 |     delete_pool(host_memory_space());
238 | }
239 | 
240 | template<class memory_space>
241 | size_t pooled_cuda_allocator::pool_size(memory_space m) const {
242 |     size_t sum = 0;
243 | 
244 |     boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m));
245 |     const std::map<void*, size_t>& pool_sizes = get_pool_sizes(m);
246 | 
247 |     std::map<void*, size_t>::const_iterator it;
248 |     for (it = pool_sizes.begin(); it != pool_sizes.end(); it++) {
249 |         sum += it->second;
250 |     }
251 |     return sum;
252 | }
253 | 
254 | template<class memory_space>
255 | size_t pooled_cuda_allocator::pool_count(memory_space m) const {
256 |     boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m));
257 |     return get_pool_sizes(m).size();
258 | }
259 | 
260 | template<class memory_space>
261 | size_t pooled_cuda_allocator::pool_free_count(memory_space m) const {
262 |     size_t free = 0;
263 | 
264 |     boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m));
265 |     const std::map<void*, bool>& pool = get_pool(m);
266 | 
267 |     std::map<void*, bool>::const_iterator it;
268 |     for (it = pool.begin(); it != pool.end(); it++) {
269 |         if (it->second) {
270 |             free++;
271 |         }
272 |     }
273 |     return free;
274 | }
275 | 
276 | size_t pooled_cuda_allocator::pool_free_count() const {
277 |     return pool_free_count(dev_memory_space()) + pool_free_count(host_memory_space());
278 | }
279 | 
280 | size_t pooled_cuda_allocator::pool_size() const {
281 |     return pool_size(dev_memory_space()) + pool_size(host_memory_space());
282 | }
283 | 
284 | size_t pooled_cuda_allocator::pool_count() const {
285 |     return pool_count(dev_memory_space()) + pool_count(host_memory_space());
286 | }
287 | 
288 | void pooled_cuda_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space m) {
289 |     if (memsize * valueSize < MIN_SIZE_DEV) {
290 |         default_alloc.alloc(ptr, memsize, valueSize, m);
291 |     } else {
292 |         alloc_pooled(ptr, memsize, valueSize, m);
293 |     }
294 | }
295 | 
296 | template<class memory_space>
297 | void pooled_cuda_allocator::alloc_pooled(void** ptr, size_t memsize, size_t valueSize, memory_space m) {
298 | 
299 |     assert(memsize > 0);
300 | 
301 |     // try to find memory in the pool that is available and large enough but not too large
302 |     size_t bestSize = 0;
303 |     void* bestPtr = 0;
304 | 
305 |     boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m));
306 |     std::map<void*, bool>& pool = get_pool(m);
307 |     std::map<void*, size_t>& pool_sizes = get_pool_sizes(m);
308 | 
309 |     std::map<void*, bool>::iterator it;
310 |     {
311 |         for (it = pool.begin(); it != pool.end(); it++) {
312 |             // available?
313 |             if (!it->second) {
314 |                 continue;
315 |             }
316 | 
317 |             size_t size = pool_sizes[it->first];
318 |             // large enough?
319 |             if (size > memsize * valueSize) {
320 |                 if (bestPtr == 0 || size < bestSize) {
321 |                     bestPtr = it->first;
322 |                     bestSize = size;
323 |                 }
324 |             }
325 |             // can’t get better
326 |             else if (size == memsize * valueSize) {
327 |                 bestPtr = it->first;
328 |                 bestSize = size;
329 |                 break;
330 |             }
331 |         }
332 | 
333 |         if (bestPtr) {
334 |             // we take it
335 |             assert(pool[bestPtr]);
336 |             pool[bestPtr] = false;
337 |             *ptr = bestPtr;
338 | 
339 |             CUV_LOG_DEBUG("reusing " << memsize * valueSize << "/" << pool_sizes[bestPtr] << " bytes in pool "
340 |                     << m_name << " (" << memtype(m) << ")");
341 | 
342 |             return;
343 |         }
344 |     }
345 | 
346 |     CUV_LOG_DEBUG("allocating " << memsize << "x" << valueSize << " bytes in pool " << m_name <<
347 |             " (" << memtype(m) << ")");
348 | 
349 |     // nothing found?
350 |     // allocate new memory
351 |     cuda_alloc.alloc(ptr, memsize, valueSize, m);
352 | 
353 |     pool[*ptr] = false;
354 |     pool_sizes[*ptr] = memsize * valueSize;
355 | 
356 |     CUV_LOG_DEBUG("allocated in pool " << m_name << " (" << memtype(m) <<
357 |             "). total bytes: " << pool_size(m) << ". count: " << pool_count(m) << ". free: "
358 |             << pool_free_count(m));
359 | 
360 |     assert(!pool.empty());
361 | }
362 | 
363 | void pooled_cuda_allocator::dealloc(void** ptr, dev_memory_space m) {
364 |     do_dealloc(ptr, m);
365 | }
366 | 
367 | void pooled_cuda_allocator::dealloc(void** ptr, host_memory_space m) {
368 |     do_dealloc(ptr, m);
369 | }
370 | 
371 | template<class memory_space>
372 | void pooled_cuda_allocator::do_dealloc(void** ptr, memory_space m) {
373 | 
374 |     assert(*ptr);
375 | 
376 |     boost::recursive_mutex::scoped_lock pool_lock(get_pool_mutex(m));
377 |     std::map<void*, bool>& pool = get_pool(m);
378 | 
379 |     std::map<void*, bool>::iterator it = pool.find(*ptr);
380 |     if (it == pool.end()) {
381 |         default_alloc.dealloc(ptr, m);
382 |         return;
383 |     }
384 | 
385 |     // mark the memory as available
386 |     assert(it->second == false);
387 |     it->second = true;
388 | 
389 | #ifndef NDEBUG
390 |     std::map<void*, size_t>& pool_sizes = get_pool_sizes(m);
391 | 
392 |     assert(pool_sizes[*ptr] > 0);
393 | 
394 |     CUV_LOG_DEBUG(
395 |             "released " << pool_sizes[*ptr] << " bytes in pool " << m_name << " ("
396 |             << memtype(m) << "). total bytes: " << pool_size(m) << ". count: " << pool_count(m) <<", free: " << pool_free_count(m));
397 | #endif
398 | 
399 |     *ptr = 0;
400 | }
401 | 
402 | void pooled_cuda_allocator::alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space m) {
403 |     if (memsize * valueSize < MIN_SIZE_HOST) {
404 |         default_alloc.alloc(ptr, memsize, valueSize, m);
405 |     } else {
406 |         alloc_pooled(ptr, memsize, valueSize, m);
407 |     }
408 | }
409 | 
410 | void pooled_cuda_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
411 |         host_memory_space m) {
412 |     // not yet pooled
413 |     default_alloc.alloc2d(ptr, pitch, height, width, valueSize, m);
414 | }
415 | 
416 | void pooled_cuda_allocator::alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
417 |         dev_memory_space m) {
418 |     // not yet pooled
419 |     default_alloc.alloc2d(ptr, pitch, height, width, valueSize, m);
420 | }
421 | 
422 | }
423 | 
424 | #define CUV_POOLED_CUDA_ALLOCATOR_INST(X) \
425 |     template size_t cuv::pooled_cuda_allocator::pool_count(X) const; \
426 |     template size_t cuv::pooled_cuda_allocator::pool_free_count(X) const; \
427 |     template size_t cuv::pooled_cuda_allocator::pool_size(X) const;
428 | 
429 | CUV_POOLED_CUDA_ALLOCATOR_INST(cuv::dev_memory_space);
430 | CUV_POOLED_CUDA_ALLOCATOR_INST(cuv::host_memory_space);
431 | 


--------------------------------------------------------------------------------
/src/cuv/allocators.hpp:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | #######################################################################################
  3 | # The MIT License
  4 | 
  5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
  6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
  7 | 
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | # of this software and associated documentation files (the "Software"), to deal
 10 | # in the Software without restriction, including without limitation the rights
 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | # copies of the Software, and to permit persons to whom the Software is
 13 | # furnished to do so, subject to the following conditions:
 14 | # 
 15 | # The above copyright notice and this permission notice shall be included in all
 16 | # copies or substantial portions of the Software.
 17 | # 
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 | # SOFTWARE.
 25 | #######################################################################################
 26 | #endif
 27 | #ifndef __CUV_ALLOCATORS_HPP__
 28 | #define __CUV_ALLOCATORS_HPP__
 29 | 
 30 | #include <assert.h>
 31 | #include <boost/shared_ptr.hpp>
 32 | #include <boost/thread/recursive_mutex.hpp>
 33 | #include <map>
 34 | #include <string>
 35 | 
 36 | #ifdef DEBUG_POOLING
 37 | #include <iostream>
 38 | #define CUV_LOG_DEBUG(X) std::cout << X << std::endl;
 39 | #else
 40 | #define CUV_LOG_DEBUG(X)
 41 | #endif
 42 | 
 43 | #include "tags.hpp"
 44 | #include "meta_programming.hpp"
 45 | 
 46 | namespace cuv {
 47 | 
 48 | class allocator {
 49 | 
 50 | public:
 51 | 
 52 |     virtual ~allocator() {
 53 |     }
 54 | 
 55 |     virtual void alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space) = 0;
 56 | 
 57 |     virtual void alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space) = 0;
 58 | 
 59 |     virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
 60 |             host_memory_space) = 0;
 61 | 
 62 |     virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
 63 |             dev_memory_space) = 0;
 64 | 
 65 |     virtual void dealloc(void** ptr, host_memory_space) = 0;
 66 | 
 67 |     virtual void dealloc(void** ptr, dev_memory_space) = 0;
 68 | 
 69 | };
 70 | 
 71 | /**
 72 |  * Allocator allows allocation, deallocation and copying depending on memory_space_type
 73 |  *
 74 |  * \ingroup tools
 75 |  */
 76 | class default_allocator: public allocator {
 77 | 
 78 | public:
 79 | 
 80 |     virtual ~default_allocator() {
 81 |     }
 82 | 
 83 |     virtual void alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space);
 84 | 
 85 |     virtual void alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space);
 86 | 
 87 |     virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
 88 |             host_memory_space);
 89 | 
 90 |     virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
 91 |             dev_memory_space);
 92 | 
 93 |     virtual void dealloc(void** ptr, host_memory_space);
 94 | 
 95 |     virtual void dealloc(void** ptr, dev_memory_space);
 96 | 
 97 | };
 98 | 
 99 | /**
100 |  * @brief allocator that uses cudaMallocHost for allocations in host_memory_space
101 |  */
102 | class cuda_allocator: public default_allocator {
103 | 
104 | public:
105 | 
106 |     virtual ~cuda_allocator() {
107 |     }
108 | 
109 |     virtual void alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space);
110 | 
111 |     virtual void alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space);
112 | 
113 |     virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
114 |             host_memory_space);
115 | 
116 |     virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
117 |             dev_memory_space);
118 | 
119 |     virtual void dealloc(void** ptr, host_memory_space);
120 | 
121 |     virtual void dealloc(void** ptr, dev_memory_space);
122 | 
123 | }
124 | ;
125 | 
126 | /**
127 |  * @brief allocator that naively pools device and host memory
128 |  */
129 | class pooled_cuda_allocator: public allocator {
130 | private:
131 | 
132 |     std::string m_name;
133 | 
134 |     boost::recursive_mutex m_dev_pool_mutex;
135 |     boost::recursive_mutex m_host_pool_mutex;
136 | 
137 |     // maps pointers to flag: true means memory is available. false means: currently in use
138 |     std::map<void*, bool> m_dev_pool;
139 |     std::map<void*, size_t> m_dev_pool_sizes;
140 | 
141 |     std::map<void*, bool> m_host_pool;
142 |     std::map<void*, size_t> m_host_pool_sizes;
143 | 
144 |     default_allocator default_alloc;
145 |     cuda_allocator cuda_alloc;
146 | 
147 |     pooled_cuda_allocator(const pooled_cuda_allocator& o);
148 |     pooled_cuda_allocator& operator=(const pooled_cuda_allocator& o);
149 | 
150 |     // for logging
151 |     std::string memtype(host_memory_space) const {
152 |         return "host space";
153 |     }
154 | 
155 |     // for logging
156 |     std::string memtype(dev_memory_space) const {
157 |         return "dev space";
158 |     }
159 | 
160 |     template<class memory_space>
161 |     boost::recursive_mutex& get_pool_mutex(memory_space m) const;
162 | 
163 |     template<class memory_space>
164 |     std::map<void*, bool>& get_pool(memory_space m);
165 | 
166 |     template<class memory_space>
167 |     const std::map<void*, bool>& get_pool(memory_space m) const;
168 | 
169 |     template<class memory_space>
170 |     std::map<void*, size_t>& get_pool_sizes(memory_space m);
171 | 
172 |     template<class memory_space>
173 |     const std::map<void*, size_t>& get_pool_sizes(memory_space m) const;
174 | 
175 |     template<class memory_space>
176 |     void collect_garbage(memory_space m);
177 | 
178 |     template<class memory_space>
179 |     void alloc_pooled(void** ptr, size_t memsize, size_t valueSize, memory_space m);
180 | 
181 |     template<class memory_space>
182 |     void delete_pool(memory_space);
183 | 
184 |     template<class memory_space>
185 |     void do_dealloc(void** ptr, memory_space m);
186 | 
187 | public:
188 |     static const size_t MIN_SIZE_HOST = 8192;
189 |     static const size_t MIN_SIZE_DEV = 1;
190 |     explicit pooled_cuda_allocator(const std::string& _name = "");
191 | 
192 |     virtual ~pooled_cuda_allocator();
193 | 
194 |     virtual void garbage_collection() {
195 |         collect_garbage(host_memory_space());
196 |         collect_garbage(dev_memory_space());
197 |     }
198 | 
199 |     virtual void alloc(void** ptr, size_t memsize, size_t valueSize, host_memory_space);
200 | 
201 |     virtual void alloc(void** ptr, size_t memsize, size_t valueSize, dev_memory_space);
202 | 
203 |     virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
204 |             host_memory_space);
205 | 
206 |     virtual void alloc2d(void** ptr, size_t& pitch, size_t height, size_t width, size_t valueSize,
207 |             dev_memory_space);
208 | 
209 |     virtual void dealloc(void** ptr, host_memory_space);
210 | 
211 |     virtual void dealloc(void** ptr, dev_memory_space);
212 | 
213 |     template<class memory_space>
214 |     size_t pool_free_count(memory_space m) const;
215 | 
216 |     template<class memory_space>
217 |     size_t pool_size(memory_space m) const;
218 | 
219 |     template<class memory_space>
220 |     size_t pool_count(memory_space m) const;
221 | 
222 |     size_t pool_free_count() const;
223 | 
224 |     size_t pool_size() const;
225 | 
226 |     size_t pool_count() const;
227 | 
228 | };
229 | 
230 | }
231 | 
232 | #endif
233 | 


--------------------------------------------------------------------------------
/src/cuv/cuda_general.hpp:
--------------------------------------------------------------------------------
 1 | #if 0
 2 | #######################################################################################
 3 | # The MIT License
 4 | 
 5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
 6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
 7 | 
 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | # 
15 | # The above copyright notice and this permission notice shall be included in all
16 | # copies or substantial portions of the Software.
17 | # 
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | # SOFTWARE.
25 | #######################################################################################
26 | #endif
27 | #ifndef __CUV_GENERAL_HPP__
28 | #define __CUV_GENERAL_HPP__
29 | 
30 | #include <cuda_runtime_api.h>
31 | #include <stdexcept>
32 | 
33 | #ifndef CUDA_TEST_DEVICE
34 | #  define CUDA_TEST_DEVICE 0
35 | #endif
36 | 
37 | namespace cuv {
38 | 
39 | /** check whether cuda thinks there was an error and fail with msg, if this is the case
40 |  * @ingroup tools
41 |  */
42 | static inline void checkCudaError(const char *msg) {
43 |     cudaError_t err = cudaGetLastError();
44 |     if (cudaSuccess != err) {
45 |         throw std::runtime_error(std::string(msg) + ": " + cudaGetErrorString(err));
46 |     }
47 | }
48 | 
49 | // use this macro to make sure no error occurs when cuda functions are called
50 | #ifdef NDEBUG
51 | #  define cuvSafeCall(X)  \
52 |       if(strcmp(#X,"cudaThreadSynchronize()")!=0){ X; cuv::checkCudaError(#X); }
53 | #else
54 | #  define cuvSafeCall(X) X; cuv::checkCudaError(#X);
55 | #endif
56 | 
57 | }
58 | 
59 | #endif
60 | 


--------------------------------------------------------------------------------
/src/cuv/memory.cu:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | #######################################################################################
  3 | # The MIT License
  4 | 
  5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
  6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
  7 | 
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | # of this software and associated documentation files (the "Software"), to deal
 10 | # in the Software without restriction, including without limitation the rights
 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | # copies of the Software, and to permit persons to whom the Software is
 13 | # furnished to do so, subject to the following conditions:
 14 | # 
 15 | # The above copyright notice and this permission notice shall be included in all
 16 | # copies or substantial portions of the Software.
 17 | # 
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 | # SOFTWARE.
 25 | #######################################################################################
 26 | #endif
 27 | #include "memory.hpp"
 28 | 
 29 | #include <thrust/device_ptr.h>
 30 | 
 31 | #include "cuda_general.hpp"
 32 | #include "meta_programming.hpp"
 33 | 
 34 | namespace cuv {
 35 | 
 36 | namespace detail {
 37 | 
 38 | template<class value_type>
 39 | void copy(value_type* dst, const value_type* src, size_t size, host_memory_space, host_memory_space,
 40 |         cudaStream_t stream) {
 41 |     memcpy(dst, src, size * sizeof(value_type));
 42 | }
 43 | 
 44 | template<class value_type, class value_type2>
 45 | void copy(value_type* dst, const value_type2* src, size_t size, host_memory_space, host_memory_space,
 46 |         cudaStream_t stream) {
 47 |     for (size_t i = 0; i < size; i++)
 48 |         dst[i] = static_cast<value_type>(src[i]);
 49 | }
 50 | 
 51 | template<class value_type>
 52 | void copy(value_type* dst, const value_type* src, size_t size, host_memory_space, dev_memory_space,
 53 |         cudaStream_t stream) {
 54 |     cuvSafeCall(cudaMemcpyAsync(dst, src, size * sizeof(value_type), cudaMemcpyDeviceToHost, stream));
 55 |     if (stream == 0) {
 56 |         cuvSafeCall(cudaStreamSynchronize(stream));
 57 |     }
 58 | }
 59 | template<class value_type>
 60 | void copy(value_type* dst, const value_type* src, size_t size, dev_memory_space, dev_memory_space,
 61 |         cudaStream_t stream) {
 62 |     cuvSafeCall(cudaMemcpyAsync(dst, src, size * sizeof(value_type), cudaMemcpyDeviceToDevice, stream));
 63 |     if (stream == 0) {
 64 |         cuvSafeCall(cudaStreamSynchronize(stream));
 65 |     }
 66 | }
 67 | 
 68 | template<class value_type, class value_type2>
 69 | void copy(value_type* dst, const value_type2* src, size_t size, host_memory_space, dev_memory_space,
 70 |         cudaStream_t stream) {
 71 |     cuvSafeCall(cudaMemcpyAsync(dst, src, size * sizeof(value_type), cudaMemcpyDeviceToHost, stream));
 72 |     if (stream == 0) {
 73 |         cuvSafeCall(cudaStreamSynchronize(stream));
 74 |     }
 75 | }
 76 | 
 77 | template<class value_type>
 78 | void copy(value_type* dst, const value_type* src, size_t size, dev_memory_space, host_memory_space,
 79 |         cudaStream_t stream) {
 80 |     cuvSafeCall(cudaMemcpyAsync(dst, src, size * sizeof(value_type), cudaMemcpyHostToDevice, stream));
 81 |     if (stream == 0) {
 82 |         cuvSafeCall(cudaStreamSynchronize(stream));
 83 |     }
 84 | }
 85 | 
 86 | template<class value_type, class value_type2>
 87 | void copy(value_type* dst, const value_type2* src, size_t size, dev_memory_space, dev_memory_space,
 88 |         cudaStream_t stream) {
 89 |     if (IsSame<value_type, value_type2>::Result::value) {
 90 |         cuvSafeCall(cudaMemcpyAsync(dst, src, size * sizeof(value_type), cudaMemcpyDeviceToDevice, stream));
 91 |         if (stream == 0) {
 92 |             cuvSafeCall(cudaStreamSynchronize(stream));
 93 |         }
 94 |     } else {
 95 |         thrust::copy(thrust::device_ptr<value_type2>(const_cast<value_type2*>(src)),
 96 |                 thrust::device_ptr<value_type2>(const_cast<value_type2*>(src)) + size,
 97 |                 thrust::device_ptr<value_type>(dst));
 98 |         cuvSafeCall(cudaThreadSynchronize());
 99 |     }
100 | }
101 | 
102 | template<class value_type, class value_type2>
103 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w,
104 |         host_memory_space, host_memory_space, cudaStream_t stream) {
105 |     cuvSafeCall(cudaMemcpy2DAsync(dst, dpitch * sizeof(value_type),
106 |             src, spitch * sizeof(value_type2),
107 |             w * sizeof(value_type), h, cudaMemcpyHostToHost, stream));
108 |     if (stream == 0) {
109 |         cuvSafeCall(cudaStreamSynchronize(stream));
110 |     }
111 | }
112 | 
113 | template<class value_type, class value_type2>
114 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h,
115 |         size_t w, host_memory_space, dev_memory_space, cudaStream_t stream) {
116 |     cuvSafeCall(cudaMemcpy2DAsync(dst, dpitch * sizeof(value_type), src, spitch * sizeof(value_type2),
117 |             w * sizeof(value_type), h, cudaMemcpyDeviceToHost, stream));
118 |     if (stream == 0) {
119 |         cuvSafeCall(cudaStreamSynchronize(stream));
120 |     }
121 | }
122 | 
123 | template<class value_type, class value_type2>
124 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h,
125 |         size_t w, dev_memory_space, host_memory_space, cudaStream_t stream) {
126 |     cuvSafeCall(cudaMemcpy2DAsync(dst, dpitch * sizeof(value_type), src, spitch * sizeof(value_type2),
127 |             w * sizeof(value_type), h, cudaMemcpyHostToDevice, stream));
128 |     if (stream == 0) {
129 |         cuvSafeCall(cudaStreamSynchronize(stream));
130 |     }
131 | }
132 | 
133 | template<class value_type, class value_type2>
134 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h,
135 |         size_t w, dev_memory_space, dev_memory_space, cudaStream_t stream) {
136 |     cuvSafeCall(cudaMemcpy2DAsync(dst, dpitch * sizeof(value_type),
137 |             src, spitch * sizeof(value_type2),
138 |             w * sizeof(value_type), h, cudaMemcpyDeviceToDevice, stream));
139 |     if (stream == 0) {
140 |         cuvSafeCall(cudaStreamSynchronize(stream));
141 |     }
142 | }
143 | 
144 | #define CUV_MEMORY_COPY(TYPE) \
145 | template void copy<TYPE>(TYPE*, const TYPE*, size_t, host_memory_space, host_memory_space, cudaStream_t); \
146 | template void copy<TYPE>(TYPE*, const TYPE*, size_t, host_memory_space, dev_memory_space, cudaStream_t); \
147 | template void copy<TYPE>(TYPE*, const TYPE*, size_t, dev_memory_space, host_memory_space, cudaStream_t); \
148 | template void copy<TYPE>(TYPE*, const TYPE*, size_t, dev_memory_space, dev_memory_space, cudaStream_t); \
149 | template void copy2d<TYPE, TYPE>(TYPE*, const TYPE*, size_t, size_t, size_t, size_t, host_memory_space, host_memory_space, cudaStream_t); \
150 | template void copy2d<TYPE, TYPE>(TYPE*, const TYPE*, size_t, size_t, size_t, size_t, host_memory_space, dev_memory_space, cudaStream_t); \
151 | template void copy2d<TYPE, TYPE>(TYPE*, const TYPE*, size_t, size_t, size_t, size_t, dev_memory_space, host_memory_space, cudaStream_t); \
152 | template void copy2d<TYPE, TYPE>(TYPE*, const TYPE*, size_t, size_t, size_t, size_t, dev_memory_space, dev_memory_space, cudaStream_t);
153 | 
154 | CUV_MEMORY_COPY(signed char);
155 | CUV_MEMORY_COPY(unsigned char);
156 | CUV_MEMORY_COPY(short);
157 | CUV_MEMORY_COPY(unsigned short);
158 | CUV_MEMORY_COPY(int);
159 | CUV_MEMORY_COPY(unsigned int);
160 | CUV_MEMORY_COPY(float);
161 | CUV_MEMORY_COPY(double);
162 | 
163 | }
164 | 
165 | }
166 | 


--------------------------------------------------------------------------------
/src/cuv/memory.hpp:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | #######################################################################################
  3 | # The MIT License
  4 | 
  5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
  6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
  7 | 
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | # of this software and associated documentation files (the "Software"), to deal
 10 | # in the Software without restriction, including without limitation the rights
 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | # copies of the Software, and to permit persons to whom the Software is
 13 | # furnished to do so, subject to the following conditions:
 14 | # 
 15 | # The above copyright notice and this permission notice shall be included in all
 16 | # copies or substantial portions of the Software.
 17 | # 
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 | # SOFTWARE.
 25 | #######################################################################################
 26 | #endif
 27 | #ifndef __CUV_MEMORY_HPP__
 28 | #define __CUV_MEMORY_HPP__
 29 | 
 30 | #include <boost/make_shared.hpp>
 31 | #include <boost/shared_ptr.hpp>
 32 | #include <cuda_runtime_api.h>
 33 | #include <limits>
 34 | #include <stdexcept>
 35 | 
 36 | #include "allocators.hpp"
 37 | #include "reference.hpp"
 38 | 
 39 | namespace boost {
 40 | namespace serialization {
 41 | class access;
 42 | }
 43 | }
 44 | 
 45 | namespace cuv {
 46 | 
 47 | /**
 48 |  * @addtogroup data_structures
 49 |  * @{
 50 |  */
 51 | 
 52 | /**
 53 |  * @addtogroup tags
 54 |  * @{
 55 |  */
 56 | /// Tag for column major matrices
 57 | struct column_major {
 58 | };
 59 | /// Tag for row major matrices
 60 | struct row_major {
 61 | };
 62 | 
 63 | /// tag for linear memory
 64 | struct linear_memory_tag {
 65 | };
 66 | 
 67 | /// tag for pitched memory
 68 | struct pitched_memory_tag {
 69 | };
 70 | 
 71 | /** @} */ // tags
 72 | namespace detail {
 73 | 
 74 | /// copy from host to host
 75 | template<class value_type>
 76 | void copy(value_type* dst, const value_type* src, size_t size, host_memory_space, host_memory_space, cudaStream_t);
 77 | 
 78 | /// copy from device to host
 79 | template<class value_type>
 80 | void copy(value_type* dst, const value_type* src, size_t size, host_memory_space, dev_memory_space, cudaStream_t);
 81 | 
 82 | /// copy from host to host
 83 | template<class value_type, class value_type2>
 84 | void copy(value_type* dst, const value_type2* src, size_t size, host_memory_space, host_memory_space, cudaStream_t);
 85 | 
 86 | /// copy from device to host
 87 | template<class value_type, class value_type2>
 88 | void copy(value_type* dst, const value_type2* src, size_t size, host_memory_space, dev_memory_space, cudaStream_t);
 89 | 
 90 | /// copy from host to device
 91 | template<class value_type>
 92 | void copy(value_type* dst, const value_type* src, size_t size, dev_memory_space, host_memory_space, cudaStream_t);
 93 | 
 94 | /// copy from device to device
 95 | template<class value_type>
 96 | void copy(value_type* dst, const value_type* src, size_t size, dev_memory_space, dev_memory_space, cudaStream_t);
 97 | 
 98 | /// copy from host to device
 99 | template<class value_type, class value_type2>
100 | void copy(value_type* dst, const value_type2* src, size_t size, dev_memory_space, host_memory_space, cudaStream_t);
101 | 
102 | /// copy from device to device
103 | template<class value_type, class value_type2>
104 | void copy(value_type* dst, const value_type2* src, size_t size, dev_memory_space, dev_memory_space, cudaStream_t);
105 | 
106 | /// copy from host to host
107 | template<class value_type, class value_type2>
108 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w,
109 |         host_memory_space, host_memory_space, cudaStream_t);
110 | 
111 | /// copy from device to host
112 | template<class value_type, class value_type2>
113 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w,
114 |         host_memory_space, dev_memory_space, cudaStream_t);
115 | 
116 | /// copy from host to device
117 | template<class value_type, class value_type2>
118 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w,
119 |         dev_memory_space, host_memory_space, cudaStream_t);
120 | 
121 | /// copy from device to device
122 | template<class value_type, class value_type2>
123 | void copy2d(value_type* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w,
124 |         dev_memory_space, dev_memory_space, cudaStream_t);
125 | }
126 | 
127 | /**
128 |  * simply keeps a pointer and deallocates it when destroyed
129 |  */
130 | template<class V, class M>
131 | class memory {
132 | 
133 | public:
134 |     typedef typename unconst<V>::type value_type; ///< type of contained values
135 |     typedef const V const_value_type; ///< const version of value_type
136 |     typedef M memory_space_type; ///< host or dev memory_space
137 |     typedef unsigned int size_type; ///< type of shapes
138 |     typedef int index_type; ///< how to index values
139 |     typedef reference<V, M> reference_type; ///< type of reference you get using operator[]
140 |     typedef const reference<V, M> const_reference_type; ///< type of reference you get using operator[]
141 | 
142 | private:
143 |     friend class boost::serialization::access;
144 | 
145 |     /// prohibit copying
146 |     memory(const memory&);
147 | 
148 |     /// prohibit copying
149 |     memory& operator=(const memory& o);
150 | 
151 | protected:
152 |     V* m_ptr; ///< points to allocated memory
153 |     size_type m_size; ///< size (for serialization)
154 |     boost::shared_ptr<allocator> m_allocator; ///< how stored memory was allocated
155 |     bool m_owned; ///< flag is this instance owns the memory (m_ptr) and is responsibly for destroying
156 | 
157 |     void check_size_limit(size_t size) const {
158 |         if (size > static_cast<size_t>(std::numeric_limits<index_type>::max())) {
159 |             throw std::runtime_error("maximum memory size exceeded");
160 |         }
161 |     }
162 | 
163 | public:
164 | 
165 |     /// @return pointer to allocated memory
166 |     V* ptr() {
167 |         return m_ptr;
168 |     }
169 | 
170 |     /// @return pointer to allocated memory (const)
171 |     const V* ptr() const {
172 |         return m_ptr;
173 |     }
174 | 
175 |     /// @return number of stored elements
176 |     size_type size() const {
177 |         return m_size;
178 |     }
179 | 
180 |     /// @return number of stored bytes
181 |     size_type memsize() const {
182 |         return size() * sizeof(V);
183 |     }
184 | 
185 |     /// reset information (use with care, for deserialization)
186 |     void reset(V* p, size_type s) {
187 |         m_ptr = p;
188 |         m_size = s;
189 |     }
190 | 
191 |     /// default constructor (just sets ptr to NULL)
192 |     explicit memory(const boost::shared_ptr<allocator>& _allocator) :
193 |             m_ptr(NULL), m_size(0), m_allocator(_allocator), m_owned(true) {
194 |     }
195 | 
196 |     /// construct with pointer (takes /ownership/ of this pointer and deletes it when destroyed!)
197 |     explicit memory(value_type* ptr, size_type size, const boost::shared_ptr<allocator>& _allocator, bool owned = true) :
198 |             m_ptr(ptr), m_size(size), m_allocator(_allocator), m_owned(owned) {
199 |     }
200 | 
201 |     /// destructor (deallocates the memory)
202 |     ~memory() {
203 |         dealloc();
204 |     }
205 | 
206 |     /// dellocate space
207 |     void dealloc() {
208 |         if (m_ptr && m_owned) {
209 |             m_allocator->dealloc(reinterpret_cast<void**>(&this->m_ptr), memory_space_type());
210 |         }
211 |         m_ptr = NULL;
212 |         m_size = 0;
213 |     }
214 | 
215 |     template<class value_type2, class memory_space>
216 |     void copy_from(V* dst, const value_type2* src, size_t size, memory_space m, cudaStream_t stream) {
217 |         detail::copy(dst, src, size, M(), m, stream);
218 |     }
219 | 
220 |     template<class value_type2, class memory_space>
221 |     void copy_from(const value_type2* src, size_t size, memory_space m, cudaStream_t stream) {
222 |         copy_from(m_ptr, src, size, m, stream);
223 |     }
224 | 
225 |     template<class value_type2, class memory_space>
226 |     void copy2d_from(V* dst, const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w,
227 |             memory_space m, cudaStream_t stream) {
228 |         detail::copy2d(dst, src, dpitch, spitch, h, w, M(), m, stream);
229 |     }
230 | 
231 |     template<class value_type2, class memory_space>
232 |     void copy2d_from(const value_type2* src, size_t dpitch, size_t spitch, size_t h, size_t w,
233 |             memory_space m, cudaStream_t stream) {
234 |         copy2d_from(m_ptr, src, dpitch, spitch, h, w, m, stream);
235 |     }
236 | 
237 | };
238 | 
239 | /**
240 |  * represents contiguous memory
241 |  */
242 | template<class V, class M>
243 | class linear_memory: public memory<V, M> {
244 | private:
245 |     typedef memory<V, M> super;
246 |     public:
247 |     typedef typename super::value_type value_type; ///< type of contained values
248 |     typedef typename super::const_value_type const_value_type; ///< const version of value_type
249 |     typedef typename super::memory_space_type memory_space_type; ///< host or dev memory_space
250 |     typedef typename super::index_type index_type; ///< how to index values
251 |     typedef typename super::size_type size_type; ///< type of shapes
252 |     typedef typename super::reference_type reference_type; ///< type of reference you get using operator[]
253 |     typedef typename super::const_reference_type const_reference_type; ///< type of reference you get using operator[]
254 | 
255 | private:
256 | 
257 |     friend class boost::serialization::access;
258 |     typedef linear_memory<V, M> my_type; ///< my own type
259 |     using super::m_ptr;
260 |     using super::m_size;
261 |     using super::m_allocator;
262 | 
263 | public:
264 | 
265 |     /// default constructor: does nothing
266 |     explicit linear_memory(const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
267 |             memory<V, M>(_allocator) {
268 |     }
269 | 
270 |     /** constructor: reserves space for i elements
271 |      *  @param i number of elements
272 |      */
273 |     explicit linear_memory(size_type i, const boost::shared_ptr<allocator> _allocator =
274 |             boost::make_shared<default_allocator>()) :
275 |             memory<V, M>(_allocator) {
276 |         m_size = i;
277 |         alloc();
278 |     }
279 | 
280 |     /// releases ownership of pointer (for storage in memory class)
281 |     value_type* release() {
282 |         value_type* ptr = m_ptr;
283 |         m_ptr = NULL;
284 |         return ptr;
285 |     }
286 | 
287 |     /// sets the size (reallocates if necessary)
288 |     void set_size(size_type s) {
289 |         if (s != this->size()) {
290 |             this->dealloc();
291 |             m_size = s;
292 |             alloc();
293 |         }
294 |     }
295 | 
296 |     /// allocate space according to size()
297 |     void alloc() {
298 |         assert(this->m_ptr == NULL);
299 |         if (m_size > 0)
300 |             m_allocator->alloc(reinterpret_cast<void**>(&m_ptr), m_size, sizeof(V), memory_space_type());
301 |     }
302 | 
303 |     /**
304 |      * @brief Copy linear_memory.
305 |      *
306 |      * @param o Source linear_memory
307 |      *
308 |      * @return *this
309 |      *
310 |      */
311 |     my_type& operator=(const my_type& o) {
312 |         if (this == &o)
313 |             return *this;
314 | 
315 |         if (this->size() != o.size()) {
316 |             this->dealloc();
317 |             m_size = o.size();
318 |             this->alloc();
319 |         }
320 | 
321 |         // TODO async copy
322 |         cudaStream_t stream = 0;
323 |         this->copy_from(o, stream);
324 | 
325 |         return *this;
326 |     }
327 | 
328 |     /**
329 |      * @overload
330 |      *
331 |      * @brief Copy linear_memory from other memory type.
332 |      *
333 |      * @param o Source linear_memory
334 |      *
335 |      * @return *this
336 |      *
337 |      */
338 |     template<class OM>
339 |     my_type& operator=(const linear_memory<value_type, OM>& o) {
340 |         if (this->size() != o.size()) {
341 |             this->dealloc();
342 |             m_size = o.size();
343 |             this->alloc();
344 |         }
345 | 
346 |         // TODO async copy
347 |         cudaStream_t stream = 0;
348 |         this->copy_from(o, stream);
349 |         return *this;
350 |     }
351 | 
352 |     /**
353 |      * construct from other linear memory
354 |      */
355 |     explicit linear_memory(const my_type& o) :
356 |             memory<V, M>(o.m_allocator) {
357 |         operator=(o);
358 |     }
359 | 
360 |     /**
361 |      * construct from other linear memory
362 |      */
363 |     template<class OM>
364 |     explicit linear_memory(const linear_memory<V, OM>& o) :
365 |             memory<V, M>(o.m_allocator) {
366 |         operator=(o);
367 |     }
368 | 
369 |     /**
370 |      * @return a reference to memory at a position
371 |      * @param idx position
372 |      */
373 |     reference_type operator[](const index_type& idx) {
374 |         assert(idx >= 0);
375 |         assert((size_type) idx < m_size);
376 |         return reference_type(this->m_ptr + idx);
377 |     }
378 | 
379 |     /**
380 |      * @overload
381 |      *
382 |      * @return a reference to memory at a position
383 |      * @param idx position
384 |      */
385 |     const_reference_type operator[](const index_type& idx) const {
386 |         assert(idx >= 0);
387 |         assert((size_type) idx < m_size);
388 |         return const_reference_type(this->m_ptr + idx);
389 |     }
390 | 
391 |     /// deallocates memory
392 |     ~linear_memory() {
393 |         this->dealloc();
394 |     }
395 | 
396 |     /// set strides for this memory
397 |     void set_strides(linear_memory<index_type, cuv::host_memory_space>& strides,
398 |             const linear_memory<size_type, cuv::host_memory_space>& shape, row_major) {
399 |         size_t size = 1;
400 |         for (int i = shape.size() - 1; i >= 0; --i) {
401 |             strides[i] = (shape[i] == 1) ? 0 : size;
402 |             size *= shape[i];
403 |         }
404 |         this->check_size_limit(size);
405 |     }
406 | 
407 |     /// set strides for this memory
408 |     void set_strides(linear_memory<index_type, cuv::host_memory_space>& strides,
409 |             const linear_memory<size_type, cuv::host_memory_space>& shape, column_major) {
410 |         size_t size = 1;
411 |         for (size_t i = 0; i < shape.size(); ++i) {
412 |             strides[i] = (shape[i] == 1) ? 0 : size;
413 |             size *= shape[i];
414 |         }
415 |         this->check_size_limit(size);
416 |     }
417 | 
418 |     /** reverse the array (for transposing etc)
419 |      *
420 |      * currently only enabled for host memory space arrays
421 |      */
422 |     void reverse() {
423 |         if (IsSame<dev_memory_space, memory_space_type>::Result::value)
424 |             throw std::runtime_error("reverse of dev linear memory not implemented");
425 |         value_type* __first = m_ptr, *__last = m_ptr + this->size();
426 |         while (true)
427 |             if (__first == __last || __first == --__last)
428 |                 return;
429 |             else {
430 |                 std::iter_swap(__first, __last);
431 |                 ++__first;
432 |             }
433 |     }
434 | 
435 |     template<class value_type2, class memory_space>
436 |     void copy_from(const value_type2* src, size_t size, memory_space m, cudaStream_t stream) {
437 |         memory<V, M>::copy_from(src, size, m, stream);
438 |     }
439 | 
440 |     template<class V2, class OM>
441 |     void copy_from(const linear_memory<V2, OM>& src, cudaStream_t stream) const {
442 |         detail::copy(m_ptr, src.ptr(), src.size(), M(), OM(), stream);
443 |     }
444 | 
445 | };
446 | 
447 | /**
448 |  * represents 2D non-contiguous ("pitched") memory
449 |  */
450 | template<class V, class M>
451 | class pitched_memory: public memory<V, M> {
452 | 
453 | private:
454 |     typedef memory<V, M> super;
455 | 
456 | public:
457 | 
458 |     typedef typename super::value_type value_type; ///< type of contained values
459 |     typedef typename super::const_value_type const_value_type; ///< const version of value_type
460 |     typedef typename super::memory_space_type memory_space_type; ///< host or dev memory_space
461 |     typedef typename super::index_type index_type; ///< how to index values
462 |     typedef typename super::size_type size_type; ///< type of shapes
463 |     typedef typename super::reference_type reference_type; ///< type of reference you get using operator[]
464 |     typedef typename super::const_reference_type const_reference_type; ///< type of reference you get using operator[]
465 | 
466 | private:
467 |     friend class boost::serialization::access;
468 |     typedef pitched_memory<V, M> my_type; ///< my own type
469 |     size_type m_rows; ///< number of rows
470 |     size_type m_cols; ///< number of columns
471 |     size_type m_pitch; ///< pitch (multiples of sizeof(V))
472 |     using super::m_ptr;
473 |     using super::m_size;
474 |     using super::m_allocator;
475 |     public:
476 | 
477 |     /// @return the number of rows
478 |     size_type rows() const {
479 |         return m_rows;
480 |     }
481 | 
482 |     /// @return the number of cols
483 |     size_type cols() const {
484 |         return m_cols;
485 |     }
486 | 
487 |     /// @return the number of allocated cols
488 |     size_type pitch() const {
489 |         return m_pitch;
490 |     }
491 | 
492 |     /// @return number of stored elements
493 |     size_type size() const {
494 |         return m_rows * m_pitch;
495 |     }
496 | 
497 |     /// @return number of stored bytes
498 |     size_type memsize() const {
499 |         return size() * sizeof(V);
500 |     }
501 | 
502 |     /// default constructor: does nothing
503 |     explicit pitched_memory(const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
504 |             memory<V, M>(_allocator), m_rows(0), m_cols(0), m_pitch(0) {
505 |     }
506 | 
507 |     /** constructor: reserves space for at least i*j elements
508 |      *  @param i number of rows
509 |      *  @param j minimum number of elements per row
510 |      */
511 |     explicit pitched_memory(index_type i, index_type j, const boost::shared_ptr<allocator> _allocator =
512 |             boost::make_shared<default_allocator>()) :
513 |             memory<V, M>(_allocator), m_rows(i), m_cols(j), m_pitch(0) {
514 |         alloc();
515 |     }
516 | 
517 |     /**
518 |      * allocate space according to size()
519 |      */
520 |     void alloc() {
521 |         assert(this->m_ptr == NULL);
522 |         size_t pitch;
523 |         m_allocator->alloc2d(reinterpret_cast<void**>(&this->m_ptr), pitch, m_rows, m_cols, sizeof(V),
524 |                 memory_space_type());
525 |         assert(this->m_ptr != NULL);
526 |         m_pitch = pitch;
527 |         assert(m_pitch % sizeof(value_type) == 0);
528 |         m_pitch /= sizeof(value_type);
529 |         m_size = m_rows * m_pitch; // in class memory
530 |     }
531 | 
532 |     /// releases ownership of pointer (for storage in memory class)
533 |     value_type* release() {
534 |         value_type* ptr = m_ptr;
535 |         m_ptr = NULL;
536 |         return ptr;
537 |     }
538 | 
539 |     /**
540 |      * set the size (reallocating, if necessary)
541 |      * @param rows number of desired rows
542 |      * @param cols number of desired columns
543 |      */
544 |     void set_size(size_type rows, size_type cols) {
545 |         if (cols > m_pitch || rows > m_rows) {
546 |             this->dealloc();
547 |             m_rows = rows;
548 |             m_cols = cols;
549 |             this->alloc();
550 |         } else {
551 |             m_rows = rows;
552 |             m_cols = cols;
553 |         }
554 |     }
555 | 
556 |     /**
557 |      * @brief Copy pitched_memory.
558 |      *
559 |      * @param o Source pitched_memory
560 |      *
561 |      * @return *this
562 |      *
563 |      */
564 |     my_type& operator=(const my_type& o) {
565 |         if (this == &o)
566 |             return *this;
567 | 
568 |         if (m_pitch < o.m_cols || m_rows < o.m_rows) {
569 |             this->dealloc();
570 |             m_cols = o.m_cols;
571 |             m_rows = o.m_rows;
572 |             this->alloc();
573 |         }
574 |         m_cols = o.m_cols;
575 |         m_rows = o.m_rows;
576 |         this->copy_from(o);
577 |         return *this;
578 |     }
579 | 
580 |     /**
581 |      * @overload
582 |      *
583 |      * @brief Copy pitched_memory from other memory type.
584 |      *
585 |      * @param o Source pitched_memory
586 |      *
587 |      * @return *this
588 |      *
589 |      */
590 |     template<class OM>
591 |     my_type&
592 |     operator=(const pitched_memory<value_type, OM>& o) {
593 |         if (m_pitch < o.m_cols || m_rows < o.m_rows) {
594 |             this->dealloc();
595 |             m_cols = o.m_cols;
596 |             m_rows = o.m_rows;
597 |             this->alloc();
598 |         }
599 |         m_cols = o.m_cols;
600 |         m_rows = o.m_rows;
601 |         this->copy_from(o);
602 |         return *this;
603 |     }
604 | 
605 |     /**
606 |      * @return a reference to memory at a position as if this were pitched memory
607 |      * @param idx position
608 |      */
609 |     reference_type operator[](const index_type& idx) {
610 |         assert(idx >= 0);
611 |         index_type row = idx / m_cols;
612 |         index_type col = idx % m_cols;
613 |         assert((size_type) row < m_rows);
614 |         assert((size_type) col < m_cols);
615 |         return reference_type(this->m_ptr + row * m_pitch + col);
616 |     }
617 | 
618 |     /**
619 |      * @overload
620 |      *
621 |      * @return a reference to memory at a position
622 |      * @param idx position
623 |      */
624 |     const_reference_type operator[](const index_type& idx) const {
625 |         return const_cast<pitched_memory&>(*this)(idx);
626 |     }
627 | 
628 |     /**
629 |      * get a reference to a datum in memory
630 |      *
631 |      * @param i first (slow-changing) dimension index
632 |      * @param j second (fast-changing) dimension index
633 |      * @return reference to datum at index i,j
634 |      */
635 |     reference_type operator()(const index_type& i, const index_type& j) {
636 |         assert(i >= 0);
637 |         assert(j >= 0);
638 |         assert((size_type) i < m_rows);
639 |         assert((size_type) j < m_cols);
640 |         return reference_type(this->m_ptr + i * m_pitch + j);
641 |     }
642 |     /** @overload */
643 |     const_reference_type operator()(const index_type& i, const index_type& j) const {
644 |         return const_cast<pitched_memory&>(*this)(i, j);
645 |     }
646 | 
647 |     /**
648 |      * set strides for this memory
649 |      *
650 |      * determines the strides for a given shape, with special consideration to pitched dimension
651 |      *
652 |      * @param strides output vector
653 |      * @param shape   shape of the vector
654 |      *
655 |      * row major version
656 |      */
657 |     void set_strides(linear_memory<index_type, cuv::host_memory_space>& strides,
658 |             const linear_memory<size_type, cuv::host_memory_space>& shape, row_major) {
659 |         size_type size = 1;
660 |         assert(shape.size() >= 2);
661 |         const int pitched_dim = shape.size() - 1;
662 |         for (int i = shape.size() - 1; i >= 0; --i) {
663 |             if (shape[i] == 1) {
664 |                 strides[i] = 0;
665 |             } else if (i == pitched_dim) {
666 |                 strides[i] = 1;
667 |                 size *= pitch();
668 |             } else {
669 |                 strides[i] = size;
670 |                 size *= shape[i];
671 |             }
672 |         }
673 |     }
674 |     /**
675 |      * @overload
676 |      *
677 |      * column major version
678 |      */
679 |     void set_strides(linear_memory<index_type, cuv::host_memory_space>& strides,
680 |             const linear_memory<size_type, cuv::host_memory_space>& shape, column_major) {
681 |         size_type size = 1;
682 |         assert(shape.size() >= 2);
683 |         const size_type pitched_dim = 0;
684 |         for (unsigned int i = 0; i < shape.size(); ++i) {
685 |             if (shape[i] == 1) {
686 |                 strides[i] = 0;
687 |             } else if (i == pitched_dim) {
688 |                 strides[i] = 1;
689 |                 size *= pitch();
690 |             } else {
691 |                 strides[i] = size;
692 |                 size *= shape[i];
693 |             }
694 |         }
695 |     }
696 | 
697 |     template<class V2, class OM>
698 |     void copy2d_from(const memory<V2, OM> src, cudaStream_t stream) const {
699 |         memory<V, M>::copy2d_from(m_ptr, src.ptr(), m_pitch / sizeof(value_type), src.m_pitch / sizeof(V2),
700 |                 m_rows, m_cols, M(), OM(), stream);
701 |     }
702 | 
703 |     template<class V2, class OM>
704 |     void copy_from(const pitched_memory<V2, OM>& src, cudaStream_t stream) const {
705 |         detail::copy(m_ptr, src.ptr(), src.size(), M(), OM(), stream);
706 |     }
707 | 
708 | };
709 | 
710 | /** @} */ // data_structures
711 | namespace detail {
712 | 
713 | /**
714 |  * true iff there are no "holes" in memory
715 |  */
716 | inline bool is_c_contiguous(row_major, const linear_memory<unsigned int, cuv::host_memory_space>& shape,
717 |         const linear_memory<int, cuv::host_memory_space>& stride) {
718 |     bool c_contiguous = true;
719 |     int size = 1;
720 |     for (int i = shape.size() - 1; (i >= 0) && c_contiguous; --i) {
721 |         if (shape[i] == 1)
722 |             continue;
723 |         if (stride[i] != size)
724 |             c_contiguous = false;
725 |         size = size * shape[i];
726 |     }
727 |     return c_contiguous;
728 | }
729 | 
730 | /**
731 |  * @overload
732 |  */
733 | inline bool is_c_contiguous(column_major, const linear_memory<unsigned int, cuv::host_memory_space>& shape,
734 |         const linear_memory<int, cuv::host_memory_space>& stride) {
735 |     bool c_contiguous = true;
736 |     int size = 1;
737 |     for (unsigned int i = 0; i < shape.size() && c_contiguous; ++i) {
738 |         if (shape[i] == 1)
739 |             continue;
740 |         if (stride[i] != size)
741 |             c_contiguous = false;
742 |         size = size * shape[i];
743 |     }
744 |     return c_contiguous;
745 | }
746 | 
747 | /// returns true iff memory can be copied using copy2d
748 | inline bool is_2dcopyable(row_major, const linear_memory<unsigned int, cuv::host_memory_space>& shape,
749 |         const linear_memory<int, cuv::host_memory_space>& stride) {
750 |     bool c_contiguous = shape.size() > 1;
751 |     int pitched_dim = shape.size() - 1; // last dim
752 |     while (shape[pitched_dim] == 1 && stride[pitched_dim] == 1)
753 |         pitched_dim--;
754 |     int size = 1;
755 |     for (int i = shape.size() - 1; (i >= 0) && c_contiguous; --i) {
756 |         if (shape[i] == 1) {
757 |             continue;
758 |         } else if (i == pitched_dim) {
759 |             size *= stride[i - 1];
760 |         } else if (stride[i] != size) {
761 |             c_contiguous = false;
762 |         } else {
763 |             size *= shape[i];
764 |         }
765 |     }
766 |     return c_contiguous;
767 | }
768 | 
769 | /// @overload
770 | inline bool is_2dcopyable(column_major, const linear_memory<unsigned int, cuv::host_memory_space>& shape,
771 |         const linear_memory<int, cuv::host_memory_space>& stride) {
772 |     bool c_contiguous = shape.size() > 1;
773 |     unsigned int pitched_dim = 0;
774 |     while (shape[pitched_dim] == 1 && stride[pitched_dim] == 1)
775 |         pitched_dim++;
776 |     int size = 1;
777 |     for (unsigned int i = 0; (i < shape.size()) && c_contiguous; ++i) {
778 |         if (shape[i] == 1) {
779 |             continue;
780 |         } else if (i == pitched_dim) {
781 |             size *= stride[i];
782 |         } else if (stride[i] != size) {
783 |             c_contiguous = false;
784 |         } else {
785 |             size *= shape[i];
786 |         }
787 |     }
788 |     return c_contiguous;
789 | }
790 | 
791 | }
792 | }
793 | 
794 | #endif
795 | 


--------------------------------------------------------------------------------
/src/cuv/meta_programming.hpp:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | #######################################################################################
  3 | # The MIT License
  4 | 
  5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
  6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
  7 | 
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | # of this software and associated documentation files (the "Software"), to deal
 10 | # in the Software without restriction, including without limitation the rights
 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | # copies of the Software, and to permit persons to whom the Software is
 13 | # furnished to do so, subject to the following conditions:
 14 | # 
 15 | # The above copyright notice and this permission notice shall be included in all
 16 | # copies or substantial portions of the Software.
 17 | # 
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 | # SOFTWARE.
 25 | #######################################################################################
 26 | #endif
 27 | #ifndef __CUV_META_PROGRAMMING_HPP__
 28 | #define __CUV_META_PROGRAMMING_HPP__
 29 | 
 30 | namespace cuv {
 31 | 
 32 | /**
 33 |  * @addtogroup MetaProgramming
 34 |  * @{
 35 |  */
 36 | 
 37 | /// defines "False"
 38 | struct FalseType {
 39 |         enum {
 40 |             value = false
 41 |         };
 42 | };
 43 | /// defines "True"
 44 | struct TrueType {
 45 |         enum {
 46 |             value = true
 47 |         };
 48 | };
 49 | 
 50 | /**
 51 |  * @brief Checks whether two types are equal
 52 |  */
 53 | template<typename T1, typename T2>
 54 | struct IsSame
 55 | {
 56 |         /// is true only if T1==T2
 57 |         typedef FalseType Result;
 58 | };
 59 | 
 60 | /**
 61 |  * @see IsSame
 62 |  */
 63 | template<typename T>
 64 | struct IsSame<T, T>
 65 | {
 66 |         /// T==T, therefore Result==TrueType
 67 |         typedef TrueType Result;
 68 | };
 69 | 
 70 | /**
 71 |  * @brief Checks whether two types are different
 72 |  */
 73 | template<typename T1, typename T2>
 74 | struct IsDifferent
 75 | {
 76 |         /// is true only if T1!=T2
 77 |         typedef TrueType Result;
 78 | };
 79 | 
 80 | /**
 81 |  * @see IsDifferent
 82 |  */
 83 | template<typename T>
 84 | struct IsDifferent<T, T>
 85 | {
 86 |         /// T==T, therefore Result==FalseType
 87 |         typedef FalseType Result;
 88 | };
 89 | 
 90 | /**
 91 |  * @brief Remove "const" from a type
 92 |  */
 93 | template<typename T>
 94 | struct unconst {
 95 |         /// no change
 96 |         typedef T type;
 97 | };
 98 | 
 99 | /**
100 |  * @see unconst
101 |  */
102 | template<typename T>
103 | struct unconst<const T> {
104 |         /// T without the const
105 |         typedef T type;
106 | };
107 | 
108 | /**
109 |  * @brief Switch result depending on Condition
110 |  */
111 | template<bool Condition, class Then, class Else>
112 | struct If {
113 |         /// assume condition is true
114 |         typedef Then result;
115 | };
116 | /**
117 |  * @see If
118 |  */
119 | template<class Then, class Else>
120 | struct If<false, Then, Else> {
121 |         /// condition is false
122 |         typedef Else result;
123 | };
124 | 
125 | /**
126 |  * @brief enable-if controlled creation of SFINAE conditions
127 |  */
128 | template<bool B, class T = void>
129 | struct EnableIfC {
130 |         typedef T type; /// enabling succeeded :-)
131 | };
132 | 
133 | /// @see EnableIfC
134 | template<class T>
135 | struct EnableIfC<false, T> {
136 | };
137 | 
138 | /// @see EnableIfC
139 | template<class Cond, class T = void>
140 | struct EnableIf: public EnableIfC<Cond::value, T> {
141 | };
142 | 
143 | /// @see EnableIfC
144 | template<class Cond, class T = void>
145 | struct DisableIf: public EnableIfC<!Cond::value, T> {
146 | };
147 | 
148 | /**
149 |  * @}
150 |  */
151 | }
152 | 
153 | #endif
154 | 


--------------------------------------------------------------------------------
/src/cuv/ndarray.hpp:
--------------------------------------------------------------------------------
   1 | #if 0
   2 | #######################################################################################
   3 | # The MIT License
   4 | 
   5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
   6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
   7 | 
   8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
   9 | # of this software and associated documentation files (the "Software"), to deal
  10 | # in the Software without restriction, including without limitation the rights
  11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  12 | # copies of the Software, and to permit persons to whom the Software is
  13 | # furnished to do so, subject to the following conditions:
  14 | # 
  15 | # The above copyright notice and this permission notice shall be included in all
  16 | # copies or substantial portions of the Software.
  17 | # 
  18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  24 | # SOFTWARE.
  25 | #######################################################################################
  26 | #endif
  27 | #ifndef __CUV_NDARRAY_HPP__
  28 | #define __CUV_NDARRAY_HPP__
  29 | 
  30 | #include <boost/multi_array/extent_gen.hpp>
  31 | #include <boost/multi_array/index_gen.hpp>
  32 | #include <boost/shared_ptr.hpp>
  33 | #include <iostream>
  34 | #include <limits>
  35 | #include <numeric>
  36 | #include <stdexcept>
  37 | #include <vector>
  38 | 
  39 | #include "allocators.hpp"
  40 | #include "memory.hpp"
  41 | #include "meta_programming.hpp"
  42 | 
  43 | namespace cuv {
  44 | 
  45 | /** fail with an error message, a stack trace and a runtime_exception (the nicest failures you've seen ^^!)
  46 |  * @ingroup tools
  47 |  */
  48 | static inline void cuvAssertFailed(const char *msg) {
  49 |     throw std::runtime_error(std::string(msg));
  50 | }
  51 | 
  52 | /**
  53 |  * @def cuvAssert
  54 |  * @ingroup tools
  55 |  * use this macro to ensure that a condition is true.
  56 |  * in contrast to assert(), this will throw a runtime_exception,
  57 |  * which can be translated to python.
  58 |  * Additionally, when using Linux, you get a full stack trace printed
  59 |  */
  60 | #define cuvAssert(X)  \
  61 |   if(!(X)){ cuv::cuvAssertFailed(#X); }
  62 | 
  63 | using boost::detail::multi_array::extent_gen;
  64 | using boost::detail::multi_array::index_gen;
  65 | 
  66 | /**
  67 |  * defines an index range, stolen from boost::multi_array
  68 |  *
  69 |  * examples:
  70 |  * @code
  71 |  * index_range(1,3)
  72 |  * index(1) <= index_range() < index(3)
  73 |  * @endcode
  74 |  */
  75 | typedef boost::detail::multi_array::index_range<boost::detail::multi_array::index, boost::detail::multi_array::size_type> index_range;
  76 | 
  77 | /**
  78 |  * the index type used in index_range, useful for comparator syntax in @see index_range
  79 |  */
  80 | typedef index_range::index index;
  81 | 
  82 | #ifndef CUV_DONT_CREATE_EXTENTS_OBJ
  83 | 
  84 | namespace {
  85 | /**
  86 |  * extents object, can be used to generate a multi-dimensional array conveniently.
  87 |  *
  88 |  * stolen from boost::multi_array.
  89 |  *
  90 |  * Example:
  91 |  * @code
  92 |  * ndarray<...> v(extents[5][6][7]); // 3-dimensional ndarray
  93 |  * @endcode
  94 |  */
  95 | extent_gen<0> extents;
  96 | 
  97 | /**
  98 |  * indices object, can be used to generate multi-dimensional views conveniently.
  99 |  *
 100 |  * stolen form boost::multi_array.
 101 |  *
 102 |  * Example:
 103 |  * @code
 104 |  * ndarray_view<...> v(indices[index_range(1,3)][index_range()], other_ndarray);
 105 |  * // or, equivalently
 106 |  * other_ndarray[indices[index_range(1,3)][index_range()]];
 107 |  * @endcode
 108 |  */
 109 | index_gen<0, 0> indices;
 110 | }
 111 | #endif
 112 | 
 113 | /**
 114 |  * @addtogroup data_structures Basic datastructures
 115 |  * @{
 116 |  */
 117 | 
 118 | template<class V, class M, class L> class ndarray;
 119 | template<class V, class M, class L> class ndarray_view;
 120 | 
 121 | /// used in implementation of ndarray.operator= for value_type argument
 122 | template<class V, class M, class L>
 123 | void fill(ndarray<V, M, L>& v, const V& p);
 124 | 
 125 | namespace detail {
 126 | 
 127 | /**
 128 |  * this is intended for copying pitched memory.
 129 |  *
 130 |  * given shape, stride and a memory layout, we can determine the number of
 131 |  * rows, columns and the pitch of a
 132 |  */
 133 | template<class index_type, class size_type>
 134 | void get_pitched_params(size_type& rows, size_type& cols, size_type& pitch,
 135 |         const linear_memory<size_type, host_memory_space>& shape,
 136 |         const linear_memory<index_type, host_memory_space>& stride, row_major) {
 137 |     // strided dimension is the LAST one
 138 |     rows = std::accumulate(shape[0].ptr, shape[0].ptr + shape.size() - 1, 1, std::multiplies<index_type>());
 139 |     cols = shape[shape.size() - 1];
 140 |     pitch = stride[shape.size() - 2];
 141 | }
 142 | 
 143 | /**
 144 |  * @overload
 145 |  */
 146 | template<class index_type, class size_type>
 147 | void get_pitched_params(size_type& rows, size_type& cols, size_type& pitch,
 148 |         const linear_memory<size_type, host_memory_space>& shape,
 149 |         const linear_memory<index_type, host_memory_space>& stride, column_major) {
 150 |     // strided dimension is the FIRST one
 151 |     rows = std::accumulate(shape[0].ptr + 1, shape[0].ptr + shape.size(), 1, std::multiplies<index_type>());
 152 |     cols = shape[0];
 153 |     pitch = stride[1];
 154 | }
 155 | 
 156 | }
 157 | 
 158 | /**
 159 |  * contains infos about shape and stride on host and in the ndarray data space.
 160 |  */
 161 | template<class M, class L>
 162 | class ndarray_info {
 163 | 
 164 | public:
 165 | 
 166 |     typedef unsigned int size_type; ///< type of shapes of the ndarray
 167 |     typedef int index_type; ///< type of indices in ndarray
 168 |     typedef M data_memory_space; ///< this is where the data lies
 169 | 
 170 |     boost::shared_ptr<allocator> m_allocator;
 171 | 
 172 |     /// shape stored in host memory
 173 |     linear_memory<size_type, host_memory_space> host_shape;
 174 | 
 175 |     /// strides stored in host memory
 176 |     linear_memory<index_type, host_memory_space> host_stride;
 177 | 
 178 |     /// shape stored in data memory
 179 |     linear_memory<size_type, data_memory_space> data_shape;
 180 | 
 181 |     /// strides stored in data memory
 182 |     linear_memory<index_type, data_memory_space> data_stride;
 183 | 
 184 |     /// default constructor: does nothing
 185 |     ndarray_info(const boost::shared_ptr<allocator>& _allocator) :
 186 |             m_allocator(_allocator), host_shape(_allocator), host_stride(_allocator),
 187 |                     data_shape(_allocator), data_stride(_allocator)
 188 |     {
 189 |     }
 190 | 
 191 |     /// @return the size of the arrays (should all be the same)
 192 |     size_type size() {
 193 |         return host_shape.size();
 194 |     }
 195 | 
 196 |     /// construct with known shape
 197 |     ndarray_info(size_type s, const boost::shared_ptr<allocator>& _allocator) :
 198 |             m_allocator(_allocator), host_shape(_allocator), host_stride(_allocator),
 199 |                     data_shape(_allocator), data_stride(_allocator)
 200 |     {
 201 |         resize(s);
 202 |     }
 203 | 
 204 |     /// resize all memories
 205 |     void resize(size_type s) {
 206 |         host_shape.set_size(s);
 207 |         host_stride.set_size(s);
 208 |     }
 209 | 
 210 |     /// copy-constructor
 211 |     ndarray_info(const ndarray_info<M, L>& o) :
 212 |             m_allocator(o.m_allocator), host_shape(o.host_shape), host_stride(o.host_stride),
 213 |                     data_shape(m_allocator), data_stride(m_allocator)
 214 |     {
 215 |     }
 216 | 
 217 |     /// copy-construct from other memory space
 218 |     template<class OM>
 219 |     ndarray_info(const ndarray_info<OM, L>& o) :
 220 |             m_allocator(o.m_allocator), host_shape(o.host_shape), host_stride(o.host_stride),
 221 |                     data_shape(m_allocator), data_stride(m_allocator)
 222 |     {
 223 |     }
 224 | 
 225 | };
 226 | 
 227 | /**
 228 |  * represents an n-dimensional array on GPU or CPU.
 229 |  */
 230 | template<class V, class M, class L = row_major>
 231 | class ndarray {
 232 | 
 233 | public:
 234 | 
 235 |     typedef memory<V, M> memory_type; ///< type of stored memory
 236 |     typedef typename memory_type::reference_type reference_type; ///< values returned by operator() and []
 237 |     typedef typename memory_type::const_reference_type const_reference_type; ///< values returned by operator()
 238 |     typedef typename memory_type::memory_space_type memory_space_type; ///< dev/host
 239 |     typedef typename memory_type::value_type value_type; ///< type of stored values
 240 |     typedef typename memory_type::size_type size_type; ///< type shapes
 241 |     typedef typename memory_type::index_type index_type; ///< type strides
 242 |     typedef L memory_layout_type; ///< column/row major
 243 | 
 244 |     typedef ndarray_info<M, L> info_type; ///< type of shape info struct
 245 |     typedef ndarray_view<V, M, L> view_type; ///< type of views on this ndarray
 246 | 
 247 | public:
 248 |     boost::shared_ptr<allocator> m_allocator;
 249 | 
 250 | private:
 251 |     void check_size_limit(size_t size) const {
 252 |         if (size > static_cast<size_t>(std::numeric_limits<index_type>::max())) {
 253 |             throw std::runtime_error("maximum ndarray size exceeded");
 254 |         }
 255 |     }
 256 | 
 257 |     /// ndarray views are our friends
 258 |     template<class _V, class _M, class _L>
 259 |     friend class ndarray_view;
 260 | 
 261 | protected:
 262 | 
 263 |     /// information about shape, strides
 264 |     info_type m_info;
 265 | 
 266 |     /// points to (possibly shared) memory
 267 |     boost::shared_ptr<memory_type> m_memory;
 268 | 
 269 |     /// points to start of actually referenced memory (within m_memory)
 270 |     V* m_ptr;
 271 | 
 272 |     /**
 273 |      * determine linear index in memory of an index array
 274 |      *
 275 |      * this function takes strides etc. into account, so that indices
 276 |      * are interpreted as relative to the (strided) sub-ndarray we're
 277 |      * referring to.
 278 |      *
 279 |      * @param D    size of index array
 280 |      * @param arr  index array
 281 |      * @return linear index in memory of index array
 282 |      *
 283 |      */
 284 |     size_type index_of(int D, index_type* arr) const {
 285 |         index_type pos = 0;
 286 |         for (int i = 0; i < D; i++) {
 287 |             index_type temp = arr[i];
 288 |             if (temp < 0)
 289 |                 temp = m_info.host_shape[i] + temp;
 290 |             pos += temp * m_info.host_stride[i];
 291 |         }
 292 |         return pos;
 293 |     }
 294 | 
 295 |     /**
 296 |      * allocate linear memory (c-contiguous version)
 297 |      *
 298 |      * @param t ndarray to allocate
 299 |      */
 300 |     void allocate(ndarray& t, linear_memory_tag) {
 301 |         linear_memory<V, M> mem(t.size(), t.m_allocator);
 302 |         mem.set_strides(t.m_info.host_stride, t.m_info.host_shape, L());
 303 |         t.m_ptr = mem.ptr();
 304 |         t.m_memory.reset(new memory<V, M>(mem.release(), mem.size(), t.m_allocator));
 305 |     }
 306 | 
 307 |     /**
 308 |      * @overload
 309 |      *
 310 |      * pitched version
 311 |      */
 312 |     void allocate(ndarray& t, pitched_memory_tag) {
 313 |         typename ndarray<V, M, L>::size_type row, col, pitch;
 314 |         detail::get_pitched_params(row, col, pitch, t.m_info.host_shape, t.m_info.host_stride, L());
 315 |         pitched_memory<V, M> d(row, col);
 316 |         d.set_strides(t.m_info.host_stride, t.m_info.host_shape, L());
 317 |         t.m_ptr = d.ptr();
 318 |         t.m_memory.reset(new memory<V, M>(d.release(), d.size(), t.m_allocator));
 319 |     }
 320 | 
 321 | public:
 322 | 
 323 |     /**
 324 |      * determine linear index in memory of an index array
 325 |      *
 326 |      * this function takes strides etc. into account, so that indices
 327 |      * are interpreted as relative to the (strided) sub-ndarray we're
 328 |      * referring to.
 329 |      *
 330 |      * @tparam D    size of index array
 331 |      * @param eg  position in array
 332 |      * @return linear index in memory of index array
 333 |      *
 334 |      */
 335 |     template<size_t D>
 336 |     size_type index_of(const extent_gen<D>& eg) const {
 337 |         index_type pos = 0;
 338 |         for (size_t i = 0; i < D; i++) {
 339 |             index_type temp = eg.ranges_[i].finish();
 340 |             if (temp < 0)
 341 |                 temp = m_info.host_shape[i] + temp;
 342 |             pos += temp * m_info.host_stride[i];
 343 |         }
 344 |         return pos;
 345 |     }
 346 | 
 347 |     /**
 348 |      * @name Accessors
 349 |      * @{
 350 |      */
 351 |     /// return the number of dimensions
 352 |     index_type ndim() const {
 353 |         return m_info.host_shape.size();
 354 |     }
 355 | 
 356 |     /** return the size of the i-th dimension
 357 |      *  @param i the index of the queried dimension
 358 |      */
 359 |     size_type shape(const size_t i) const {
 360 |         return m_info.host_shape[i];
 361 |     }
 362 | 
 363 |     /** return the stride of the i-th dimension
 364 |      *  @param i the index of the queried dimension
 365 |      */
 366 |     index_type stride(const size_t i) const {
 367 |         return m_info.host_stride[i];
 368 |     }
 369 | 
 370 |     /** @return the pointer to the referenced memory */
 371 |     V* ptr() {
 372 |         return m_ptr;
 373 |     }
 374 | 
 375 |     /**
 376 |      * @overload
 377 |      * @return the const pointer to the referenced memory
 378 |      * */
 379 |     const V* ptr() const {
 380 |         return m_ptr;
 381 |     }
 382 | 
 383 |     /** set the pointer offset (used in deserialization) */
 384 |     void set_ptr_offset(long int i) {
 385 |         m_ptr = m_memory->ptr() + i;
 386 |     }
 387 | 
 388 |     /** * @return pointer to allocated memory */
 389 |     boost::shared_ptr<memory_type>& mem() {
 390 |         return m_memory;
 391 |     }
 392 |     /**
 393 |      * @overload
 394 |      * @return the const pointer to the allocated memory
 395 |      * */
 396 |     const boost::shared_ptr<memory_type>& mem() const {
 397 |         return m_memory;
 398 |     }
 399 | 
 400 |     /** @return the number of stored elements
 401 |      */
 402 |     size_type size() const {
 403 |         size_t size = std::accumulate(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size(), 1,
 404 |                 std::multiplies<size_t>());
 405 | 
 406 |         check_size_limit(size);
 407 | 
 408 |         return static_cast<size_type>(size);
 409 |     }
 410 | 
 411 |     /**
 412 |      * determine size in bytes
 413 |      *
 414 |      * assumes that the memory is c_contiguous!
 415 |      *
 416 |      * @return the size in bytes
 417 |      */
 418 |     size_type memsize() const {
 419 | #ifndef NDEBUG
 420 |         cuvAssert(is_c_contiguous());
 421 | #endif
 422 |         size_t size = std::accumulate(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size(), 1,
 423 |                 std::multiplies<size_type>());
 424 | 
 425 |         check_size_limit(size);
 426 | 
 427 |         return static_cast<size_type>(size);
 428 |     }
 429 | 
 430 |     /// return the shape of the ndarray (as a vector for backward compatibility)
 431 |     std::vector<size_type> shape() const {
 432 |         if (ndim() == 0)
 433 |             return std::vector<size_type>();
 434 |         return std::vector<size_type>(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size());
 435 |     }
 436 | 
 437 |     /**
 438 |      * return the effective shape of the ndarray (as a vector for backward compatibility)
 439 |      *
 440 |      * the effective shape removes all degenerate dimensions (i.e. shape(i)==1).
 441 |      */
 442 |     std::vector<size_type> effective_shape() const {
 443 |         std::vector<size_type> shape;
 444 |         shape.reserve(ndim());
 445 |         if (ndim() == 0)
 446 |             return shape;
 447 |         std::remove_copy_if(m_info.host_shape[0].ptr, m_info.host_shape[0].ptr + m_info.host_shape.size(),
 448 |                 std::back_inserter(shape), std::bind2nd(std::equal_to<size_type>(), 1));
 449 |         return shape;
 450 |     }
 451 | 
 452 |     /// @return the ndarray info struct (const)
 453 |     const info_type& info() const {
 454 |         return m_info;
 455 |     }
 456 | 
 457 |     /// @return the ndarray info struct
 458 |     info_type& info() {
 459 |         return m_info;
 460 |     }
 461 | 
 462 |     /// true iff there are no "holes" in memory
 463 |     bool is_c_contiguous() const {
 464 |         return detail::is_c_contiguous(memory_layout_type(), m_info.host_shape, m_info.host_stride);
 465 |     }
 466 | 
 467 |     /// true iff it can be copied as a 2d array (only one dimension is pitched)
 468 |     bool is_2dcopyable() const {
 469 |         return detail::is_2dcopyable(memory_layout_type(), m_info.host_shape, m_info.host_stride);
 470 |     }
 471 | 
 472 |     /** @} */ // accessors
 473 |     /**
 474 |      * @name accessing stored values
 475 |      * @{
 476 |      */
 477 | 
 478 |     /**
 479 |      * member access: "flat" access as if memory was linear
 480 |      */
 481 |     reference_type operator[](index_type idx) {
 482 |         size_type ndim = m_info.host_shape.size();
 483 |         size_type* virtualstride = new size_type[ndim];
 484 |         size_type pos = 0;
 485 |         if (IsSame<L, row_major>::Result::value) {
 486 |             // row major
 487 |             {
 488 |                 size_type virt_size = 1;
 489 |                 for (int i = ndim - 1; i >= 0; --i) {
 490 |                     virtualstride[i] = virt_size;
 491 |                     virt_size *= m_info.host_shape[i];
 492 |                 }
 493 |             }
 494 |             for (size_type i = 0; i < ndim; ++i) {
 495 |                 pos += (idx / virtualstride[i]) * m_info.host_stride[i];
 496 |                 idx -= (idx / virtualstride[i]) * virtualstride[i];
 497 |             }
 498 |         } else {
 499 |             // column major
 500 |             {
 501 |                 size_type virt_size = 1;
 502 |                 for (unsigned int i = 0; i < ndim; ++i) {
 503 |                     virtualstride[i] = virt_size;
 504 |                     virt_size *= m_info.host_shape[i];
 505 |                 }
 506 |             }
 507 |             for (int i = ndim - 1; i >= 0; --i) {
 508 |                 pos += (idx / virtualstride[i]) * m_info.host_stride[i];
 509 |                 idx -= (idx / virtualstride[i]) * virtualstride[i];
 510 |             }
 511 |         }
 512 |         delete[] virtualstride;
 513 |         return reference_type(m_ptr + pos);
 514 |     }
 515 | 
 516 |     /** @overload */
 517 |     const_reference_type operator[](index_type idx) const {
 518 |         return const_cast<ndarray&>(*this)[idx];
 519 |     }
 520 | 
 521 |     /**
 522 |      * get a reference to the datum at an index
 523 |      * @param i0 index for a 1-dimensional ndarray
 524 |      * @return reference to datum at i0
 525 |      */
 526 |     reference_type operator()(index_type i0) {
 527 | #ifndef NDEBUG
 528 |         cuvAssert(ndim()==1);
 529 |         cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)(-i0)<shape(0)+1))
 530 | #endif
 531 |         if (i0 >= 0) {
 532 |             return reference_type(m_ptr + i0);
 533 |         } else {
 534 |             return reference_type(m_ptr + shape(0) - i0);
 535 |         }
 536 |     }
 537 | 
 538 |     /** @overload */
 539 |     const_reference_type operator()(index_type i0) const {
 540 |         return const_cast<ndarray&>(*this)(i0);
 541 |     }
 542 | 
 543 |     /** @overload */
 544 |     const_reference_type operator()(index_type i0, index_type i1) const {
 545 |         return const_cast<ndarray&>(*this)(i0, i1);
 546 |     }
 547 | 
 548 |     /** @overload */
 549 |     reference_type operator()(index_type i0, index_type i1) {
 550 | #ifndef NDEBUG
 551 |         cuvAssert(ndim()==2);
 552 |         cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)(-i0)<shape(0)+1))
 553 |         cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)(-i1)<shape(1)+1))
 554 | #endif
 555 |         index_type arr[2] = { i0, i1 };
 556 |         return reference_type(m_ptr + index_of(2, arr));
 557 |     }
 558 | 
 559 |     /** @overload */
 560 |     const_reference_type operator()(index_type i0, index_type i1, index_type i2) const {
 561 |         return const_cast<ndarray&>(*this)(i0, i1, i2);
 562 |     }
 563 | 
 564 |     /** @overload */
 565 |     reference_type operator()(index_type i0, index_type i1, index_type i2) {
 566 | #ifndef NDEBUG
 567 |         cuvAssert(ndim()==3);
 568 |         cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0<shape(0)+1))
 569 |         cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1<shape(1)+1))
 570 |         cuvAssert((i2>=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2<shape(2)+1))
 571 | #endif
 572 |         index_type arr[3] = { i0, i1, i2 };
 573 |         return reference_type(m_ptr + index_of(3, arr));
 574 |     }
 575 | 
 576 |     /** @overload */
 577 |     const_reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3) const {
 578 |         return const_cast<ndarray&>(*this)(i0, i1, i2, i3);
 579 |     }
 580 | 
 581 |     /** @overload */
 582 |     reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3) {
 583 | #ifndef NDEBUG
 584 |         cuvAssert(ndim()==4);
 585 |         cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0<shape(0)+1))
 586 |         cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1<shape(1)+1))
 587 |         cuvAssert((i2>=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2<shape(2)+1))
 588 |         cuvAssert((i3>=0 && (size_type)i3 < shape(3)) || (i3<0 && (size_type)-i3<shape(3)+1))
 589 | #endif
 590 |         index_type arr[4] = { i0, i1, i2, i3 };
 591 |         return reference_type(m_ptr + index_of(4, arr));
 592 |     }
 593 | 
 594 |     /** @overload */
 595 |     const_reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3,
 596 |             index_type i4) const {
 597 |         return const_cast<ndarray&>(*this)(i0, i1, i2, i3, i4);
 598 |     }
 599 | 
 600 |     /** @overload */
 601 |     reference_type operator()(index_type i0, index_type i1, index_type i2, index_type i3, index_type i4) {
 602 | #ifndef NDEBUG
 603 |         cuvAssert(ndim()==5);
 604 |         cuvAssert((i0>=0 && (size_type)i0 < shape(0)) || (i0<0 && (size_type)-i0<shape(0)+1))
 605 |         cuvAssert((i1>=0 && (size_type)i1 < shape(1)) || (i1<0 && (size_type)-i1<shape(1)+1))
 606 |         cuvAssert((i2>=0 && (size_type)i2 < shape(2)) || (i2<0 && (size_type)-i2<shape(2)+1))
 607 |         cuvAssert((i3>=0 && (size_type)i3 < shape(3)) || (i3<0 && (size_type)-i3<shape(3)+1))
 608 |         cuvAssert((i4>=0 && (size_type)i4 < shape(4)) || (i4<0 && (size_type)-i4<shape(4)+1))
 609 | #endif
 610 |         index_type arr[5] = { i0, i1, i2, i3, i4 };
 611 |         return reference_type(m_ptr + index_of(5, arr));
 612 |     }
 613 | 
 614 |     /** @} */ // accessing stored values
 615 |     /** @name constructors
 616 |      * @{
 617 |      *
 618 |      */
 619 |     /**
 620 |      * default constructor (does nothing)
 621 |      */
 622 |     ndarray(const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
 623 |             m_allocator(_allocator), m_info(_allocator), m_ptr(NULL) {
 624 |     }
 625 | 
 626 |     // ****************************************************************
 627 |     //        Constructing from other ndarray
 628 |     // ****************************************************************
 629 | 
 630 |     /**
 631 |      * construct ndarray from ndarray of exact same type
 632 |      *
 633 |      * time O(1)
 634 |      */
 635 |     ndarray(const ndarray& o) :
 636 |             m_allocator(o.m_allocator),
 637 |                     m_info(o.m_info), // copy only shape
 638 |                     m_memory(o.m_memory), // increase ref counter
 639 |                     m_ptr(o.m_ptr) {
 640 |     } // same pointer in memory
 641 | 
 642 |     /**
 643 |      * construct ndarray from ndarray of other memory space
 644 |      * in (dense) /linear/ memory. Note: this /copies/ the memory!
 645 |      */
 646 |     template<class OM>
 647 |     ndarray(const ndarray<value_type, OM, L>& o, cudaStream_t stream = 0) :
 648 |             m_allocator(o.m_allocator),
 649 |                     m_info(o.info()), // primarily to copy shape
 650 |                     m_ptr(NULL) {
 651 |         copy_memory(o, linear_memory_tag(), stream);
 652 |         m_ptr = m_memory->ptr();
 653 |     }
 654 | 
 655 |     /**
 656 |      * construct ndarray from ndarray of same memory space
 657 |      * in  /pitched/ memory. Note: this /copies/ the memory!
 658 |      */
 659 |     explicit ndarray(const ndarray& o, pitched_memory_tag, cudaStream_t stream = 0) :
 660 |             m_allocator(o.m_allocator),
 661 |                     m_info(o.m_info), // primarily to copy shape
 662 |                     m_ptr(NULL) {
 663 |         copy_memory(o, pitched_memory_tag(), stream);
 664 |         m_ptr = m_memory->ptr();
 665 |     }
 666 | 
 667 |     /**
 668 |      * construct ndarray from ndarray of other memory space
 669 |      * in  /pitched/ memory. Note: this /copies/ the memory!
 670 |      */
 671 |     template<class OM>
 672 |     explicit ndarray(const ndarray<value_type, OM, L>& o, pitched_memory_tag, cudaStream_t stream = 0) :
 673 |             m_allocator(o.m_allocator),
 674 |                     m_info(o.info()), // primarily to copy shape
 675 |                     m_ptr(NULL) {
 676 |         copy_memory(o, pitched_memory_tag(), stream);
 677 |         m_ptr = m_memory->ptr();
 678 |     }
 679 | 
 680 |     /**
 681 |      * construct ndarray from ndarray of same memory space
 682 |      * in (dense) /linear/ memory. Note: this /copies/ the memory!
 683 |      */
 684 |     explicit ndarray(const ndarray& o, linear_memory_tag, cudaStream_t stream = 0) :
 685 |             m_allocator(o.m_allocator),
 686 |                     m_info(o.m_info), // primarily to copy shape
 687 |                     m_ptr(NULL) {
 688 |         copy_memory(o, linear_memory_tag(), stream);
 689 |         m_ptr = m_memory->ptr();
 690 |     }
 691 | 
 692 |     /**
 693 |      * construct ndarray from ndarray of other memory space
 694 |      * in (dense) /linear/ memory. Note: this /copies/ the memory!
 695 |      */
 696 |     template<class OM>
 697 |     explicit ndarray(const ndarray<value_type, OM, L>& o, linear_memory_tag, cudaStream_t stream = 0) :
 698 |             m_allocator(o.m_allocator),
 699 |                     m_info(o.info()), // primarily to copy shape
 700 |                     m_ptr(NULL) {
 701 |         copy_memory(o, linear_memory_tag(), stream);
 702 |         m_ptr = m_memory->ptr();
 703 |     }
 704 | 
 705 |     /**
 706 |      * construct ndarray from other memory layout
 707 |      *
 708 |      * this does not copy memory, but reverses dimensions and strides
 709 |      * (and therefore only takes O(1) time)
 710 |      */
 711 |     template<class OL>
 712 |     explicit ndarray(const ndarray<value_type, M, OL>& o) :
 713 |             m_allocator(o.m_allocator),
 714 |                     m_info(o.m_allocator),
 715 |                     m_memory(o.mem()), // increase ref counter
 716 |                     m_ptr(const_cast<V*>(o.ptr())) { // same pointer in memory
 717 |         m_info.host_shape = o.info().host_shape;
 718 |         m_info.host_shape.reverse();
 719 |         m_info.host_stride = o.info().host_stride;
 720 |         m_info.host_stride.reverse();
 721 |     }
 722 | 
 723 |     // ****************************************************************
 724 |     //        Constructing from SHAPE
 725 |     // ****************************************************************
 726 | 
 727 |     /**
 728 |      * construct one-dimensional ndarray
 729 |      */
 730 |     explicit ndarray(const size_type i,
 731 |             const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
 732 |             m_allocator(_allocator),
 733 |                     m_info(_allocator),
 734 |                     m_ptr(NULL) {
 735 |         m_info.resize(1);
 736 |         m_info.host_shape[0] = i;
 737 |         allocate(*this, linear_memory_tag());
 738 |     }
 739 | 
 740 |     /**
 741 |      * construct two-dimensional ndarray
 742 |      */
 743 |     explicit ndarray(const size_type i, const int j, const boost::shared_ptr<allocator> _allocator =
 744 |             boost::make_shared<default_allocator>()) :
 745 |             m_allocator(_allocator),
 746 |                     m_info(_allocator),
 747 |                     m_ptr(NULL) {
 748 |         m_info.resize(2);
 749 |         m_info.host_shape[0] = i;
 750 |         m_info.host_shape[1] = j;
 751 |         allocate(*this, linear_memory_tag());
 752 |     }
 753 | 
 754 |     /**
 755 |      * construct ndarray from a shape
 756 |      */
 757 |     template<size_t D>
 758 |     explicit ndarray(const extent_gen<D>& eg,
 759 |             const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
 760 |             m_allocator(_allocator),
 761 |                     m_info(_allocator),
 762 |                     m_ptr(NULL) {
 763 |         m_info.resize(D);
 764 |         for (size_t i = 0; i < D; i++)
 765 |             m_info.host_shape[i] = eg.ranges_[i].finish();
 766 |         allocate(*this, linear_memory_tag());
 767 |     }
 768 | 
 769 |     /**
 770 |      * construct ndarray from a shape
 771 |      *
 772 |      * @deprecated
 773 |      */
 774 |     explicit ndarray(const std::vector<size_type>& eg,
 775 |             const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
 776 |             m_allocator(_allocator),
 777 |                     m_info(_allocator),
 778 |                     m_ptr(NULL) {
 779 |         m_info.resize(eg.size());
 780 |         for (size_t i = 0; i < eg.size(); i++)
 781 |             m_info.host_shape[i] = eg[i];
 782 |         allocate(*this, linear_memory_tag());
 783 |     }
 784 | 
 785 |     /**
 786 |      * construct ndarray from a shape
 787 |      *
 788 |      * @deprecated
 789 |      */
 790 |     explicit ndarray(const std::vector<size_type>& eg, pitched_memory_tag,
 791 |             const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
 792 |             m_allocator(_allocator),
 793 |                     m_info(_allocator),
 794 |                     m_ptr(NULL) {
 795 |         m_info.resize(eg.size());
 796 |         for (size_t i = 0; i < eg.size(); i++)
 797 |             m_info.host_shape[i] = eg[i];
 798 |         allocate(*this, pitched_memory_tag());
 799 |     }
 800 | 
 801 |     /**
 802 |      * construct ndarray from a shape (pitched)
 803 |      */
 804 |     template<size_t D>
 805 |     explicit ndarray(const extent_gen<D>& eg, pitched_memory_tag, const boost::shared_ptr<allocator> _allocator =
 806 |             boost::make_shared<default_allocator>()) :
 807 |             m_allocator(_allocator),
 808 |                     m_info(_allocator),
 809 |                     m_ptr(NULL) {
 810 |         m_info.resize(D);
 811 |         for (size_t i = 0; i < D; i++)
 812 |             m_info.host_shape[i] = eg.ranges_[i].finish();
 813 |         allocate(*this, pitched_memory_tag());
 814 |     }
 815 | 
 816 |     // ****************************************************************
 817 |     //        Constructing from shape and raw pointer
 818 |     // ****************************************************************
 819 | 
 820 |     /**
 821 |      * construct ndarray from a shape and a pointer (does not copy memory)
 822 |      *
 823 |      * @warning You have to ensure that the memory lives as long as this object.
 824 |      */
 825 |     template<size_t D>
 826 |     explicit ndarray(const extent_gen<D>& eg, value_type* ptr, const boost::shared_ptr<allocator> _allocator =
 827 |             boost::make_shared<default_allocator>()) :
 828 |             m_allocator(_allocator),
 829 |                     m_info(_allocator),
 830 |                     m_ptr(ptr) {
 831 |         m_info.resize(D);
 832 |         size_t size = 1;
 833 |         if (IsSame<memory_layout_type, row_major>::Result::value) {
 834 |             for (int i = D - 1; i >= 0; i--) {
 835 |                 m_info.host_shape[i] = eg.ranges_[i].finish();
 836 |                 m_info.host_stride[i] = size;
 837 |                 size *= eg.ranges_[i].finish();
 838 |             }
 839 |         } else {
 840 |             for (size_t i = 0; i < D; i++) {
 841 |                 m_info.host_shape[i] = eg.ranges_[i].finish();
 842 |                 m_info.host_stride[i] = size;
 843 |                 size *= eg.ranges_[i].finish();
 844 |             }
 845 |         }
 846 |         m_memory.reset(new memory<V, M>(ptr, size, m_allocator, false));
 847 |     }
 848 | 
 849 |     explicit ndarray(const std::vector<size_type>& shape, value_type* ptr,
 850 |             const boost::shared_ptr<allocator> _allocator = boost::make_shared<default_allocator>()) :
 851 |             m_allocator(_allocator),
 852 |                     m_info(_allocator),
 853 |                     m_ptr(ptr) {
 854 |         unsigned int D = shape.size();
 855 |         m_info.resize(D);
 856 |         size_type size = 1;
 857 |         if (IsSame<memory_layout_type, row_major>::Result::value)
 858 |             for (int i = D - 1; i >= 0; i--) {
 859 |                 m_info.host_shape[i] = shape[i];
 860 |                 m_info.host_stride[i] = size;
 861 |                 size *= shape[i];
 862 |             }
 863 |         else
 864 |             for (size_t i = 0; i < D; i++) {
 865 |                 m_info.host_shape[i] = shape[i];
 866 |                 m_info.host_stride[i] = size;
 867 |                 size *= shape[i];
 868 |             }
 869 |     }
 870 |     /**
 871 |      * construct ndarray from a shape and a pointer (does not copy memory)
 872 |      *
 873 |      * @warning You have to ensure that the memory lives as long as this object.
 874 |      * @deprecated
 875 |      */
 876 |     template<int D, int E>
 877 |     explicit ndarray(const index_gen<D, E>& idx, value_type* ptr, const boost::shared_ptr<allocator> _allocator =
 878 |             boost::make_shared<default_allocator>()) :
 879 |             m_allocator(_allocator),
 880 |                     m_info(_allocator),
 881 |                     m_ptr(ptr) {
 882 |         m_info.resize(D);
 883 |         size_type size = 1;
 884 |         if (IsSame<memory_layout_type, row_major>::Result::value)
 885 |             for (int i = D - 1; i >= 0; i--) {
 886 |                 m_info.host_shape[i] = idx.ranges_[i].finish();
 887 |                 m_info.host_stride[i] = size;
 888 |                 size *= idx.ranges_[i].finish();
 889 |             }
 890 |         else
 891 |             for (size_t i = 0; i < D; i++) {
 892 |                 m_info.host_shape[i] = idx.ranges_[i].finish();
 893 |                 m_info.host_stride[i] = size;
 894 |                 size *= idx.ranges_[i].finish();
 895 |             }
 896 |     }
 897 |     // @} // constructors
 898 | 
 899 |     // ****************************************************************
 900 |     //   assignment operators (try not to reallocate if shapes match)
 901 |     // ****************************************************************
 902 | 
 903 |     /**
 904 |      * @name assigning other values to a ndarray object
 905 |      * @{
 906 |      */
 907 | 
 908 |     /**
 909 |      * explicitly assign by copying memory
 910 |      */
 911 |     template<class _M, class _L>
 912 |     ndarray& assign(const ndarray<V, _M, _L>& o, cudaStream_t stream = 0) {
 913 |         if (!copy_memory(o, false, stream))
 914 |             throw std::runtime_error("copying ndarray did not succeed. Maybe a shape mismatch?");
 915 |         return *this;
 916 |     }
 917 | 
 918 |     /**
 919 |      * assign from ndarray of same type
 920 |      *
 921 |      * always an O(1) operation.
 922 |      */
 923 |     ndarray& operator=(const ndarray& o) {
 924 |         if (this == &o)
 925 |             return *this; // check for self-assignment
 926 | 
 927 |         // TODO make use of copy-and-swap idiom
 928 |         m_memory = o.mem();
 929 |         m_ptr = const_cast<V*>(o.ptr());
 930 |         m_info = o.info();
 931 |         return *this;
 932 |     }
 933 | 
 934 |     /**
 935 |      * assign from value (sets all elements equal to one scalar)
 936 |      */
 937 |     template<class _V>
 938 |     typename boost::enable_if_c<boost::is_convertible<_V, value_type>::value, ndarray&>::type operator=(
 939 |             const _V& scalar) {
 940 |         fill(*this, scalar);
 941 |         return *this;
 942 |     }
 943 | 
 944 |     /**
 945 |      * assign from ndarray of different memory space type.
 946 |      *
 947 |      * If shapes do not match, it defaults to linear memory.
 948 |      *
 949 |      * this copies memory (obviously) but tries to avoid reallocation
 950 |      */
 951 |     template<class OM>
 952 |     ndarray& assign(const ndarray<value_type, OM, L>& o, cudaStream_t stream = 0) {
 953 |         if (!copy_memory(o, false, stream))
 954 |             copy_memory(o, linear_memory_tag(), stream);
 955 |         if (mem())
 956 |             // if mem() does not exist, we're just wrapping a pointer
 957 |             // of a std::vector or so -> simply keep it
 958 |             m_ptr = mem()->ptr();
 959 |         return *this;
 960 |     }
 961 | 
 962 |     /**
 963 |      * assign from ndarray of different memory space type.
 964 |      *
 965 |      * If shapes do not match, it defaults to linear memory.
 966 |      *
 967 |      * this copies memory (obviously) but tries to avoid reallocation
 968 |      */
 969 |     template<class OM>
 970 |     ndarray& operator=(const ndarray<value_type, OM, L>& o) {
 971 |         return assign(o);
 972 |     }
 973 | 
 974 |     /**
 975 |      * assign from ndarray of different memory layout type.
 976 |      *
 977 |      * this does not copy memory, but reverses strides and shapes.
 978 |      */
 979 |     template<class OL>
 980 |     ndarray& operator=(const ndarray<value_type, M, OL>& o) {
 981 |         return assign(o);
 982 |     }
 983 | 
 984 |     /** @} */ // assignment
 985 |     /**
 986 |      * copy memory using given allocator tag (linear/pitched)
 987 |      */
 988 |     template<class T>
 989 |     ndarray copy(T tag = linear_memory_tag(), cudaStream_t stream = 0) const {
 990 |         ndarray t(m_allocator);
 991 |         const ndarray& o = *this;
 992 |         t.m_info = o.info();
 993 |         t.copy_memory(o, tag, stream);
 994 |         t.m_ptr = t.mem()->ptr();
 995 |         return t;
 996 |     }
 997 | 
 998 |     /**
 999 |      * copy memory using linear memory
1000 |      */
1001 |     ndarray copy() const {
1002 |         return copy(linear_memory_tag());
1003 |     }
1004 | 
1005 |     /**
1006 |      * create a sub-ndarray of the current ndarray
1007 |      *
1008 |      * this works in O(1).
1009 |      */
1010 |     template<int D, int E>
1011 |     ndarray_view<V, M, L> operator[](const index_gen<D, E>& idx) const {
1012 | 
1013 |         ndarray_view<V, M, L> t(m_allocator);
1014 |         const ndarray& o = *this;
1015 |         t.m_memory = o.mem();
1016 |         t.m_ptr = const_cast<V*>(o.ptr());
1017 | 
1018 |         std::vector<int> shapes;
1019 |         std::vector<int> strides;
1020 |         shapes.reserve(D);
1021 |         strides.reserve(D);
1022 |         cuvAssert(o.ndim()==D);
1023 | 
1024 |         for (size_t i = 0; i < D; i++) {
1025 |             int start = idx.ranges_[i].get_start(0);
1026 |             int finish = idx.ranges_[i].get_finish(o.shape(i));
1027 |             int stride = idx.ranges_[i].stride();
1028 |             if (start < 0)
1029 |                 start += o.shape(i);
1030 |             if (finish < 0)
1031 |                 finish += o.shape(i);
1032 | #ifndef NDEBUG
1033 |             cuvAssert(finish>start);
1034 | #endif
1035 |             t.m_ptr += start * o.stride(i);
1036 |             if (idx.ranges_[i].is_degenerate()) {
1037 |                 // skip dimension
1038 |             } else {
1039 |                 shapes.push_back((finish - start) / stride);
1040 |                 strides.push_back(o.stride(i) * stride);
1041 |             }
1042 |         }
1043 | 
1044 |         // store in m_info
1045 |         t.m_info.resize(shapes.size());
1046 | 
1047 |         std::copy(shapes.begin(), shapes.end(), t.m_info.host_shape[0].ptr);
1048 |         std::copy(strides.begin(), strides.end(), t.m_info.host_stride[0].ptr);
1049 |         return t; // should not copy mem, only m_info
1050 |     }
1051 | 
1052 |     /**
1053 |      * reshape the ndarray (in place)
1054 |      *
1055 |      * works only for c_contiguous memory!
1056 |      *
1057 |      * @param eg new shape
1058 |      */
1059 |     template<size_t D>
1060 |     void reshape(const extent_gen<D>& eg) {
1061 |         std::vector<size_type> shape(D);
1062 |         for (size_t i = 0; i < D; i++)
1063 |             shape[i] = eg.ranges_[i].finish();
1064 |         reshape(shape);
1065 |     }
1066 |     /**
1067 |      * reshape the ndarray (in place)
1068 |      *
1069 |      * works only for c_contiguous memory!
1070 |      *
1071 |      * @param shape new shape
1072 |      */
1073 |     void reshape(const std::vector<size_type>& shape) {
1074 |         size_type new_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_type>());
1075 |         if (!is_c_contiguous())
1076 |             throw std::runtime_error("cannot reshape: ndarray is not c_contiguous");
1077 |         if (size() != new_size)
1078 |             throw std::runtime_error("cannot reshape: products do not match");
1079 |         m_info.resize(shape.size());
1080 |         size_type size = 1;
1081 |         if (IsSame<memory_layout_type, row_major>::Result::value)
1082 |             for (int i = shape.size() - 1; i >= 0; i--) {
1083 |                 m_info.host_shape[i] = shape[i];
1084 |                 m_info.host_stride[i] = size;
1085 |                 size *= shape[i];
1086 |             }
1087 |         else
1088 |             for (size_t i = 0; i < shape.size(); i++) {
1089 |                 m_info.host_shape[i] = shape[i];
1090 |                 m_info.host_stride[i] = size;
1091 |                 size *= shape[i];
1092 |             }
1093 |     }
1094 |     /**
1095 |      * convenience wrapper for reshape(extents[r][c])
1096 |      * @param r leading index of new shape
1097 |      * @param c following index of new shape
1098 |      */
1099 |     void reshape(size_type r, size_type c) {
1100 |         reshape(extents[r][c]);
1101 |     }
1102 | 
1103 |     /**
1104 |      * resize the ndarray (deallocates memory if product changes, otherwise equivalent to reshape)
1105 |      *
1106 |      * @param shape new shape
1107 |      */
1108 |     void resize(const std::vector<size_type>& shape) {
1109 |         if (ndim() != 0) {
1110 |             size_type new_size = std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<size_type>());
1111 |             if (is_c_contiguous() && size() == new_size) {
1112 |                 reshape(shape);
1113 |                 return;
1114 |             }
1115 |         }
1116 | 
1117 |         // free memory before we allocate new memory (important if pooling is active)
1118 |         m_memory.reset(new memory<V, M>(0, 0, m_allocator));
1119 |         *this = ndarray(shape, m_allocator);
1120 |     }
1121 |     /**
1122 |      * resize the ndarray (deallocates memory if product changes, otherwise equivalent to reshape)
1123 |      *
1124 |      * @overload
1125 |      *
1126 |      * @param eg new shape
1127 |      */
1128 |     template<size_t D>
1129 |     void resize(const extent_gen<D>& eg) {
1130 |         std::vector<size_type> shape(D);
1131 |         for (size_t i = 0; i < D; i++)
1132 |             shape[i] = eg.ranges_[i].finish();
1133 |         resize(shape);
1134 |     }
1135 | 
1136 |     /**
1137 |      * convenience wrapper for resize(extents[size])
1138 |      * @param size size of the new shape
1139 |      */
1140 |     void resize(size_type size) {
1141 |         resize(extents[size]);
1142 |     }
1143 | 
1144 |     /**
1145 |      * convenience wrapper for resize(extents[r][c])
1146 |      * @param r leading index of new shape
1147 |      * @param c following index of new shape
1148 |      */
1149 |     void resize(size_type r, size_type c) {
1150 |         resize(extents[r][c]);
1151 |     }
1152 | 
1153 |     /**
1154 |      * force deallocation of memory if possible
1155 |      */
1156 |     void dealloc() {
1157 |         m_memory.reset();
1158 |         m_ptr = NULL;
1159 |         m_info.host_shape.set_size(0);
1160 |     }
1161 | 
1162 |     /// tries to copy memory, succeeds if shapes match AND both ndarrays are c_contiguous or 2d-copyable.
1163 |     template<class OM, class OL>
1164 |     bool copy_memory(const ndarray<V, OM, OL>& src, bool force_dst_contiguous, cudaStream_t stream) {
1165 |         if (effective_shape() != src.effective_shape() || !ptr()) {
1166 |             return false;
1167 |         }
1168 | 
1169 |         assert(m_memory.get());
1170 |         // ATTENTION: m_ptr might be different than m_memory->ptr()!
1171 | 
1172 |         // TODO: this could be probably implemented in the memory classes as well
1173 | 
1174 |         if (is_c_contiguous() && src.is_c_contiguous()) {
1175 |             // can copy w/o bothering about m_memory
1176 |             m_memory->copy_from(m_ptr, src.ptr(), src.size(), OM(), stream);
1177 |         } else if (is_c_contiguous() && src.is_2dcopyable()) {
1178 |             size_type row, col, pitch;
1179 |             detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL());
1180 |             m_memory->copy2d_from(m_ptr, src.ptr(), col, pitch, row, col, OM(), stream);
1181 |         } else if (!force_dst_contiguous && is_2dcopyable() && src.is_c_contiguous()) {
1182 |             size_type row, col, pitch;
1183 |             detail::get_pitched_params(row, col, pitch, info().host_shape, info().host_stride, L());
1184 |             m_memory->copy2d_from(m_ptr, src.ptr(), pitch, col, row, col, OM(), stream);
1185 |         } else if (!force_dst_contiguous && is_2dcopyable() && src.is_c_contiguous()) {
1186 |             size_type srow, scol, spitch;
1187 |             size_type drow, dcol, dpitch;
1188 |             detail::get_pitched_params(drow, dcol, dpitch, info().host_shape, info().host_stride, L());
1189 |             detail::get_pitched_params(srow, scol, spitch, src.info().host_shape, src.info().host_stride, OL());
1190 |             cuvAssert(scol==srow);
1191 |             cuvAssert(dcol==drow);
1192 |             m_memory->copy2d_from(m_ptr, src.ptr(), dpitch, spitch, srow, scol, OM(), stream);
1193 |         } else {
1194 |             throw std::runtime_error("copying of generic strides not implemented yet");
1195 |         }
1196 | 
1197 |         if (!IsSame<L, OL>::Result::value) {
1198 |             info().host_stride.reverse();
1199 |             info().host_shape.reverse();
1200 |         }
1201 |         return true;
1202 |     }
1203 | 
1204 |     /// copies between different memory spaces
1205 |     template<class OM, class OL>
1206 |     void copy_memory(const ndarray<V, OM, OL>& src, linear_memory_tag, cudaStream_t stream) {
1207 |         if (copy_memory(src, true, stream)) // destination must be contiguous
1208 |             return;
1209 |         info().resize(src.ndim());
1210 |         info().host_shape = src.info().host_shape;
1211 | 
1212 |         // free old memory
1213 |         m_memory.reset(new memory<V, M>(m_allocator));
1214 | 
1215 |         linear_memory<V, M> d(src.size(), m_allocator);
1216 |         d.set_strides(info().host_stride, info().host_shape, L());
1217 |         if (src.is_c_contiguous()) {
1218 |             // easiest case: both linear, simply copy
1219 |             d.copy_from(src.ptr(), src.size(), OM(), stream);
1220 |         } else if (src.is_2dcopyable()) {
1221 |             // other memory is probably a pitched memory or some view onto an array
1222 |             size_type row, col, pitch;
1223 |             detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL());
1224 |             d.copy2d_from(src.ptr(), col, pitch, row, col, OM(), stream);
1225 |         } else {
1226 |             throw std::runtime_error("copying arbitrarily strided memory not implemented");
1227 |         }
1228 |         mem().reset(new memory<V, M>(d.release(), d.size(), m_allocator));
1229 |         if (!IsSame<L, OL>::Result::value) {
1230 |             info().host_stride.reverse();
1231 |             info().host_shape.reverse();
1232 |         }
1233 |     }
1234 | 
1235 |     /// copies between different memory spaces
1236 |     template<class OM, class OL>
1237 |     void copy_memory(const ndarray<V, OM, OL>& src, pitched_memory_tag, cudaStream_t stream) {
1238 |         assert(src.ndim()>=2);
1239 |         if (copy_memory(src, false, stream)) // destination need not be contiguous
1240 |             return;
1241 |         info().resize(src.ndim());
1242 |         info().host_shape = src.info().host_shape;
1243 |         size_type row, col, pitch;
1244 |         detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL());
1245 |         pitched_memory<V, M> d(row, col);
1246 |         //dst.mem().reset(d);
1247 |         d->set_strides(info().host_stride, info().host_shape, L());
1248 |         if (src.is_2dcopyable()) {
1249 |             // other memory is probably a pitched memory or some view onto an array
1250 |             detail::get_pitched_params(row, col, pitch, src.info().host_shape, src.info().host_stride, OL());
1251 |             d.copy2d_from(src, stream);
1252 |         } else {
1253 |             throw std::runtime_error("copying arbitrarily strided memory not implemented");
1254 |         }
1255 |         mem().reset(new memory<V, M>(d.release(), d.size(), m_allocator));
1256 | 
1257 |         if (!IsSame<L, OL>::Result::value) {
1258 |             info().host_stride.reverse();
1259 |             info().host_shape.reverse();
1260 |         }
1261 |     }
1262 | 
1263 | };
1264 | 
1265 | /**
1266 |  * primarily used as result of ndarray::operator[]
1267 |  */
1268 | template<class V, class M, class L = row_major>
1269 | class ndarray_view: public ndarray<V, M, L>
1270 | {
1271 | private:
1272 |     typedef ndarray<V, M, L> super;
1273 |     using super::m_memory;
1274 |     using super::m_ptr;
1275 |     using super::m_info;
1276 | 
1277 |     template<class _V, class _M, class _L>
1278 |     friend class ndarray;
1279 | 
1280 | public:
1281 | 
1282 |     /** default constructor does nothing */
1283 |     ndarray_view(const boost::shared_ptr<allocator>& allocator) :
1284 |             ndarray<V, M, L>(allocator) {
1285 |     }
1286 | 
1287 |     /**
1288 |      * /always/ try to copy memory
1289 |      */
1290 |     ndarray_view& assign(const ndarray<V, M, L>& o, cudaStream_t stream = 0) {
1291 |         if (!this->copy_memory(o, false, stream))
1292 |             throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?");
1293 |         return *this;
1294 |     }
1295 | 
1296 |     /**
1297 |      * /always/ try to copy memory
1298 |      */
1299 |     ndarray_view& assign(const ndarray_view<V, M, L>& o, cudaStream_t stream = 0) {
1300 |         if (!this->copy_memory(o, false, stream))
1301 |             throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?");
1302 |         return *this;
1303 |     }
1304 | 
1305 |     /**
1306 |      * assignment operator for other memory space type
1307 |      *
1308 |      * @param o a ndarray of another memory space type
1309 |      */
1310 |     template<class OM>
1311 |     ndarray_view& assign(const ndarray<V, OM, L>& o, cudaStream_t stream = 0) {
1312 |         if (!this->copy_memory(o, false, stream))
1313 |             throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?");
1314 |         return *this;
1315 |     }
1316 | 
1317 |     /**
1318 |      * assignment operator for views in other memory space types
1319 |      *
1320 |      * @param o a ndarray_view of another memory space type
1321 |      */
1322 |     template<class OM>
1323 |     ndarray_view& assign(const ndarray_view<V, OM, L>& o, cudaStream_t stream = 0) {
1324 |         if (!this->copy_memory(o, false, stream))
1325 |             throw std::runtime_error("copying ndarray to ndarray_view did not succeed. Maybe a shape mismatch?");
1326 |         return *this;
1327 |     }
1328 | 
1329 |     /**
1330 |      * /always/ try to copy memory
1331 |      */
1332 |     ndarray_view& operator=(const ndarray<V, M, L>& o) {
1333 |         return assign(o);
1334 |     }
1335 | 
1336 |     /**
1337 |      * /always/ try to copy memory
1338 |      */
1339 |     ndarray_view& operator=(const ndarray_view<V, M, L>& o) {
1340 |         return assign(o);
1341 |     }
1342 | 
1343 |     /**
1344 |      * assign from value (sets all elements equal to one scalar)
1345 |      *
1346 |      * @param scalar value which should be assigned to all elements
1347 |      */
1348 |     template<class _V>
1349 |     typename boost::enable_if_c<boost::is_convertible<_V, V>::value, ndarray_view&>::type operator=(
1350 |             const _V& scalar) {
1351 |         super::operator=(scalar);
1352 |         return *this;
1353 |     }
1354 | 
1355 |     /**
1356 |      * assignment operator for other memory space type
1357 |      *
1358 |      * @param o a ndarray of another memory space type
1359 |      */
1360 |     template<class OM>
1361 |     ndarray_view& operator=(const ndarray<V, OM, L>& o) {
1362 |         return assign(o);
1363 |     }
1364 | 
1365 |     /**
1366 |      * assignment operator for views in other memory space types
1367 |      *
1368 |      * @param o a ndarray_view of another memory space type
1369 |      */
1370 |     template<class OM>
1371 |     ndarray_view& operator=(const ndarray_view<V, OM, L>& o) {
1372 |         return assign(o);
1373 |     }
1374 | 
1375 |     /**
1376 |      * construct ndarray_view
1377 |      *
1378 |      * @warning if a dimension has size 1, the resulting ndarray has fewer dimensions than the original one.
1379 |      *
1380 |      * @warning most operations in CUV on ndarrays currently only work
1381 |      *          if the sub-ndarray is a connected area in memory.  Basically this
1382 |      *          means that you can only slice in the first dimension which has
1383 |      *          size>1.
1384 |      *
1385 |      * @param idx  the indices of the sub-ndarray
1386 |      * @param o   the original ndarray
1387 |      *
1388 |      * Example:
1389 |      * @code
1390 |      * ndarray<float,host_memory_space> v(extents[5][10]);
1391 |      *
1392 |      * // these are equivalent:
1393 |      * ndarray<float,host_memory_space> w0(v,indices[index_range(2,3)][index_range(0,10)]);
1394 |      * ndarray<float,host_memory_space> w0(v,indices[index_range(2,3)][index_range()]);
1395 |      * ndarray<float,host_memory_space> w0(v,indices[index_range(2,3)][index_range() < index(10)]);
1396 |      * ndarray<float,host_memory_space> w0(v,indices[index_range(2,3)][index(0) < index_range() < index(10)]);
1397 |      *
1398 |      * // yields a 1D-ndarray corresponding to the 2nd slice in the 1st dimension:
1399 |      * ndarray<float,host_memory_space> w0(indices[1][index_range()]);
1400 |      * @endcode
1401 |      */
1402 |     template<int D, int E>
1403 |     explicit ndarray_view(const ndarray<V, M, L>& o, const index_gen<D, E>& idx) :
1404 |             ndarray<V, M, L>(o.m_allocator)
1405 |     {
1406 |         m_memory = o.mem();
1407 |         m_ptr = const_cast<V*>(o.ptr());
1408 |         std::vector<int> shapes;
1409 |         std::vector<int> strides;
1410 |         shapes.reserve(D);
1411 |         strides.reserve(D);
1412 |         cuvAssert(o.ndim()==D);
1413 |         for (size_t i = 0; i < D; i++) {
1414 |             int start = idx.ranges_[i].get_start(0);
1415 |             int finish = idx.ranges_[i].get_finish(o.shape(i));
1416 |             int stride = idx.ranges_[i].stride();
1417 |             if (start < 0)
1418 |                 start += o.shape(i);
1419 |             if (finish < 0)
1420 |                 finish += o.shape(i);
1421 | #ifndef NDEBUG
1422 |             cuvAssert(finish>start);
1423 | #endif
1424 |             m_ptr += start * o.stride(i);
1425 |             if (idx.ranges_[i].is_degenerate()) {
1426 |                 // skip dimension
1427 |             } else {
1428 |                 shapes.push_back((finish - start) / stride);
1429 |                 strides.push_back(o.stride(i) * stride);
1430 |             }
1431 |         }
1432 |         // store in m_info
1433 |         m_info.resize(shapes.size());
1434 |         std::copy(shapes.begin(), shapes.end(), m_info.host_shape[0].ptr);
1435 |         std::copy(strides.begin(), strides.end(), m_info.host_stride[0].ptr);
1436 |     }
1437 | 
1438 |     /**
1439 |      * different order of arguments as above, all else being equal.
1440 |      *
1441 |      * @deprecated
1442 |      * @param idx a set of index ranges into o
1443 |      * @param o   other ndarray
1444 |      */
1445 |     template<int D, int E>
1446 |     explicit ndarray_view(const index_gen<D, E>& idx, const ndarray<V, M, L>& o) :
1447 |             ndarray<V, M, L>(o.m_allocator)
1448 |     {
1449 |         m_memory = o.mem();
1450 |         m_ptr = const_cast<V*>(o.ptr());
1451 |         std::vector<int> shapes;
1452 |         std::vector<int> strides;
1453 |         shapes.reserve(D);
1454 |         strides.reserve(D);
1455 |         cuvAssert(o.ndim()==D);
1456 |         for (size_t i = 0; i < D; i++) {
1457 |             int start = idx.ranges_[i].get_start(0);
1458 |             int finish = idx.ranges_[i].get_finish(o.shape(i));
1459 |             int stride = idx.ranges_[i].stride();
1460 |             if (start < 0)
1461 |                 start += o.shape(i);
1462 |             if (finish < 0)
1463 |                 finish += o.shape(i);
1464 | #ifndef NDEBUG
1465 |             cuvAssert(finish>start);
1466 | #endif
1467 |             m_ptr += start * o.stride(i);
1468 |             if (idx.ranges_[i].is_degenerate()) {
1469 |                 // skip dimension
1470 |             } else {
1471 |                 shapes.push_back((finish - start) / stride);
1472 |                 strides.push_back(o.stride(i) * stride);
1473 |             }
1474 |         }
1475 |         // store in m_info
1476 |         m_info.resize(shapes.size());
1477 |         std::copy(shapes.begin(), shapes.end(), m_info.host_shape[0].ptr);
1478 |         std::copy(strides.begin(), strides.end(), m_info.host_stride[0].ptr);
1479 |     }
1480 | };
1481 | 
1482 | /** @} */ // data_structures
1483 | /**
1484 |  * test whether two ndarrays have the same shape
1485 |  * @ingroup tools
1486 |  * @param a first ndarray
1487 |  * @param a second ndarray
1488 |  */
1489 | template<class V, class V2, class M, class M2, class L>
1490 | bool equal_shape(const ndarray<V, M, L>& a, const ndarray<V2, M2, L>& b) {
1491 |     return a.effective_shape() == b.effective_shape();
1492 | }
1493 | 
1494 | /**
1495 |  * @addtogroup MetaProgramming
1496 |  */
1497 | /// create a ndarray type with the same template parameters, but with switched value type
1498 | template<class Mat, class NewVT>
1499 | struct switch_value_type {
1500 |     typedef ndarray<NewVT, typename Mat::memory_space_type, typename Mat::memory_layout_type> type; ///< new ndarray type after switch
1501 | };
1502 | /// create a ndarray type with the same template parameters, but with switched memory_layout_type
1503 | template<class Mat, class NewML>
1504 | struct switch_memory_layout_type {
1505 |     typedef ndarray<typename Mat::value_type, typename Mat::memory_space_type, NewML> type; ///< new ndarray type after switch
1506 | };
1507 | /// create a ndarray type with the same template parameters, but with switched memory_space_type
1508 | template<class Mat, class NewMS>
1509 | struct switch_memory_space_type {
1510 |     typedef ndarray<typename Mat::value_type, NewMS, typename Mat::memory_layout_type> type; ///< new ndarray type after switch
1511 | };
1512 | 
1513 | /** @} */
1514 | 
1515 | }
1516 | 
1517 | /**
1518 |  * input and output operations
1519 |  *
1520 |  * @addtogroup io
1521 |  * @{
1522 |  */
1523 | namespace std {
1524 | 
1525 | /**
1526 |  * print a host linear memory to a stream
1527 |  * @param o the stream
1528 |  * @param t the ndarray
1529 |  */
1530 | template<class V>
1531 | ostream& operator<<(ostream& o, const cuv::linear_memory<V, cuv::host_memory_space>& t) {
1532 |     o << "[ ";
1533 |     for (unsigned int i = 0; i < t.size(); i++)
1534 |         o << t[i] << " ";
1535 |     o << "]";
1536 |     return o;
1537 | }
1538 | 
1539 | /**
1540 |  * print a dev linear memory to a stream (copies first)
1541 |  * @param o the stream
1542 |  * @param t_ the ndarray
1543 |  */
1544 | template<class V>
1545 | ostream& operator<<(ostream& o, const cuv::linear_memory<V, cuv::dev_memory_space>& t_) {
1546 |     cuv::linear_memory<V, cuv::host_memory_space> t = t_; // pull
1547 |     o << "[ ";
1548 |     for (unsigned int i = 0; i < t.size(); i++)
1549 |         o << t[i] << " ";
1550 |     o << "]";
1551 |     return o;
1552 | }
1553 | 
1554 | /**
1555 |  * print a host pitched memory to a stream
1556 |  * @param o the stream
1557 |  * @param t the ndarray
1558 |  */
1559 | template<class V>
1560 | ostream& operator<<(ostream& o, const cuv::pitched_memory<V, cuv::host_memory_space>& t) {
1561 |     o << "[ ";
1562 |     for (unsigned int i = 0; i < t.rows(); i++) {
1563 |         for (unsigned int j = 0; j < t.rows(); j++) {
1564 |             o << t(i, j) << " ";
1565 |         }
1566 |         if (i < t.rows() - 1)
1567 |             o << std::endl;
1568 |     }
1569 |     o << "]";
1570 |     return o;
1571 | }
1572 | 
1573 | /**
1574 |  * print a dev pitched memory to a stream (copies first)
1575 |  * @param o the stream
1576 |  * @param t_ the ndarray
1577 |  */
1578 | template<class V>
1579 | ostream& operator<<(ostream& o, const cuv::pitched_memory<V, cuv::dev_memory_space>& t_) {
1580 |     cuv::pitched_memory<V, cuv::host_memory_space> t = t_; // pull
1581 |     o << "[ ";
1582 |     for (unsigned int i = 0; i < t.rows(); i++) {
1583 |         for (unsigned int j = 0; j < t.rows(); j++) {
1584 |             o << t(i, j) << " ";
1585 |         }
1586 |         if (i < t.rows() - 1)
1587 |             o << std::endl;
1588 |     }
1589 |     o << "]";
1590 |     return o;
1591 | }
1592 | 
1593 | /**
1594 |  * print a dev ndarray to a stream (copying to host first)
1595 |  *
1596 |  * @param o the stream
1597 |  * @param t the ndarray
1598 |  */
1599 | template<class V, class L>
1600 | ostream& operator<<(ostream& o, const cuv::ndarray<V, cuv::dev_memory_space, L>& t) {
1601 |     return o << cuv::ndarray<V, cuv::host_memory_space, L>(t);
1602 | }
1603 | 
1604 | /**
1605 |  * print a host ndarray to a stream
1606 |  *
1607 |  * @param o the stream
1608 |  * @param t the ndarray
1609 |  */
1610 | template<class V, class L>
1611 | ostream& operator<<(ostream& o, const cuv::ndarray<V, cuv::host_memory_space, L>& t) {
1612 |     if (t.ndim() == 0)
1613 |         return o << "[]";
1614 | 
1615 |     if (t.ndim() == 1) {
1616 |         o << "[ ";
1617 |         for (unsigned int i = 0; i < t.shape(0); i++)
1618 |             o << t[i] << " ";
1619 |         return o << "]";
1620 |     }
1621 |     if (t.ndim() == 2) {
1622 |         o << "[";
1623 |         for (unsigned int i = 0; i < t.shape(0); ++i) {
1624 |             if (i > 0)
1625 |                 o << " ";
1626 |             o << "[ ";
1627 |             for (unsigned int j = 0; j < t.shape(1); j++)
1628 |                 o << t(i, j) << " ";
1629 |             o << "]";
1630 |             if (i != t.shape(0) - 1)
1631 |                 o << std::endl;
1632 |         }
1633 |         return o << "]";
1634 |     }
1635 |     if (t.ndim() == 3) {
1636 |         o << "[" << std::endl;
1637 |         for (unsigned int l = 0; l < t.shape(0); l++) {
1638 |             o << "[";
1639 |             for (unsigned int i = 0; i < t.shape(1); ++i) {
1640 |                 if (i > 0)
1641 |                     o << " ";
1642 |                 o << "[ ";
1643 |                 //for(unsigned int j=0;j<t.shape(2);j++) o<< t(l,i,j)<<" ";
1644 |                 for (unsigned int j = 0; j < t.shape(2); j++)
1645 |                     o << t[l * t.shape(1) * t.shape(2) + i * t.shape(2) + j] << " ";
1646 |                 o << "]";
1647 |                 if (i != t.shape(1) - 1)
1648 |                     o << std::endl;
1649 |             }
1650 |             o << "]";
1651 |             if (l < t.shape(0) - 1)
1652 |                 o << std::endl;
1653 |         }
1654 |         return o << "]";
1655 |     }
1656 |     throw std::runtime_error("printing of ndarrays with >3 dimensions not implemented");
1657 | }
1658 | }
1659 | /** @} */ // io
1660 | #endif
1661 | 


--------------------------------------------------------------------------------
/src/cuv/reference.cu:
--------------------------------------------------------------------------------
 1 | #if 0
 2 | #######################################################################################
 3 | # The MIT License
 4 | 
 5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
 6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
 7 | 
 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | # 
15 | # The above copyright notice and this permission notice shall be included in all
16 | # copies or substantial portions of the Software.
17 | # 
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | # SOFTWARE.
25 | #######################################################################################
26 | #endif
27 | #include "reference.hpp"
28 | 
29 | #include <thrust/device_ptr.h>
30 | 
31 | namespace cuv {
32 | namespace detail {
33 | 
34 | template<class value_type>
35 | void entry_set(value_type* ptr, size_t idx, value_type val, host_memory_space) {
36 |     ptr[idx] = val;
37 | }
38 | 
39 | template<class value_type>
40 | value_type entry_get(const value_type* ptr, size_t idx, host_memory_space) {
41 |     return ptr[idx];
42 | }
43 | 
44 | template<class value_type>
45 | void entry_set(value_type* ptr, size_t idx, value_type val, dev_memory_space) {
46 |     thrust::device_ptr<value_type> dev_ptr(ptr);
47 |     dev_ptr[idx] = val;
48 | }
49 | 
50 | template<class value_type>
51 | value_type entry_get(const value_type* ptr, size_t idx, dev_memory_space) {
52 |     const thrust::device_ptr<const value_type> dev_ptr(ptr);
53 |     return static_cast<value_type>(*(dev_ptr + idx));
54 | }
55 | 
56 | }
57 | }
58 | 
59 | template<class T, class M>
60 | std::ostream& operator<<(std::ostream& os, const cuv::reference<T, M>& reference) {
61 |     os << static_cast<T>(reference);
62 |     return os;
63 | }
64 | 
65 | 
66 | #define CUV_REFERENCE_INST(TYPE) \
67 |     template void cuv::detail::entry_set(TYPE*, size_t, TYPE, cuv::host_memory_space); \
68 |     template void cuv::detail::entry_set(TYPE*, size_t, TYPE, cuv::dev_memory_space); \
69 |     template TYPE cuv::detail::entry_get(const TYPE*, size_t, cuv::host_memory_space); \
70 |     template TYPE cuv::detail::entry_get(const TYPE*, size_t, cuv::dev_memory_space); \
71 |     template std::ostream& operator<<(std::ostream& os, const cuv::reference<TYPE, cuv::host_memory_space>& reference); \
72 |     template std::ostream& operator<<(std::ostream& os, const cuv::reference<TYPE, cuv::dev_memory_space>& reference);
73 | 
74 | CUV_REFERENCE_INST(signed char);
75 | CUV_REFERENCE_INST(unsigned char);
76 | CUV_REFERENCE_INST(short);
77 | CUV_REFERENCE_INST(unsigned short);
78 | CUV_REFERENCE_INST(int);
79 | CUV_REFERENCE_INST(unsigned int);
80 | CUV_REFERENCE_INST(float);
81 | CUV_REFERENCE_INST(double);
82 | 


--------------------------------------------------------------------------------
/src/cuv/reference.hpp:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | #######################################################################################
  3 | # The MIT License
  4 | 
  5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
  6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
  7 | 
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | # of this software and associated documentation files (the "Software"), to deal
 10 | # in the Software without restriction, including without limitation the rights
 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | # copies of the Software, and to permit persons to whom the Software is
 13 | # furnished to do so, subject to the following conditions:
 14 | # 
 15 | # The above copyright notice and this permission notice shall be included in all
 16 | # copies or substantial portions of the Software.
 17 | # 
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 | # SOFTWARE.
 25 | #######################################################################################
 26 | #endif
 27 | #ifndef __CUV_REFERENCE_HPP__
 28 | #define __CUV_REFERENCE_HPP__
 29 | 
 30 | #include <boost/type_traits/is_convertible.hpp>
 31 | #include <boost/utility/enable_if.hpp>
 32 | #include <iostream>
 33 | 
 34 | #include "meta_programming.hpp"
 35 | #include "tags.hpp"
 36 | 
 37 | namespace cuv {
 38 | 
 39 | namespace detail {
 40 | 
 41 | /**
 42 |  * @brief Setting entry of host linear_memory at ptr at index idx to value val
 43 |  *
 44 |  * @param ptr Address of array in memory
 45 |  * @param idx Index of value to set
 46 |  * @param val Value to set linear_memory entry to
 47 |  *
 48 |  */
 49 | template<class value_type>
 50 | void entry_set(value_type* ptr, size_t idx, value_type val, host_memory_space);
 51 | 
 52 | /**
 53 |  * @brief Getting entry of host linear_memory at ptr at index idx
 54 |  *
 55 |  * @param ptr Address of array in memory
 56 |  * @param idx Index of value to get
 57 |  *
 58 |  * @return
 59 |  */
 60 | template<class value_type>
 61 | value_type entry_get(const value_type* ptr, size_t idx, host_memory_space);
 62 | 
 63 | template<class value_type>
 64 | void entry_set(value_type* ptr, size_t idx, value_type val, dev_memory_space);
 65 | 
 66 | /**
 67 |  * Set the value at *(ptr+idx) to val, when ptr is in dev_memory_space.
 68 |  */
 69 | template<class value_type>
 70 | value_type entry_get(const value_type* ptr, size_t idx, dev_memory_space);
 71 | 
 72 | }
 73 | 
 74 | /**
 75 |  * This objects acts like a reference to the object stored at the wrapped pointer.
 76 |  * \ingroup data_structures
 77 |  */
 78 | template<class T, class M>
 79 | class reference
 80 | {
 81 | 
 82 | public:
 83 | 
 84 |     typedef typename unconst<T>::type value_type; ///< the type of the pointer
 85 |     typedef M memory_space_type; ///< the memory space of the pointer
 86 |     typedef reference<T, M> my_type; ///< the type of this reference
 87 | 
 88 |     value_type* ptr; ///< the wrapped pointer
 89 | 
 90 |     /// convert to the stored value
 91 |     operator value_type() const {
 92 |         return detail::entry_get(ptr, 0, memory_space_type());
 93 |     }
 94 | 
 95 |     /// assign a new value
 96 |     void operator=(const value_type& v) {
 97 |         detail::entry_set(ptr, 0, v, memory_space_type());
 98 |     }
 99 | 
100 |     /// assign a value of a different (but convertible) value type
101 |     template<class _T>
102 |     typename boost::enable_if_c<boost::is_convertible<_T, value_type>::value>::type operator=(const _T& v) {
103 |         detail::entry_set(ptr, 0, (value_type) v, memory_space_type());
104 |     }
105 | 
106 |     /// assignment from reference of same type
107 |     reference& operator=(const reference& o) {
108 |         if (&o == &(*this)) // operator & is overloaded and returns value_type*
109 |             return *this;
110 |         (*this) = (value_type) o;
111 |         return *this;
112 |     }
113 | 
114 |     /// assignment from reference of other memory type
115 |     template<class OM>
116 |     reference& operator=(const reference<T, OM>& o) {
117 |         (*this) = static_cast<T>(o);
118 |         return *this;
119 |     }
120 | 
121 |     /// get the wrapped pointer
122 |     const value_type* operator&() const {
123 |         return ptr;
124 |     }
125 | 
126 |     /// get the wrapped pointer
127 |     value_type* operator&() {
128 |         return ptr;
129 |     }
130 | 
131 |     /// construct using a pointer
132 |     reference(const T* p) :
133 |             ptr(p) {
134 |     }
135 | 
136 |     /// construct using a pointer
137 |     reference(T* p) :
138 |             ptr(p) {
139 |     }
140 | 
141 |     /// implicit construction using value
142 |     reference(value_type& p) :
143 |             ptr(&p) {
144 |     }
145 | 
146 |     /// implicit construction using value
147 |     reference(const value_type& p) :
148 |             ptr(&p) {
149 |     }
150 | 
151 |     /// add to the value stored at ptr
152 |     my_type& operator+=(const value_type& v) {
153 |         *this = (value_type) (*this) + v;
154 |         return *this;
155 |     }
156 | 
157 |     /// subtract from the value stored at ptr
158 |     my_type& operator-=(const value_type& v) {
159 |         *this = (value_type) (*this) - v;
160 |         return *this;
161 |     }
162 | 
163 |     /// multiply with the value stored at ptr
164 |     my_type& operator*=(const value_type& v) {
165 |         *this = (value_type) (*this) * v;
166 |         return *this;
167 |     }
168 | 
169 |     /// divide by the value stored at ptr
170 |     my_type& operator/=(const value_type& v) {
171 |         *this = (value_type) (*this) / v;
172 |         return *this;
173 |     }
174 | 
175 |     /// increment value at ptr
176 |     value_type operator++(int) {
177 |         value_type v = *this;
178 |         *this = v + 1;
179 |         return v;
180 |     }
181 | 
182 |     /// decrement value at ptr
183 |     value_type operator--(int) {
184 |         value_type v = *this;
185 |         *this = v - 1;
186 |         return v;
187 |     }
188 | 
189 |     /// increment value at ptr
190 |     value_type operator++() {
191 |         value_type v = *this;
192 |         *this = v + 1;
193 |         return v + 1;
194 |     }
195 | 
196 |     /// decrement value at ptr
197 |     value_type operator--() {
198 |         value_type v = *this;
199 |         *this = v - 1;
200 |         return v - 1;
201 |     }
202 | 
203 |     /// compare value at ptr with another
204 |     bool operator==(const value_type& v) {
205 |         return ((value_type) *this) == v;
206 |     }
207 | 
208 |     /// compare value at ptr with another
209 |     bool operator<=(const value_type& v) {
210 |         return ((value_type) *this) <= v;
211 |     }
212 | 
213 |     /// compare value at ptr with another
214 |     bool operator<(const value_type& v) {
215 |         return ((value_type) *this) < v;
216 |     }
217 | 
218 |     /// compare value at ptr with another
219 |     bool operator>=(const value_type& v) {
220 |         return ((value_type) *this) >= v;
221 |     }
222 | 
223 |     /// compare value at ptr with another
224 |     bool operator>(const value_type& v) {
225 |         return ((value_type) *this) > v;
226 |     }
227 | };
228 | 
229 | }
230 | 
231 | template<class T, class M>
232 | std::ostream& operator<<(std::ostream& os, const cuv::reference<T, M>& reference);
233 | 
234 | #endif
235 | 


--------------------------------------------------------------------------------
/src/cuv/tags.hpp:
--------------------------------------------------------------------------------
 1 | #if 0
 2 | #######################################################################################
 3 | # The MIT License
 4 | 
 5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
 6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
 7 | 
 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | # 
15 | # The above copyright notice and this permission notice shall be included in all
16 | # copies or substantial portions of the Software.
17 | # 
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | # SOFTWARE.
25 | #######################################################################################
26 | #endif
27 | #ifndef __CUV_TAGS_HPP__
28 | #define __CUV_TAGS_HPP__
29 | 
30 | namespace cuv {
31 | /**
32 |  * @addtogroup tags
33 |  * @{
34 |  */
35 | 
36 | /** Tag for host memory
37 |  * @ingroup basics
38 |  */
39 | struct host_memory_space {
40 | };
41 | 
42 | /** Tag for device memory
43 |  * @ingroup basics
44 |  */
45 | struct dev_memory_space {
46 | };
47 | 
48 | /**
49 |  * @}
50 |  */
51 | }
52 | 
53 | #endif
54 | 


--------------------------------------------------------------------------------
/src/tests/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | #######################################################################################
 2 | # The MIT License
 3 | 
 4 | # Copyright (c) 2014       Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
 5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
 6 | # Copyright (c) 2008-2009  Sebastian Nowozin                       <nowozin@gmail.com>
 7 | 
 8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 9 | # of this software and associated documentation files (the "Software"), to deal
10 | # in the Software without restriction, including without limitation the rights
11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 | # copies of the Software, and to permit persons to whom the Software is
13 | # furnished to do so, subject to the following conditions:
14 | # 
15 | # The above copyright notice and this permission notice shall be included in all
16 | # copies or substantial portions of the Software.
17 | # 
18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24 | # SOFTWARE.
25 | #######################################################################################
26 | INCLUDE_DIRECTORIES(${Boost_INCLUDE_DIR})
27 | INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/src/cuv)
28 | INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/src/)
29 | 
30 | FIND_PACKAGE(TBB      REQUIRED)
31 | 
32 | CUDA_INCLUDE_DIRECTORIES( ${TBB_INCLUDE_DIRS}  )
33 | INCLUDE_DIRECTORIES(      ${TBB_INCLUDE_DIRS}  )
34 | 
35 | SET (TEST_LINK_LIBS ${Boost_LIBRARIES} ${TBB_LIBRARIES} ndarray${LIB_SUFFIX})
36 | 
37 | CUDA_ADD_EXECUTABLE(allocators_test allocators_test.cpp)
38 | TARGET_LINK_LIBRARIES(allocators_test ${TEST_LINK_LIBS})
39 | 
40 | CUDA_ADD_EXECUTABLE(ndarray_test ndarray_test.cpp)
41 | TARGET_LINK_LIBRARIES(ndarray_test ${TEST_LINK_LIBS})
42 | 
43 | ADD_TEST(allocators_test "${CMAKE_BINARY_DIR}/src/tests/allocators_test")
44 | ADD_TEST(ndarray_test "${CMAKE_BINARY_DIR}/src/tests/ndarray_test")
45 | 


--------------------------------------------------------------------------------
/src/tests/allocators_test.cpp:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | #######################################################################################
  3 | # The MIT License
  4 | 
  5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
  6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
  7 | 
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | # of this software and associated documentation files (the "Software"), to deal
 10 | # in the Software without restriction, including without limitation the rights
 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | # copies of the Software, and to permit persons to whom the Software is
 13 | # furnished to do so, subject to the following conditions:
 14 | # 
 15 | # The above copyright notice and this permission notice shall be included in all
 16 | # copies or substantial portions of the Software.
 17 | # 
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 | # SOFTWARE.
 25 | #######################################################################################
 26 | #endif
 27 | #define BOOST_TEST_MODULE example
 28 | 
 29 | #include <boost/format.hpp>
 30 | #include <boost/test/included/unit_test.hpp>
 31 | #include <boost/thread/mutex.hpp>
 32 | #include <cuv/allocators.hpp>
 33 | #include <cuv/reference.hpp>
 34 | #include <tbb/parallel_for_each.h>
 35 | 
 36 | using namespace cuv;
 37 | 
 38 | BOOST_AUTO_TEST_SUITE(allocators_test)
 39 | 
 40 | template<class memory_space>
 41 | static void test_pooled_allocator() {
 42 |     memory_space m;
 43 |     pooled_cuda_allocator allocator;
 44 |     int* ptr1 = 0;
 45 |     int* ptr2 = 0;
 46 | 
 47 |     const int NUM_ELEMENTS = 10000;
 48 | 
 49 |     allocator.alloc(reinterpret_cast<void**>(&ptr1), NUM_ELEMENTS, sizeof(int), m);
 50 |     allocator.alloc(reinterpret_cast<void**>(&ptr2), NUM_ELEMENTS, sizeof(int), m);
 51 |     BOOST_CHECK(ptr1);
 52 |     BOOST_CHECK(ptr2);
 53 |     BOOST_CHECK_NE(ptr1, ptr2);
 54 |     BOOST_CHECK_EQUAL(allocator.pool_count(m), 2);
 55 |     BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 0);
 56 |     BOOST_CHECK_EQUAL(allocator.pool_size(m), 2 * NUM_ELEMENTS * sizeof(int));
 57 | 
 58 |     for (size_t i = 0; i < 10000; i++) {
 59 |         reference<int, memory_space> ref(ptr1 + i);
 60 |         ref = i;
 61 |         BOOST_CHECK_EQUAL(static_cast<int>(ref), i);
 62 |     }
 63 | 
 64 |     allocator.dealloc(reinterpret_cast<void**>(&ptr1), m);
 65 |     BOOST_CHECK(ptr1 == 0);
 66 | 
 67 |     BOOST_CHECK_EQUAL(allocator.pool_count(m), 2);
 68 |     BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 1);
 69 |     BOOST_CHECK_EQUAL(allocator.pool_size(m), 2 * NUM_ELEMENTS * sizeof(int));
 70 | 
 71 |     for (size_t i = 0; i < 10000; i++) {
 72 |         reference<int, memory_space> ref(ptr2 + i);
 73 |         ref = i + 100;
 74 |         BOOST_CHECK_EQUAL(static_cast<int>(ref), i + 100);
 75 |     }
 76 | 
 77 |     allocator.dealloc(reinterpret_cast<void**>(&ptr2), m);
 78 | 
 79 |     BOOST_CHECK_EQUAL(allocator.pool_free_count(), allocator.pool_count());
 80 | }
 81 | 
 82 | template<class memory_space>
 83 | class allocate {
 84 |     private:
 85 |         pooled_cuda_allocator& allocator;
 86 |         int allocSize;
 87 |         boost::mutex& mutex;
 88 | 
 89 | 
 90 |     public:
 91 |         allocate(pooled_cuda_allocator& allocator, int allocSize, boost::mutex& mutex)
 92 |             : allocator(allocator), allocSize(allocSize), mutex(mutex) {}
 93 | 
 94 |         void operator()(void*& ptr) const {
 95 |             memory_space m;
 96 |             size_t pool_size = allocator.pool_size(m);
 97 |             void* ptr1 = NULL;
 98 |             void* ptr2 = NULL;
 99 |             allocator.alloc(&ptr1, allocSize, 1, m);
100 |             allocator.alloc(&ptr2, 1, 1, m);
101 |             allocator.alloc(&ptr, allocSize, 1, m);
102 | 
103 |             {
104 |                 boost::mutex::scoped_lock lock(mutex);
105 |                 BOOST_REQUIRE(ptr1);
106 |                 BOOST_REQUIRE(ptr2);
107 |                 BOOST_REQUIRE(ptr);
108 | 
109 |                 BOOST_REQUIRE_NE(ptr1, ptr2);
110 |                 BOOST_REQUIRE_NE(ptr2, ptr);
111 |                 BOOST_REQUIRE_NE(ptr1, ptr);
112 | 
113 |                 BOOST_REQUIRE_GE(allocator.pool_count(m), 2lu);
114 |             }
115 | 
116 |             allocator.dealloc(&ptr1, m);
117 |             allocator.dealloc(&ptr2, m);
118 | 
119 |             {
120 |                 boost::mutex::scoped_lock lock(mutex);
121 |                 BOOST_REQUIRE_GE(allocator.pool_size(m), pool_size);
122 |                 BOOST_REQUIRE_GE(allocator.pool_free_count(m), 0lu);
123 |             }
124 |         }
125 | };
126 | 
127 | template<class memory_space>
128 | class deallocate {
129 |     private:
130 |         pooled_cuda_allocator& allocator;
131 |         boost::mutex& mutex;
132 | 
133 |     public:
134 |       deallocate(pooled_cuda_allocator &allocator, boost::mutex &mutex)
135 |           : allocator(allocator), mutex(mutex) {}
136 | 
137 |         void operator()(void*& ptr) const {
138 |             allocator.dealloc(&ptr, memory_space());
139 | 
140 |             {
141 |                 boost::mutex::scoped_lock lock(mutex);
142 |                 BOOST_CHECK(!ptr);
143 |             }
144 |         }
145 | };
146 | 
147 | template<class memory_space>
148 | static void test_pooled_allocator_multi_threaded() {
149 |     memory_space m;
150 |     pooled_cuda_allocator allocator("allocator_multi_threaded");
151 | 
152 |     const int allocSize = pooled_cuda_allocator::MIN_SIZE_HOST;
153 | 
154 |     // boost-test is not thread-safe
155 |     boost::mutex boost_mutex;
156 | 
157 |     std::vector<void*> pointers(1000, NULL);
158 |     tbb::parallel_for_each(pointers.begin(), pointers.end(),
159 |             allocate<memory_space>(allocator, allocSize, boost_mutex));
160 | 
161 |     for (size_t i = 0; i < pointers.size(); i++) {
162 |         BOOST_REQUIRE(pointers[i]);
163 |     }
164 | 
165 |     BOOST_CHECK_GE(allocator.pool_size(m), pointers.size() * allocSize);
166 |     BOOST_CHECK_LE(allocator.pool_count(m), 10 * pointers.size());
167 | 
168 |     size_t count = allocator.pool_count(m);
169 |     BOOST_CHECK_GE(count, pointers.size());
170 | 
171 |     tbb::parallel_for_each(pointers.begin(), pointers.end(),
172 |             deallocate<memory_space>(allocator, boost_mutex));
173 | 
174 |     BOOST_CHECK_EQUAL(allocator.pool_free_count(), allocator.pool_count());
175 | }
176 | 
177 | template<class memory_space>
178 | static void test_pooled_allocator_garbage_collection() {
179 |     memory_space m;
180 |     pooled_cuda_allocator allocator;
181 |     int* ptr1 = 0;
182 |     int* ptr2 = 0;
183 |     allocator.alloc(reinterpret_cast<void**>(&ptr1), 10000, sizeof(int), m);
184 |     allocator.alloc(reinterpret_cast<void**>(&ptr2), 10000, sizeof(int), m);
185 | 
186 |     BOOST_CHECK_EQUAL(allocator.pool_count(m), 2);
187 | 
188 |     allocator.dealloc(reinterpret_cast<void**>(&ptr1), m);
189 | 
190 |     BOOST_CHECK_EQUAL(allocator.pool_count(m), 2);
191 |     BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 1);
192 | 
193 |     allocator.garbage_collection();
194 | 
195 |     BOOST_CHECK_EQUAL(allocator.pool_count(m), 1);
196 |     BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 0);
197 | 
198 |     allocator.dealloc(reinterpret_cast<void**>(&ptr2), m);
199 | 
200 |     BOOST_CHECK_EQUAL(allocator.pool_count(m), 1);
201 |     BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 1);
202 | 
203 |     allocator.garbage_collection();
204 | 
205 |     BOOST_CHECK_EQUAL(allocator.pool_count(m), 0);
206 |     BOOST_CHECK_EQUAL(allocator.pool_free_count(m), 0);
207 | }
208 | 
209 | BOOST_AUTO_TEST_CASE( pooled_cuda_allocator_test_simple ) {
210 |     test_pooled_allocator<dev_memory_space>();
211 |     test_pooled_allocator<host_memory_space>();
212 | }
213 | 
214 | BOOST_AUTO_TEST_CASE( pooled_cuda_allocator_test_multithreaded ) {
215 |     test_pooled_allocator_multi_threaded<dev_memory_space>();
216 |     test_pooled_allocator_multi_threaded<host_memory_space>();
217 | }
218 | 
219 | BOOST_AUTO_TEST_CASE( pooled_cuda_allocator_test_garbage_collection ) {
220 |     test_pooled_allocator_garbage_collection<dev_memory_space>();
221 |     test_pooled_allocator_garbage_collection<host_memory_space>();
222 | }
223 | BOOST_AUTO_TEST_SUITE_END()
224 | 


--------------------------------------------------------------------------------
/src/tests/ndarray_test.cpp:
--------------------------------------------------------------------------------
  1 | #if 0
  2 | #######################################################################################
  3 | # The MIT License
  4 | 
  5 | # Copyright (c) 2013       Benedikt Waldvogel, University of Bonn <mail@bwaldvogel.de>
  6 | # Copyright (c) 2012-2014  Hannes Schulz, University of Bonn  <schulz@ais.uni-bonn.de>
  7 | 
  8 | # Permission is hereby granted, free of charge, to any person obtaining a copy
  9 | # of this software and associated documentation files (the "Software"), to deal
 10 | # in the Software without restriction, including without limitation the rights
 11 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 12 | # copies of the Software, and to permit persons to whom the Software is
 13 | # furnished to do so, subject to the following conditions:
 14 | # 
 15 | # The above copyright notice and this permission notice shall be included in all
 16 | # copies or substantial portions of the Software.
 17 | # 
 18 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 19 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 20 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 21 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 22 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 23 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 24 | # SOFTWARE.
 25 | #######################################################################################
 26 | #endif
 27 | #define BOOST_TEST_MODULE example
 28 | #include <boost/test/included/unit_test.hpp>
 29 | #include <cuv/ndarray.hpp>
 30 | 
 31 | using namespace cuv;
 32 | 
 33 | BOOST_AUTO_TEST_SUITE(ndarray_test)
 34 | 
 35 | /**
 36 |  * @test
 37 |  * @brief create ndarray
 38 |  */
 39 | BOOST_AUTO_TEST_CASE( create_ndarray ) {
 40 |     // column_major
 41 |     ndarray<float, host_memory_space, column_major> m(extents[2][3][4]);
 42 |     BOOST_CHECK_EQUAL(24, m.size());
 43 |     BOOST_CHECK_EQUAL(2ul, m.shape(0));
 44 |     BOOST_CHECK_EQUAL(3ul, m.shape(1));
 45 |     BOOST_CHECK_EQUAL(4ul, m.shape(2));
 46 | 
 47 |     BOOST_CHECK_EQUAL(0ul, m.index_of(extents[0][0][0]));
 48 |     // column major test
 49 |     BOOST_CHECK_EQUAL(1ul, m.index_of(extents[1][0][0]));
 50 |     BOOST_CHECK_EQUAL(2ul, m.index_of(extents[0][1][0]));
 51 | 
 52 |     // row_major
 53 |     ndarray<float, host_memory_space, row_major> n(extents[2][3][4]);
 54 |     BOOST_CHECK_EQUAL(24, m.size());
 55 |     BOOST_CHECK_EQUAL(2ul, n.shape(0));
 56 |     BOOST_CHECK_EQUAL(3ul, n.shape(1));
 57 |     BOOST_CHECK_EQUAL(4ul, n.shape(2));
 58 | 
 59 |     BOOST_CHECK_EQUAL(0ul, n.index_of(extents[0][0][0]));
 60 |     // row major test
 61 |     BOOST_CHECK_EQUAL(1ul, n.index_of(extents[0][0][1]));
 62 |     BOOST_CHECK_EQUAL(2ul, n.index_of(extents[0][0][2]));
 63 |     BOOST_CHECK_EQUAL(4ul, n.index_of(extents[0][1][0]));
 64 | }
 65 | 
 66 | BOOST_AUTO_TEST_CASE( ndarray_data_access ) {
 67 |     ndarray<float, host_memory_space, column_major> m(extents[2][3][4]);
 68 |     ndarray<float, host_memory_space, row_major> n(extents[2][3][4]);
 69 | 
 70 |     ndarray<float, host_memory_space, column_major> o(extents[2][3][4]);
 71 |     ndarray<float, host_memory_space, row_major> p(extents[2][3][4]);
 72 |     for (int i = 0; i < 2; ++i) {
 73 |         for (int j = 0; j < 3; ++j) {
 74 |             for (int k = 0; k < 4; ++k) {
 75 |                 m(i, j, k) = i * j + k;
 76 |                 n(i, j, k) = i * j + k;
 77 | 
 78 |                 o(i, j, k) = i * j + k;
 79 |                 p(i, j, k) = i * j + k;
 80 |             }
 81 |         }
 82 |     }
 83 |     BOOST_CHECK_EQUAL(1*2+3, m(1,2,3));
 84 |     BOOST_CHECK_EQUAL(1*2+3, n(1,2,3));
 85 |     BOOST_CHECK_EQUAL(1*2+3, o(1,2,3));
 86 |     BOOST_CHECK_EQUAL(1*2+3, p(1,2,3));
 87 | 
 88 |     BOOST_CHECK_EQUAL(1*2+3-1, --p(1,2,3));
 89 |     BOOST_CHECK_EQUAL(1*2+3, p(1,2,3)+=1);
 90 | }
 91 | 
 92 | BOOST_AUTO_TEST_CASE( ndarray_assignment ) {
 93 |     ndarray<float, host_memory_space, column_major> m(extents[2][3][4]);
 94 |     ndarray<float, host_memory_space, column_major> n(extents[2][3][4]);
 95 | 
 96 |     ndarray<float, host_memory_space, column_major> o(extents[2][3][4]);
 97 | 
 98 |     for (int i = 0; i < 2 * 3 * 4; ++i)
 99 |         m[i] = i;
100 |     n = m;
101 |     o = m;
102 | 
103 |     ndarray<float, host_memory_space, column_major> s(n);
104 |     ndarray<float, dev_memory_space, column_major> t(n);
105 | 
106 |     for (int i = 0; i < 2 * 3 * 4; ++i) {
107 |         BOOST_CHECK_EQUAL(m[i], i);
108 |         BOOST_CHECK_EQUAL(n[i], i);
109 |         BOOST_CHECK_EQUAL(o[i], i);
110 |         BOOST_CHECK_EQUAL(s[i], i);
111 |         BOOST_CHECK_EQUAL(t[i], i);
112 |     }
113 | 
114 | }
115 | 
116 | BOOST_AUTO_TEST_CASE( ndarray_zero_copy_assignment ) {
117 |     ndarray<float, host_memory_space> x(extents[4][5][6]);
118 |     for (int i = 0; i < 4 * 5 * 6; i++) {
119 |         x[i] = i;
120 |     }
121 | 
122 |     ndarray<float, host_memory_space> y = x;
123 | 
124 |     for (int i = 0; i < 4 * 5 * 6; i++) {
125 |         BOOST_CHECK_EQUAL(x[i], y[i]);
126 |         y[i] = i + 1; // change the copy results in change of original!
127 |         BOOST_CHECK_EQUAL(x[i], y[i]);
128 |     }
129 | }
130 | 
131 | BOOST_AUTO_TEST_CASE( ndarray_copy ) {
132 |     boost::shared_ptr<allocator> allocator(new pooled_cuda_allocator("ndarray_copy"));
133 |     ndarray<float, host_memory_space> x(extents[4][5][6], allocator);
134 |     for (int i = 0; i < 4 * 5 * 6; i++) {
135 |         x[i] = i;
136 |     }
137 | 
138 |     ndarray<float, host_memory_space> y = x.copy();
139 |     BOOST_CHECK_NE(x.ptr(), y.ptr());
140 | 
141 |     for (int i = 0; i < 4; i++) {
142 |         BOOST_CHECK_NE(x[indices[i][index_range()][index_range()]].ptr(),
143 |                 y[indices[i][index_range()][index_range()]].ptr());
144 |     }
145 | 
146 |     ndarray<float, host_memory_space> y2(x.copy());
147 |     BOOST_CHECK_NE(x.ptr(), y2.ptr());
148 | 
149 |     for (int i = 0; i < 4; i++) {
150 |         BOOST_CHECK_NE(x[indices[i][index_range()][index_range()]].ptr(),
151 |                 y2[indices[i][index_range()][index_range()]].ptr());
152 |     }
153 | 
154 |     for (int i = 0; i < 4 * 5 * 6; i++) {
155 |         BOOST_CHECK_EQUAL(x[i], y[i]);
156 |         y[i]++; // change must not change original!
157 |         BOOST_CHECK_NE(x[i], y[i]);
158 |     }
159 | }
160 | 
161 | BOOST_AUTO_TEST_CASE( ndarray_copy_assignment )
162 | {
163 |     ndarray<int, host_memory_space> x(extents[10][10][10]);
164 |     ndarray<int, host_memory_space> y(extents[1][10][10]);
165 | 
166 |     // avoid fill() dependency
167 |     for (int i = 0; i < x.size(); i++) {
168 |         x.ptr()[i] = 0;
169 |     }
170 |     for (int i = 0; i < y.size(); i++) {
171 |         y.ptr()[i] = 1;
172 |     }
173 | 
174 |     // assign to a view should copy the array
175 |     x[indices[1][index_range(0, 10)][index_range(0, 10)]] = y[indices[0][index_range()][index_range()]];
176 | 
177 |     // x[0, ...] must remain unchanged
178 |     for (int i = 0; i < 10 * 10; i++) {
179 |         BOOST_REQUIRE_EQUAL(x.ptr()[i], 0);
180 |     }
181 | 
182 |     // x[1, ...] must have changed to 1
183 |     for (int i = 0; i < 10 * 10; i++) {
184 |         BOOST_REQUIRE_EQUAL(x.ptr()[100 + i], 1);
185 |     }
186 | 
187 |     // changing y must not influence x
188 |     for (int i = 0; i < y.size(); i++) {
189 |         y.ptr()[i] = 2;
190 |     }
191 | 
192 |     for (int i = 0; i < 10 * 10; i++) {
193 |         BOOST_REQUIRE_EQUAL(x.ptr()[100 + i], 1);
194 |     }
195 | }
196 | 
197 | BOOST_AUTO_TEST_CASE( ndarray_out_of_scope_view ) {
198 |     // sub-ndarray views should persist when original ndarray falls out of scope
199 |     ndarray<float, host_memory_space> y;
200 |     {
201 |         ndarray<float, host_memory_space> x(extents[4][5][6]);
202 |         for (int i = 0; i < 4; ++i)
203 |             for (int j = 0; j < 5; ++j)
204 |                 for (int k = 0; k < 6; ++k)
205 |                     x(i, j, k) = i + j + k;
206 |         y = x[indices[index_range(1, 3)][index_range()][index_range()]];
207 |     }
208 |     for (int i = 1; i < 3; ++i)
209 |         for (int j = 0; j < 5; ++j)
210 |             for (int k = 0; k < 6; ++k) {
211 |                 BOOST_CHECK_EQUAL(y(i-1,j,k), i+j+k);
212 |             }
213 | }
214 | 
215 | BOOST_AUTO_TEST_CASE( ndarray_slice1col ) {
216 |     ndarray<float, host_memory_space> y;
217 |     ndarray<float, host_memory_space> x(extents[4][5][6]);
218 | 
219 |     for (int i = 0; i < 4; ++i) {
220 |         for (int j = 0; j < 5; ++j) {
221 |             for (int k = 0; k < 6; ++k) {
222 |                 x(i, j, k) = i + j + k;
223 |             }
224 |         }
225 |     }
226 | 
227 |     // accessing strided memory
228 |     y = x[indices[index_range(0,1)][index_range()][index_range()]];
229 |     for (int i = 0; i < 1; ++i) {
230 |         for (int j = 0; j < 5; ++j) {
231 |             for (int k = 0; k < 6; ++k) {
232 |                 BOOST_CHECK_EQUAL(y(i,j,k), i+j+k);
233 |             }
234 |         }
235 |     }
236 |     x[indices[index_range(0,1)][index_range()][index_range()]] = y.copy();
237 | }
238 | 
239 | BOOST_AUTO_TEST_CASE( ndarray_slice1row ) {
240 |     ndarray<float, host_memory_space> y;
241 |     ndarray<float, host_memory_space> x(extents[4][5][6]);
242 | 
243 |     for (int i = 0; i < 4; ++i) {
244 |         for (int j = 0; j < 5; ++j) {
245 |             for (int k = 0; k < 6; ++k) {
246 |                 x(i, j, k) = i + j + k;
247 |             }
248 |         }
249 |     }
250 | 
251 |     // accessing strided memory
252 |     y = x[indices[index_range()][index_range()][index_range(0,1)]];
253 |     for (int i = 0; i < 4; ++i) {
254 |         for (int j = 0; j < 5; ++j) {
255 |             for (int k = 0; k < 1; ++k) {
256 |                 BOOST_CHECK_EQUAL(y(i,j,k), i+j+k);
257 |             }
258 |         }
259 |     }
260 |     x[indices[index_range()][index_range()][index_range(0,1)]] = y.copy();
261 | }
262 | 
263 | BOOST_AUTO_TEST_CASE( ndarray_memcpy2d ) {
264 |     ndarray<float, host_memory_space> y;
265 |     ndarray<float, host_memory_space> x(extents[4][5][6]);
266 | 
267 |     for (int i = 0; i < 4; ++i) {
268 |         for (int j = 0; j < 5; ++j) {
269 |             for (int k = 0; k < 6; ++k) {
270 |                 x(i, j, k) = i + j + k;
271 |             }
272 |         }
273 |     }
274 | 
275 |     // accessing strided memory
276 |     y = x[indices[index_range()][index_range()][index_range(0, 1)]];
277 |     for (int i = 0; i < 4; ++i) {
278 |         for (int j = 0; j < 5; ++j) {
279 |             for (int k = 0; k < 1; ++k) {
280 |                 BOOST_CHECK_EQUAL(y(i,j,k), i+j+k);
281 |             }
282 |         }
283 |     }
284 | 
285 |     // copying strided memory
286 |     y = y.copy(); // y in R^(4,5,1)
287 |     for (size_t k = 0; k < y.size(); k++) { // avoid fill() dependency in this file (speed up compiling...)
288 |         y[k] = 0.f;
289 |     }
290 | 
291 |     ndarray_view<float, host_memory_space> m(x, indices[index_range()][index_range()][index_range(0, 1)]);
292 |     m = y;
293 |     for (int i = 0; i < 4; ++i) {
294 |         for (int j = 0; j < 5; ++j) {
295 |             for (int k = 0; k < 1; ++k) {
296 |                 if (k != 0) {
297 |                     BOOST_CHECK_EQUAL(x(i,j,k), i+j+k);
298 |                 } else {
299 |                     BOOST_CHECK_EQUAL(x(i,j,k), 0.f);
300 |                 }
301 |             }
302 |         }
303 |     }
304 | }
305 | 
306 | template<class V, class M>
307 | void test_resize() {
308 | 
309 |     // resize with default allocator
310 |     ndarray<V, M, row_major> a(100, 100);
311 |     V* p0 = a.ptr();
312 |     a.resize(100, 100);
313 |     BOOST_CHECK_EQUAL(p0, a.ptr());
314 |     // no size change. pointer must not change
315 | 
316 |     boost::shared_ptr<pooled_cuda_allocator> allocator(new pooled_cuda_allocator("test_resize"));
317 |     {
318 |         ndarray<V, M, row_major> a(200, 300, allocator);
319 | 
320 |         BOOST_CHECK_EQUAL(a.shape(0), 200);
321 |         BOOST_CHECK_EQUAL(a.shape(1), 300);
322 | 
323 |         BOOST_CHECK_EQUAL(allocator->pool_count(M()), 1);
324 |         BOOST_CHECK_EQUAL(allocator->pool_free_count(M()), 0);
325 |         BOOST_CHECK_EQUAL(allocator->pool_size(M()), 200 * 300 * sizeof(V));
326 | 
327 |         a.resize(100, 100);
328 | 
329 |         // make sure the memory is freed before new memory is allocated
330 | 
331 |         BOOST_CHECK_EQUAL(allocator->pool_count(M()), 1);
332 |         BOOST_CHECK_EQUAL(allocator->pool_free_count(M()), 0);
333 |         BOOST_CHECK_EQUAL(allocator->pool_size(M()), 200 * 300 * sizeof(V));
334 | 
335 |         BOOST_CHECK_EQUAL(a.shape(0), 100);
336 |         BOOST_CHECK_EQUAL(a.shape(1), 100);
337 |     }
338 | 
339 |     BOOST_CHECK_EQUAL(allocator->pool_count(M()), 1);
340 |     BOOST_CHECK_EQUAL(allocator->pool_free_count(M()), 1);
341 | }
342 | 
343 | template<class V, class M1, class M2>
344 | void test_pushpull_2d() {
345 |     static const int h = 123, w = 247;
346 |     ndarray<V, M1, row_major> t1;
347 |     ndarray<V, M2, row_major> t2(extents[h][w]);
348 | 
349 |     for (int i = 0; i < h; i++)
350 |         for (int j = 0; j < w; j++) {
351 |             t2(i, j) = (float) drand48();
352 |         }
353 |     t1 = t2;
354 |     BOOST_CHECK(equal_shape(t1,t2));
355 |     for (int i = 0; i < h; i++) {
356 |         for (int j = 0; j < w; j++) {
357 |             BOOST_CHECK_EQUAL( (V) t1(i,j), (V) t2(i,j));
358 |         }
359 |     }
360 | }
361 | 
362 | template<class V, class M1, class M2>
363 | void test_pushpull_3d() {
364 |     static const int d = 3, h = 123, w = 247;
365 |     ndarray<V, M1, row_major> t1;
366 |     ndarray<V, M2, row_major> t2(extents[d][h][w]);
367 | 
368 |     // ***************************************
369 |     // assignment 2D --> 1D
370 |     // ***************************************
371 |     for (int k = 0; k < d; k++)
372 |         for (int i = 0; i < h; i++)
373 |             for (int j = 0; j < w; j++) {
374 |                 t2(k, i, j) = (float) drand48();
375 |             }
376 |     t1 = t2;
377 |     BOOST_CHECK(equal_shape(t1,t2));
378 |     for (int k = 0; k < d; ++k) {
379 |         for (int i = 0; i < h; i++) {
380 |             for (int j = 0; j < w; j++) {
381 |                 BOOST_CHECK_EQUAL( (V) t1(k,i,j), (V) t2(k,i,j));
382 |             }
383 |         }
384 |     }
385 | }
386 | 
387 | template<class V, class M>
388 | void test_lowdim_views() {
389 |     static const int d = 3, h = 123, w = 247;
390 |     ndarray<V, M, row_major> t1d(extents[d][h][w]);
391 |     ndarray<V, M, row_major> t2d(extents[d][h][w]);
392 | 
393 |     for (int k = 0; k < d; k++) {
394 |         for (int i = 0; i < h; i++) {
395 |             for (int j = 0; j < w; j++) {
396 |                 t2d(k, i, j) = (float) drand48();
397 |             }
398 |         }
399 |     }
400 | 
401 |     // ***************************************
402 |     // 2D View on 3D ndarray
403 |     // ***************************************
404 |     for (int k = 0; k < d; ++k) {
405 |         ndarray_view<V, M, row_major> view(indices[k][index_range(0, h)][index_range(0, w)], t2d);
406 |         BOOST_CHECK_EQUAL( view.ndim(), 2);
407 |         BOOST_CHECK_EQUAL( view.shape(0), h);
408 |         BOOST_CHECK_EQUAL( view.shape(1), w);
409 |         for (int i = 0; i < h; i++) {
410 |             for (int j = 0; j < w; j++) {
411 |                 BOOST_CHECK_EQUAL( (V) view(i,j), (V) t2d(k,i,j));
412 |             }
413 |         }
414 | 
415 |         // alternative spec
416 |         ndarray_view<V, M, row_major> view_(indices[k][index_range()][index_range() < cuv::index(w)], t2d);
417 |         BOOST_CHECK_EQUAL( view_.ndim(), 2);
418 |         BOOST_CHECK_EQUAL( view_.shape(0), h);
419 |         BOOST_CHECK_EQUAL( view_.shape(1), w);
420 |         for (int i = 0; i < h; i++) {
421 |             for (int j = 0; j < w; j++) {
422 |                 BOOST_CHECK_EQUAL( (V) view_(i,j), (V) t2d(k,i,j));
423 |             }
424 |         }
425 |     }
426 | 
427 |     // ***************************************
428 |     // 1D View on 3D ndarray
429 |     // ***************************************
430 |     for (int k = 0; k < d; ++k) {
431 |         for (int i = 0; i < h; ++i) {
432 |             ndarray_view<V, M, row_major> view(indices[k][i][index_range(0, w)], t2d);
433 |             for (int j = 0; j < w; j++) {
434 |                 BOOST_REQUIRE_EQUAL( (V) view(j), (V) t2d(k,i,j));
435 |             }
436 |         }
437 |     }
438 | }
439 | 
440 | BOOST_AUTO_TEST_CASE( lowdim_views ) {
441 |     test_lowdim_views<float, host_memory_space>();
442 |     test_lowdim_views<float, dev_memory_space>();
443 | }
444 | 
445 | BOOST_AUTO_TEST_CASE( ndarray_wrapping ) {
446 |     {
447 |         std::vector<float> v_orig(10, 0.f);
448 |         ndarray<float, host_memory_space> v(extents[10], &v_orig[0]);
449 |         ndarray<float, host_memory_space> w(extents[10]);
450 |         for (unsigned int i = 0; i < 10; i++)
451 |             w[i] = 1.f;
452 | 
453 |         // overwrite the wrapped memory (needs copying)
454 |         v = w;
455 |     }
456 |     {
457 |         std::vector<float> v_orig(10, 0.f);
458 |         ndarray<float, host_memory_space> v(extents[10], &v_orig[0]);
459 |         ndarray<float, dev_memory_space> w(extents[10]);
460 |         for (unsigned int i = 0; i < 10; i++)
461 |             w[i] = 1.f;
462 | 
463 |         // overwrite the wrapped memory (needs copying)
464 |         v = w;
465 |     }
466 | }
467 | 
468 | BOOST_AUTO_TEST_CASE( pushpull_nd ) {
469 |     // same memory space, linear container
470 |     test_pushpull_2d<float, host_memory_space, host_memory_space>();
471 |     test_pushpull_2d<float, dev_memory_space, dev_memory_space>();
472 | 
473 |     // same memory space, 2d container
474 |     test_pushpull_2d<float, host_memory_space, host_memory_space>();
475 |     test_pushpull_2d<float, dev_memory_space, dev_memory_space>();
476 | 
477 |     // same memory space, 2d vs. 1d
478 |     test_pushpull_2d<float, host_memory_space, host_memory_space>();
479 |     test_pushpull_2d<float, dev_memory_space, dev_memory_space>();
480 |     test_pushpull_2d<float, host_memory_space, host_memory_space>();
481 |     test_pushpull_2d<float, dev_memory_space, dev_memory_space>();
482 | }
483 | 
484 | BOOST_AUTO_TEST_CASE( ndarray_resize ) {
485 |     test_resize<float, host_memory_space>();
486 |     test_resize<float, dev_memory_space>();
487 | }
488 | 
489 | BOOST_AUTO_TEST_CASE( create_lm )
490 | {
491 |     unsigned int N = 54;
492 |     {
493 |         linear_memory<float, host_memory_space> v(N);
494 |         BOOST_CHECK_EQUAL(v.size(), N);
495 |         BOOST_CHECK_NE(v.ptr(), (float*)NULL);
496 |         v.dealloc();
497 |         BOOST_CHECK_EQUAL(v.ptr(), (float*)NULL);
498 |     }
499 |     {
500 |         linear_memory<float, dev_memory_space> v(N);
501 |         BOOST_CHECK_EQUAL(v.size(), N);
502 |         BOOST_CHECK_NE(v.ptr(), (float*)NULL);
503 |         v.dealloc();
504 |         BOOST_CHECK_EQUAL(v.ptr(), (float*)NULL);
505 |     }
506 | 
507 | }
508 | 
509 | BOOST_AUTO_TEST_CASE( readwrite_lm )
510 | {
511 |     unsigned int N = 54;
512 |     {
513 |         linear_memory<float, host_memory_space> v(N);
514 |         v[1] = 0;
515 |         BOOST_CHECK_EQUAL(v[1], 0);
516 |         v[1] = 1;
517 |         BOOST_CHECK_EQUAL(v[1], 1);
518 |     }
519 |     {
520 |         linear_memory<float, dev_memory_space> v(N);
521 |         v[1] = 0;
522 |         BOOST_CHECK_EQUAL(v[1], 0);
523 |         v[1] = 1;
524 |         BOOST_CHECK_EQUAL(v[1], 1);
525 |     }
526 | 
527 | }
528 | 
529 | BOOST_AUTO_TEST_CASE( create_pm )
530 | {
531 |     unsigned int N = 54, M = 97;
532 |     {
533 |         pitched_memory<float, host_memory_space> v(N, M);
534 |         BOOST_CHECK_EQUAL(v.size(), N*M);
535 |         BOOST_CHECK_EQUAL(v.rows(), N);
536 |         BOOST_CHECK_EQUAL(v.cols(), M);
537 |         BOOST_CHECK_GE(v.pitch(), M);
538 |         BOOST_CHECK_NE(v.ptr(), (float*)NULL);
539 |         v.dealloc();
540 |         BOOST_CHECK_EQUAL(v.ptr(), (float*)NULL);
541 |     }
542 |     {
543 |         pitched_memory<float, dev_memory_space> v(N, M);
544 |         BOOST_CHECK_GE(v.size(), N*M);
545 |         BOOST_CHECK_EQUAL(v.rows(), N);
546 |         BOOST_CHECK_EQUAL(v.cols(), M);
547 |         BOOST_CHECK_GE(v.pitch(), M);
548 |         BOOST_CHECK_NE(v.ptr(), (float*)NULL);
549 |         v.dealloc();
550 |         BOOST_CHECK_EQUAL(v.ptr(), (float*)NULL);
551 |     }
552 | 
553 | }
554 | 
555 | BOOST_AUTO_TEST_CASE( readwrite_pm )
556 | {
557 |     unsigned int N = 54, M = 97;
558 |     {
559 |         pitched_memory<float, host_memory_space> v(N, M);
560 |         v[1] = 0;
561 |         BOOST_CHECK_EQUAL(v[1], 0);
562 |         v[1] = 1;
563 |         BOOST_CHECK_EQUAL(v[1], 1);
564 |     }
565 |     {
566 |         pitched_memory<float, dev_memory_space> v(N, M);
567 |         v[1] = 0;
568 |         BOOST_CHECK_EQUAL(v[1], 0);
569 |         v[1] = 1;
570 |         BOOST_CHECK_EQUAL(v[1], 1);
571 |     }
572 | 
573 |     {
574 |         pitched_memory<float, host_memory_space> v(N, M);
575 |         v(3, 4) = 0;
576 |         BOOST_CHECK_EQUAL(v(3,4), 0);
577 |         v(3, 4) = 1;
578 |         BOOST_CHECK_EQUAL(v(3,4), 1);
579 |     }
580 |     {
581 |         pitched_memory<float, dev_memory_space> v(N, M);
582 |         v(3, 4) = 0;
583 |         BOOST_CHECK_EQUAL(v(3,4), 0);
584 |         v(3, 4) = 1;
585 |         BOOST_CHECK_EQUAL(v(3,4), 1);
586 |     }
587 | 
588 | }
589 | 
590 | /**
591 |  * @test
592 |  * @brief create dense matrix.
593 |  */BOOST_AUTO_TEST_CASE( create_linear )
594 | {
595 |     unsigned int N = 16, M = 32;
596 |     {
597 |         ndarray<float, dev_memory_space, row_major> m(extents[N][M]);
598 |         BOOST_CHECK_EQUAL(m.size(), N*M);
599 |         BOOST_CHECK_EQUAL(m.shape(0), N);
600 |         BOOST_CHECK_EQUAL(m.shape(1), M);
601 |         BOOST_CHECK_EQUAL(m.stride(0), M);
602 |         BOOST_CHECK_EQUAL(m.stride(1), 1);
603 |     }
604 | 
605 |     {
606 |         ndarray<float, host_memory_space, row_major> m(extents[N][M]);
607 |         BOOST_CHECK_EQUAL(m.size(), N*M);
608 |         BOOST_CHECK_EQUAL(m.shape(0), N);
609 |         BOOST_CHECK_EQUAL(m.shape(1), M);
610 |         BOOST_CHECK_EQUAL(m.stride(0), M);
611 |         BOOST_CHECK_EQUAL(m.stride(1), 1);
612 |     }
613 | 
614 |     {
615 |         ndarray<float, dev_memory_space, column_major> m(extents[N][M]);
616 |         BOOST_CHECK_EQUAL(m.size(), N*M);
617 |         BOOST_CHECK_EQUAL(m.shape(0), N);
618 |         BOOST_CHECK_EQUAL(m.shape(1), M);
619 |         BOOST_CHECK_EQUAL(m.stride(0), 1);
620 |         BOOST_CHECK_EQUAL(m.stride(1), N);
621 |     }
622 | 
623 |     {
624 |         ndarray<float, host_memory_space, column_major> m(extents[N][M]);
625 |         BOOST_CHECK_EQUAL(m.size(), N*M);
626 |         BOOST_CHECK_EQUAL(m.shape(0), N);
627 |         BOOST_CHECK_EQUAL(m.shape(1), M);
628 |         BOOST_CHECK_EQUAL(m.stride(0), 1);
629 |         BOOST_CHECK_EQUAL(m.stride(1), N);
630 |     }
631 | }
632 | 
633 | /**
634 |  * @test
635 |  * @brief create pitched matrix.
636 |  */BOOST_AUTO_TEST_CASE( create_pitched )
637 | {
638 |     unsigned int N = 16, M = 32;
639 |     {
640 |         ndarray<float, dev_memory_space, row_major> m(extents[N][M], pitched_memory_tag());
641 |         BOOST_CHECK_EQUAL(m.size(), N*M);
642 |         BOOST_CHECK_EQUAL(m.shape(0), N);
643 |         BOOST_CHECK_EQUAL(m.shape(1), M);
644 |         BOOST_CHECK_GE(m.stride(0), M);
645 |         BOOST_CHECK_EQUAL(m.stride(1), 1);
646 |     }
647 | 
648 |     {
649 |         ndarray<float, host_memory_space, row_major> m(extents[N][M], pitched_memory_tag());
650 |         BOOST_CHECK_EQUAL(m.size(), N*M);
651 |         BOOST_CHECK_EQUAL(m.shape(0), N);
652 |         BOOST_CHECK_EQUAL(m.shape(1), M);
653 |         BOOST_CHECK_GE(m.stride(0), M);
654 |         BOOST_CHECK_EQUAL(m.stride(1), 1);
655 |     }
656 | 
657 |     {
658 |         ndarray<float, dev_memory_space, column_major> m(extents[N][M], pitched_memory_tag());
659 |         BOOST_CHECK_EQUAL(m.size(), N*M);
660 |         BOOST_CHECK_EQUAL(m.shape(0), N);
661 |         BOOST_CHECK_EQUAL(m.shape(1), M);
662 |         BOOST_CHECK_EQUAL(m.stride(0), 1);
663 |         BOOST_CHECK_GE(m.stride(1), N);
664 |     }
665 | 
666 |     {
667 |         ndarray<float, host_memory_space, column_major> m(extents[N][M], pitched_memory_tag());
668 |         BOOST_CHECK_EQUAL(m.size(), N*M);
669 |         BOOST_CHECK_EQUAL(m.shape(0), N);
670 |         BOOST_CHECK_EQUAL(m.shape(1), M);
671 |         BOOST_CHECK_EQUAL(m.stride(0), 1);
672 |         BOOST_CHECK_GE(m.stride(1), N);
673 |     }
674 | }
675 | 
676 | /**
677 |  * @test
678 |  * @brief setting and getting for device and host vectors.
679 |  */BOOST_AUTO_TEST_CASE( set_vector_elements )
680 | {
681 |     static const unsigned int N = 145;
682 |     static const unsigned int M = 97;
683 |     ndarray<float, host_memory_space> v(extents[N][M]);                     // linear memory
684 |     ndarray<float, dev_memory_space> w(extents[N][M], pitched_memory_tag()); // pitched memory
685 |     for (unsigned int i = 0; i < N; i++) {
686 |         v[i] = (float) i / N;
687 |         w[i] = (float) i / N;
688 |     }
689 |     //convert(w,v);
690 |     for (unsigned int i = 0; i < N; i++) {
691 |         BOOST_CHECK_EQUAL(v[i], (float) i/N);
692 |         BOOST_CHECK_EQUAL(w[i], (float) i/N);
693 |     }
694 | }
695 | 
696 | BOOST_AUTO_TEST_CASE( assign_func )
697 | {
698 |     static const unsigned int N = 145;
699 |     static const unsigned int M = 97;
700 |     ndarray<float, host_memory_space> v(extents[N][M]);
701 |     ndarray<float, host_memory_space> w(extents[N][M]);
702 |     v[5] = 5;
703 |     w[5] = 0;
704 |     w.assign(v);
705 |     BOOST_CHECK_NE(w.ptr(), v.ptr());
706 |     BOOST_CHECK_EQUAL(v[5], 5);
707 |     BOOST_CHECK_EQUAL(w[5], 5);
708 | }
709 | 
710 | BOOST_AUTO_TEST_CASE( stream_values )
711 | {
712 |     ndarray<float, host_memory_space> v(3, 2);
713 |     for (size_t i = 0; i < v.size(); i++) {
714 |         v[i] = i;
715 |     }
716 |     std::ostringstream o;
717 |     for (size_t i = 0; i < v.size(); i++) {
718 |         o << v[i];
719 |     }
720 |     BOOST_CHECK_EQUAL(o.str(), "012345");
721 | }
722 | 
723 | 
724 | 
725 | BOOST_AUTO_TEST_SUITE_END()
726 | 


--------------------------------------------------------------------------------