├── .gitignore ├── LICENSE ├── README.md ├── CMakeLists.txt ├── tests ├── cnmem_kernel_test.cu └── cnmem_tests.cpp ├── include └── cnmem.h └── src └── cnmem.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore the build directory created to build the code 2 | build 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. 2 | 3 | Redistribution and use in source and binary forms, with or without 4 | modification, are permitted provided that the following conditions 5 | are met: 6 | * Redistributions of source code must retain the above copyright 7 | notice, this list of conditions and the following disclaimer. 8 | * Redistributions in binary form must reproduce the above copyright 9 | notice, this list of conditions and the following disclaimer in the 10 | documentation and/or other materials provided with the distribution. 11 | * Neither the name of NVIDIA CORPORATION nor the names of its 12 | contributors may be used to endorse or promote products derived 13 | from this software without specific prior written permission. 14 | 15 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | 27 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CNMeM Library 2 | 3 | Simple library to help the Deep Learning frameworks manage CUDA memory. 4 | 5 | CNMeM is not intended to be a general purpose memory management library. It was designed as a simple 6 | tool for applications which work on a limited number of large memory buffers. 7 | 8 | CNMeM is mostly developed on Ubuntu Linux. It should support other operating systems as well. If you 9 | encounter an issue with the library on other operating systems, please submit a bug (or a fix). 10 | 11 | # Prerequisites 12 | 13 | CNMeM relies on the CUDA toolkit. It uses C++ STL and the Pthread library on Linux. On Windows, it uses 14 | the native Win32 threading library. The build system uses CMake. The unit tests are written using 15 | Google tests (but are not mandatory). 16 | 17 | ## CUDA 18 | 19 | The CUDA toolkit is required. We recommend using CUDA >= 7.0 even if earlier versions will work. 20 | * Download from the [CUDA website](https://developer.nvidia.com/cuda-downloads) 21 | * Follow the installation instructions 22 | * Don't forget to set your path. For example: 23 | * `CUDA_HOME=/usr/local/cuda` 24 | * `LD_LIBRARY_PATH=$CUDA_HOME/lib64:$LD_LIBRARY_PATH` 25 | 26 | # Build CNMeM 27 | 28 | ## Grab the source 29 | 30 | % cd $HOME 31 | % git clone https://github.com/NVIDIA/cnmem.git cnmem 32 | 33 | ## Build CNMeM without the unit tests 34 | 35 | % cd cnmem 36 | % mkdir build 37 | % cd build 38 | % cmake .. 39 | % make 40 | 41 | ## Build CNMeM with the unit tests 42 | 43 | To build the tests, you need to add an extra option to the cmake command. 44 | 45 | % cd cnmem 46 | % mkdir build 47 | % cd build 48 | % cmake -DWITH_TESTS=True .. 49 | % make 50 | 51 | ## Link with CNMeM 52 | 53 | The source folder contains a header file 'include/cnmem.h' and the build directory contains the 54 | library 'libcnmem.so', 'cnmem.lib/cnmem.dll' or 'libcnmem.dylib', depending on your operating 55 | system. 56 | 57 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # CMakeLists to build the cnmem library. 2 | cmake_minimum_required(VERSION 2.8.8) 3 | project(cnmem) 4 | 5 | # We need CUDA to build that library. 6 | find_package(CUDA QUIET REQUIRED) 7 | include_directories(${CUDA_INCLUDE_DIRS}) 8 | 9 | # Rules to build the cnmem library. 10 | include_directories(include) 11 | add_definitions(-DCNMEM_DLLEXPORT) 12 | add_library(cnmem SHARED src/cnmem.cpp) 13 | set_target_properties(cnmem PROPERTIES VERSION 1.0.0 SOVERSION 1) 14 | target_link_libraries(cnmem LINK_PUBLIC ${CUDA_LIBRARIES}) 15 | install(TARGETS cnmem RUNTIME DESTINATION bin ARCHIVE DESTINATION lib LIBRARY DESTINATION lib) 16 | install(FILES include/cnmem.h DESTINATION include) 17 | 18 | # Add the tests. 19 | if(WITH_TESTS) 20 | 21 | # Get Google tests. 22 | find_package(GTest QUIET REQUIRED) 23 | include_directories(${GTEST_INCLUDE_DIRS}) 24 | 25 | # Build the executable. 26 | add_executable(cnmem_tests tests/cnmem_tests.cpp) 27 | if(MSVC) 28 | if(MSVC_VERSION GREATER 1700) # Visual Studio 11 or more. 29 | add_definitions(-DUSE_CPP_11) 30 | endif(MSVC_VERSION GREATER 1700) 31 | endif(MSVC) 32 | if(CMAKE_COMPILER_IS_GNUCC) 33 | add_definitions(-std=c++11 -DUSE_CPP_11) 34 | endif(CMAKE_COMPILER_IS_GNUCC) 35 | target_link_libraries(cnmem_tests LINK_PUBLIC cnmem ${CUDA_LIBRARIES} ${GTEST_LIBRARIES} -lpthread) 36 | install(TARGETS cnmem_tests RUNTIME DESTINATION bin) 37 | 38 | # Tests that launch kernels to force reading and writing to memory 39 | cuda_add_executable(cnmem_kernel_tests tests/cnmem_kernel_test.cu) 40 | target_link_libraries(cnmem_kernel_tests cnmem ${CUDA_LIBRARIES} ${GTEST_LIBRARIES} -lpthread) 41 | install(TARGETS cnmem_kernel_tests RUNTIME DESTINATION bin) 42 | 43 | # On Windows, we copy the Google test DLL to the bin folder. 44 | if(MSVC) 45 | get_filename_component(gtest_dll_path ${GTEST_LIBRARIES} DIRECTORY) 46 | install(FILES ${gtest_dll_path}/gtest.dll DESTINATION bin) 47 | endif(MSVC) 48 | 49 | endif(WITH_TESTS) 50 | 51 | -------------------------------------------------------------------------------- /tests/cnmem_kernel_test.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | static std::size_t getFreeMemory() { 7 | cudaFree(0); 8 | std::size_t freeMem, totalMem; 9 | cudaMemGetInfo(&freeMem, &totalMem); 10 | return freeMem; 11 | } 12 | 13 | class CnmemTest : public ::testing::Test { 14 | /// We determine the amount of free memory. 15 | std::size_t mFreeMem; 16 | 17 | protected: 18 | /// Do we test memory leaks. 19 | bool mTestLeaks; 20 | /// Do we skip finalization. 21 | bool mFinalize; 22 | 23 | public: 24 | /// Ctor. 25 | CnmemTest() : mFreeMem(getFreeMemory()), mTestLeaks(true), mFinalize(true) {} 26 | /// Tear down the test. 27 | void TearDown(); 28 | }; 29 | 30 | void CnmemTest::TearDown() { 31 | if( mFinalize ) { 32 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFinalize()); 33 | } 34 | if( mTestLeaks ) { 35 | ASSERT_EQ(mFreeMem, getFreeMemory()); 36 | } 37 | cudaDeviceReset(); 38 | } 39 | 40 | /////////////////////////////////////////////////////////////////////////////////////////////////// 41 | 42 | template 43 | __global__ void tinyKernel(T* d_a, int numElem) 44 | { 45 | int ind = (blockIdx.x * blockDim.x) + threadIdx.x; 46 | if(ind >= numElem) 47 | return; 48 | d_a[ind] += 1; 49 | } 50 | 51 | 52 | struct _24ByteStruct 53 | { 54 | double a; 55 | double c; 56 | double b; 57 | 58 | __host__ __device__ 59 | void operator +=(int other) 60 | { 61 | a += other; 62 | b += other; 63 | c += other; 64 | } 65 | 66 | __host__ __device__ 67 | void operator =(int other) 68 | { 69 | a = other; 70 | b = other; 71 | c = other; 72 | } 73 | }; 74 | 75 | template 76 | void testAlign() 77 | { 78 | const int numElem = 200; 79 | const int size = numElem*sizeof(T); 80 | T* cpuData = new T[numElem]; 81 | for(int i = 0; i < numElem; i++) 82 | cpuData[i] = i; 83 | 84 | ASSERT_EQ(expectedSize, sizeof(T)); 85 | 86 | cudaStream_t streams[2]; 87 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 88 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 89 | 90 | cnmemDevice_t device; 91 | memset(&device, 0, sizeof(device)); 92 | device.numStreams = 2; 93 | device.streams = streams; 94 | //intentonally misallign, but could be from calculation based on gpu size 95 | size_t streamSizes[] = { size*2 + sizeof(T) - 1, size*2 + sizeof(T) - 1 }; 96 | device.streamSizes = streamSizes; 97 | 98 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, CNMEM_FLAGS_DEFAULT)); 99 | T *ptr0, *ptr1; 100 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc((void**)&ptr0, size, streams[0])); 101 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc((void**)&ptr1, size, streams[1])); 102 | 103 | ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(ptr0, cpuData, size, cudaMemcpyHostToDevice, streams[0])); 104 | ASSERT_EQ(cudaSuccess, cudaMemcpyAsync(ptr1, cpuData, size, cudaMemcpyHostToDevice, streams[1])); 105 | 106 | //force read and write from ptr0,1 107 | tinyKernel<<>>(ptr0, numElem); 108 | tinyKernel<<>>(ptr1, numElem); 109 | 110 | ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(streams[0])); 111 | ASSERT_EQ(cudaSuccess, cudaStreamSynchronize(streams[1])); 112 | 113 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, streams[1])); 114 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, streams[0])); 115 | 116 | ASSERT_EQ(cudaSuccess, cudaStreamDestroy(streams[0])); 117 | ASSERT_EQ(cudaSuccess, cudaStreamDestroy(streams[1])); 118 | 119 | ASSERT_EQ(cudaSuccess, cudaDeviceSynchronize()); 120 | } 121 | 122 | TEST_F(CnmemTest, alignment8) { 123 | testAlign(); 124 | } 125 | 126 | TEST_F(CnmemTest, alignment16) { 127 | testAlign(); 128 | } 129 | 130 | TEST_F(CnmemTest, alignment32) { 131 | testAlign(); 132 | } 133 | 134 | TEST_F(CnmemTest, alignment64) { 135 | testAlign(); 136 | } 137 | 138 | TEST_F(CnmemTest, alignment192) { 139 | testAlign<_24ByteStruct, 24>(); 140 | } 141 | 142 | /////////////////////////////////////////////////////////////////////////////////////////////////// 143 | 144 | int main(int argc, char **argv) { 145 | ::testing::InitGoogleTest(&argc, argv); 146 | return RUN_ALL_TESTS(); 147 | } 148 | 149 | /////////////////////////////////////////////////////////////////////////////////////////////////// 150 | 151 | -------------------------------------------------------------------------------- /include/cnmem.h: -------------------------------------------------------------------------------- 1 | /* ********************************************************************** 2 | * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. 3 | * 4 | * Redistribution and use in source and binary forms, with or without 5 | * modification, are permitted provided that the following conditions 6 | * are met: 7 | * * Redistributions of source code must retain the above copyright 8 | * notice, this list of conditions and the following disclaimer. 9 | * * Redistributions in binary form must reproduce the above copyright 10 | * notice, this list of conditions and the following disclaimer in the 11 | * documentation and/or other materials provided with the distribution. 12 | * * Neither the name of NVIDIA CORPORATION nor the names of its 13 | * contributors may be used to endorse or promote products derived 14 | * from this software without specific prior written permission. 15 | * 16 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | * ********************************************************************** */ 28 | #pragma once 29 | 30 | #ifdef __cplusplus 31 | #include "cstdio" 32 | #else 33 | #include "stdio.h" 34 | #endif 35 | #include "cuda_runtime_api.h" 36 | 37 | #if defined(_MSC_VER) || defined(WIN32) 38 | #ifdef CNMEM_DLLEXPORT 39 | #define CNMEM_API __declspec(dllexport) 40 | #else 41 | #define CNMEM_API __declspec(dllimport) 42 | #endif 43 | #else 44 | #ifdef CNMEM_DLLEXPORT 45 | #define CNMEM_API __attribute__((visibility ("default"))) 46 | #else 47 | #define CNMEM_API 48 | #endif 49 | #endif 50 | 51 | #define CNMEM_VERSION 100 // It corresponds to 1.0.0 52 | 53 | #ifdef __cplusplus 54 | extern "C" { 55 | #endif 56 | 57 | /* ********************************************************************************************* */ 58 | 59 | typedef enum 60 | { 61 | CNMEM_STATUS_SUCCESS = 0, 62 | CNMEM_STATUS_CUDA_ERROR, 63 | CNMEM_STATUS_INVALID_ARGUMENT, 64 | CNMEM_STATUS_NOT_INITIALIZED, 65 | CNMEM_STATUS_OUT_OF_MEMORY, 66 | CNMEM_STATUS_UNKNOWN_ERROR 67 | } cnmemStatus_t; 68 | 69 | /* ********************************************************************************************* */ 70 | 71 | typedef enum 72 | { 73 | CNMEM_FLAGS_DEFAULT = 0, /// Default flags. 74 | CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption. 75 | CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory. 76 | CNMEM_FLAGS_MANAGED = 4, /// Use cudaMallocManaged for the allocator. 77 | } cnmemManagerFlags_t; 78 | 79 | /* ********************************************************************************************* */ 80 | 81 | typedef struct cnmemDevice_t_ 82 | { 83 | /** The device number. */ 84 | int device; 85 | /** The size to allocate for that device. If 0, the implementation chooses the size. */ 86 | size_t size; 87 | /** The number of named streams associated with the device. The NULL stream is not counted. */ 88 | int numStreams; 89 | /** The streams associated with the device. It can be NULL. The NULL stream is managed. */ 90 | cudaStream_t *streams; 91 | /** The size reserved for each streams. It can be 0. */ 92 | size_t *streamSizes; 93 | 94 | } cnmemDevice_t; 95 | 96 | /** 97 | * \brief Initialize the library and allocate memory on the listed devices. 98 | * 99 | * For each device, an internal memory manager is created and the specified amount of memory is 100 | * allocated (it is the size defined in device[i].size). For each, named stream an additional 101 | * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 102 | * manager for the device and a list of children, one for each named stream. 103 | * 104 | * This function must be called before any other function in the library. It has to be called 105 | * by a single thread since it is not thread-safe. 106 | * 107 | * \return 108 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 109 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, 110 | * CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory, 111 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function. 112 | */ 113 | cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags); 114 | 115 | /** 116 | * \brief Release all the allocated memory. 117 | * 118 | * This function must be called by a single thread and after all threads that called 119 | * cnmemMalloc/cnmemFree have joined. This function is not thread-safe. 120 | * 121 | * \return 122 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 123 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 124 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 125 | */ 126 | cnmemStatus_t CNMEM_API cnmemFinalize(); 127 | 128 | /** 129 | * \brief Increase the internal reference counter of the context object. 130 | * 131 | * This function increases the internal reference counter of the library. The purpose of that 132 | * reference counting mechanism is to give more control to the user over the lifetime of the 133 | * library. It is useful with scoped memory allocation which may be destroyed in a final 134 | * memory collection after the end of main(). That function is thread-safe. 135 | * 136 | * \return 137 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 138 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 139 | */ 140 | cnmemStatus_t CNMEM_API cnmemRetain(); 141 | 142 | /** 143 | * \brief Decrease the internal reference counter of the context object. 144 | * 145 | * This function decreases the internal reference counter of the library. The purpose of that 146 | * reference counting mechanism is to give more control to the user over the lifetime of the 147 | * library. It is useful with scoped memory allocation which may be destroyed in a final 148 | * memory collection after the end of main(). That function is thread-safe. 149 | * 150 | * You can use \c cnmemRelease to explicitly finalize the library. 151 | * 152 | * \return 153 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 154 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 155 | */ 156 | cnmemStatus_t CNMEM_API cnmemRelease(); 157 | 158 | /** 159 | * \brief Add a new stream to the pool of managed streams on a device. 160 | * 161 | * This function registers a new stream into a device memory manager. It is thread-safe. 162 | * 163 | * \return 164 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 165 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, 166 | */ 167 | cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream); 168 | 169 | /** 170 | * \brief Allocate memory. 171 | * 172 | * This function allocates memory and initializes a pointer to device memory. If no memory 173 | * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe. 174 | * 175 | * The behavior of that function is the following: 176 | * 177 | * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 178 | * memory. If there's a buffer of size larger or equal to the requested size in the list of 179 | * free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 180 | * its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 181 | * cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 182 | * allowed to grow, the manager attempts to steal memory from one of its children (unless 183 | * CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 184 | * CNMEM_STATUS_OUT_OF_MEMORY. 185 | * 186 | * - If the stream is a named stream, the initial request goes to the memory manager associated 187 | * with that stream. If a free node is available in the lists of that manager, it is returned. 188 | * Otherwise, the request is passed to the root node and works as if the request were made on 189 | * the NULL stream. 190 | * 191 | * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 192 | * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 193 | * make sure no kernel uses a given buffer before stealing it) and it the execution is 194 | * sequential (in a multi-threaded context, the code is executed in a critical section inside 195 | * the cnmem library - no need for the user to wrap cnmemMalloc with locks). 196 | * 197 | * \return 198 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 199 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 200 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0, 201 | * CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available, 202 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 203 | */ 204 | cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream); 205 | 206 | /** 207 | * \brief Release memory. 208 | * 209 | * This function releases memory and recycles a memory block in the manager. This function is 210 | * thread safe. 211 | * 212 | * \return 213 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 214 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 215 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0, 216 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 217 | */ 218 | cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream); 219 | 220 | /* ********************************************************************************************* */ 221 | /* Utility functions. */ 222 | /* ********************************************************************************************* */ 223 | 224 | /** 225 | * \brief Returns the amount of memory managed by the memory manager associated with a stream. 226 | * 227 | * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple- 228 | * xity linear in the number of allocated blocks so do not call it in performance critical 229 | * sections. 230 | * 231 | * \return 232 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 233 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 234 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid, 235 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 236 | */ 237 | cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream); 238 | 239 | /** 240 | * \brief Print a list of nodes to a file. 241 | * 242 | * This function is intended to be used in case of complex scenarios to help understand the 243 | * behaviour of the memory managers/application. It is thread safe. 244 | * 245 | * \return 246 | * CNMEM_STATUS_SUCCESS, if everything goes fine, 247 | * CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called, 248 | * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 249 | * or free_mem == 0, 250 | * CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions. 251 | */ 252 | cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream); 253 | 254 | /** 255 | * \brief Converts a cnmemStatus_t value to a string. 256 | */ 257 | const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status); 258 | 259 | /* ********************************************************************************************* */ 260 | 261 | #ifdef __cplusplus 262 | } // extern "C" 263 | #endif 264 | 265 | -------------------------------------------------------------------------------- /tests/cnmem_tests.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions 6 | // are met: 7 | // * Redistributions of source code must retain the above copyright 8 | // notice, this list of conditions and the following disclaimer. 9 | // * Redistributions in binary form must reproduce the above copyright 10 | // notice, this list of conditions and the following disclaimer in the 11 | // documentation and/or other materials provided with the distribution. 12 | // * Neither the name of NVIDIA CORPORATION nor the names of its 13 | // contributors may be used to endorse or promote products derived 14 | // from this software without specific prior written permission. 15 | // 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | /////////////////////////////////////////////////////////////////////////////////////////////////// 28 | 29 | #include 30 | #include 31 | #include 32 | #ifdef USE_CPP_11 33 | #include 34 | #endif 35 | 36 | /////////////////////////////////////////////////////////////////////////////////////////////////// 37 | 38 | static std::size_t getFreeMemory() { 39 | cudaFree(0); 40 | std::size_t freeMem, totalMem; 41 | cudaMemGetInfo(&freeMem, &totalMem); 42 | return freeMem; 43 | } 44 | 45 | class CnmemTest : public testing::TestWithParam{ 46 | /// We determine the amount of free memory. 47 | std::size_t mFreeMem; 48 | 49 | protected: 50 | /// Do we test memory leaks. 51 | bool mTestLeaks; 52 | /// Do we skip finalization. 53 | bool mFinalize; 54 | /// Do we use managed memory. 55 | unsigned pool_flags; 56 | 57 | public: 58 | /// Ctor. 59 | CnmemTest() : 60 | mFreeMem(getFreeMemory()), 61 | mTestLeaks(true), 62 | mFinalize(true), 63 | pool_flags(GetParam()) {} 64 | /// Tear down the test. 65 | void TearDown(); 66 | }; 67 | 68 | void CnmemTest::TearDown() { 69 | if( mFinalize ) { 70 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFinalize()); 71 | } 72 | if( mTestLeaks ) { 73 | ASSERT_EQ(mFreeMem, getFreeMemory()); 74 | } 75 | cudaDeviceReset(); 76 | } 77 | 78 | INSTANTIATE_TEST_CASE_P(DefaultOrManagedPool, 79 | CnmemTest, 80 | ::testing::Values(CNMEM_FLAGS_DEFAULT, 81 | CNMEM_FLAGS_MANAGED)); 82 | 83 | /////////////////////////////////////////////////////////////////////////////////////////////////// 84 | 85 | TEST_P(CnmemTest, notInitializedFinalize) { 86 | ASSERT_EQ(CNMEM_STATUS_NOT_INITIALIZED, cnmemFinalize()); 87 | cnmemDevice_t device; 88 | memset(&device, 0, sizeof(device)); 89 | device.size = 2048; 90 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); // For TearDown to be happy 91 | } 92 | 93 | /////////////////////////////////////////////////////////////////////////////////////////////////// 94 | 95 | TEST_P(CnmemTest, notInitializedMalloc) { 96 | ASSERT_EQ(CNMEM_STATUS_NOT_INITIALIZED, cnmemMalloc(NULL, 0, 0)); 97 | cnmemDevice_t device; 98 | memset(&device, 0, sizeof(device)); 99 | device.size = 2048; 100 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); // For TearDown to be happy 101 | } 102 | 103 | /////////////////////////////////////////////////////////////////////////////////////////////////// 104 | 105 | TEST_P(CnmemTest, notInitializedFree) { 106 | ASSERT_EQ(CNMEM_STATUS_NOT_INITIALIZED, cnmemFree(NULL, 0)); 107 | cnmemDevice_t device; 108 | memset(&device, 0, sizeof(device)); 109 | device.size = 2048; 110 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); // For TearDown to be happy 111 | } 112 | 113 | /////////////////////////////////////////////////////////////////////////////////////////////////// 114 | 115 | TEST_P(CnmemTest, initInvalidSize) { 116 | cudaDeviceProp props; 117 | ASSERT_EQ(cudaSuccess, cudaGetDeviceProperties(&props, 0)); 118 | 119 | cnmemDevice_t device; 120 | memset(&device, 0, sizeof(device)); 121 | device.size = props.totalGlobalMem * 2; 122 | ASSERT_EQ(CNMEM_STATUS_INVALID_ARGUMENT, cnmemInit(1, &device, pool_flags)); 123 | 124 | device.size = 2048; 125 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); // For TearDown to be happy 126 | } 127 | 128 | /////////////////////////////////////////////////////////////////////////////////////////////////// 129 | 130 | TEST_P(CnmemTest, initNinetyFivePrct) { 131 | cudaDeviceProp props; 132 | ASSERT_EQ(cudaSuccess, cudaGetDeviceProperties(&props, 0)); 133 | 134 | cnmemDevice_t device; 135 | memset(&device, 0, sizeof(device)); 136 | device.size = (size_t) (0.95*props.totalGlobalMem); 137 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 138 | mTestLeaks = false; 139 | } 140 | 141 | /////////////////////////////////////////////////////////////////////////////////////////////////// 142 | 143 | TEST_P(CnmemTest, initDevice1) { 144 | int numDevices; 145 | ASSERT_EQ(cudaSuccess, cudaGetDeviceCount(&numDevices)); 146 | cnmemDevice_t device; 147 | memset(&device, 0, sizeof(device)); 148 | device.device = numDevices < 2 ? 0 : 1; // Skip device 0 if we have more than 1 device. 149 | device.size = 2048; 150 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 151 | } 152 | 153 | /////////////////////////////////////////////////////////////////////////////////////////////////// 154 | 155 | TEST_P(CnmemTest, freeNULL) { 156 | cnmemDevice_t device; 157 | memset(&device, 0, sizeof(device)); 158 | device.size = 2048; 159 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 160 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(NULL, NULL)); 161 | } 162 | 163 | /////////////////////////////////////////////////////////////////////////////////////////////////// 164 | 165 | TEST_P(CnmemTest, freeTwoStreams) { 166 | cudaStream_t streams[2]; 167 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 168 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 169 | 170 | cnmemDevice_t device; 171 | memset(&device, 0, sizeof(device)); 172 | device.size = 3*1024; 173 | device.numStreams = 2; 174 | device.streams = streams; 175 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 176 | void *ptr0, *ptr1; 177 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr0, 1024, streams[0])); 178 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr1, 1024, streams[1])); 179 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, streams[1])); 180 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, streams[0])); 181 | 182 | ASSERT_EQ(cudaSuccess, cudaStreamDestroy(streams[0])); 183 | ASSERT_EQ(cudaSuccess, cudaStreamDestroy(streams[1])); 184 | } 185 | 186 | /////////////////////////////////////////////////////////////////////////////////////////////////// 187 | 188 | TEST_P(CnmemTest, addStream) { 189 | cudaStream_t streams[2]; 190 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 191 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 192 | 193 | // Register only 1 stream (on purpose). 194 | cnmemDevice_t device; 195 | memset(&device, 0, sizeof(device)); 196 | device.size = 3*1024; 197 | device.numStreams = 1; 198 | device.streams = streams; 199 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 200 | 201 | // Allocate a pointer with a valid stream. 202 | void *ptr0, *ptr1; 203 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr0, 1024, streams[0])); 204 | 205 | // Try to allocate with an invalid stream. 206 | ASSERT_EQ(CNMEM_STATUS_INVALID_ARGUMENT, cnmemMalloc(&ptr1, 1024, streams[1])); 207 | 208 | // Register the stream and try again. 209 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemRegisterStream(streams[1])); 210 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr1, 1024, streams[1])); 211 | 212 | // Clean up. 213 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, streams[1])); 214 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, streams[0])); 215 | 216 | ASSERT_EQ(cudaSuccess, cudaStreamDestroy(streams[0])); 217 | ASSERT_EQ(cudaSuccess, cudaStreamDestroy(streams[1])); 218 | } 219 | 220 | /////////////////////////////////////////////////////////////////////////////////////////////////// 221 | 222 | TEST_P(CnmemTest, freeWrongStream) { 223 | cudaStream_t streams[2]; 224 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 225 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 226 | 227 | cnmemDevice_t device; 228 | memset(&device, 0, sizeof(device)); 229 | device.size = 3*1024; 230 | device.numStreams = 2; 231 | device.streams = streams; 232 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 233 | void *ptr; 234 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr, 1024, streams[0])); 235 | ASSERT_EQ(CNMEM_STATUS_INVALID_ARGUMENT, cnmemFree(ptr, streams[1])); 236 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr, streams[0])); 237 | 238 | ASSERT_EQ(cudaSuccess, cudaStreamDestroy(streams[0])); 239 | ASSERT_EQ(cudaSuccess, cudaStreamDestroy(streams[1])); 240 | } 241 | 242 | /////////////////////////////////////////////////////////////////////////////////////////////////// 243 | 244 | TEST_P(CnmemTest, freeNULLRatherThanNamed) { 245 | cudaStream_t stream; 246 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&stream)); 247 | 248 | cnmemDevice_t device; 249 | memset(&device, 0, sizeof(device)); 250 | device.size = 2048; 251 | device.numStreams = 1; 252 | device.streams = &stream; 253 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 254 | void *ptr; 255 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr, 1024, stream)); 256 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr, NULL)); // We expect this async free to work. 257 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFinalize()); 258 | 259 | ASSERT_EQ(cudaSuccess, cudaStreamDestroy(stream)); 260 | 261 | device.numStreams = 0; 262 | device.streams = NULL; 263 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); // For TearDown to be happy 264 | } 265 | 266 | /////////////////////////////////////////////////////////////////////////////////////////////////// 267 | 268 | TEST_P(CnmemTest, allocateNULL) { 269 | cnmemDevice_t device; 270 | memset(&device, 0, sizeof(device)); 271 | device.size = 2048; 272 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 273 | 274 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(NULL, 0, NULL)); 275 | } 276 | 277 | /////////////////////////////////////////////////////////////////////////////////////////////////// 278 | 279 | TEST_P(CnmemTest, allocateZeroSize) { 280 | cnmemDevice_t device; 281 | memset(&device, 0, sizeof(device)); 282 | device.size = 2048; 283 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 284 | 285 | void *ptr; 286 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr, 0, NULL)); 287 | ASSERT_EQ((void*) NULL, ptr); 288 | } 289 | 290 | /////////////////////////////////////////////////////////////////////////////////////////////////// 291 | 292 | TEST_P(CnmemTest, allocateNoFree) { 293 | cnmemDevice_t device; 294 | memset(&device, 0, sizeof(device)); 295 | device.size = 2048; 296 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 297 | 298 | void *ptr; 299 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr, 512, NULL)); 300 | ASSERT_NE((void*) NULL, ptr); 301 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFinalize()); 302 | 303 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); // For TearDown to be happy 304 | } 305 | 306 | /////////////////////////////////////////////////////////////////////////////////////////////////// 307 | 308 | TEST_P(CnmemTest, allocateAndFreeOne) { 309 | cnmemDevice_t device; 310 | memset(&device, 0, sizeof(device)); 311 | device.size = 2048; 312 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 313 | 314 | void *ptr; 315 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr, 512, NULL)); 316 | ASSERT_NE((void*) NULL, ptr); 317 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr, NULL)); 318 | } 319 | 320 | /////////////////////////////////////////////////////////////////////////////////////////////////// 321 | 322 | TEST_P(CnmemTest, allocateAndFreeTwo) { 323 | cnmemDevice_t device; 324 | memset(&device, 0, sizeof(device)); 325 | device.size = 2048; 326 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 327 | 328 | void *ptr0; 329 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr0, 512, NULL)); 330 | ASSERT_NE((void*) NULL, ptr0); 331 | void *ptr1; 332 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr1, 512, NULL)); 333 | ASSERT_NE((void*) NULL, ptr1); 334 | 335 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, NULL)); 336 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, NULL)); 337 | } 338 | 339 | /////////////////////////////////////////////////////////////////////////////////////////////////// 340 | 341 | TEST_P(CnmemTest, allocateAndFreeAll) { 342 | cnmemDevice_t device; 343 | memset(&device, 0, sizeof(device)); 344 | device.size = 2048; 345 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 346 | 347 | void *ptr0; 348 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr0, 512, NULL)); 349 | ASSERT_NE((void*) NULL, ptr0); 350 | void *ptr1; 351 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr1, 512, NULL)); 352 | ASSERT_NE((void*) NULL, ptr1); 353 | void *ptr2; 354 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr2, 512, NULL)); 355 | ASSERT_NE((void*) NULL, ptr2); 356 | void *ptr3; 357 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr3, 512, NULL)); 358 | ASSERT_NE((void*) NULL, ptr3); 359 | 360 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr3, NULL)); 361 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr2, NULL)); 362 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, NULL)); 363 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, NULL)); 364 | } 365 | 366 | /////////////////////////////////////////////////////////////////////////////////////////////////// 367 | 368 | TEST_P(CnmemTest, allocateAndFreeAnyOrder) { 369 | cnmemDevice_t device; 370 | memset(&device, 0, sizeof(device)); 371 | device.size = 2048; 372 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 373 | 374 | void *ptr0; 375 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr0, 512, NULL)); 376 | ASSERT_NE((void*) NULL, ptr0); 377 | void *ptr1; 378 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr1, 512, NULL)); 379 | ASSERT_NE((void*) NULL, ptr1); 380 | 381 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, NULL)); 382 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, NULL)); 383 | } 384 | 385 | /////////////////////////////////////////////////////////////////////////////////////////////////// 386 | 387 | TEST_P(CnmemTest, allocateTooMuchAndGrow) { 388 | cnmemDevice_t device; 389 | memset(&device, 0, sizeof(device)); 390 | device.size = 2048; 391 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 392 | 393 | void *ptr0; 394 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr0, 512, NULL)); 395 | ASSERT_NE((void*) NULL, ptr0); 396 | void *ptr1; 397 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr1, 512, NULL)); 398 | ASSERT_NE((void*) NULL, ptr1); 399 | void *ptr2; 400 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr2, 512, NULL)); 401 | ASSERT_NE((void*) NULL, ptr2); 402 | void *ptr3; 403 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr3, 512, NULL)); 404 | ASSERT_NE((void*) NULL, ptr3); 405 | void *ptr4; 406 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr4, 512, NULL)); 407 | ASSERT_NE((void*) NULL, ptr4); 408 | 409 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr4, NULL)); 410 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr3, NULL)); 411 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr2, NULL)); 412 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, NULL)); 413 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, NULL)); 414 | } 415 | 416 | /////////////////////////////////////////////////////////////////////////////////////////////////// 417 | 418 | TEST_P(CnmemTest, allocateTooMuchNoGrow) { 419 | cnmemDevice_t device; 420 | memset(&device, 0, sizeof(device)); 421 | device.size = 2048; 422 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 423 | 424 | void *ptr0; 425 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr0, 512, NULL)); 426 | ASSERT_NE((void*) NULL, ptr0); 427 | void *ptr1; 428 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr1, 512, NULL)); 429 | ASSERT_NE((void*) NULL, ptr1); 430 | void *ptr2; 431 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr2, 512, NULL)); 432 | ASSERT_NE((void*) NULL, ptr2); 433 | void *ptr3; 434 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr3, 512, NULL)); 435 | ASSERT_NE((void*) NULL, ptr3); 436 | void *ptr4; 437 | ASSERT_EQ(CNMEM_STATUS_OUT_OF_MEMORY, cnmemMalloc(&ptr4, 512, NULL)); 438 | 439 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr3, NULL)); 440 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr2, NULL)); 441 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, NULL)); 442 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, NULL)); 443 | } 444 | 445 | /////////////////////////////////////////////////////////////////////////////////////////////////// 446 | 447 | TEST_P(CnmemTest, allocateAndSteal) { 448 | cudaStream_t streams[2]; 449 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 450 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 451 | 452 | cnmemDevice_t device; 453 | memset(&device, 0, sizeof(device)); 454 | device.size = 3*1024; 455 | device.numStreams = 2; 456 | device.streams = streams; 457 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 458 | 459 | // Take the 1024B from streams[0]. 460 | void *ptr0; 461 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr0, 1024, streams[0])); 462 | ASSERT_NE((void*) NULL, ptr0); 463 | // Take the 1024B from NULL. 464 | void *ptr1; 465 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr1, 1024, streams[0])); 466 | ASSERT_NE((void*) NULL, ptr1); 467 | // Steal the 1024B from streams[1]. 468 | void *ptr2; 469 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr2, 1024, streams[0])); 470 | ASSERT_NE((void*) NULL, ptr2); 471 | 472 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr2, streams[0])); 473 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, streams[0])); 474 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, streams[0])); 475 | 476 | cudaStreamDestroy(streams[1]); 477 | cudaStreamDestroy(streams[0]); 478 | } 479 | 480 | /////////////////////////////////////////////////////////////////////////////////////////////////// 481 | 482 | TEST_P(CnmemTest, allocateAndSteal2) { 483 | cudaStream_t streams[2]; 484 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 485 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 486 | 487 | cnmemDevice_t device; 488 | memset(&device, 0, sizeof(device)); 489 | device.size = 3*1024; 490 | device.numStreams = 2; 491 | device.streams = streams; 492 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 493 | 494 | void *ptr0; 495 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr0, 1024, streams[0])); 496 | ASSERT_NE((void*) NULL, ptr0); 497 | void *ptr1; 498 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr1, 512, streams[1])); 499 | ASSERT_NE((void*) NULL, ptr1); 500 | void *ptr2; 501 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr2, 512, streams[0])); 502 | ASSERT_NE((void*) NULL, ptr2); 503 | 504 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr2, streams[0])); 505 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, streams[1])); 506 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, streams[0])); 507 | 508 | cudaStreamDestroy(streams[1]); 509 | cudaStreamDestroy(streams[0]); 510 | } 511 | 512 | /////////////////////////////////////////////////////////////////////////////////////////////////// 513 | 514 | TEST_P(CnmemTest, allocateAndSteal3) { 515 | cudaStream_t streams[2]; 516 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 517 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 518 | 519 | cnmemDevice_t device; 520 | memset(&device, 0, sizeof(device)); 521 | device.size = 3*2048; 522 | device.numStreams = 2; 523 | device.streams = streams; 524 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 525 | 526 | void *ptr0; 527 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr0, 2048, streams[0])); 528 | ASSERT_NE((void*) NULL, ptr0); 529 | void *ptr1; 530 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr1, 512, streams[1])); 531 | ASSERT_NE((void*) NULL, ptr1); 532 | void *ptr2; 533 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr2, 1024, streams[0])); 534 | ASSERT_NE((void*) NULL, ptr2); 535 | void *ptr3; 536 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr3, 512, streams[1])); 537 | ASSERT_NE((void*) NULL, ptr3); 538 | 539 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr3, streams[1])); 540 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr2, streams[0])); 541 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, streams[1])); 542 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, streams[0])); 543 | 544 | cudaStreamDestroy(streams[1]); 545 | cudaStreamDestroy(streams[0]); 546 | } 547 | 548 | /////////////////////////////////////////////////////////////////////////////////////////////////// 549 | 550 | TEST_P(CnmemTest, allocateAndSteal4) { 551 | cudaStream_t streams[2]; 552 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 553 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 554 | 555 | cnmemDevice_t device; 556 | memset(&device, 0, sizeof(device)); 557 | device.size = 6*1024; 558 | device.numStreams = 2; 559 | device.streams = streams; 560 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 561 | 562 | void *ptr0; 563 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr0, 1024, streams[0])); 564 | void *ptr1; 565 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr1, 1024, streams[0])); 566 | void *ptr2; 567 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr2, 1024, streams[0])); 568 | void *ptr3; 569 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr3, 1024, streams[0])); 570 | void *ptr4; 571 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr4, 1024, streams[0])); 572 | void *ptr5; 573 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr5, 1024, streams[1])); 574 | 575 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr0, streams[0])); 576 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr1, streams[0])); 577 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr2, streams[0])); 578 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr3, streams[0])); 579 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr4, streams[0])); 580 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr5, streams[1])); 581 | } 582 | 583 | /////////////////////////////////////////////////////////////////////////////////////////////////// 584 | 585 | TEST_P(CnmemTest, allocateAndReserveStream) { 586 | cudaStream_t streams[2]; 587 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 588 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 589 | 590 | cnmemDevice_t device; 591 | memset(&device, 0, sizeof(device)); 592 | device.size = 4096; 593 | device.numStreams = 2; 594 | device.streams = streams; 595 | size_t streamSizes[] = { 2048, 2048 }; 596 | device.streamSizes = streamSizes; 597 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 598 | 599 | size_t totalMem, freeMem; 600 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMemGetInfo(&freeMem, &totalMem, cudaStreamDefault)); 601 | ASSERT_EQ(4096, totalMem); 602 | ASSERT_EQ(0, freeMem); 603 | 604 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMemGetInfo(&freeMem, &totalMem, streams[0])); 605 | ASSERT_EQ(2048, totalMem); 606 | ASSERT_EQ(2048, freeMem); 607 | 608 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMemGetInfo(&freeMem, &totalMem, streams[1])); 609 | ASSERT_EQ(2048, totalMem); 610 | ASSERT_EQ(2048, freeMem); 611 | } 612 | 613 | /////////////////////////////////////////////////////////////////////////////////////////////////// 614 | 615 | TEST_P(CnmemTest, allocateAndReserveStreamDifferentSizes) { 616 | cudaStream_t streams[2]; 617 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 618 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 619 | 620 | cnmemDevice_t device; 621 | memset(&device, 0, sizeof(device)); 622 | device.size = 8192; 623 | device.numStreams = 2; 624 | device.streams = streams; 625 | size_t streamSizes[] = { 2048, 4096 }; 626 | device.streamSizes = streamSizes; 627 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 628 | 629 | size_t totalMem, freeMem; 630 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMemGetInfo(&freeMem, &totalMem, cudaStreamDefault)); 631 | ASSERT_EQ(8192, totalMem); 632 | ASSERT_EQ(2048, freeMem); 633 | 634 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMemGetInfo(&freeMem, &totalMem, streams[0])); 635 | ASSERT_EQ(2048, totalMem); 636 | ASSERT_EQ(2048, freeMem); 637 | 638 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMemGetInfo(&freeMem, &totalMem, streams[1])); 639 | ASSERT_EQ(4096, totalMem); 640 | ASSERT_EQ(4096, freeMem); 641 | 642 | FILE *file = fopen("reserveStream.log", "w"); 643 | ASSERT_NE((FILE*) NULL, file); 644 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemPrintMemoryState(file, streams[0])); 645 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemPrintMemoryState(file, streams[1])); 646 | fclose(file); 647 | } 648 | 649 | /////////////////////////////////////////////////////////////////////////////////////////////////// 650 | 651 | #ifdef USE_CPP_11 652 | 653 | template< int N > 654 | static void allocate(cudaStream_t stream) { 655 | void *ptr[N]; 656 | for( int i = 0 ; i < N ; ++i ) 657 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr[i], 1024, stream)); 658 | for( int i = 0 ; i < N ; ++i ) 659 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr[i], stream)); 660 | } 661 | 662 | /////////////////////////////////////////////////////////////////////////////////////////////////// 663 | 664 | TEST_P(CnmemTest, allocateConcurrentNoCompete) { 665 | cudaStream_t streams[2]; 666 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 667 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 668 | 669 | cnmemDevice_t device; 670 | memset(&device, 0, sizeof(device)); 671 | device.size = 6*1024; 672 | device.numStreams = 2; 673 | device.streams = streams; 674 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 675 | 676 | // In this test, each manager has enough memory to accommodate the threads. 677 | std::vector threads(2); 678 | for( int i = 0 ; i < 2 ; ++i ) 679 | threads[i] = new std::thread(allocate<2>, streams[i]); 680 | for( int i = 0 ; i < 2 ; ++i ) 681 | threads[i]->join(); 682 | for( unsigned i = 0 ; i < 2 ; ++i ) 683 | delete threads[i]; 684 | threads.clear(); 685 | } 686 | 687 | /////////////////////////////////////////////////////////////////////////////////////////////////// 688 | 689 | TEST_P(CnmemTest, allocateConcurrentCompete) { 690 | cudaStream_t streams[2]; 691 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 692 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 693 | 694 | cnmemDevice_t device; 695 | memset(&device, 0, sizeof(device)); 696 | device.size = 6*1024; 697 | device.numStreams = 2; 698 | device.streams = streams; 699 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 700 | 701 | // In this test, the threads compete for the memory of the root manager. 702 | std::vector threads(2); 703 | for( int i = 0; i < 2; ++i ) 704 | threads[i] = new std::thread(allocate<3>, streams[i]); 705 | for( int i = 0; i < 2; ++i ) 706 | threads[i]->join(); 707 | for( unsigned i = 0; i < 2; ++i ) 708 | delete threads[i]; 709 | threads.clear(); 710 | 711 | mTestLeaks = false; // For some reasons, it reports a leak. It's likely to be in the driver/runtime. 712 | } 713 | 714 | /////////////////////////////////////////////////////////////////////////////////////////////////// 715 | 716 | TEST_P(CnmemTest, allocateConcurrentSteal) { 717 | const int N = 4; 718 | cudaStream_t streams[N]; 719 | for( int i = 0 ; i < N ; ++i ) { 720 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[i])); 721 | } 722 | 723 | cnmemDevice_t device; 724 | memset(&device, 0, sizeof(device)); 725 | device.size = 4*N*1024; 726 | device.numStreams = N; 727 | device.streams = streams; 728 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 729 | 730 | // In this test, the thread 0 has to steal memory from thread 1. 731 | std::vector threads(N); 732 | for( int i = 0 ; i < N ; ++i ) { 733 | threads[i] = new std::thread(allocate<4>, streams[i]); 734 | } 735 | for( int i = 0; i < N; ++i ) 736 | threads[i]->join(); 737 | for( int i = 0; i < N; ++i ) 738 | delete threads[i]; 739 | threads.clear(); 740 | 741 | mTestLeaks = false; // For some reasons, it reports a leak. 742 | } 743 | 744 | /////////////////////////////////////////////////////////////////////////////////////////////////// 745 | 746 | TEST_P(CnmemTest, allocateConcurrentMultiStreamsPerThreadNoGrow) { 747 | const int NUM_STREAMS = 8, NUM_THREADS = 32; 748 | cudaStream_t streams[NUM_STREAMS]; 749 | for( int i = 0 ; i < NUM_STREAMS ; ++i ) { 750 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[i])); 751 | } 752 | 753 | cnmemDevice_t device; 754 | memset(&device, 0, sizeof(device)); 755 | device.size = 4*NUM_THREADS*1024; 756 | device.numStreams = NUM_STREAMS; 757 | device.streams = streams; 758 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 759 | 760 | std::vector threads(NUM_THREADS); 761 | for( int i = 0 ; i < NUM_THREADS ; ++i ) { 762 | threads[i] = new std::thread(allocate<4>, streams[i%NUM_STREAMS]); 763 | } 764 | for( int i = 0; i < NUM_THREADS; ++i ) 765 | threads[i]->join(); 766 | for( int i = 0; i < NUM_THREADS; ++i ) 767 | delete threads[i]; 768 | threads.clear(); 769 | 770 | mTestLeaks = false; // For some reasons, it reports a leak. 771 | } 772 | 773 | /////////////////////////////////////////////////////////////////////////////////////////////////// 774 | 775 | TEST_P(CnmemTest, allocateConcurrentMultiStreamsPerThreadGrow) { 776 | const int NUM_STREAMS = 8, NUM_THREADS = 32; 777 | cudaStream_t streams[NUM_STREAMS]; 778 | for( int i = 0 ; i < NUM_STREAMS ; ++i ) { 779 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[i])); 780 | } 781 | 782 | cnmemDevice_t device; 783 | memset(&device, 0, sizeof(device)); 784 | device.size = NUM_THREADS*1024; 785 | device.numStreams = NUM_STREAMS; 786 | device.streams = streams; 787 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 788 | 789 | std::vector threads(NUM_THREADS); 790 | for( int i = 0 ; i < NUM_THREADS ; ++i ) { 791 | threads[i] = new std::thread(allocate<4>, streams[i%NUM_STREAMS]); 792 | } 793 | for( int i = 0; i < NUM_THREADS; ++i ) 794 | threads[i]->join(); 795 | for( int i = 0; i < NUM_THREADS; ++i ) 796 | delete threads[i]; 797 | threads.clear(); 798 | 799 | mTestLeaks = false; // For some reasons, it reports a leak. 800 | } 801 | 802 | /////////////////////////////////////////////////////////////////////////////////////////////////// 803 | 804 | template< int N > 805 | static void registerAndAllocate(cudaStream_t stream) { 806 | void *ptr[N]; 807 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemRegisterStream(stream)); 808 | for( int i = 0 ; i < N ; ++i ) 809 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr[i], 1024, stream)); 810 | for( int i = 0 ; i < N ; ++i ) 811 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr[i], stream)); 812 | } 813 | 814 | TEST_P(CnmemTest, registerAndAllocateConcurrentStreamsGrow) { 815 | const int N = 32; 816 | cudaStream_t streams[N]; 817 | for( int i = 0 ; i < N ; ++i ) { 818 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[i])); 819 | } 820 | 821 | // Declare no stream. 822 | cnmemDevice_t device; 823 | memset(&device, 0, sizeof(device)); 824 | device.size = 1024; 825 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 826 | 827 | // In this test, the thread 0 has to steal memory from thread 1. 828 | std::vector threads(N); 829 | for( int i = 0 ; i < N ; ++i ) { 830 | threads[i] = new std::thread(registerAndAllocate<2>, streams[i]); 831 | } 832 | for( int i = 0; i < N; ++i ) 833 | threads[i]->join(); 834 | for( int i = 0; i < N; ++i ) 835 | delete threads[i]; 836 | threads.clear(); 837 | 838 | mTestLeaks = false; // For some reasons, it reports a leak. 839 | } 840 | 841 | TEST_P(CnmemTest, registerAndAllocateConcurrentMultiStreamsPerThread) { 842 | const int NUM_STREAMS = 8, NUM_THREADS = 32; 843 | cudaStream_t streams[NUM_STREAMS]; 844 | for( int i = 0 ; i < NUM_STREAMS ; ++i ) { 845 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[i])); 846 | } 847 | 848 | // Declare no stream. 849 | cnmemDevice_t device; 850 | memset(&device, 0, sizeof(device)); 851 | device.size = 1024; 852 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 853 | 854 | // In this test, the thread 0 has to steal memory from thread 1. 855 | std::vector threads(NUM_THREADS); 856 | for( int i = 0 ; i < NUM_THREADS ; ++i ) { 857 | threads[i] = new std::thread(registerAndAllocate<4>, streams[i%NUM_STREAMS]); 858 | } 859 | for( int i = 0; i < NUM_THREADS; ++i ) 860 | threads[i]->join(); 861 | for( int i = 0; i < NUM_THREADS; ++i ) 862 | delete threads[i]; 863 | threads.clear(); 864 | 865 | mTestLeaks = false; // For some reasons, it reports a leak. 866 | } 867 | 868 | /////////////////////////////////////////////////////////////////////////////////////////////////// 869 | 870 | template< int N > 871 | static void allocateAndPrint(int id, cudaStream_t stream) { 872 | void *ptr[N]; 873 | for( int i = 0 ; i < N ; ++i ) 874 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr[i], 1024, stream)); 875 | char buffer[64]; 876 | sprintf(buffer, "memoryState.%d.log", id); 877 | FILE *file = fopen(buffer, "w"); 878 | ASSERT_NE((FILE*) NULL, file); 879 | cnmemPrintMemoryState(file, stream); 880 | fclose(file); 881 | for( int i = 0 ; i < N ; ++i ) 882 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr[i], stream)); 883 | } 884 | 885 | TEST_P(CnmemTest, testPrintMemoryState) { 886 | cudaStream_t streams[2]; 887 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[0])); 888 | ASSERT_EQ(cudaSuccess, cudaStreamCreate(&streams[1])); 889 | 890 | cnmemDevice_t device; 891 | memset(&device, 0, sizeof(device)); 892 | device.size = 4096; 893 | device.numStreams = 2; 894 | device.streams = streams; 895 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags | CNMEM_FLAGS_CANNOT_GROW)); 896 | 897 | // In this test, each manager has enough memory to accommodate the threads. 898 | std::vector threads(2); 899 | for( int i = 0 ; i < 2 ; ++i ) 900 | threads[i] = new std::thread(allocateAndPrint<2>, i, streams[i]); 901 | for( int i = 0 ; i < 2 ; ++i ) 902 | threads[i]->join(); 903 | for( unsigned i = 0 ; i < 2 ; ++i ) 904 | delete threads[i]; 905 | threads.clear(); 906 | 907 | mTestLeaks = false; // For some reasons, it reports a leak. 908 | } 909 | 910 | #endif // defined USE_CPP_11 911 | 912 | /////////////////////////////////////////////////////////////////////////////////////////////////// 913 | 914 | TEST_P(CnmemTest, memoryUsage) { 915 | cnmemDevice_t device; 916 | memset(&device, 0, sizeof(device)); 917 | device.size = 4096; 918 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 919 | 920 | std::size_t totalMem, freeMem; 921 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMemGetInfo(&freeMem, &totalMem, cudaStreamDefault)); 922 | ASSERT_EQ(4096, totalMem); 923 | ASSERT_EQ(4096, freeMem); 924 | 925 | void *ptr; 926 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc(&ptr, 1024, NULL)); 927 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMemGetInfo(&freeMem, &totalMem, cudaStreamDefault)); 928 | ASSERT_EQ(4096, totalMem); 929 | ASSERT_EQ(3072, freeMem); 930 | 931 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr, NULL)); 932 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMemGetInfo(&freeMem, &totalMem, cudaStreamDefault)); 933 | ASSERT_EQ(4096, totalMem); 934 | ASSERT_EQ(4096, freeMem); 935 | } 936 | 937 | /////////////////////////////////////////////////////////////////////////////////////////////////// 938 | 939 | TEST_P(CnmemTest, testDeviceDoesNotChange) { 940 | 941 | int numDevices; 942 | ASSERT_EQ(cudaSuccess, cudaGetDeviceCount(&numDevices)); 943 | if( numDevices < 2 ) { 944 | ASSERT_TRUE(true); 945 | mFinalize = false; 946 | return; 947 | } 948 | 949 | cnmemDevice_t devices[2]; 950 | memset(devices, 0, sizeof(devices)); 951 | devices[0].device = 0; 952 | devices[0].size = 4096; 953 | devices[1].device = 1; 954 | devices[1].size = 2048; 955 | 956 | int currentDevice; 957 | ASSERT_EQ(cudaSuccess, cudaSetDevice(0)); 958 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(2, devices, pool_flags)); 959 | ASSERT_EQ(cudaSuccess, cudaGetDevice(¤tDevice)); 960 | ASSERT_EQ(0, currentDevice); 961 | 962 | size_t totalMem, freeMem; 963 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMemGetInfo(&freeMem, &totalMem, cudaStreamDefault)); 964 | ASSERT_EQ(4096, totalMem); 965 | ASSERT_EQ(4096, freeMem); 966 | 967 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFinalize()); 968 | 969 | ASSERT_EQ(cudaSuccess, cudaSetDevice(1)); 970 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(2, devices, pool_flags)); 971 | ASSERT_EQ(cudaSuccess, cudaGetDevice(¤tDevice)); 972 | ASSERT_EQ(1, currentDevice); 973 | 974 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMemGetInfo(&freeMem, &totalMem, cudaStreamDefault)); 975 | ASSERT_EQ(2048, totalMem); 976 | ASSERT_EQ(2048, freeMem); 977 | 978 | ASSERT_EQ(cudaSuccess, cudaSetDevice(0)); // Make sure we are on dev 0 for final mem checks. 979 | } 980 | 981 | /////////////////////////////////////////////////////////////////////////////////////////////////// 982 | 983 | #ifdef USE_CPP_11 984 | #include 985 | 986 | template< typename T > 987 | class DeviceDeleter { 988 | public: 989 | DeviceDeleter() {} 990 | void operator()(T *ptr) { 991 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFree(ptr, cudaStreamDefault)); 992 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemRelease()); 993 | } 994 | }; 995 | 996 | TEST_P(CnmemTest, testSharedPtr) { 997 | 998 | cnmemDevice_t device; 999 | memset(&device, 0, sizeof(device)); 1000 | device.size = 2048; 1001 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemInit(1, &device, pool_flags)); 1002 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemRetain()); 1003 | 1004 | float *ptr; 1005 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemMalloc((void**) &ptr, 1024, cudaStreamDefault)); 1006 | std::shared_ptr p(ptr, DeviceDeleter()); 1007 | 1008 | ASSERT_EQ(CNMEM_STATUS_SUCCESS, cnmemFinalize()); // We still have a pointer in the scope... 1009 | mFinalize = false; // Make sure TearDown does call finalize again. 1010 | } 1011 | 1012 | #endif 1013 | 1014 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1015 | 1016 | int main(int argc, char **argv) { 1017 | ::testing::InitGoogleTest(&argc, argv); 1018 | return RUN_ALL_TESTS(); 1019 | } 1020 | 1021 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1022 | 1023 | -------------------------------------------------------------------------------- /src/cnmem.cpp: -------------------------------------------------------------------------------- 1 | /////////////////////////////////////////////////////////////////////////////////////////////////// 2 | // Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved. 3 | // 4 | // Redistribution and use in source and binary forms, with or without 5 | // modification, are permitted provided that the following conditions 6 | // are met: 7 | // * Redistributions of source code must retain the above copyright 8 | // notice, this list of conditions and the following disclaimer. 9 | // * Redistributions in binary form must reproduce the above copyright 10 | // notice, this list of conditions and the following disclaimer in the 11 | // documentation and/or other materials provided with the distribution. 12 | // * Neither the name of NVIDIA CORPORATION nor the names of its 13 | // contributors may be used to endorse or promote products derived 14 | // from this software without specific prior written permission. 15 | // 16 | // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 17 | // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 19 | // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 20 | // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21 | // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22 | // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23 | // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 24 | // OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | /////////////////////////////////////////////////////////////////////////////////////////////////// 28 | 29 | #include "cnmem.h" 30 | #include 31 | #include 32 | #include 33 | 34 | #if !defined(WIN32) && defined(_MSC_VER) 35 | #define WIN32 36 | #endif 37 | 38 | #ifdef WIN32 39 | #include 40 | #else 41 | #include 42 | #endif 43 | 44 | #if defined(__SIZEOF_POINTER__) && __SIZEOF_POINTER__ == 4 // ARMv7 is the only 32-bit target that we support. 45 | #define CNMEM_BUILD_WITH_32_BIT_POINTERS 46 | #endif 47 | 48 | #define CNMEM_GRANULARITY 512 49 | 50 | /////////////////////////////////////////////////////////////////////////////////////////////////// 51 | 52 | extern "C" const char* cnmemGetErrorString(cnmemStatus_t status) { 53 | switch(status) { 54 | case CNMEM_STATUS_SUCCESS: return "CNMEM_STATUS_SUCCESS"; 55 | case CNMEM_STATUS_CUDA_ERROR: return "CNMEM_STATUS_CUDA_ERROR"; 56 | case CNMEM_STATUS_INVALID_ARGUMENT: return "CNMEM_STATUS_INVALID_ARGUMENT"; 57 | case CNMEM_STATUS_NOT_INITIALIZED: return "CNMEM_STATUS_NOT_INITIALIZED"; 58 | case CNMEM_STATUS_OUT_OF_MEMORY: return "CNMEM_STATUS_OUT_OF_MEMORY"; 59 | default: return "CNMEM_STATUS_UNKNOWN_ERROR"; 60 | } 61 | } 62 | 63 | /////////////////////////////////////////////////////////////////////////////////////////////////// 64 | 65 | #if 0 66 | #ifdef WIN32 67 | #define CNMEM_DEBUG_ERROR(...) do { \ 68 | fprintf(stderr, "Error at line: %d\n", __LINE__); \ 69 | fprintf(stderr, __VA_ARGS__); \ 70 | } while(0) 71 | #else 72 | #include 73 | static inline void printBacktrace() { 74 | void *stackBuffer[64]; 75 | int numAddresses = backtrace((void**) &stackBuffer, 64); 76 | char **addresses = backtrace_symbols(stackBuffer, numAddresses); 77 | for( int i = 0 ; i < numAddresses ; ++i ) { 78 | fprintf(stderr, "[%2d]: %s\n", i, addresses[i]); 79 | } 80 | free(addresses); 81 | } 82 | #define CNMEM_DEBUG_ERROR(...) do { \ 83 | fprintf(stderr, "Error at line: %d\n", __LINE__); \ 84 | fprintf(stderr, __VA_ARGS__); \ 85 | fprintf(stderr, "Backtrace:\n"); \ 86 | printBacktrace(); \ 87 | } while(0) 88 | #endif 89 | #else 90 | #define CNMEM_DEBUG_ERROR(...) 91 | #endif 92 | 93 | #if 0 94 | #define CNMEM_DEBUG_INFO printf 95 | #else 96 | #define CNMEM_DEBUG_INFO(...) 97 | #endif 98 | 99 | #if 0 // Enable/disable assertions 100 | #include 101 | #define CNMEM_ASSERT assert 102 | #else 103 | #define CNMEM_ASSERT(...) 104 | #endif 105 | 106 | #define CNMEM_CHECK_TRUE(cond, error) do { \ 107 | if( !(cond) ) { \ 108 | CNMEM_DEBUG_ERROR("CNMEM_CHECK_TRUE evaluates to false\n"); \ 109 | return error; \ 110 | } \ 111 | } while(0) 112 | 113 | #define CNMEM_CHECK(call) do { \ 114 | cnmemStatus_t status = (call); \ 115 | if( status != CNMEM_STATUS_SUCCESS ) { \ 116 | CNMEM_DEBUG_ERROR("CNMEM_CHECK failed with status \"%s\"\n", \ 117 | cnmemGetErrorString(status)); \ 118 | return status; \ 119 | } \ 120 | } while(0) 121 | 122 | #define CNMEM_CHECK_OR_UNLOCK(call, mutex) do { \ 123 | cnmemStatus_t status = (call); \ 124 | if( status != CNMEM_STATUS_SUCCESS ) { \ 125 | CNMEM_DEBUG_ERROR("CNMEM_CHECK_OR_UNLOCK failed with status \"%s\"\n", \ 126 | cnmemGetErrorString(status)); \ 127 | (mutex).unlock(); \ 128 | return status; \ 129 | } \ 130 | } while(0) 131 | 132 | #define CNMEM_CHECK_CUDA(call) do { \ 133 | cudaError_t cudaError = (call); \ 134 | if( cudaError == cudaErrorMemoryAllocation ) { \ 135 | CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \ 136 | cudaGetErrorString(cudaError)); \ 137 | return CNMEM_STATUS_OUT_OF_MEMORY; \ 138 | } \ 139 | else if( cudaError != cudaSuccess ) { \ 140 | CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA failed with CUDA error \"%s\"\n", \ 141 | cudaGetErrorString(cudaError)); \ 142 | return CNMEM_STATUS_CUDA_ERROR; \ 143 | } \ 144 | } while(0) 145 | 146 | #define CNMEM_CHECK_CUDA_OR_UNLOCK(call, mutex) do { \ 147 | cudaError_t cudaError = (call); \ 148 | if( cudaError == cudaErrorMemoryAllocation ) { \ 149 | CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \ 150 | cudaGetErrorString(cudaError)); \ 151 | (mutex).unlock(); \ 152 | return CNMEM_STATUS_OUT_OF_MEMORY; \ 153 | } \ 154 | else if( cudaError != cudaSuccess ) { \ 155 | CNMEM_DEBUG_ERROR("CNMEM_CHECK_CUDA_OR_UNLOCK failed with CUDA error \"%s\"\n", \ 156 | cudaGetErrorString(cudaError)); \ 157 | (mutex).unlock(); \ 158 | return CNMEM_STATUS_CUDA_ERROR; \ 159 | } \ 160 | } while(0) 161 | 162 | #ifdef WIN32 163 | #define CNMEM_CHECK_WIN32(call, error_code) do { \ 164 | SetLastError(0); /* Clean the flag. */ \ 165 | call; \ 166 | DWORD status = GetLastError(); \ 167 | if( status ) \ 168 | return error_code; \ 169 | } while(0) 170 | #else 171 | #define CNMEM_CHECK_PTHREAD(call, error_code) do { \ 172 | int status = call; \ 173 | if( status ) { \ 174 | CNMEM_DEBUG_ERROR("CNMEM_CHECK_PTHREAD failed with status %d\n", status); \ 175 | return error_code; \ 176 | } \ 177 | } while(0) 178 | #endif 179 | 180 | /////////////////////////////////////////////////////////////////////////////////////////////////// 181 | 182 | namespace cnmem { 183 | 184 | static inline std::size_t ceilInt(std::size_t m, std::size_t n) { 185 | CNMEM_ASSERT(n > 0); 186 | return (m + n-1) / n * n; 187 | } 188 | 189 | /////////////////////////////////////////////////////////////////////////////////////////////////// 190 | 191 | class Mutex { 192 | #ifdef WIN32 193 | mutable CRITICAL_SECTION mCriticalSection; 194 | #else 195 | pthread_mutex_t mMutex; 196 | #endif 197 | 198 | public: 199 | /// Initialize the mutex. 200 | cnmemStatus_t initialize(); 201 | /// Finalize the mutex. 202 | cnmemStatus_t finalize(); 203 | /// Lock the mutex. 204 | cnmemStatus_t lock() const; 205 | /// Unlock the mutex. 206 | cnmemStatus_t unlock() const; 207 | }; 208 | 209 | /////////////////////////////////////////////////////////////////////////////////////////////////// 210 | 211 | cnmemStatus_t Mutex::initialize() { 212 | #ifdef WIN32 213 | CNMEM_CHECK_WIN32(InitializeCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR); 214 | #else 215 | #if 0 216 | pthread_mutexattr_t attr; 217 | CNMEM_CHECK_PTHREAD(pthread_mutexattr_init(&attr), CNMEM_STATUS_UNKNOWN_ERROR); 218 | CNMEM_CHECK_PTHREAD(pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE), CNMEM_STATUS_UNKNOWN_ERROR); 219 | CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, &attr), CNMEM_STATUS_UNKNOWN_ERROR); 220 | #else 221 | CNMEM_CHECK_PTHREAD(pthread_mutex_init(&mMutex, NULL), CNMEM_STATUS_UNKNOWN_ERROR); 222 | #endif 223 | #endif 224 | return CNMEM_STATUS_SUCCESS; 225 | } 226 | 227 | /////////////////////////////////////////////////////////////////////////////////////////////////// 228 | 229 | cnmemStatus_t Mutex::finalize() { 230 | #ifdef WIN32 231 | CNMEM_CHECK_WIN32(DeleteCriticalSection((CRITICAL_SECTION*) &mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR); 232 | #else 233 | CNMEM_CHECK_PTHREAD(pthread_mutex_destroy(&mMutex), CNMEM_STATUS_UNKNOWN_ERROR); 234 | #endif 235 | return CNMEM_STATUS_SUCCESS; 236 | } 237 | 238 | /////////////////////////////////////////////////////////////////////////////////////////////////// 239 | 240 | cnmemStatus_t Mutex::lock() const { 241 | #ifdef WIN32 242 | CNMEM_CHECK_WIN32(EnterCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR); 243 | #else 244 | CNMEM_CHECK_PTHREAD(pthread_mutex_lock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR); 245 | #endif 246 | return CNMEM_STATUS_SUCCESS; 247 | } 248 | 249 | /////////////////////////////////////////////////////////////////////////////////////////////////// 250 | 251 | cnmemStatus_t Mutex::unlock() const { 252 | #ifdef WIN32 253 | CNMEM_CHECK_WIN32(LeaveCriticalSection(&mCriticalSection), CNMEM_STATUS_UNKNOWN_ERROR); 254 | #else 255 | CNMEM_CHECK_PTHREAD(pthread_mutex_unlock((pthread_mutex_t*) &mMutex), CNMEM_STATUS_UNKNOWN_ERROR); 256 | #endif 257 | return CNMEM_STATUS_SUCCESS; 258 | } 259 | 260 | /////////////////////////////////////////////////////////////////////////////////////////////////// 261 | 262 | class Block { 263 | /// The pointer to the memory region on the device. 264 | char *mData; 265 | /// The size of the memory buffer. 266 | std::size_t mSize; 267 | /// The prev/next blocks in the linked list of blocks. 268 | Block *mNext; 269 | /// Is it a head node (i.e. a node obtained from parent->allocate or cudaMalloc). 270 | bool mIsHead; 271 | 272 | public: 273 | /// Create a block. 274 | Block(char *data, std::size_t size, Block *next, bool isHead) 275 | : mData(data) 276 | , mSize(size) 277 | , mNext(next) 278 | , mIsHead(isHead) { 279 | } 280 | 281 | /// The data. 282 | inline const char* getData() const { return mData; } 283 | /// The data (mutable). 284 | inline char* getData() { return mData; } 285 | 286 | /// The size of the block. 287 | inline std::size_t getSize() const { return mSize; } 288 | 289 | /// The next block in the linked list. 290 | inline const Block* getNext() const { return mNext; } 291 | /// The next block in the linked list (mutable). 292 | inline Block* getNext() { return mNext; } 293 | 294 | /// Is it a head block. 295 | inline bool isHead() const { return mIsHead; } 296 | 297 | /// Change the next block. 298 | inline void setNext(Block *next) { mNext = next; } 299 | /// Change the size of the block. 300 | inline void setSize(std::size_t size) { mSize = size; } 301 | /// Set the head flag. 302 | inline void setHeadFlag(bool isHead) { mIsHead = isHead; } 303 | }; 304 | 305 | /////////////////////////////////////////////////////////////////////////////////////////////////// 306 | 307 | class Manager { 308 | 309 | /// The parent manager. 310 | Manager *mParent; 311 | /// The children managers. 312 | std::vector mChildren; 313 | /// The GPU device where the memory is allocated. 314 | int mDevice; 315 | /// The stream this manager is associated with. It could be NULL. 316 | cudaStream_t mStream; 317 | /// Is the stream blocking? 318 | bool mIsStreamBlocking; 319 | /// The list of used blocks. 320 | Block *mUsedBlocks; 321 | /// The list of free blocks. 322 | Block *mFreeBlocks; 323 | /// The managed memory size. 324 | std::size_t mSize; 325 | /// The flags. 326 | unsigned mFlags; 327 | /// To support multi-threading. Each manager has its own mutex. 328 | Mutex mMutex; 329 | 330 | public: 331 | /// Create an unitialized manager. 332 | Manager(); 333 | /// Dtor. 334 | ~Manager(); 335 | 336 | /// Allocate a block of memory. 337 | cnmemStatus_t allocate(void *&ptr, std::size_t size, bool isBlocking = true); 338 | /// Release a block of memory. 339 | cnmemStatus_t release(void *ptr); 340 | /// Release memory. It returns true if we have no memory leak. 341 | cnmemStatus_t releaseAllUnsafe(); 342 | /// Reserve memory for a manager. 343 | cnmemStatus_t reserve(std::size_t size); 344 | /// Steal memory from another manager. 345 | cnmemStatus_t stealUnsafe(void *&ptr, std::size_t size); 346 | 347 | /// Print the full memory state. 348 | cnmemStatus_t printMemoryState(FILE *file) const; 349 | 350 | /// The amount of used memory. 351 | inline cnmemStatus_t getUsedMemoryUnsafe(std::size_t &usedMemory) const { 352 | return getMemoryUnsafe(usedMemory, mUsedBlocks); 353 | } 354 | /// The amount of used memory. 355 | inline cnmemStatus_t getFreeMemoryUnsafe(std::size_t &freeMemory) const { 356 | return getMemoryUnsafe(freeMemory, mFreeBlocks); 357 | } 358 | 359 | /// Get a specific child based on the stream id. 360 | cnmemStatus_t getChildFromStream(Manager *&manager, cudaStream_t stream) const; 361 | /// Get a specific child based on the stream id. 362 | cnmemStatus_t getChild(Manager *&manager, std::size_t i) const; 363 | /// Add a new child. 364 | cnmemStatus_t addChild(Manager *manager); 365 | /// The number of children. 366 | cnmemStatus_t getNumChildren(std::size_t &numChildren) const; 367 | 368 | /// The associated device. 369 | inline int getDevice() const { return mDevice; } 370 | /// The flags. 371 | inline unsigned getFlags() const { return mFlags; } 372 | /// Get the mutex. 373 | inline const Mutex* getMutex() const { return &mMutex; } 374 | /// The size allocated to that manager. 375 | inline std::size_t getSize() const { return mSize; } 376 | /// The CUDA stream. 377 | inline cudaStream_t getStream() const { return mStream; } 378 | 379 | /// Define the parent. 380 | inline void setParent(Manager *parent) { mParent = parent; } 381 | /// Define the device. 382 | inline void setDevice(int device) { mDevice = device; } 383 | /// Define the stream. 384 | inline cnmemStatus_t setStream(cudaStream_t stream) { 385 | mStream = stream; 386 | #ifdef CUDA_API_PER_THREAD_DEFAULT_STREAM 387 | mIsStreamBlocking = false; 388 | #elif CUDART_VERSION < 5050 389 | mIsStreamBlocking = true; 390 | #else 391 | unsigned flags = 0; 392 | CNMEM_CHECK_CUDA(cudaStreamGetFlags(mStream, &flags)); 393 | mIsStreamBlocking = !mStream || !(flags & cudaStreamNonBlocking); 394 | #endif 395 | return CNMEM_STATUS_SUCCESS; 396 | } 397 | /// Define the flags. 398 | inline void setFlags(unsigned flags) { mFlags = flags; } 399 | 400 | private: 401 | /// The member functions below which are marked "Unsafe" are not thread-safe when called on a 402 | /// same Manager object. Make sure they are called by a single thread in that case. 403 | 404 | /// Allocate a new block and add it to the free list. 405 | cnmemStatus_t allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size); 406 | /// Release a block from the active list. 407 | cnmemStatus_t releaseBlockUnsafe(Block *curr, Block *prev); 408 | /// Find the best free node based on the size. 409 | cnmemStatus_t findBestBlockUnsafe(Block *&curr, Block *&prev, std::size_t size); 410 | /// Extract a node from the list of free blocks. 411 | cnmemStatus_t extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen); 412 | 413 | /// Give a free block from that manager. 414 | cnmemStatus_t giveBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size); 415 | /// Steal a block from another manager. 416 | cnmemStatus_t stealBlockUnsafe(void *&data, std::size_t &dataSize, std::size_t size); 417 | 418 | /// The memory consumption of a list. 419 | cnmemStatus_t getMemoryUnsafe(std::size_t &memSize, const Block *head) const; 420 | /// Print an internal linked list. 421 | cnmemStatus_t printListUnsafe(FILE *file, const char *name, const Block *head) const; 422 | }; 423 | 424 | /////////////////////////////////////////////////////////////////////////////////////////////////// 425 | 426 | Manager::Manager() 427 | : mParent(NULL) 428 | , mChildren() 429 | , mDevice(-1) 430 | , mStream(NULL) 431 | , mIsStreamBlocking(false) 432 | , mUsedBlocks(NULL) 433 | , mFreeBlocks(NULL) 434 | , mSize(0) 435 | , mFlags(CNMEM_FLAGS_DEFAULT) 436 | , mMutex() { 437 | 438 | mMutex.initialize(); 439 | } 440 | 441 | /////////////////////////////////////////////////////////////////////////////////////////////////// 442 | 443 | Manager::~Manager() { 444 | if( mDevice == -1 || cudaSetDevice(mDevice) != cudaSuccess ) { // Invalid device, skip it. 445 | return; 446 | } 447 | releaseAllUnsafe(); 448 | mMutex.finalize(); 449 | } 450 | 451 | /////////////////////////////////////////////////////////////////////////////////////////////////// 452 | 453 | cnmemStatus_t Manager::addChild(Manager *manager) { 454 | CNMEM_CHECK(mMutex.lock()); 455 | mChildren.push_back(manager); 456 | CNMEM_CHECK(mMutex.unlock()); 457 | return CNMEM_STATUS_SUCCESS; 458 | } 459 | 460 | /////////////////////////////////////////////////////////////////////////////////////////////////// 461 | 462 | cnmemStatus_t Manager::allocate(void *&ptr, std::size_t size, bool isBlocking) { 463 | CNMEM_CHECK(mMutex.lock()); 464 | 465 | // If the client is not blocking, we have to explicitly synchronize before giving one buffer. 466 | if( !isBlocking ) { 467 | CNMEM_CHECK_CUDA_OR_UNLOCK(cudaStreamSynchronize(mStream), mMutex); 468 | } 469 | 470 | // Find the best fit. 471 | Block *best = NULL, *prev = NULL; 472 | CNMEM_CHECK_OR_UNLOCK(findBestBlockUnsafe(best, prev, size), mMutex); 473 | 474 | // If there's no block left in the list of free blocks (with a sufficient size). Request a new block. 475 | if( best == NULL && !(mFlags & CNMEM_FLAGS_CANNOT_GROW) ) { 476 | CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(best, prev, size), mMutex); 477 | } 478 | 479 | // Make sure we do have a block or quit. 480 | if( !best ) { 481 | ptr = NULL; 482 | CNMEM_CHECK(mMutex.unlock()); 483 | return CNMEM_STATUS_OUT_OF_MEMORY; 484 | } 485 | 486 | // Split the free block if needed. 487 | CNMEM_CHECK_OR_UNLOCK(extractBlockUnsafe(best, prev, size, false), mMutex); 488 | 489 | // Push the node to the list of used nodes. 490 | best->setNext(mUsedBlocks); 491 | mUsedBlocks = best; 492 | 493 | // Return the new pointer into memory. 494 | ptr = mUsedBlocks->getData(); 495 | CNMEM_CHECK(mMutex.unlock()); 496 | return CNMEM_STATUS_SUCCESS; 497 | } 498 | 499 | /////////////////////////////////////////////////////////////////////////////////////////////////// 500 | 501 | cnmemStatus_t Manager::allocateBlockUnsafe(Block *&curr, Block *&prev, std::size_t size) { 502 | // Reset the outputs. 503 | curr = prev = NULL; 504 | 505 | // Try to allocate data from the parent or the device. 506 | void *data = NULL; 507 | if( mParent ) { 508 | CNMEM_CHECK(mParent->allocate(data, size, mIsStreamBlocking)); 509 | } 510 | else { 511 | if (mFlags & CNMEM_FLAGS_MANAGED) { 512 | CNMEM_DEBUG_INFO("cudaMallocManaged(%lu)\n", size); 513 | CNMEM_CHECK_CUDA(cudaMallocManaged(&data, size)); 514 | CNMEM_CHECK_CUDA(cudaMemPrefetchAsync(data, size, mDevice)); 515 | } 516 | else { 517 | CNMEM_DEBUG_INFO("cudaMalloc(%lu)\n", size); 518 | CNMEM_CHECK_CUDA(cudaMalloc(&data, size)); 519 | } 520 | CNMEM_DEBUG_INFO(">> returned address=0x%016lx\n", (size_t) data); 521 | } 522 | 523 | // If it failed, there's an unexpected issue. 524 | CNMEM_ASSERT(data); 525 | 526 | // We have data, we now need to add it to the list of free nodes. We keep the list sorted. 527 | Block *next = mFreeBlocks; 528 | for( ; next && next->getData() < data ; next = next->getNext() ) { 529 | prev = next; 530 | } 531 | curr = new Block((char*) data, size, next, true); 532 | if( !curr ) { 533 | return CNMEM_STATUS_OUT_OF_MEMORY; 534 | } 535 | if( prev ) { 536 | prev->setNext(curr); 537 | } 538 | else { 539 | mFreeBlocks = curr; 540 | } 541 | 542 | return CNMEM_STATUS_SUCCESS; 543 | } 544 | 545 | /////////////////////////////////////////////////////////////////////////////////////////////////// 546 | 547 | cnmemStatus_t Manager::extractBlockUnsafe(Block *curr, Block *prev, std::size_t size, bool stolen) { 548 | // We have two cases: 1/ It is the right size so we keep it or 2/ it is too large and we split the node. 549 | Block *next; 550 | if( curr->getSize() == size ) { 551 | next = curr->getNext(); 552 | } 553 | else { 554 | std::size_t remaining = curr->getSize()-size; 555 | Block *newBlock = new Block(curr->getData() + size, remaining, curr->getNext(), stolen); 556 | if( !newBlock ) { 557 | return CNMEM_STATUS_OUT_OF_MEMORY; 558 | } 559 | next = newBlock; 560 | curr->setSize(size); 561 | } 562 | 563 | // Redo the "branching" in the nodes. 564 | if( prev ) { 565 | prev->setNext(next); 566 | } 567 | else { 568 | mFreeBlocks = next; 569 | } 570 | return CNMEM_STATUS_SUCCESS; 571 | } 572 | 573 | /////////////////////////////////////////////////////////////////////////////////////////////////// 574 | 575 | cnmemStatus_t Manager::findBestBlockUnsafe(Block *&best, Block *&prev, std::size_t size) { 576 | best = NULL, prev = NULL; 577 | for( Block *temp = mFreeBlocks, *tempPrev = NULL ; temp ; temp = temp->getNext() ) { 578 | if( temp->getSize() >= size && (!best || temp->getSize() < best->getSize()) ) { 579 | best = temp; 580 | prev = tempPrev; 581 | } 582 | tempPrev = temp; 583 | } 584 | return CNMEM_STATUS_SUCCESS; 585 | } 586 | 587 | /////////////////////////////////////////////////////////////////////////////////////////////////// 588 | 589 | cnmemStatus_t Manager::getChildFromStream(Manager *&manager, cudaStream_t stream) const { 590 | CNMEM_CHECK(mMutex.lock()); 591 | std::size_t i = 0, numChildren = mChildren.size(); 592 | for( ; i < numChildren ; ++i ) { 593 | if( mChildren[i]->mStream == stream ) { 594 | manager = mChildren[i]; 595 | break; 596 | } 597 | } 598 | CNMEM_CHECK(mMutex.unlock()); 599 | return i < numChildren ? CNMEM_STATUS_SUCCESS : CNMEM_STATUS_INVALID_ARGUMENT; 600 | } 601 | 602 | /////////////////////////////////////////////////////////////////////////////////////////////////// 603 | 604 | cnmemStatus_t Manager::getChild(Manager *&manager, std::size_t i) const { 605 | CNMEM_CHECK(mMutex.lock()); 606 | if( i >= mChildren.size() ) { 607 | CNMEM_CHECK(mMutex.unlock()); 608 | return CNMEM_STATUS_INVALID_ARGUMENT; 609 | } 610 | manager = mChildren[i]; 611 | 612 | CNMEM_CHECK(mMutex.unlock()); 613 | return CNMEM_STATUS_SUCCESS; 614 | } 615 | 616 | /////////////////////////////////////////////////////////////////////////////////////////////////// 617 | 618 | cnmemStatus_t Manager::getMemoryUnsafe(std::size_t &size, const Block *head) const { 619 | size = 0; 620 | for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) { 621 | size += curr->getSize(); 622 | } 623 | return CNMEM_STATUS_SUCCESS; 624 | } 625 | 626 | /////////////////////////////////////////////////////////////////////////////////////////////////// 627 | 628 | #if 0 629 | cnmemStatus_t Manager::getMemory(std::size_t &size, const Block *head) const { 630 | CNMEM_CHECK(mMutex.lock()); 631 | CNMEM_CHECK_OR_UNLOCK(getMemoryUnsafe(size, head)); 632 | CNMEM_CHECK(mMutex.unlock()); 633 | return status; 634 | } 635 | #endif 636 | 637 | /////////////////////////////////////////////////////////////////////////////////////////////////// 638 | 639 | cnmemStatus_t Manager::getNumChildren(std::size_t &numChildren) const { 640 | CNMEM_CHECK(mMutex.lock()); 641 | numChildren = mChildren.size(); 642 | CNMEM_CHECK(mMutex.unlock()); 643 | return CNMEM_STATUS_SUCCESS; 644 | } 645 | 646 | /////////////////////////////////////////////////////////////////////////////////////////////////// 647 | 648 | cnmemStatus_t Manager::giveBlockUnsafe(void *&blockData, std::size_t &blockSize, std::size_t size) { 649 | // Make sure the block is not in use any more. It could be too coarse grain and we may change 650 | // it in the future. 651 | CNMEM_CHECK_CUDA(cudaStreamSynchronize(mStream)); 652 | 653 | // Init the returned values to 0. 654 | blockData = NULL; 655 | blockSize = 0; 656 | 657 | // Find the best node to steal and reserve it. 658 | Block *best = NULL, *prev = NULL; 659 | CNMEM_CHECK(findBestBlockUnsafe(best, prev, size)); 660 | if( !best ) { 661 | return CNMEM_STATUS_OUT_OF_MEMORY; 662 | } 663 | CNMEM_CHECK(extractBlockUnsafe(best, prev, size, true)); 664 | blockData = best->getData(); 665 | blockSize = best->getSize(); 666 | 667 | // Release the memory used by that block. 668 | delete best; 669 | return CNMEM_STATUS_SUCCESS; 670 | } 671 | 672 | /////////////////////////////////////////////////////////////////////////////////////////////////// 673 | 674 | cnmemStatus_t Manager::printListUnsafe(FILE *file, const char *name, const Block *head) const { 675 | std::size_t size = 0; 676 | for( Block *curr = (Block*) head; curr; curr = curr->getNext() ) { 677 | size += curr->getSize(); 678 | } 679 | #ifdef CNMEM_BUILD_WITH_32_BIT_POINTERS 680 | fprintf(file, "| list=\"%s\", size=%u\n", name, size); 681 | for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) { 682 | fprintf(file, "| | node=0x%08x, data=0x%08x, size=%u, next=0x%08x, head=%2u\n", 683 | #else 684 | fprintf(file, "| list=\"%s\", size=%lu\n", name, size); 685 | for( Block *curr = (Block*) head ; curr ; curr = curr->getNext() ) { 686 | fprintf(file, "| | node=0x%016lx, data=0x%016lx, size=%lu, next=0x%016lx, head=%2lu\n", 687 | #endif 688 | (std::size_t) curr, 689 | (std::size_t) curr->getData(), 690 | (std::size_t) curr->getSize(), 691 | (std::size_t) curr->getNext(), 692 | (std::size_t) curr->isHead ()); 693 | } 694 | fprintf(file, "|\n"); 695 | return CNMEM_STATUS_SUCCESS; 696 | } 697 | 698 | /////////////////////////////////////////////////////////////////////////////////////////////////// 699 | 700 | cnmemStatus_t Manager::printMemoryState(FILE *file) const { 701 | CNMEM_CHECK(mMutex.lock()); 702 | std::size_t streamCode = (std::size_t) mStream; 703 | std::size_t usedMemory, freeMemory; 704 | CNMEM_CHECK_OR_UNLOCK(getUsedMemoryUnsafe(usedMemory), mMutex); 705 | CNMEM_CHECK_OR_UNLOCK(getFreeMemoryUnsafe(freeMemory), mMutex); 706 | 707 | #ifdef CNMEM_BUILD_WITH_32_BIT_POINTERS 708 | fprintf(file, ">> [%s] device=%d, stream=0x%08x, used=%uB, free=%uB\n", 709 | #else 710 | fprintf(file, ">> [%s] device=%d, stream=0x%016lx, used=%luB, free=%luB\n", 711 | #endif 712 | mParent ? "child" : "root", 713 | mDevice, 714 | streamCode, 715 | usedMemory, 716 | freeMemory); 717 | CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "used", mUsedBlocks), mMutex); 718 | CNMEM_CHECK_OR_UNLOCK(printListUnsafe(file, "free", mFreeBlocks), mMutex); 719 | fprintf(file, "\n"); 720 | CNMEM_CHECK(mMutex.unlock()); 721 | 722 | if( mParent ) { 723 | CNMEM_CHECK(mParent->printMemoryState(file)); 724 | } 725 | return CNMEM_STATUS_SUCCESS; 726 | } 727 | 728 | /////////////////////////////////////////////////////////////////////////////////////////////////// 729 | 730 | cnmemStatus_t Manager::release(void *ptr) { 731 | // Skip if ptr is NULL. 732 | if( ptr == NULL ) { 733 | return CNMEM_STATUS_SUCCESS; 734 | } 735 | 736 | // Lock to make sure only one thread execute that fragment of code. 737 | CNMEM_CHECK(mMutex.lock()); 738 | 739 | // Find the node in the list of used blocks. 740 | Block *curr = mUsedBlocks, *prev = NULL; 741 | for( ; curr && curr->getData() != ptr ; curr = curr->getNext() ) { 742 | prev = curr; 743 | } 744 | 745 | // Make sure we have found a node. 746 | if( curr == NULL ) { 747 | CNMEM_CHECK(mMutex.unlock()); 748 | return CNMEM_STATUS_INVALID_ARGUMENT; 749 | } 750 | 751 | // We have the node so release it. 752 | cnmemStatus_t result = releaseBlockUnsafe(curr, prev); 753 | CNMEM_CHECK(mMutex.unlock()); 754 | return result; 755 | } 756 | 757 | /////////////////////////////////////////////////////////////////////////////////////////////////// 758 | 759 | cnmemStatus_t Manager::releaseAllUnsafe() { 760 | // Destroy the children if any. 761 | for( std::size_t i = 0; i < mChildren.size(); ++i ) { 762 | Manager *child = mChildren[i]; 763 | CNMEM_CHECK(child->releaseAllUnsafe()); 764 | delete child; 765 | } 766 | mChildren.clear(); 767 | 768 | // Destroy used blocks. It's a kind of panic mode to avoid leaks. NOTE: Do that only with roots!!! 769 | if( !mParent ) { 770 | while( mUsedBlocks ) { 771 | CNMEM_CHECK(releaseBlockUnsafe(mUsedBlocks, NULL)); 772 | } 773 | } 774 | 775 | // We should be having only free blocks that are head blocks. Release those blocks. 776 | while( mFreeBlocks ) { 777 | if( mParent ) { 778 | CNMEM_CHECK(mParent->release(mFreeBlocks->getData())); 779 | } 780 | else if( mFreeBlocks->isHead() ) { 781 | void *data = mFreeBlocks->getData(); 782 | CNMEM_DEBUG_INFO("cudaFree(%lu, 0x%016lx)\n", mFreeBlocks->getSize(), (size_t) data); 783 | CNMEM_CHECK_CUDA(cudaFree(data)); 784 | CNMEM_DEBUG_INFO(">> success\n"); 785 | } 786 | Block *block = mFreeBlocks; 787 | mFreeBlocks = mFreeBlocks->getNext(); 788 | delete block; 789 | } 790 | 791 | // We shouldn't have any used block left. Or, it means the user is causing memory leaks! 792 | return CNMEM_STATUS_SUCCESS; 793 | } 794 | 795 | /////////////////////////////////////////////////////////////////////////////////////////////////// 796 | 797 | cnmemStatus_t Manager::releaseBlockUnsafe(Block *curr, Block *prev) { 798 | // The current node cannot be NULL! 799 | CNMEM_ASSERT(curr != NULL); 800 | 801 | // Change the connection of the node. 802 | if( prev ) { 803 | prev->setNext(curr->getNext()); 804 | } 805 | else { 806 | mUsedBlocks = curr->getNext(); 807 | } 808 | 809 | // Find the location where this block should be added to the free list. 810 | prev = NULL; 811 | Block *iter = mFreeBlocks; 812 | for( ; iter && iter->getData() < curr->getData() ; iter = iter->getNext() ) { 813 | prev = iter; 814 | } 815 | 816 | // Keep track of the successor of pred. We may lose track of it in the following "else". 817 | Block *next = prev ? prev->getNext() : mFreeBlocks; 818 | 819 | // We first check if we can merge the block with its predecessor in the list and curr can be merged. 820 | if( prev && prev->getData() + prev->getSize() == curr->getData() && !curr->isHead() ) { 821 | prev->setSize(prev->getSize() + curr->getSize()); 822 | delete curr; 823 | curr = prev; 824 | } 825 | else if( prev ) { 826 | prev->setNext(curr); 827 | } 828 | else { 829 | mFreeBlocks = curr; 830 | } 831 | 832 | // Check if we can merge curr and next. We can't merge over "cudaMalloc" boundaries. 833 | if( next && curr->getData() + curr->getSize() == next->getData() && !next->isHead() ) { 834 | curr->setSize(curr->getSize() + next->getSize()); 835 | curr->setNext(next->getNext()); 836 | delete next; 837 | } 838 | else { 839 | curr->setNext(next); 840 | } 841 | return CNMEM_STATUS_SUCCESS; 842 | } 843 | 844 | /////////////////////////////////////////////////////////////////////////////////////////////////// 845 | 846 | cnmemStatus_t Manager::reserve(std::size_t size) { 847 | CNMEM_CHECK(mMutex.lock()); 848 | Block *curr, *prev; 849 | CNMEM_CHECK_OR_UNLOCK(allocateBlockUnsafe(curr, prev, size), mMutex); 850 | mSize = size; 851 | CNMEM_CHECK(mMutex.unlock()); 852 | return CNMEM_STATUS_SUCCESS; 853 | } 854 | 855 | /////////////////////////////////////////////////////////////////////////////////////////////////// 856 | 857 | cnmemStatus_t Manager::stealUnsafe(void *&stolen, std::size_t size) { 858 | // If we cannot steal, don't even try. 859 | if( mFlags & CNMEM_FLAGS_CANNOT_STEAL ) { 860 | stolen = NULL; 861 | return CNMEM_STATUS_INVALID_ARGUMENT; 862 | } 863 | 864 | // The stolen block. 865 | void *data = NULL; std::size_t dataSize = 0; 866 | if( !mChildren.empty() ) { 867 | CNMEM_CHECK(stealBlockUnsafe(data, dataSize, size)); 868 | } 869 | else if( mParent ) { 870 | CNMEM_CHECK(mParent->stealBlockUnsafe(data, dataSize, size)); 871 | } 872 | 873 | // Make sure we do have a block of memory or quit. 874 | if( !data ) { 875 | stolen = NULL; 876 | return CNMEM_STATUS_OUT_OF_MEMORY; 877 | } 878 | 879 | // Push the block in the used list. 880 | mUsedBlocks = new Block((char*) data, dataSize, mUsedBlocks, true); 881 | if( !mUsedBlocks ) { 882 | return CNMEM_STATUS_OUT_OF_MEMORY; 883 | } 884 | 885 | // Return the new pointer into memory. 886 | stolen = data; 887 | return CNMEM_STATUS_SUCCESS; 888 | } 889 | 890 | /////////////////////////////////////////////////////////////////////////////////////////////////// 891 | 892 | cnmemStatus_t Manager::stealBlockUnsafe(void *&data, std::size_t &dataSize, ::size_t size) { 893 | // No block found and no room to grow. Try to steal from a children (if we have any). 894 | data = NULL; 895 | for( std::size_t i = 0 ; !data && i < mChildren.size() ; ++i ) { 896 | Manager *child = mChildren[i]; 897 | if( child->giveBlockUnsafe(data, dataSize, size) == CNMEM_STATUS_SUCCESS ) { 898 | break; 899 | } 900 | } 901 | 902 | // If no memory space found, simply return NULL. We have failed to allocate. Quit miserably. 903 | if( !data ) { 904 | return CNMEM_STATUS_OUT_OF_MEMORY; 905 | } 906 | 907 | // We have got a node from a children. We need to update our "used" list before we can do 908 | // anything with it. 909 | Block *curr = mUsedBlocks, *prev = NULL; 910 | for( ; curr ; curr = curr->getNext() ) { 911 | if( curr->getData() <= data && data < curr->getData()+curr->getSize() ) { 912 | break; 913 | } 914 | prev = curr; 915 | } 916 | 917 | // Curr points to the node which contains that memory region. 918 | CNMEM_ASSERT(curr); 919 | 920 | // If it is exactly the same memory region, we are done!!! 921 | if( curr->getData() == data && curr->getSize() == dataSize ) { 922 | return CNMEM_STATUS_SUCCESS; 923 | } 924 | 925 | // Track the blocks before and after curr. 926 | Block *next = curr->getNext(); 927 | 928 | // We may have up to 3 blocks. 929 | std::size_t sizeBefore = (std::size_t) ((char*) data - curr->getData()); 930 | std::size_t sizeAfter = (curr->getSize() - sizeBefore - dataSize); 931 | 932 | // The resulting block. 933 | Block *result = curr; 934 | 935 | // If we have no space between curr->getData and block->getData. 936 | if( sizeBefore == 0 ) { 937 | curr->setSize(dataSize); 938 | } 939 | else { 940 | curr->setSize(sizeBefore); 941 | Block *block = new Block((char*) data, dataSize, next, false); 942 | if( !block ) { 943 | return CNMEM_STATUS_OUT_OF_MEMORY; 944 | } 945 | curr->setNext(block); 946 | curr = block; 947 | data = (char*) data + dataSize; 948 | dataSize = sizeAfter; 949 | result = block; 950 | } 951 | 952 | // We have space at the end so we may need to add a new node. 953 | if( sizeAfter > 0 ) { 954 | Block *block = new Block(curr->getData() + curr->getSize(), sizeAfter, next, false); 955 | if( !block ) { 956 | return CNMEM_STATUS_OUT_OF_MEMORY; 957 | } 958 | curr->setNext(block); 959 | curr = block; 960 | } 961 | return CNMEM_STATUS_SUCCESS; 962 | } 963 | 964 | /////////////////////////////////////////////////////////////////////////////////////////////////// 965 | 966 | class Context { 967 | /// Use a magic number to specify that the context is valid. 968 | enum { CTX_VALID = 0x1f5632a3 }; 969 | 970 | /// The reference counting mechanism. 971 | int mRefCount; 972 | /// The mutex to increase/decrease the reference counter. TODO: Use atomics. 973 | Mutex mMutex; 974 | /// The memory managers. 975 | std::vector mManagers; 976 | /// The global context. 977 | static Context *sCtx; 978 | /// Use a magic number to specify that the context was created. 979 | static int sCtxCheck; 980 | 981 | public: 982 | /// Ctor. 983 | Context() : mRefCount(1) { mMutex.initialize(); } 984 | /// Dtor. 985 | ~Context(); 986 | /// Get the managers. 987 | inline std::vector& getManagers() { return mManagers; } 988 | /// Get a single manager associated with a device. 989 | inline Manager& getManager(int i) { return mManagers[i]; } 990 | 991 | /// Create the global context. 992 | static cnmemStatus_t create(); 993 | /// Check that the context was created. 994 | static inline bool check() { return sCtxCheck == CTX_VALID && sCtx; } 995 | /// Get the global context. 996 | static Context* get(); 997 | /// Retain. 998 | static cnmemStatus_t retain(); 999 | /// Release. 1000 | static cnmemStatus_t release(); 1001 | }; 1002 | 1003 | Context *Context::sCtx; 1004 | int Context::sCtxCheck; 1005 | 1006 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1007 | 1008 | Context::~Context() { 1009 | int oldDevice; 1010 | cudaGetDevice(&oldDevice); 1011 | for( std::size_t i = 0 ; i < mManagers.size() ; ++i ) { 1012 | if( mManagers[i].getDevice() != -1 ) { // Skip invalid managers. 1013 | cudaSetDevice(mManagers[i].getDevice()); 1014 | mManagers[i].releaseAllUnsafe(); 1015 | } 1016 | } 1017 | mManagers.clear(); 1018 | mMutex.finalize(); 1019 | cudaSetDevice(oldDevice); 1020 | } 1021 | 1022 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1023 | 1024 | cnmemStatus_t Context::create() { 1025 | sCtx = new Context; 1026 | sCtxCheck = CTX_VALID; 1027 | return CNMEM_STATUS_SUCCESS; 1028 | } 1029 | 1030 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1031 | 1032 | Context* Context::get() { 1033 | CNMEM_ASSERT(Context::check()); 1034 | return Context::sCtx; 1035 | } 1036 | 1037 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1038 | 1039 | cnmemStatus_t Context::retain() { 1040 | CNMEM_CHECK(sCtx->mMutex.lock()); 1041 | sCtx->mRefCount++; 1042 | CNMEM_CHECK(sCtx->mMutex.unlock()); 1043 | return CNMEM_STATUS_SUCCESS; 1044 | } 1045 | 1046 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1047 | 1048 | cnmemStatus_t Context::release() { 1049 | CNMEM_CHECK(sCtx->mMutex.lock()); 1050 | int refCount = --sCtx->mRefCount; 1051 | CNMEM_CHECK(sCtx->mMutex.unlock()); 1052 | 1053 | if( refCount == 0 ) { // Kill the context. 1054 | delete sCtx; 1055 | Context::sCtx = NULL; 1056 | Context::sCtxCheck = 0; 1057 | } 1058 | return CNMEM_STATUS_SUCCESS; 1059 | } 1060 | 1061 | } // namespace cnmem 1062 | 1063 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1064 | 1065 | extern "C" { 1066 | 1067 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1068 | 1069 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1070 | 1071 | cnmemStatus_t cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags) { 1072 | // Make sure we have at least one device declared. 1073 | CNMEM_CHECK_TRUE(numDevices > 0, CNMEM_STATUS_INVALID_ARGUMENT); 1074 | 1075 | // Find the largest ID of the device. 1076 | int maxDevice = 0; 1077 | for( int i = 0 ; i < numDevices ; ++i ) { 1078 | if( devices[i].device > maxDevice ) { 1079 | maxDevice = devices[i].device; 1080 | } 1081 | } 1082 | 1083 | // Create the global context. 1084 | cnmem::Context::create(); 1085 | cnmem::Context *ctx = cnmem::Context::get(); 1086 | 1087 | // Allocate enough managers. 1088 | CNMEM_CHECK_TRUE(maxDevice >= 0, CNMEM_STATUS_INVALID_ARGUMENT); 1089 | std::vector &managers = ctx->getManagers(); 1090 | managers.resize(maxDevice+1); 1091 | 1092 | // Create a root manager for each device and create the children. 1093 | int oldDevice; 1094 | CNMEM_CHECK_CUDA(cudaGetDevice(&oldDevice)); 1095 | for( int i = 0 ; i < numDevices ; ++i ) { 1096 | CNMEM_CHECK_CUDA(cudaSetDevice(devices[i].device)); 1097 | std::size_t size = devices[i].size; 1098 | cudaDeviceProp props; 1099 | CNMEM_CHECK_CUDA(cudaGetDeviceProperties(&props, devices[i].device)); 1100 | if( size == 0 ) { 1101 | size = props.totalGlobalMem / 2; 1102 | } 1103 | CNMEM_CHECK_TRUE( 1104 | size > 0 && size < props.totalGlobalMem, CNMEM_STATUS_INVALID_ARGUMENT); 1105 | 1106 | cnmem::Manager &manager = ctx->getManager(devices[i].device); 1107 | manager.setDevice(devices[i].device); 1108 | manager.setFlags(flags); 1109 | 1110 | size = cnmem::ceilInt(size, CNMEM_GRANULARITY); 1111 | CNMEM_CHECK(manager.reserve(size)); 1112 | 1113 | for( int j = 0 ; j < devices[i].numStreams ; ++j ) { 1114 | cnmem::Manager *child = new cnmem::Manager; 1115 | child->setParent(&manager); 1116 | child->setDevice(devices[i].device); 1117 | child->setStream(devices[i].streams[j]); 1118 | child->setFlags(flags & ~CNMEM_FLAGS_CANNOT_GROW); 1119 | if( devices[i].streamSizes && devices[i].streamSizes[j] > 0 ) { 1120 | //https://docs.nvidia.com/cuda/cuda-c-best-practices-guide/index.html#sequential-but-misaligned-access-pattern 1121 | //Align stream blocks so stream base addresses are alligned to CNMEM_GRANULARITY 1122 | devices[i].streamSizes[j] = cnmem::ceilInt(devices[i].streamSizes[j], CNMEM_GRANULARITY); 1123 | CNMEM_CHECK(child->reserve(devices[i].streamSizes[j])); 1124 | } 1125 | CNMEM_CHECK(manager.addChild(child)); 1126 | } 1127 | } 1128 | CNMEM_CHECK_CUDA(cudaSetDevice(oldDevice)); 1129 | return CNMEM_STATUS_SUCCESS; 1130 | } 1131 | 1132 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1133 | 1134 | cnmemStatus_t cnmemFinalize() { 1135 | CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); 1136 | return cnmem::Context::release(); 1137 | } 1138 | 1139 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1140 | 1141 | cnmemStatus_t cnmemRetain() { 1142 | CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); 1143 | return cnmem::Context::retain(); 1144 | } 1145 | 1146 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1147 | 1148 | cnmemStatus_t cnmemRelease() { 1149 | CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); 1150 | return cnmem::Context::release(); 1151 | } 1152 | 1153 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1154 | 1155 | cnmemStatus_t cnmemRegisterStream(cudaStream_t stream) { 1156 | CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); 1157 | CNMEM_CHECK_TRUE(stream, CNMEM_STATUS_INVALID_ARGUMENT); 1158 | 1159 | int device; 1160 | CNMEM_CHECK_CUDA(cudaGetDevice(&device)); 1161 | 1162 | cnmem::Manager &root = cnmem::Context::get()->getManager(device); 1163 | cnmem::Manager *child = new cnmem::Manager; 1164 | child->setParent(&root); 1165 | child->setDevice(device); 1166 | child->setStream(stream); 1167 | child->setFlags(root.getFlags() & ~CNMEM_FLAGS_CANNOT_GROW); 1168 | root.addChild(child); 1169 | 1170 | return CNMEM_STATUS_SUCCESS; 1171 | } 1172 | 1173 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1174 | 1175 | cnmemStatus_t cnmemMalloc(void **ptr, std::size_t size, cudaStream_t stream) { 1176 | CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); 1177 | if( !ptr && !size ) { 1178 | return CNMEM_STATUS_SUCCESS; 1179 | } 1180 | else if( !size ) { 1181 | ptr[0] = NULL; 1182 | return CNMEM_STATUS_SUCCESS; 1183 | } 1184 | CNMEM_CHECK_TRUE(ptr, CNMEM_STATUS_INVALID_ARGUMENT); 1185 | 1186 | int device; 1187 | CNMEM_CHECK_CUDA(cudaGetDevice(&device)); 1188 | 1189 | cnmem::Manager &root = cnmem::Context::get()->getManager(device); 1190 | cnmem::Manager *manager = &root; 1191 | if( stream ) { 1192 | CNMEM_CHECK(root.getChildFromStream(manager, stream)); 1193 | } 1194 | CNMEM_ASSERT(manager); 1195 | 1196 | size = cnmem::ceilInt(size, CNMEM_GRANULARITY); 1197 | cnmemStatus_t result = manager->allocate(ptr[0], size); 1198 | 1199 | // We failed to allocate but there might still be a buffer available in another manager. Try to 1200 | // steal it. 1201 | if( result == CNMEM_STATUS_OUT_OF_MEMORY ) { 1202 | 1203 | // Try to acquire locks on all the children. 1204 | std::size_t numChildren; 1205 | CNMEM_CHECK(root.getNumChildren(numChildren)); 1206 | std::vector mutexes(numChildren); 1207 | 1208 | std::size_t numLocked = 0; 1209 | for( size_t i = 0 ; i < numChildren ; ++i, ++numLocked ) { 1210 | cnmem::Manager *child; 1211 | CNMEM_CHECK(root.getChild(child, i)); 1212 | mutexes[numLocked] = child->getMutex(); 1213 | if( mutexes[numLocked]->lock() != CNMEM_STATUS_SUCCESS ) { 1214 | break; 1215 | } 1216 | } 1217 | 1218 | // One lock failed, quit. Reduce the damage as much as possible, though. 1219 | if( numLocked != numChildren ) { 1220 | for( std::size_t i = 0 ; i < numLocked ; ++i ) { 1221 | cnmemStatus_t lockStatus = mutexes[i]->unlock(); 1222 | } 1223 | return CNMEM_STATUS_UNKNOWN_ERROR; 1224 | } 1225 | 1226 | // Grab the lock on the root, first. 1227 | const cnmem::Mutex *rootMutex = root.getMutex(); 1228 | CNMEM_CHECK(rootMutex->lock()); 1229 | 1230 | // We acquired all the lock so we try to steal a node from another child. 1231 | if( numLocked == mutexes.size() ) { 1232 | result = manager->stealUnsafe(ptr[0], size); 1233 | } 1234 | for( std::size_t i = 0 ; i < numLocked ; ++i ) { 1235 | cnmemStatus_t lockStatus = mutexes[i]->unlock(); 1236 | if( lockStatus != CNMEM_STATUS_SUCCESS ) { 1237 | // Starting from now we are panicking!!! One lock failed to be released, we try 1238 | // we others. We could also give up because we are already screwed. I don't know 1239 | // what's best! Comment are welcome. 1240 | result = lockStatus; 1241 | } 1242 | } 1243 | CNMEM_CHECK(rootMutex->unlock()); 1244 | } 1245 | return result; 1246 | } 1247 | 1248 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1249 | 1250 | cnmemStatus_t cnmemFree(void *ptr, cudaStream_t stream) { 1251 | CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); 1252 | if( ptr == NULL ) { 1253 | return CNMEM_STATUS_SUCCESS; 1254 | } 1255 | 1256 | int device; 1257 | CNMEM_CHECK_CUDA(cudaGetDevice(&device)); 1258 | 1259 | cnmem::Manager &root = cnmem::Context::get()->getManager(device); 1260 | cnmem::Manager *manager = &root; 1261 | if( stream ) { 1262 | CNMEM_CHECK(root.getChildFromStream(manager, stream)); 1263 | } 1264 | CNMEM_ASSERT(manager); 1265 | return manager->release(ptr); 1266 | } 1267 | 1268 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1269 | 1270 | cnmemStatus_t cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream) { 1271 | CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); 1272 | CNMEM_CHECK_TRUE(totalMem && freeMem, CNMEM_STATUS_INVALID_ARGUMENT); 1273 | 1274 | int device; 1275 | CNMEM_CHECK_CUDA(cudaGetDevice(&device)); 1276 | cnmem::Manager &root = cnmem::Context::get()->getManager(device); 1277 | cnmem::Manager *manager = &root; 1278 | if( stream ) { 1279 | CNMEM_CHECK(root.getChildFromStream(manager, stream)); 1280 | } 1281 | CNMEM_ASSERT(manager); 1282 | 1283 | const cnmem::Mutex *mutex = manager->getMutex(); 1284 | CNMEM_CHECK(mutex->lock()); 1285 | CNMEM_CHECK_OR_UNLOCK(manager->getFreeMemoryUnsafe(*freeMem), *mutex); 1286 | size_t usedMem; 1287 | CNMEM_CHECK_OR_UNLOCK(manager->getUsedMemoryUnsafe(usedMem), *mutex); 1288 | CNMEM_CHECK(mutex->unlock()); 1289 | totalMem[0] = usedMem + freeMem[0]; 1290 | return CNMEM_STATUS_SUCCESS; 1291 | } 1292 | 1293 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1294 | 1295 | cnmemStatus_t cnmemPrintMemoryState(FILE *file, cudaStream_t stream) { 1296 | CNMEM_CHECK_TRUE(cnmem::Context::check(), CNMEM_STATUS_NOT_INITIALIZED); 1297 | 1298 | int device; 1299 | CNMEM_CHECK_CUDA(cudaGetDevice(&device)); 1300 | cnmem::Manager &root = cnmem::Context::get()->getManager(device); 1301 | cnmem::Manager *manager = &root; 1302 | if( stream ) { 1303 | CNMEM_CHECK(root.getChildFromStream(manager, stream)); 1304 | } 1305 | CNMEM_ASSERT(manager); 1306 | return manager->printMemoryState(file); 1307 | } 1308 | 1309 | /////////////////////////////////////////////////////////////////////////////////////////////////// 1310 | 1311 | } // extern "C" 1312 | 1313 | --------------------------------------------------------------------------------