├── managed_allocator.hpp ├── demo.cu └── README.md /managed_allocator.hpp: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include 5 | 6 | template 7 | class managed_allocator 8 | { 9 | public: 10 | using value_type = T; 11 | using reference = T&; 12 | using const_reference = const T&; 13 | 14 | managed_allocator() {} 15 | 16 | template 17 | managed_allocator(const managed_allocator&) {} 18 | 19 | value_type* allocate(size_t n) 20 | { 21 | value_type* result = nullptr; 22 | 23 | cudaError_t error = cudaMallocManaged(&result, n*sizeof(T), cudaMemAttachGlobal); 24 | 25 | if(error != cudaSuccess) 26 | { 27 | throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::allocate(): cudaMallocManaged"); 28 | } 29 | 30 | return result; 31 | } 32 | 33 | void deallocate(value_type* ptr, size_t) 34 | { 35 | cudaError_t error = cudaFree(ptr); 36 | 37 | if(error != cudaSuccess) 38 | { 39 | throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::deallocate(): cudaFree"); 40 | } 41 | } 42 | }; 43 | 44 | template 45 | bool operator==(const managed_allocator&, const managed_allocator&) 46 | { 47 | return true; 48 | } 49 | 50 | template 51 | bool operator!=(const managed_allocator& lhs, const managed_allocator& rhs) 52 | { 53 | return !(lhs == rhs); 54 | } 55 | 56 | -------------------------------------------------------------------------------- /demo.cu: -------------------------------------------------------------------------------- 1 | #include "managed_allocator.hpp" 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | // create a nickname for vectors which use a managed_allocator 12 | template 13 | using managed_vector = std::vector>; 14 | 15 | __global__ void increment_kernel(int *data, size_t n) 16 | { 17 | size_t i = blockDim.x * blockIdx.x + threadIdx.x; 18 | 19 | if(i < n) 20 | { 21 | data[i] += 1; 22 | } 23 | } 24 | 25 | int main() 26 | { 27 | size_t n = 1 << 20; 28 | 29 | managed_vector vec(n); 30 | 31 | // we can use the vector from the host 32 | std::iota(vec.begin(), vec.end(), 0); 33 | 34 | std::vector ref(n); 35 | std::iota(ref.begin(), ref.end(), 0); 36 | assert(std::equal(ref.begin(), ref.end(), vec.begin())); 37 | 38 | // we can also use it in a CUDA kernel 39 | size_t block_size = 256; 40 | size_t num_blocks = (n + (block_size - 1)) / block_size; 41 | 42 | increment_kernel<<>>(vec.data(), vec.size()); 43 | 44 | cudaDeviceSynchronize(); 45 | 46 | std::for_each(ref.begin(), ref.end(), [](int& x) 47 | { 48 | x += 1; 49 | }); 50 | 51 | assert(std::equal(ref.begin(), ref.end(), vec.begin())); 52 | 53 | // we can also use it with Thrust algorithms 54 | 55 | // by default, the Thrust algorithm will execute on the host with the managed_vector 56 | thrust::fill(vec.begin(), vec.end(), 7); 57 | assert(std::all_of(vec.begin(), vec.end(), [](int x) 58 | { 59 | return x == 7; 60 | })); 61 | 62 | // to execute on the device, use the thrust::device execution policy 63 | thrust::fill(thrust::device, vec.begin(), vec.end(), 13); 64 | 65 | // we need to synchronize before attempting to use the vector on the host 66 | cudaDeviceSynchronize(); 67 | 68 | // to execute on the host, use the thrust::host execution policy 69 | assert(thrust::all_of(thrust::host, vec.begin(), vec.end(), [](int x) 70 | { 71 | return x == 13; 72 | })); 73 | 74 | std::cout << "OK" << std::endl; 75 | 76 | return 0; 77 | } 78 | 79 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # `managed_allocator` 2 | 3 | A C++ allocator based on `cudaMallocManaged()`. 4 | 5 | To create a custom C++ allocator which allocates storage using `cudaMallocManaged`, we need to make a class and give it `.allocate()` and `.deallocate()` functions: 6 | 7 | The `.allocate()` function calls `cudaMallocManaged` and throws an exception if it fails: 8 | 9 | ``` 10 | value_type* allocate(size_t n) 11 | { 12 | value_type* result = nullptr; 13 | 14 | cudaError_t error = cudaMallocManaged(&result, n*sizeof(T), cudaMemAttachGlobal); 15 | 16 | if(error != cudaSuccess) 17 | { 18 | throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::allocate(): cudaMallocManaged"); 19 | } 20 | 21 | return result; 22 | } 23 | ``` 24 | 25 | The `.deallocate()` function can just call `cudaFree()`: 26 | 27 | ``` 28 | void deallocate(value_type* ptr, size_t) 29 | { 30 | cudaError_t error = cudaFree(ptr); 31 | 32 | if(error != cudaSuccess) 33 | { 34 | throw thrust::system_error(error, thrust::cuda_category(), "managed_allocator::deallocate(): cudaFree"); 35 | } 36 | } 37 | ``` 38 | 39 | Here's a program which demonstrates some of the different things you can do with it: 40 | 41 | ``` 42 | #include "managed_allocator.hpp" 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | 52 | // create a nickname for vectors which use a managed_allocator 53 | template 54 | using managed_vector = std::vector>; 55 | 56 | __global__ void increment_kernel(int *data, size_t n) 57 | { 58 | size_t i = blockDim.x * blockIdx.x + threadIdx.x; 59 | 60 | if(i < n) 61 | { 62 | data[i] += 1; 63 | } 64 | } 65 | 66 | int main() 67 | { 68 | size_t n = 1 << 20; 69 | 70 | managed_vector vec(n); 71 | 72 | // we can use the vector from the host 73 | std::iota(vec.begin(), vec.end(), 0); 74 | 75 | std::vector ref(n); 76 | std::iota(ref.begin(), ref.end(), 0); 77 | assert(std::equal(ref.begin(), ref.end(), vec.begin())); 78 | 79 | // we can also use it in a CUDA kernel 80 | size_t block_size = 256; 81 | size_t num_blocks = (n + (block_size - 1)) / block_size; 82 | 83 | increment_kernel<<>>(vec.data(), vec.size()); 84 | 85 | cudaDeviceSynchronize(); 86 | 87 | std::for_each(ref.begin(), ref.end(), [](int& x) 88 | { 89 | x += 1; 90 | }); 91 | 92 | assert(std::equal(ref.begin(), ref.end(), vec.begin())); 93 | 94 | // we can also use it with Thrust algorithms 95 | 96 | // by default, the Thrust algorithm will execute on the host with the managed_vector 97 | thrust::fill(vec.begin(), vec.end(), 7); 98 | assert(std::all_of(vec.begin(), vec.end(), [](int x) 99 | { 100 | return x == 7; 101 | })); 102 | 103 | // to execute on the device, use the thrust::device execution policy 104 | thrust::fill(thrust::device, vec.begin(), vec.end(), 13); 105 | 106 | // we need to synchronize before attempting to use the vector on the host 107 | cudaDeviceSynchronize(); 108 | 109 | // to execute on the host, use the thrust::host execution policy 110 | assert(thrust::all_of(thrust::host, vec.begin(), vec.end(), [](int x) 111 | { 112 | return x == 13; 113 | })); 114 | 115 | std::cout << "OK" << std::endl; 116 | 117 | return 0; 118 | } 119 | ``` 120 | --------------------------------------------------------------------------------