├── Makefile ├── README.md ├── common.cc ├── common.hh ├── gemmManaged.cu ├── gemmManagedOutOfCore.cu ├── gemmManagedPrefetch.cu ├── gemmMemcpy.cu ├── gemmXtOutOfCore.cu ├── simpleDMA.cu ├── simpleManaged.cu ├── simpleManagedPrefetch.cu ├── simpleMemcpy.cu └── stridedManaged.cu /Makefile: -------------------------------------------------------------------------------- 1 | 2 | 3 | NVCC = nvcc --compiler-options="-Wall -Wextra -O3" -std=c++11 -arch=compute_61 -code=sm_61 -lcublas 4 | 5 | 6 | default: common.o simpleMemcpy simpleManaged simpleManagedPrefetch simpleDMA stridedManaged gemmMemcpy gemmManaged gemmManagedPrefetch gemmXtOutOfCore gemmManagedOutOfCore 7 | 8 | 9 | simpleMemcpy: Makefile simpleMemcpy.cu common.o 10 | $(NVCC) -o simpleMemcpy simpleMemcpy.cu common.o 11 | 12 | simpleManaged: Makefile simpleManaged.cu common.o 13 | $(NVCC) -o simpleManaged simpleManaged.cu common.o 14 | 15 | simpleManagedPrefetch: Makefile simpleManagedPrefetch.cu common.o 16 | $(NVCC) -o simpleManagedPrefetch simpleManagedPrefetch.cu common.o 17 | 18 | simpleDMA: Makefile simpleDMA.cu common.o 19 | $(NVCC) -o simpleDMA simpleDMA.cu common.o 20 | 21 | stridedManaged: Makefile stridedManaged.cu common.o 22 | $(NVCC) -o stridedManaged stridedManaged.cu common.o 23 | 24 | gemmMemcpy: Makefile gemmMemcpy.cu common.o 25 | $(NVCC) -o gemmMemcpy gemmMemcpy.cu common.o 26 | 27 | gemmManaged: Makefile gemmManaged.cu common.o 28 | $(NVCC) -o gemmManaged gemmManaged.cu common.o 29 | 30 | gemmManagedPrefetch: Makefile gemmManagedPrefetch.cu common.o 31 | $(NVCC) -o gemmManagedPrefetch gemmManagedPrefetch.cu common.o 32 | 33 | gemmXtOutOfCore: Makefile gemmXtOutOfCore.cu common.o 34 | $(NVCC) -o gemmXtOutOfCore gemmXtOutOfCore.cu common.o 35 | 36 | gemmManagedOutOfCore: Makefile gemmManagedOutOfCore.cu common.o 37 | $(NVCC) -o gemmManagedOutOfCore gemmManagedOutOfCore.cu common.o 38 | 39 | common.o: Makefile common.cc common.hh 40 | $(NVCC) -c common.cc 41 | 42 | 43 | clean: 44 | rm -f common.o simpleMemcpy simpleManaged simpleManagedPrefetch simpleDMA stridedManaged gemmMemcpy gemmManaged gemmManagedPrefetch gemmXtOutOfCore gemmManagedOutOfCore 45 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # Cuda benchmarks for unified vs. explicit memory 3 | 4 | 5 | ## Unified memory 6 | 7 | Unified memory has been a feature of game consoles for many years. It simplifies game development because it frees the programmer from having to track whether a memory block is on CPU or GPU memory. 8 | 9 | 10 | Starting with the Pascal architecture, Nvidia also offers [advanced unified memory support](https://devblogs.nvidia.com/unified-memory-cuda-beginners/). 11 | 12 | 13 | Game consoles can have tighter hardware control, but Nvidia also has the [Jetson product line](https://en.wikipedia.org/wiki/Nvidia_Jetson) with physically unified memory that has been [reported](https://devtalk.nvidia.com/default/topic/1029853/does-unified-memory-and-zero-copy-always-better-than-cudamemcpy-/) to have better performance than explicit memory management. 14 | 15 | 16 | ## Unified memory reputation 17 | 18 | There have been several reports, many of them from around 2014, that unified memory has a lower performance in many scenarios. The reports usually do not come with links to actual code. 19 | 20 | Recent reports appear to address only specialty scenarios like [8 NVlink GPUs](https://devtalk.nvidia.com/default/topic/1029706/cuda-programming-and-performance/partial-fail-of-peer-access-in-8-volta-gpu-instance-p3-16xlarge-on-aws-gt-huge-slowdown-/). 21 | 22 | That report also has no code link. 23 | 24 | 25 | 26 | ## Unified memory reality 27 | 28 | This benchmark suite attempts to provide actual code so that people can check for themselves. It is incomplete and does not yet address scenarios like IPC or multiple GPUs. Benchmarks that show the superiority of explicit memory management are welcome. 29 | 30 | 31 | ## Examples 32 | 33 | The examples were run on Linux with Cuda release 9.2, V9.2.148, using 16GB of host memory and a GeForce 1060 with 6GB of GPU memory. 34 | 35 | ### simpleManaged vs. simpleMemcpy vs. simpleDMA 36 | 37 | This benchmark tests initializing three arrays in host memory, running a kernel and accessing the result. Most of the time is spent in copying, the kernel runtime is negligible. 38 | 39 | With N=200000000, explicit memory performs slightly better, but DMA is faster than both: 40 | 41 | 42 | ``` 43 | $ ./simpleManaged 200000000 44 | host: MallocManaged: 0.000040 45 | host: init arrays: 0.662895 46 | device: uvm+compute+synchronize: 0.892010 47 | host: access all arrays: 0.929058 48 | host: access all arrays a second time: 0.245681 49 | host: free: 0.176788 50 | total: 2.906544 51 | 52 | $ ./simpleMemcpy 200000000 53 | host: MallocHost: 0.706024 54 | host: init arrays: 0.259399 55 | device: malloc+copy+compute: 0.420570 56 | host: access all arrays: 0.239900 57 | host: access all arrays a second time: 0.239795 58 | host: free: 0.350564 59 | total: 2.216320 60 | 61 | $ ./simpleDMA 200000000 62 | host: MallocHost: 0.700510 63 | host: init arrays: 0.260276 64 | device: DMA+compute+synchronize: 0.266353 65 | host: access all arrays: 0.241061 66 | host: access all arrays a second time: 0.240792 67 | host: free: 0.349305 68 | total: 2.058358 69 | ``` 70 | 71 | With N=500000000, managed memory has no issues, but explicit memory does not run at all. DMA again performs very well: 72 | 73 | 74 | ``` 75 | $ ./simpleManaged 500000000 76 | host: MallocManaged: 0.000043 77 | host: init arrays: 1.632873 78 | device: uvm+compute+synchronize: 2.235518 79 | host: access all arrays: 1.640106 80 | host: access all arrays a second time: 0.607754 81 | host: free: 0.382087 82 | total: 6.498456 83 | 84 | $ ./simpleMemcpy 500000000 85 | host: MallocHost: 1.751784 86 | host: init arrays: 0.674096 87 | cudaErrorMemoryAllocation 88 | 89 | $ ./simpleDMA 500000000 90 | host: MallocHost: 1.750448 91 | host: init arrays: 0.673640 92 | device: DMA+compute+synchronize: 0.665088 93 | host: access all arrays: 0.607256 94 | host: access all arrays a second time: 0.607619 95 | host: free: 0.882589 96 | total: 5.186704 97 | ``` 98 | 99 | ### cuBLAS: gemmManaged vs. gemmMemcpy 100 | 101 | This benchmark calls the cublasSgemm() function. 102 | 103 | With N=8000, managed memory is considerably faster: 104 | 105 | ``` 106 | $ ./gemmManaged 8000 107 | host: MallocManaged+init: 0.191981 108 | cublasSgemm: 1.430046 109 | host: access all arrays: 0.000080 110 | host: access all arrays a second time: 0.000008 111 | host: free: 0.030062 112 | total: 1.967801 113 | 114 | $ ./gemmMemcpy 8000 115 | host: MallocHost+init: 0.236840 116 | cublasSgemm: 3.316726 117 | host: access all arrays: 0.000030 118 | host: access all arrays a second time: 0.000008 119 | host: free: 0.061765 120 | total: 3.928581 121 | ``` 122 | 123 | With N=16000, managed memory is not only considerably faster, but explicit memory performance is catastrophic: 124 | 125 | ``` 126 | $ ./gemmManaged 16000 127 | host: MallocManaged+init: 0.761249 128 | cublasSgemm: 3.317761 129 | host: access all arrays: 0.000105 130 | host: access all arrays a second time: 0.000045 131 | host: free: 0.084146 132 | total: 4.477609 133 | 134 | $ ./gemmMemcpy 16000 135 | host: MallocHost+init: 0.940572 136 | cublasSgemm: 35.439908 137 | host: access all arrays: 0.000038 138 | host: access all arrays a second time: 0.000017 139 | host: free: 0.232385 140 | total: 36.929403 141 | ``` 142 | 143 | 144 | ### cuBlas+Managed vs. cuBlasXt+HostMemory 145 | 146 | cuBlasXt handles out-of-core computations for memory that is allocated with cudaMallocHost(). This benchmark compares the cublasSgemm() function running on managed memory vs. the cublasXtSgemm() function running on host allocated memory. 147 | 148 | Note that cublasXtSgemm() is designed to run on host allocated memory and handles optimized tiled memory transfers. Also note that cuBlasXt has more functionality (multiple cards), so the slightly worse performance is not surprising. 149 | 150 | The point of this comparison, however, is that managed memory performs very well using the standard cuBlas function. 151 | 152 | ``` 153 | $ ./gemmManagedOutOfCore 32000 154 | host: MallocManaged+init: 3.059273 155 | cublasSgemm: 20.510228 156 | 157 | $ ./gemmXtOutOfCore 32000 158 | host: MallocHost+init: 3.766991 159 | cublasXtSgemm: 25.617316 160 | ``` 161 | -------------------------------------------------------------------------------- /common.cc: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include "common.hh" 9 | 10 | const char * 11 | cublasStatusAsString(cublasStatus_t status) 12 | { 13 | switch (status) { 14 | case CUBLAS_STATUS_SUCCESS: 15 | return "CUBLAS_STATUS_SUCCESS"; 16 | case CUBLAS_STATUS_NOT_INITIALIZED: 17 | return "CUBLAS_STATUS_NOT_INITIALIZED"; 18 | case CUBLAS_STATUS_ALLOC_FAILED: 19 | return "CUBLAS_STATUS_ALLOC_FAILED"; 20 | case CUBLAS_STATUS_INVALID_VALUE: 21 | return "CUBLAS_STATUS_INVALID_VALUE"; 22 | case CUBLAS_STATUS_ARCH_MISMATCH: 23 | return "CUBLAS_STATUS_ARCH_MISMATCH"; 24 | case CUBLAS_STATUS_MAPPING_ERROR: 25 | return "CUBLAS_STATUS_MAPPING_ERROR"; 26 | case CUBLAS_STATUS_EXECUTION_FAILED: 27 | return "CUBLAS_STATUS_EXECUTION_FAILED"; 28 | case CUBLAS_STATUS_INTERNAL_ERROR: 29 | return "CUBLAS_STATUS_INTERNAL_ERROR"; 30 | default: 31 | return "unknown cublas status"; 32 | } 33 | } 34 | 35 | void 36 | check(cudaError_t err) 37 | { 38 | if (err != cudaSuccess) { 39 | fprintf(stderr, "%s\n", cudaGetErrorName(err)); 40 | exit(1); 41 | } 42 | } 43 | 44 | void 45 | check(cublasStatus_t status) 46 | { 47 | if (status != CUBLAS_STATUS_SUCCESS) { 48 | fprintf(stderr, "%s\n", cublasStatusAsString(status)); 49 | exit(1); 50 | } 51 | } 52 | 53 | size_t 54 | checked_strtosize(const char *v) 55 | { 56 | char *endptr; 57 | long long lld; 58 | 59 | errno = 0; 60 | lld = strtoll(v, &endptr, 10); 61 | if (*v == '\0' || *endptr != '\0') { 62 | fprintf(stderr, "N: invalid integer: '%s'\n", v); 63 | exit(1); 64 | } 65 | 66 | if (errno == ERANGE || lld < 1 || (uint64_t)lld > SIZE_MAX) { 67 | fprintf(stderr, "N: out of range: '%s'\n", v); 68 | exit(1); 69 | } 70 | 71 | return (size_t)lld; 72 | } 73 | 74 | size_t 75 | checked_mul(size_t a, size_t b) 76 | { 77 | if (a > SIZE_MAX / b) { 78 | fprintf(stderr, "overflow error\n"); 79 | exit(1); 80 | } 81 | 82 | return a * b; 83 | } 84 | 85 | void 86 | log(const char *prefix, clock_t start, clock_t end) 87 | { 88 | printf("%s: %f\n", prefix, (double)(end-start)/(double)CLOCKS_PER_SEC); 89 | } 90 | -------------------------------------------------------------------------------- /common.hh: -------------------------------------------------------------------------------- 1 | #ifndef COMMON_H 2 | #define COMMON_H 3 | 4 | #include 5 | #include 6 | 7 | const char *cublasStatusAsString(cublasStatus_t status); 8 | void check(cudaError_t err); 9 | void check(cublasStatus_t status); 10 | size_t checked_strtosize(const char *v); 11 | size_t checked_mul(size_t a, size_t b); 12 | void log(const char *prefix, clock_t start, clock_t end); 13 | 14 | 15 | #endif 16 | -------------------------------------------------------------------------------- /gemmManaged.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "common.hh" 6 | 7 | 8 | int 9 | main(int argc, char *argv[]) 10 | { 11 | size_t N = 1000; 12 | clock_t start_program, end_program; 13 | clock_t start, end; 14 | cublasHandle_t handle; 15 | float *a, *b, *c; 16 | const float alpha = 1; 17 | const float beta = 0; 18 | size_t count, nn; 19 | 20 | if (argc == 2) { 21 | N = checked_strtosize(argv[1]); 22 | } 23 | nn = checked_mul(N, N); 24 | count = checked_mul(nn, sizeof(float)); 25 | 26 | start_program = clock(); 27 | 28 | check(cublasCreate(&handle)); 29 | 30 | start = clock(); 31 | check(cudaMallocManaged(&a, count)); 32 | check(cudaMallocManaged(&b, count)); 33 | check(cudaMallocManaged(&c, count)); 34 | 35 | for (size_t i = 0; i < N*N; i++) { 36 | a[i] = i / 37.0; 37 | b[i] = i / 101.0; 38 | } 39 | end = clock(); 40 | log("host: MallocManaged+init", start, end); 41 | 42 | start = clock(); 43 | check(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 44 | N, N, N, 45 | &alpha, 46 | a, N, 47 | b, N, 48 | &beta, 49 | c, N)); 50 | check(cudaDeviceSynchronize()); 51 | end = clock(); 52 | log("cublasSgemm", start, end); 53 | 54 | start = clock(); 55 | for (size_t i = 0; i < N; i++) { 56 | if (a[i] < 0 || b[i] < 0 || c[i] < 0) { 57 | fprintf(stderr, "unexpected result a: %f b: %f c: %f\n", 58 | a[i], b[i], c[i]); 59 | exit(1); 60 | } 61 | } 62 | end = clock(); 63 | log("host: access all arrays", start, end); 64 | 65 | start = clock(); 66 | for (size_t i = 0; i < N; i++) { 67 | if (a[i] < 0 || b[i] < 0 || c[i] < 0) { 68 | fprintf(stderr, "unexpected result a: %f b: %f c: %f\n", 69 | a[i], b[i], c[i]); 70 | exit(1); 71 | } 72 | } 73 | end = clock(); 74 | log("host: access all arrays a second time", start, end); 75 | 76 | start = clock(); 77 | check(cudaFree(a)); 78 | check(cudaFree(b)); 79 | check(cudaFree(c)); 80 | end = clock(); 81 | log("host: free", start, end); 82 | 83 | end_program = clock(); 84 | log("total", start_program, end_program); 85 | 86 | return 0; 87 | } 88 | -------------------------------------------------------------------------------- /gemmManagedOutOfCore.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "common.hh" 6 | 7 | 8 | int 9 | main(int argc, char *argv[]) 10 | { 11 | size_t N = 16000; 12 | clock_t start, end; 13 | cublasHandle_t handle; 14 | float *a, *b, *c; 15 | const float alpha = 1; 16 | const float beta = 0; 17 | size_t count, nn; 18 | 19 | if (argc == 2) { 20 | N = checked_strtosize(argv[1]); 21 | } 22 | nn = checked_mul(N, N); 23 | count = checked_mul(nn, sizeof(float)); 24 | 25 | check(cublasCreate(&handle)); 26 | 27 | start = clock(); 28 | check(cudaMallocManaged(&a, count)); 29 | check(cudaMallocManaged(&b, count)); 30 | check(cudaMallocManaged(&c, count)); 31 | 32 | for (size_t i = 0; i < N*N; i++) { 33 | a[i] = i / 37.0; 34 | b[i] = i / 101.0; 35 | } 36 | end = clock(); 37 | log("host: MallocManaged+init", start, end); 38 | 39 | start = clock(); 40 | check(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 41 | N, N, N, 42 | &alpha, 43 | a, N, 44 | b, N, 45 | &beta, 46 | c, N)); 47 | check(cudaDeviceSynchronize()); 48 | end = clock(); 49 | log("cublasSgemm", start, end); 50 | 51 | return 0; 52 | } 53 | -------------------------------------------------------------------------------- /gemmManagedPrefetch.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "common.hh" 6 | 7 | 8 | int 9 | main(int argc, char *argv[]) 10 | { 11 | size_t N = 1000; 12 | clock_t start_program, end_program; 13 | clock_t start, end; 14 | cublasHandle_t handle; 15 | float *a, *b, *c; 16 | const float alpha = 1; 17 | const float beta = 0; 18 | size_t count, nn; 19 | 20 | if (argc == 2) { 21 | N = checked_strtosize(argv[1]); 22 | } 23 | nn = checked_mul(N, N); 24 | count = checked_mul(nn, sizeof(float)); 25 | 26 | start_program = clock(); 27 | 28 | check(cublasCreate(&handle)); 29 | 30 | start = clock(); 31 | check(cudaMallocManaged(&a, count, cudaMemAttachHost)); 32 | check(cudaMallocManaged(&b, count, cudaMemAttachHost)); 33 | check(cudaMallocManaged(&c, count)); 34 | 35 | for (size_t i = 0; i < N*N; i++) { 36 | a[i] = i / 37.0; 37 | b[i] = i / 101.0; 38 | } 39 | end = clock(); 40 | log("host: MallocManaged+init", start, end); 41 | 42 | start = clock(); 43 | check(cudaStreamAttachMemAsync(NULL, a, 0, cudaMemAttachGlobal)); 44 | check(cudaStreamAttachMemAsync(NULL, b, 0, cudaMemAttachGlobal)); 45 | check(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 46 | N, N, N, 47 | &alpha, 48 | a, N, 49 | b, N, 50 | &beta, 51 | c, N)); 52 | check(cudaStreamAttachMemAsync(NULL, a, 0, cudaMemAttachHost)); 53 | check(cudaStreamAttachMemAsync(NULL, b, 0, cudaMemAttachHost)); 54 | check(cudaStreamAttachMemAsync(NULL, c, 0, cudaMemAttachHost)); 55 | check(cudaStreamSynchronize(NULL)); 56 | end = clock(); 57 | log("cublasSgemm", start, end); 58 | 59 | start = clock(); 60 | for (size_t i = 0; i < N; i++) { 61 | if (a[i] < 0 || b[i] < 0 || c[i] < 0) { 62 | fprintf(stderr, "unexpected result a: %f b: %f c: %f\n", 63 | a[i], b[i], c[i]); 64 | exit(1); 65 | } 66 | } 67 | end = clock(); 68 | log("host: access all arrays", start, end); 69 | 70 | start = clock(); 71 | for (size_t i = 0; i < N; i++) { 72 | if (a[i] < 0 || b[i] < 0 || c[i] < 0) { 73 | fprintf(stderr, "unexpected result a: %f b: %f c: %f\n", 74 | a[i], b[i], c[i]); 75 | exit(1); 76 | } 77 | } 78 | end = clock(); 79 | log("host: access all arrays a second time", start, end); 80 | 81 | start = clock(); 82 | check(cudaFree(a)); 83 | check(cudaFree(b)); 84 | check(cudaFree(c)); 85 | end = clock(); 86 | log("host: free", start, end); 87 | 88 | end_program = clock(); 89 | log("total", start_program, end_program); 90 | 91 | return 0; 92 | } 93 | -------------------------------------------------------------------------------- /gemmMemcpy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "common.hh" 6 | 7 | 8 | int 9 | main(int argc, char *argv[]) 10 | { 11 | size_t N = 1000; 12 | clock_t start_program, end_program; 13 | clock_t start, end; 14 | cublasHandle_t handle; 15 | float *a, *b, *c; 16 | float *da, *db, *dc; 17 | const float alpha = 1; 18 | const float beta = 0; 19 | size_t count, nn; 20 | 21 | if (argc == 2) { 22 | N = checked_strtosize(argv[1]); 23 | } 24 | nn = checked_mul(N, N); 25 | count = checked_mul(nn, sizeof(float)); 26 | 27 | start_program = clock(); 28 | 29 | check(cublasCreate(&handle)); 30 | 31 | start = clock(); 32 | check(cudaMallocHost(&a, count)); 33 | check(cudaMallocHost(&b, count)); 34 | check(cudaMallocHost(&c, count)); 35 | 36 | for (size_t i = 0; i < N*N; i++) { 37 | a[i] = i / 37.0; 38 | b[i] = i / 101.0; 39 | } 40 | end = clock(); 41 | log("host: MallocHost+init", start, end); 42 | 43 | start = clock(); 44 | check(cudaMalloc(&da, count)); 45 | check(cudaMalloc(&db, count)); 46 | check(cudaMalloc(&dc, count)); 47 | 48 | check(cudaMemcpy(da, a, count, cudaMemcpyHostToDevice)); 49 | check(cudaMemcpy(db, b, count, cudaMemcpyHostToDevice)); 50 | 51 | check(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 52 | N, N, N, 53 | &alpha, 54 | a, N, 55 | b, N, 56 | &beta, 57 | c, N)); 58 | 59 | check(cudaMemcpy(c, dc, count, cudaMemcpyDeviceToHost)); 60 | check(cudaFree(da)); 61 | check(cudaFree(db)); 62 | check(cudaFree(dc)); 63 | end = clock(); 64 | log("cublasSgemm", start, end); 65 | 66 | start = clock(); 67 | for (size_t i = 0; i < N; i++) { 68 | if (a[i] < 0 || b[i] < 0 || c[i] < 0) { 69 | fprintf(stderr, "unexpected result a: %f b: %f c: %f\n", 70 | a[i], b[i], c[i]); 71 | exit(1); 72 | } 73 | } 74 | end = clock(); 75 | log("host: access all arrays", start, end); 76 | 77 | start = clock(); 78 | for (size_t i = 0; i < N; i++) { 79 | if (a[i] < 0 || b[i] < 0 || c[i] < 0) { 80 | fprintf(stderr, "unexpected result a: %f b: %f c: %f\n", 81 | a[i], b[i], c[i]); 82 | exit(1); 83 | } 84 | } 85 | end = clock(); 86 | log("host: access all arrays a second time", start, end); 87 | 88 | start = clock(); 89 | check(cudaFreeHost(a)); 90 | check(cudaFreeHost(b)); 91 | check(cudaFreeHost(c)); 92 | end = clock(); 93 | log("host: free", start, end); 94 | 95 | end_program = clock(); 96 | log("total", start_program, end_program); 97 | 98 | return 0; 99 | } 100 | -------------------------------------------------------------------------------- /gemmXtOutOfCore.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "common.hh" 6 | 7 | 8 | int 9 | main(int argc, char *argv[]) 10 | { 11 | size_t N = 16000; 12 | clock_t start, end; 13 | cublasXtHandle_t handle; 14 | int devices[1] = {0}; 15 | float *a, *b, *c; 16 | const float alpha = 1; 17 | const float beta = 0; 18 | size_t count, nn; 19 | 20 | if (argc == 2) { 21 | N = checked_strtosize(argv[1]); 22 | } 23 | nn = checked_mul(N, N); 24 | count = checked_mul(nn, sizeof(float)); 25 | 26 | check(cublasXtCreate(&handle)); 27 | check(cublasXtDeviceSelect(handle, 1, devices)); 28 | 29 | start = clock(); 30 | check(cudaMallocHost(&a, count)); 31 | check(cudaMallocHost(&b, count)); 32 | check(cudaMallocHost(&c, count)); 33 | 34 | for (size_t i = 0; i < N*N; i++) { 35 | a[i] = i / 37.0; 36 | b[i] = i / 101.0; 37 | } 38 | end = clock(); 39 | log("host: MallocHost+init", start, end); 40 | 41 | start = clock(); 42 | check(cublasXtSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, 43 | N, N, N, 44 | &alpha, 45 | a, N, 46 | b, N, 47 | &beta, 48 | c, N)); 49 | end = clock(); 50 | log("cublasXtSgemm", start, end); 51 | 52 | return 0; 53 | } 54 | -------------------------------------------------------------------------------- /simpleDMA.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "common.hh" 6 | 7 | 8 | static __global__ void 9 | f(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N) 10 | { 11 | int64_t index = threadIdx.x + blockIdx.x * blockDim.x; 12 | int64_t stride = blockDim.x * gridDim.x; 13 | 14 | for (int64_t i = index; i < N; i += stride) { 15 | c[i] = a[i] * b[i]; 16 | } 17 | } 18 | 19 | static void 20 | doit(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N) 21 | { 22 | int blockSize = 256; 23 | int64_t numBlocks = (N + blockSize - 1) / blockSize; 24 | 25 | f<<>>(a, b, c, N); 26 | } 27 | 28 | int 29 | main(int argc, char *argv[]) 30 | { 31 | size_t N = 10000000; 32 | clock_t start_program, end_program; 33 | clock_t start, end; 34 | uint64_t *a, *b, *c; 35 | size_t count; 36 | 37 | if (argc == 2) { 38 | N = checked_strtosize(argv[1]); 39 | } 40 | count = checked_mul(N, sizeof(uint64_t)); 41 | 42 | /* Initialize context */ 43 | check(cudaMallocHost(&a, 128)); 44 | check(cudaDeviceSynchronize()); 45 | check(cudaFreeHost(a)); 46 | 47 | start_program = clock(); 48 | 49 | start = clock(); 50 | check(cudaMallocHost(&a, count)); 51 | check(cudaMallocHost(&b, count)); 52 | check(cudaMallocHost(&c, count)); 53 | end = clock(); 54 | log("host: MallocHost", start, end); 55 | 56 | start = clock(); 57 | for (size_t i = 0; i < N; i++) { 58 | a[i] = 3; 59 | b[i] = 5; 60 | } 61 | end = clock(); 62 | log("host: init arrays", start, end); 63 | 64 | start = clock(); 65 | doit(a, b, c, N); 66 | check(cudaDeviceSynchronize()); 67 | end = clock(); 68 | log("device: DMA+compute+synchronize", start, end); 69 | 70 | start = clock(); 71 | for (size_t i = 0; i < N; i++) { 72 | if (a[i] != 3 || b[i] != 5 || c[i] != 15) { 73 | fprintf(stderr, "unexpected result a: %lu b: %lu c: %lu\n", 74 | a[i], b[i], c[i]); 75 | exit(1); 76 | } 77 | } 78 | end = clock(); 79 | log("host: access all arrays", start, end); 80 | 81 | start = clock(); 82 | for (size_t i = 0; i < N; i++) { 83 | if (a[i] != 3 || b[i] != 5 || c[i] != 15) { 84 | fprintf(stderr, "unexpected result a: %lu b: %lu c: %lu\n", 85 | a[i], b[i], c[i]); 86 | exit(1); 87 | } 88 | } 89 | end = clock(); 90 | log("host: access all arrays a second time", start, end); 91 | 92 | start = clock(); 93 | check(cudaFreeHost(a)); 94 | check(cudaFreeHost(b)); 95 | check(cudaFreeHost(c)); 96 | end = clock(); 97 | log("host: free", start, end); 98 | 99 | end_program = clock(); 100 | log("total", start_program, end_program); 101 | 102 | return 0; 103 | } 104 | -------------------------------------------------------------------------------- /simpleManaged.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "common.hh" 5 | 6 | 7 | static __global__ void 8 | f(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N) 9 | { 10 | int64_t index = threadIdx.x + blockIdx.x * blockDim.x; 11 | int64_t stride = blockDim.x * gridDim.x; 12 | 13 | for (int64_t i = index; i < N; i += stride) { 14 | c[i] = a[i] * b[i]; 15 | } 16 | } 17 | 18 | static void 19 | doit(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N) 20 | { 21 | int blockSize = 256; 22 | int64_t numBlocks = (N + blockSize - 1) / blockSize; 23 | 24 | f<<>>(a, b, c, N); 25 | } 26 | 27 | int 28 | main(int argc, char *argv[]) 29 | { 30 | size_t N = 10000000; 31 | clock_t start_program, end_program; 32 | clock_t start, end; 33 | uint64_t *a, *b, *c; 34 | size_t count; 35 | 36 | if (argc == 2) { 37 | N = checked_strtosize(argv[1]); 38 | } 39 | count = checked_mul(N, sizeof(uint64_t)); 40 | 41 | /* Initialize context */ 42 | check(cudaMallocManaged(&a, 128)); 43 | check(cudaDeviceSynchronize()); 44 | check(cudaFree(a)); 45 | 46 | start_program = clock(); 47 | 48 | start = clock(); 49 | check(cudaMallocManaged(&a, count)); 50 | check(cudaMallocManaged(&b, count)); 51 | check(cudaMallocManaged(&c, count)); 52 | end = clock(); 53 | log("host: MallocManaged", start, end); 54 | 55 | start = clock(); 56 | for (size_t i = 0; i < N; i++) { 57 | a[i] = 3; 58 | b[i] = 5; 59 | } 60 | end = clock(); 61 | log("host: init arrays", start, end); 62 | 63 | start = clock(); 64 | doit(a, b, c, N); 65 | check(cudaDeviceSynchronize()); 66 | end = clock(); 67 | log("device: uvm+compute+synchronize", start, end); 68 | 69 | start = clock(); 70 | for (size_t i = 0; i < N; i++) { 71 | if (a[i] != 3 || b[i] != 5 || c[i] != 15) { 72 | fprintf(stderr, "unexpected result a: %lu b: %lu c: %lu\n", 73 | a[i], b[i], c[i]); 74 | exit(1); 75 | } 76 | } 77 | end = clock(); 78 | log("host: access all arrays", start, end); 79 | 80 | start = clock(); 81 | for (size_t i = 0; i < N; i++) { 82 | if (a[i] != 3 || b[i] != 5 || c[i] != 15) { 83 | fprintf(stderr, "unexpected result a: %lu b: %lu c: %lu\n", 84 | a[i], b[i], c[i]); 85 | exit(1); 86 | } 87 | } 88 | end = clock(); 89 | log("host: access all arrays a second time", start, end); 90 | 91 | start = clock(); 92 | check(cudaFree(a)); 93 | check(cudaFree(b)); 94 | check(cudaFree(c)); 95 | end = clock(); 96 | log("host: free", start, end); 97 | 98 | end_program = clock(); 99 | log("total", start_program, end_program); 100 | 101 | return 0; 102 | } 103 | -------------------------------------------------------------------------------- /simpleManagedPrefetch.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "common.hh" 5 | 6 | 7 | static __global__ void 8 | f(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N) 9 | { 10 | int64_t index = threadIdx.x + blockIdx.x * blockDim.x; 11 | int64_t stride = blockDim.x * gridDim.x; 12 | 13 | for (int64_t i = index; i < N; i += stride) { 14 | c[i] = a[i] * b[i]; 15 | } 16 | } 17 | 18 | static void 19 | doit(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N) 20 | { 21 | int blockSize = 256; 22 | int64_t numBlocks = (N + blockSize - 1) / blockSize; 23 | 24 | f<<>>(a, b, c, N); 25 | } 26 | 27 | int 28 | main(int argc, char *argv[]) 29 | { 30 | size_t N = 10000000; 31 | clock_t start_program, end_program; 32 | clock_t start, end; 33 | uint64_t *a, *b, *c; 34 | size_t count; 35 | 36 | if (argc == 2) { 37 | N = checked_strtosize(argv[1]); 38 | } 39 | count = checked_mul(N, sizeof(uint64_t)); 40 | 41 | /* Initialize context */ 42 | check(cudaMallocManaged(&a, 128)); 43 | check(cudaDeviceSynchronize()); 44 | check(cudaFree(a)); 45 | 46 | start_program = clock(); 47 | 48 | start = clock(); 49 | check(cudaMallocManaged(&a, count, cudaMemAttachHost)); 50 | check(cudaMallocManaged(&b, count, cudaMemAttachHost)); 51 | check(cudaMallocManaged(&c, count)); 52 | end = clock(); 53 | log("host: MallocManaged", start, end); 54 | 55 | start = clock(); 56 | for (size_t i = 0; i < N; i++) { 57 | a[i] = 3; 58 | b[i] = 5; 59 | } 60 | end = clock(); 61 | log("host: init arrays", start, end); 62 | 63 | start = clock(); 64 | check(cudaStreamAttachMemAsync(NULL, a, 0, cudaMemAttachGlobal)); 65 | check(cudaStreamAttachMemAsync(NULL, b, 0, cudaMemAttachGlobal)); 66 | doit(a, b, c, N); 67 | check(cudaStreamAttachMemAsync(NULL, a, 0, cudaMemAttachHost)); 68 | check(cudaStreamAttachMemAsync(NULL, b, 0, cudaMemAttachHost)); 69 | check(cudaStreamAttachMemAsync(NULL, c, 0, cudaMemAttachHost)); 70 | check(cudaStreamSynchronize(NULL)); 71 | end = clock(); 72 | log("device: uvm+compute+synchronize", start, end); 73 | 74 | start = clock(); 75 | for (size_t i = 0; i < N; i++) { 76 | if (a[i] != 3 || b[i] != 5 || c[i] != 15) { 77 | fprintf(stderr, "unexpected result a: %lu b: %lu c: %lu\n", 78 | a[i], b[i], c[i]); 79 | exit(1); 80 | } 81 | } 82 | end = clock(); 83 | log("host: access all arrays", start, end); 84 | 85 | start = clock(); 86 | for (size_t i = 0; i < N; i++) { 87 | if (a[i] != 3 || b[i] != 5 || c[i] != 15) { 88 | fprintf(stderr, "unexpected result a: %lu b: %lu c: %lu\n", 89 | a[i], b[i], c[i]); 90 | exit(1); 91 | } 92 | } 93 | end = clock(); 94 | log("host: access all arrays a second time", start, end); 95 | 96 | start = clock(); 97 | check(cudaFree(a)); 98 | check(cudaFree(b)); 99 | check(cudaFree(c)); 100 | end = clock(); 101 | log("host: free", start, end); 102 | 103 | end_program = clock(); 104 | log("total", start_program, end_program); 105 | 106 | return 0; 107 | } 108 | -------------------------------------------------------------------------------- /simpleMemcpy.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include "common.hh" 6 | 7 | 8 | static __global__ void 9 | f(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N) 10 | { 11 | int64_t index = threadIdx.x + blockIdx.x * blockDim.x; 12 | int64_t stride = blockDim.x * gridDim.x; 13 | 14 | for (int64_t i = index; i < N; i += stride) { 15 | c[i] = a[i] * b[i]; 16 | } 17 | } 18 | 19 | static void 20 | doit(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N) 21 | { 22 | int blockSize = 256; 23 | int64_t numBlocks = (N + blockSize - 1) / blockSize; 24 | 25 | f<<>>(a, b, c, N); 26 | } 27 | 28 | int 29 | main(int argc, char *argv[]) 30 | { 31 | size_t N = 10000000; 32 | clock_t start_program, end_program; 33 | clock_t start, end; 34 | uint64_t *a, *b, *c; 35 | uint64_t *da, *db, *dc; 36 | size_t count; 37 | 38 | if (argc == 2) { 39 | N = checked_strtosize(argv[1]); 40 | } 41 | count = checked_mul(N, sizeof(uint64_t)); 42 | 43 | /* Initialize context */ 44 | check(cudaMallocHost(&a, 128)); 45 | check(cudaDeviceSynchronize()); 46 | check(cudaFreeHost(a)); 47 | 48 | start_program = clock(); 49 | 50 | start = clock(); 51 | check(cudaMallocHost(&a, count)); 52 | check(cudaMallocHost(&b, count)); 53 | check(cudaMallocHost(&c, count)); 54 | end = clock(); 55 | log("host: MallocHost", start, end); 56 | 57 | start = clock(); 58 | for (size_t i = 0; i < N; i++) { 59 | a[i] = 3; 60 | b[i] = 5; 61 | } 62 | end = clock(); 63 | log("host: init arrays", start, end); 64 | 65 | start = clock(); 66 | check(cudaMalloc(&da, count)); 67 | check(cudaMalloc(&db, count)); 68 | check(cudaMalloc(&dc, count)); 69 | 70 | check(cudaMemcpy(da, a, count, cudaMemcpyHostToDevice)); 71 | check(cudaMemcpy(db, b, count, cudaMemcpyHostToDevice)); 72 | 73 | doit(da, db, dc, N); 74 | 75 | check(cudaMemcpy(c, dc, count, cudaMemcpyDeviceToHost)); 76 | 77 | check(cudaFree(da)); 78 | check(cudaFree(db)); 79 | check(cudaFree(dc)); 80 | end = clock(); 81 | log("device: malloc+copy+compute", start, end); 82 | 83 | start = clock(); 84 | for (size_t i = 0; i < N; i++) { 85 | if (a[i] != 3 || b[i] != 5 || c[i] != 15) { 86 | fprintf(stderr, "unexpected result a: %lu b: %lu c: %lu\n", 87 | a[i], b[i], c[i]); 88 | exit(1); 89 | } 90 | } 91 | end = clock(); 92 | log("host: access all arrays", start, end); 93 | 94 | start = clock(); 95 | for (size_t i = 0; i < N; i++) { 96 | if (a[i] != 3 || b[i] != 5 || c[i] != 15) { 97 | fprintf(stderr, "unexpected result a: %lu b: %lu c: %lu\n", 98 | a[i], b[i], c[i]); 99 | exit(1); 100 | } 101 | } 102 | end = clock(); 103 | log("host: access all arrays a second time", start, end); 104 | 105 | start = clock(); 106 | check(cudaFreeHost(a)); 107 | check(cudaFreeHost(b)); 108 | check(cudaFreeHost(c)); 109 | end = clock(); 110 | log("host: free", start, end); 111 | 112 | end_program = clock(); 113 | log("total", start_program, end_program); 114 | 115 | return 0; 116 | } 117 | -------------------------------------------------------------------------------- /stridedManaged.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include "common.hh" 5 | 6 | 7 | static __global__ void 8 | f(const uint64_t x0[], const uint64_t x1[], uint64_t x2[], 9 | const int64_t s0, const int64_t s1, const int64_t s2, 10 | int64_t N) 11 | { 12 | int64_t index = threadIdx.x + blockIdx.x * blockDim.x; 13 | int64_t stride = blockDim.x * gridDim.x; 14 | 15 | for (int64_t i = index; i < N; i += stride) { 16 | const int64_t i0 = i * s0; 17 | const int64_t i1 = i * s1; 18 | const int64_t i2 = i * s2; 19 | x2[i2] = x0[i0] * x1[i1]; 20 | } 21 | } 22 | 23 | static void 24 | doit(const uint64_t a0[], const uint64_t a1[], uint64_t a2[], 25 | const int64_t s0, const int64_t s1, const int64_t s2, 26 | int64_t N) 27 | { 28 | int blockSize = 256; 29 | int64_t numBlocks = (N + blockSize - 1) / blockSize; 30 | 31 | f<<>>(a0, a1, a2, s0, s1, s2, N); 32 | } 33 | 34 | int 35 | main(int argc, char *argv[]) 36 | { 37 | size_t N = 1000000; 38 | clock_t start_program, end_program; 39 | clock_t start, end; 40 | uint64_t *x0, *x1, *x2; 41 | size_t count; 42 | const int64_t s0 = 37; 43 | const int64_t s1 = 101; 44 | const int64_t s2 = 311; 45 | size_t i, k0, k1, k2; 46 | 47 | if (argc == 2) { 48 | N = checked_strtosize(argv[1]); 49 | } 50 | count = checked_mul(N, sizeof(uint64_t)); 51 | 52 | /* Initialize context */ 53 | check(cudaMallocManaged(&x0, 128)); 54 | check(cudaDeviceSynchronize()); 55 | check(cudaFree(x0)); 56 | 57 | start_program = clock(); 58 | 59 | start = clock(); 60 | check(cudaMallocManaged(&x0, count*s0)); 61 | check(cudaMallocManaged(&x1, count*s1)); 62 | check(cudaMallocManaged(&x2, count*s2)); 63 | end = clock(); 64 | log("host: MallocManaged", start, end); 65 | 66 | for (size_t i = 0; i < N*s0; i++) { 67 | x0[i] = UINT64_MAX; 68 | } 69 | for (size_t i = 0; i < N*s1; i++) { 70 | x1[i] = UINT64_MAX; 71 | } 72 | 73 | start = clock(); 74 | for (i=0, k0=0, k1=0; i