├── Makefile
├── README.md
├── common.cc
├── common.hh
├── gemmManaged.cu
├── gemmManagedOutOfCore.cu
├── gemmManagedPrefetch.cu
├── gemmMemcpy.cu
├── gemmXtOutOfCore.cu
├── simpleDMA.cu
├── simpleManaged.cu
├── simpleManagedPrefetch.cu
├── simpleMemcpy.cu
└── stridedManaged.cu


/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | NVCC = nvcc --compiler-options="-Wall -Wextra -O3" -std=c++11 -arch=compute_61 -code=sm_61 -lcublas
 4 | 
 5 | 
 6 | default: common.o simpleMemcpy simpleManaged simpleManagedPrefetch simpleDMA stridedManaged gemmMemcpy gemmManaged gemmManagedPrefetch gemmXtOutOfCore gemmManagedOutOfCore
 7 | 
 8 | 
 9 | simpleMemcpy: Makefile simpleMemcpy.cu common.o
10 | 	$(NVCC) -o simpleMemcpy simpleMemcpy.cu common.o
11 | 
12 | simpleManaged: Makefile simpleManaged.cu common.o
13 | 	$(NVCC) -o simpleManaged simpleManaged.cu common.o
14 | 
15 | simpleManagedPrefetch: Makefile simpleManagedPrefetch.cu common.o
16 | 	$(NVCC) -o simpleManagedPrefetch simpleManagedPrefetch.cu common.o
17 | 
18 | simpleDMA: Makefile simpleDMA.cu common.o
19 | 	$(NVCC) -o simpleDMA simpleDMA.cu common.o
20 | 
21 | stridedManaged: Makefile stridedManaged.cu common.o
22 | 	$(NVCC) -o stridedManaged stridedManaged.cu common.o
23 | 
24 | gemmMemcpy: Makefile gemmMemcpy.cu common.o
25 | 	$(NVCC) -o gemmMemcpy gemmMemcpy.cu common.o
26 | 
27 | gemmManaged: Makefile gemmManaged.cu common.o
28 | 	$(NVCC) -o gemmManaged gemmManaged.cu common.o
29 | 
30 | gemmManagedPrefetch: Makefile gemmManagedPrefetch.cu common.o
31 | 	$(NVCC) -o gemmManagedPrefetch gemmManagedPrefetch.cu common.o
32 | 
33 | gemmXtOutOfCore: Makefile gemmXtOutOfCore.cu common.o
34 | 	$(NVCC) -o gemmXtOutOfCore gemmXtOutOfCore.cu common.o
35 | 
36 | gemmManagedOutOfCore: Makefile gemmManagedOutOfCore.cu common.o
37 | 	$(NVCC) -o gemmManagedOutOfCore gemmManagedOutOfCore.cu common.o
38 | 
39 | common.o: Makefile common.cc common.hh
40 | 	$(NVCC) -c common.cc
41 | 
42 | 
43 | clean:
44 | 	rm -f common.o simpleMemcpy simpleManaged simpleManagedPrefetch simpleDMA stridedManaged gemmMemcpy gemmManaged gemmManagedPrefetch gemmXtOutOfCore gemmManagedOutOfCore
45 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Cuda benchmarks for unified vs. explicit memory
  3 | 
  4 | 
  5 | ## Unified memory
  6 | 
  7 | Unified memory has been a feature of game consoles for many years. It simplifies game development because it frees the programmer from having to track whether a memory block is on CPU or GPU memory.
  8 | 
  9 | 
 10 | Starting with the Pascal architecture, Nvidia also offers [advanced unified memory support](https://devblogs.nvidia.com/unified-memory-cuda-beginners/).
 11 | 
 12 | 
 13 | Game consoles can have tighter hardware control, but Nvidia also has the [Jetson product line](https://en.wikipedia.org/wiki/Nvidia_Jetson) with physically unified memory that has been [reported](https://devtalk.nvidia.com/default/topic/1029853/does-unified-memory-and-zero-copy-always-better-than-cudamemcpy-/) to have better performance than explicit memory management.
 14 | 
 15 | 
 16 | ## Unified memory reputation
 17 | 
 18 | There have been several reports, many of them from around 2014, that unified memory has a lower performance in many scenarios. The reports usually do not come with links to actual code.
 19 | 
 20 | Recent reports appear to address only specialty scenarios like [8 NVlink GPUs](https://devtalk.nvidia.com/default/topic/1029706/cuda-programming-and-performance/partial-fail-of-peer-access-in-8-volta-gpu-instance-p3-16xlarge-on-aws-gt-huge-slowdown-/).
 21 | 
 22 | That report also has no code link.
 23 | 
 24 | 
 25 | 
 26 | ## Unified memory reality
 27 | 
 28 | This benchmark suite attempts to provide actual code so that people can check for themselves. It is incomplete and does not yet address scenarios like IPC or multiple GPUs. Benchmarks that show the superiority of explicit memory management are welcome.
 29 | 
 30 | 
 31 | ## Examples
 32 | 
 33 | The examples were run on Linux with Cuda release 9.2, V9.2.148, using 16GB of host memory and a GeForce 1060 with 6GB of GPU memory.
 34 | 
 35 | ### simpleManaged vs. simpleMemcpy vs. simpleDMA
 36 | 
 37 | This benchmark tests initializing three arrays in host memory, running a kernel and accessing the result. Most of the time is spent in copying, the kernel runtime is negligible.
 38 | 
 39 | With N=200000000, explicit memory performs slightly better, but DMA is faster than both:
 40 | 
 41 | 
 42 | ```
 43 | $ ./simpleManaged 200000000
 44 | host: MallocManaged: 0.000040
 45 | host: init arrays: 0.662895
 46 | device: uvm+compute+synchronize: 0.892010
 47 | host: access all arrays: 0.929058
 48 | host: access all arrays a second time: 0.245681
 49 | host: free: 0.176788
 50 | total: 2.906544
 51 | 
 52 | $ ./simpleMemcpy 200000000
 53 | host: MallocHost: 0.706024
 54 | host: init arrays: 0.259399
 55 | device: malloc+copy+compute: 0.420570
 56 | host: access all arrays: 0.239900
 57 | host: access all arrays a second time: 0.239795
 58 | host: free: 0.350564
 59 | total: 2.216320
 60 | 
 61 | $ ./simpleDMA 200000000
 62 | host: MallocHost: 0.700510
 63 | host: init arrays: 0.260276
 64 | device: DMA+compute+synchronize: 0.266353
 65 | host: access all arrays: 0.241061
 66 | host: access all arrays a second time: 0.240792
 67 | host: free: 0.349305
 68 | total: 2.058358
 69 | ```
 70 | 
 71 | With N=500000000, managed memory has no issues, but explicit memory does not run at all. DMA again performs very well:
 72 | 
 73 | 
 74 | ```
 75 | $ ./simpleManaged 500000000
 76 | host: MallocManaged: 0.000043
 77 | host: init arrays: 1.632873
 78 | device: uvm+compute+synchronize: 2.235518
 79 | host: access all arrays: 1.640106
 80 | host: access all arrays a second time: 0.607754
 81 | host: free: 0.382087
 82 | total: 6.498456
 83 | 
 84 | $ ./simpleMemcpy 500000000
 85 | host: MallocHost: 1.751784
 86 | host: init arrays: 0.674096
 87 | cudaErrorMemoryAllocation
 88 | 
 89 | $ ./simpleDMA 500000000
 90 | host: MallocHost: 1.750448
 91 | host: init arrays: 0.673640
 92 | device: DMA+compute+synchronize: 0.665088
 93 | host: access all arrays: 0.607256
 94 | host: access all arrays a second time: 0.607619
 95 | host: free: 0.882589
 96 | total: 5.186704
 97 | ```
 98 | 
 99 | ### cuBLAS: gemmManaged vs. gemmMemcpy
100 | 
101 | This benchmark calls the cublasSgemm() function.
102 | 
103 | With N=8000, managed memory is considerably faster:
104 | 
105 | ```
106 | $ ./gemmManaged 8000
107 | host: MallocManaged+init: 0.191981
108 | cublasSgemm: 1.430046
109 | host: access all arrays: 0.000080
110 | host: access all arrays a second time: 0.000008
111 | host: free: 0.030062
112 | total: 1.967801
113 | 
114 | $ ./gemmMemcpy 8000
115 | host: MallocHost+init: 0.236840
116 | cublasSgemm: 3.316726
117 | host: access all arrays: 0.000030
118 | host: access all arrays a second time: 0.000008
119 | host: free: 0.061765
120 | total: 3.928581
121 | ```
122 | 
123 | With N=16000, managed memory is not only considerably faster, but explicit memory performance is catastrophic:
124 | 
125 | ```
126 | $ ./gemmManaged 16000
127 | host: MallocManaged+init: 0.761249
128 | cublasSgemm: 3.317761
129 | host: access all arrays: 0.000105
130 | host: access all arrays a second time: 0.000045
131 | host: free: 0.084146
132 | total: 4.477609
133 | 
134 | $ ./gemmMemcpy 16000
135 | host: MallocHost+init: 0.940572
136 | cublasSgemm: 35.439908
137 | host: access all arrays: 0.000038
138 | host: access all arrays a second time: 0.000017
139 | host: free: 0.232385
140 | total: 36.929403
141 | ```
142 | 
143 | 
144 | ### cuBlas+Managed vs. cuBlasXt+HostMemory
145 | 
146 | cuBlasXt handles out-of-core computations for memory that is allocated with cudaMallocHost(). This benchmark compares the cublasSgemm() function running on managed memory vs. the cublasXtSgemm() function running on host allocated memory.
147 | 
148 | Note that cublasXtSgemm() is designed to run on host allocated memory and handles optimized tiled memory transfers. Also note that cuBlasXt has more functionality (multiple cards), so the slightly worse performance is not surprising.
149 | 
150 | The point of this comparison, however, is that managed memory performs very well using the standard cuBlas function.
151 | 
152 | ```
153 | $ ./gemmManagedOutOfCore 32000
154 | host: MallocManaged+init: 3.059273
155 | cublasSgemm: 20.510228
156 | 
157 | $ ./gemmXtOutOfCore 32000
158 | host: MallocHost+init: 3.766991
159 | cublasXtSgemm: 25.617316
160 | ```
161 | 


--------------------------------------------------------------------------------
/common.cc:
--------------------------------------------------------------------------------
 1 | #include <cstdio>
 2 | #include <cstdlib>
 3 | #include <cstdint>
 4 | #include <ctime>
 5 | #include <cerrno>
 6 | #include <cuda_runtime.h>
 7 | #include <cublasXt.h>
 8 | #include "common.hh"
 9 | 
10 | const char *
11 | cublasStatusAsString(cublasStatus_t status)
12 | {
13 |     switch (status) {
14 |     case CUBLAS_STATUS_SUCCESS:
15 |         return "CUBLAS_STATUS_SUCCESS";
16 |     case CUBLAS_STATUS_NOT_INITIALIZED:
17 |         return "CUBLAS_STATUS_NOT_INITIALIZED";
18 |     case CUBLAS_STATUS_ALLOC_FAILED:
19 |         return "CUBLAS_STATUS_ALLOC_FAILED";
20 |     case CUBLAS_STATUS_INVALID_VALUE:
21 |         return "CUBLAS_STATUS_INVALID_VALUE";
22 |     case CUBLAS_STATUS_ARCH_MISMATCH:
23 |         return "CUBLAS_STATUS_ARCH_MISMATCH";
24 |     case CUBLAS_STATUS_MAPPING_ERROR:
25 |         return "CUBLAS_STATUS_MAPPING_ERROR";
26 |     case CUBLAS_STATUS_EXECUTION_FAILED:
27 |         return "CUBLAS_STATUS_EXECUTION_FAILED";
28 |     case CUBLAS_STATUS_INTERNAL_ERROR:
29 |         return "CUBLAS_STATUS_INTERNAL_ERROR";
30 |     default:
31 |         return "unknown cublas status";
32 |     }
33 | }
34 | 
35 | void
36 | check(cudaError_t err)
37 | {
38 |     if (err != cudaSuccess) {
39 |         fprintf(stderr, "%s\n", cudaGetErrorName(err));
40 |         exit(1);
41 |     }
42 | }
43 | 
44 | void
45 | check(cublasStatus_t status)
46 | {
47 |     if (status != CUBLAS_STATUS_SUCCESS) {
48 |         fprintf(stderr, "%s\n", cublasStatusAsString(status));
49 |         exit(1);
50 |     }
51 | }
52 | 
53 | size_t
54 | checked_strtosize(const char *v)
55 | {
56 |     char *endptr;
57 |     long long lld;
58 | 
59 |     errno = 0;
60 |     lld = strtoll(v, &endptr, 10);
61 |     if (*v == '\0' || *endptr != '\0') {
62 |         fprintf(stderr, "N: invalid integer: '%s'\n", v);
63 |         exit(1);
64 |     }
65 | 
66 |     if (errno == ERANGE || lld < 1 || (uint64_t)lld > SIZE_MAX) {
67 |         fprintf(stderr, "N: out of range: '%s'\n", v);
68 |         exit(1);
69 |     }
70 | 
71 |     return (size_t)lld;
72 | }
73 | 
74 | size_t
75 | checked_mul(size_t a, size_t b)
76 | {
77 |     if (a > SIZE_MAX / b) {
78 |         fprintf(stderr, "overflow error\n");
79 |         exit(1);
80 |     }
81 | 
82 |     return a * b;
83 | }
84 | 
85 | void
86 | log(const char *prefix, clock_t start, clock_t end)
87 | {
88 |     printf("%s: %f\n", prefix, (double)(end-start)/(double)CLOCKS_PER_SEC);
89 | }
90 | 


--------------------------------------------------------------------------------
/common.hh:
--------------------------------------------------------------------------------
 1 | #ifndef COMMON_H
 2 | #define COMMON_H
 3 | 
 4 | #include <cuda_runtime.h>
 5 | #include <cublasXt.h>
 6 | 
 7 | const char *cublasStatusAsString(cublasStatus_t status);
 8 | void check(cudaError_t err);
 9 | void check(cublasStatus_t status);
10 | size_t checked_strtosize(const char *v);
11 | size_t checked_mul(size_t a, size_t b);
12 | void log(const char *prefix, clock_t start, clock_t end);
13 | 
14 | 
15 | #endif
16 | 


--------------------------------------------------------------------------------
/gemmManaged.cu:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <cublasXt.h>
 4 | #include <cuda_runtime.h>
 5 | #include "common.hh"
 6 | 
 7 | 
 8 | int
 9 | main(int argc, char *argv[])
10 | {
11 |     size_t N = 1000;
12 |     clock_t start_program, end_program;
13 |     clock_t start, end;
14 |     cublasHandle_t handle;
15 |     float *a, *b, *c;
16 |     const float alpha = 1;
17 |     const float beta = 0;
18 |     size_t count, nn;
19 | 
20 |     if (argc == 2) {
21 |         N = checked_strtosize(argv[1]);
22 |     }
23 |     nn = checked_mul(N, N);
24 |     count = checked_mul(nn, sizeof(float));
25 | 
26 |     start_program = clock();
27 | 
28 |     check(cublasCreate(&handle));
29 | 
30 |     start = clock();
31 |     check(cudaMallocManaged(&a, count));
32 |     check(cudaMallocManaged(&b, count));
33 |     check(cudaMallocManaged(&c, count));
34 | 
35 |     for (size_t i = 0; i < N*N; i++) {
36 |         a[i] = i / 37.0;
37 |         b[i] = i / 101.0;
38 |     }
39 |     end = clock();
40 |     log("host: MallocManaged+init", start, end);
41 | 
42 |     start = clock();
43 |     check(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
44 |                       N, N, N,
45 |                       &alpha,
46 |                       a, N,
47 |                       b, N,
48 |                       &beta,
49 |                       c, N));
50 |     check(cudaDeviceSynchronize());
51 |     end = clock();
52 |     log("cublasSgemm", start, end);
53 | 
54 |     start = clock();
55 |     for (size_t i = 0; i < N; i++) {
56 |         if (a[i] < 0 || b[i] < 0 || c[i] < 0) {
57 |             fprintf(stderr, "unexpected result a: %f  b: %f  c: %f\n",
58 |                     a[i], b[i], c[i]);
59 |             exit(1);
60 |         }
61 |     }
62 |     end = clock();
63 |     log("host: access all arrays", start, end);
64 | 
65 |     start = clock();
66 |     for (size_t i = 0; i < N; i++) {
67 |         if (a[i] < 0 || b[i] < 0 || c[i] < 0) {
68 |             fprintf(stderr, "unexpected result a: %f  b: %f  c: %f\n",
69 |                     a[i], b[i], c[i]);
70 |             exit(1);
71 |         }
72 |     }
73 |     end = clock();
74 |     log("host: access all arrays a second time", start, end);
75 | 
76 |     start = clock();
77 |     check(cudaFree(a));
78 |     check(cudaFree(b));
79 |     check(cudaFree(c));
80 |     end = clock();
81 |     log("host: free", start, end);
82 | 
83 |     end_program = clock();
84 |     log("total", start_program, end_program);
85 | 
86 |     return 0;
87 | }
88 | 


--------------------------------------------------------------------------------
/gemmManagedOutOfCore.cu:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <cuda_runtime.h>
 4 | #include <cublas.h>
 5 | #include "common.hh"
 6 | 
 7 | 
 8 | int
 9 | main(int argc, char *argv[])
10 | {
11 |     size_t N = 16000;
12 |     clock_t start, end; 
13 |     cublasHandle_t handle;
14 |     float *a, *b, *c;
15 |     const float alpha = 1;
16 |     const float beta = 0;
17 |     size_t count, nn;
18 | 
19 |     if (argc == 2) {
20 |         N = checked_strtosize(argv[1]);
21 |     }
22 |     nn = checked_mul(N, N);
23 |     count = checked_mul(nn, sizeof(float));
24 | 
25 |     check(cublasCreate(&handle));
26 | 
27 |     start = clock();
28 |     check(cudaMallocManaged(&a, count));
29 |     check(cudaMallocManaged(&b, count));
30 |     check(cudaMallocManaged(&c, count));
31 | 
32 |     for (size_t i = 0; i < N*N; i++) {
33 |         a[i] = i / 37.0;
34 |         b[i] = i / 101.0;
35 |     }
36 |     end = clock();
37 |     log("host: MallocManaged+init", start, end);
38 | 
39 |     start = clock();
40 |     check(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
41 |                       N, N, N,
42 |                       &alpha,
43 |                       a, N,
44 |                       b, N,
45 |                       &beta,
46 |                       c, N));
47 |     check(cudaDeviceSynchronize());
48 |     end = clock();
49 |     log("cublasSgemm", start, end);
50 | 
51 |     return 0;
52 | }
53 | 


--------------------------------------------------------------------------------
/gemmManagedPrefetch.cu:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <cublasXt.h>
 4 | #include <cuda_runtime.h>
 5 | #include "common.hh"
 6 | 
 7 | 
 8 | int
 9 | main(int argc, char *argv[])
10 | {
11 |     size_t N = 1000;
12 |     clock_t start_program, end_program;
13 |     clock_t start, end;
14 |     cublasHandle_t handle;
15 |     float *a, *b, *c;
16 |     const float alpha = 1;
17 |     const float beta = 0;
18 |     size_t count, nn;
19 | 
20 |     if (argc == 2) {
21 |         N = checked_strtosize(argv[1]);
22 |     }
23 |     nn = checked_mul(N, N);
24 |     count = checked_mul(nn, sizeof(float));
25 | 
26 |     start_program = clock();
27 | 
28 |     check(cublasCreate(&handle));
29 | 
30 |     start = clock();
31 |     check(cudaMallocManaged(&a, count, cudaMemAttachHost));
32 |     check(cudaMallocManaged(&b, count, cudaMemAttachHost));
33 |     check(cudaMallocManaged(&c, count));
34 | 
35 |     for (size_t i = 0; i < N*N; i++) {
36 |         a[i] = i / 37.0;
37 |         b[i] = i / 101.0;
38 |     }
39 |     end = clock();
40 |     log("host: MallocManaged+init", start, end);
41 | 
42 |     start = clock();
43 |     check(cudaStreamAttachMemAsync(NULL, a, 0, cudaMemAttachGlobal));
44 |     check(cudaStreamAttachMemAsync(NULL, b, 0, cudaMemAttachGlobal));
45 |     check(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
46 |                       N, N, N,
47 |                       &alpha,
48 |                       a, N,
49 |                       b, N,
50 |                       &beta,
51 |                       c, N));
52 |     check(cudaStreamAttachMemAsync(NULL, a, 0, cudaMemAttachHost));
53 |     check(cudaStreamAttachMemAsync(NULL, b, 0, cudaMemAttachHost));
54 |     check(cudaStreamAttachMemAsync(NULL, c, 0, cudaMemAttachHost));
55 |     check(cudaStreamSynchronize(NULL));
56 |     end = clock();
57 |     log("cublasSgemm", start, end);
58 | 
59 |     start = clock();
60 |     for (size_t i = 0; i < N; i++) {
61 |         if (a[i] < 0 || b[i] < 0 || c[i] < 0) {
62 |             fprintf(stderr, "unexpected result a: %f  b: %f  c: %f\n",
63 |                     a[i], b[i], c[i]);
64 |             exit(1);
65 |         }
66 |     }
67 |     end = clock();
68 |     log("host: access all arrays", start, end);
69 | 
70 |     start = clock();
71 |     for (size_t i = 0; i < N; i++) {
72 |         if (a[i] < 0 || b[i] < 0 || c[i] < 0) {
73 |             fprintf(stderr, "unexpected result a: %f  b: %f  c: %f\n",
74 |                     a[i], b[i], c[i]);
75 |             exit(1);
76 |         }
77 |     }
78 |     end = clock();
79 |     log("host: access all arrays a second time", start, end);
80 | 
81 |     start = clock();
82 |     check(cudaFree(a));
83 |     check(cudaFree(b));
84 |     check(cudaFree(c));
85 |     end = clock();
86 |     log("host: free", start, end);
87 | 
88 |     end_program = clock();
89 |     log("total", start_program, end_program);
90 | 
91 |     return 0;
92 | }
93 | 


--------------------------------------------------------------------------------
/gemmMemcpy.cu:
--------------------------------------------------------------------------------
  1 | #include <stdlib.h>
  2 | #include <stdio.h>
  3 | #include <cublasXt.h>
  4 | #include <cuda_runtime.h>
  5 | #include "common.hh"
  6 | 
  7 | 
  8 | int
  9 | main(int argc, char *argv[])
 10 | {
 11 |     size_t N = 1000;
 12 |     clock_t start_program, end_program;
 13 |     clock_t start, end;
 14 |     cublasHandle_t handle;
 15 |     float *a, *b, *c;
 16 |     float *da, *db, *dc;
 17 |     const float alpha = 1;
 18 |     const float beta = 0;
 19 |     size_t count, nn;
 20 | 
 21 |     if (argc == 2) {
 22 |         N = checked_strtosize(argv[1]);
 23 |     }
 24 |     nn = checked_mul(N, N);
 25 |     count = checked_mul(nn, sizeof(float));
 26 | 
 27 |     start_program = clock();
 28 | 
 29 |     check(cublasCreate(&handle));
 30 | 
 31 |     start = clock();
 32 |     check(cudaMallocHost(&a, count));
 33 |     check(cudaMallocHost(&b, count));
 34 |     check(cudaMallocHost(&c, count));
 35 | 
 36 |     for (size_t i = 0; i < N*N; i++) {
 37 |         a[i] = i / 37.0;
 38 |         b[i] = i / 101.0;
 39 |     }
 40 |     end = clock();
 41 |     log("host: MallocHost+init", start, end);
 42 | 
 43 |     start = clock();
 44 |     check(cudaMalloc(&da, count));
 45 |     check(cudaMalloc(&db, count));
 46 |     check(cudaMalloc(&dc, count));
 47 | 
 48 |     check(cudaMemcpy(da, a, count, cudaMemcpyHostToDevice));
 49 |     check(cudaMemcpy(db, b, count, cudaMemcpyHostToDevice));
 50 | 
 51 |     check(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
 52 |                       N, N, N,
 53 |                       &alpha,
 54 |                       a, N,
 55 |                       b, N,
 56 |                       &beta,
 57 |                       c, N));
 58 | 
 59 |     check(cudaMemcpy(c, dc, count, cudaMemcpyDeviceToHost));
 60 |     check(cudaFree(da));
 61 |     check(cudaFree(db));
 62 |     check(cudaFree(dc));
 63 |     end = clock();
 64 |     log("cublasSgemm", start, end);
 65 | 
 66 |     start = clock();
 67 |     for (size_t i = 0; i < N; i++) {
 68 |         if (a[i] < 0 || b[i] < 0 || c[i] < 0) {
 69 |             fprintf(stderr, "unexpected result a: %f  b: %f  c: %f\n",
 70 |                     a[i], b[i], c[i]);
 71 |             exit(1);
 72 |         }
 73 |     }
 74 |     end = clock();
 75 |     log("host: access all arrays", start, end);
 76 | 
 77 |     start = clock();
 78 |     for (size_t i = 0; i < N; i++) {
 79 |         if (a[i] < 0 || b[i] < 0 || c[i] < 0) {
 80 |             fprintf(stderr, "unexpected result a: %f  b: %f  c: %f\n",
 81 |                     a[i], b[i], c[i]);
 82 |             exit(1);
 83 |         }
 84 |     }
 85 |     end = clock();
 86 |     log("host: access all arrays a second time", start, end);
 87 | 
 88 |     start = clock();
 89 |     check(cudaFreeHost(a));
 90 |     check(cudaFreeHost(b));
 91 |     check(cudaFreeHost(c));
 92 |     end = clock();
 93 |     log("host: free", start, end);
 94 | 
 95 |     end_program = clock();
 96 |     log("total", start_program, end_program);
 97 | 
 98 |     return 0;
 99 | }
100 | 


--------------------------------------------------------------------------------
/gemmXtOutOfCore.cu:
--------------------------------------------------------------------------------
 1 | #include <stdlib.h>
 2 | #include <stdio.h>
 3 | #include <cuda_runtime.h>
 4 | #include <cublasXt.h>
 5 | #include "common.hh"
 6 | 
 7 | 
 8 | int
 9 | main(int argc, char *argv[])
10 | {
11 |     size_t N = 16000;
12 |     clock_t start, end; 
13 |     cublasXtHandle_t handle;
14 |     int devices[1] = {0};
15 |     float *a, *b, *c;
16 |     const float alpha = 1;
17 |     const float beta = 0;
18 |     size_t count, nn;
19 | 
20 |     if (argc == 2) {
21 |         N = checked_strtosize(argv[1]);
22 |     }
23 |     nn = checked_mul(N, N);
24 |     count = checked_mul(nn, sizeof(float));
25 | 
26 |     check(cublasXtCreate(&handle));
27 |     check(cublasXtDeviceSelect(handle, 1, devices));
28 | 
29 |     start = clock();
30 |     check(cudaMallocHost(&a, count));
31 |     check(cudaMallocHost(&b, count));
32 |     check(cudaMallocHost(&c, count));
33 | 
34 |     for (size_t i = 0; i < N*N; i++) {
35 |         a[i] = i / 37.0;
36 |         b[i] = i / 101.0;
37 |     }
38 |     end = clock();
39 |     log("host: MallocHost+init", start, end);
40 | 
41 |     start = clock();
42 |     check(cublasXtSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N,
43 |                         N, N, N,
44 |                         &alpha,
45 |                         a, N,
46 |                         b, N,
47 |                         &beta,
48 |                         c, N));
49 |     end = clock();
50 |     log("cublasXtSgemm", start, end);
51 | 
52 |     return 0;
53 | }
54 | 


--------------------------------------------------------------------------------
/simpleDMA.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <cstdlib>
  3 | #include <cinttypes>
  4 | #include <cuda_runtime.h>
  5 | #include "common.hh"
  6 | 
  7 | 
  8 | static __global__ void
  9 | f(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N)
 10 | {
 11 |     int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
 12 |     int64_t stride = blockDim.x * gridDim.x;
 13 | 
 14 |     for (int64_t i = index; i < N; i += stride) {
 15 |         c[i] = a[i] * b[i];
 16 |     }
 17 | }
 18 | 
 19 | static void
 20 | doit(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N)
 21 | {
 22 |     int blockSize = 256;
 23 |     int64_t numBlocks = (N + blockSize - 1) / blockSize;
 24 | 
 25 |     f<<<numBlocks, blockSize>>>(a, b, c, N);
 26 | }
 27 | 
 28 | int
 29 | main(int argc, char *argv[])
 30 | {
 31 |     size_t N = 10000000;
 32 |     clock_t start_program, end_program;
 33 |     clock_t start, end;
 34 |     uint64_t *a, *b, *c;
 35 |     size_t count;
 36 | 
 37 |     if (argc == 2) {
 38 |         N = checked_strtosize(argv[1]);
 39 |     }
 40 |     count = checked_mul(N, sizeof(uint64_t));
 41 | 
 42 |     /* Initialize context */
 43 |     check(cudaMallocHost(&a, 128));
 44 |     check(cudaDeviceSynchronize());
 45 |     check(cudaFreeHost(a));
 46 | 
 47 |     start_program = clock();
 48 | 
 49 |     start = clock();
 50 |     check(cudaMallocHost(&a, count));
 51 |     check(cudaMallocHost(&b, count));
 52 |     check(cudaMallocHost(&c, count));
 53 |     end = clock();
 54 |     log("host: MallocHost", start, end);
 55 | 
 56 |     start = clock();
 57 |     for (size_t i = 0; i < N; i++) {
 58 |         a[i] = 3;
 59 |         b[i] = 5;
 60 |     }
 61 |     end = clock();
 62 |     log("host: init arrays", start, end);
 63 | 
 64 |     start = clock();
 65 |     doit(a, b, c, N);
 66 |     check(cudaDeviceSynchronize());
 67 |     end = clock();
 68 |     log("device: DMA+compute+synchronize", start, end);
 69 | 
 70 |     start = clock();
 71 |     for (size_t i = 0; i < N; i++) {
 72 |         if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
 73 |             fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
 74 |                     a[i], b[i], c[i]);
 75 |             exit(1);
 76 |         }
 77 |     }
 78 |     end = clock();
 79 |     log("host: access all arrays", start, end);
 80 | 
 81 |     start = clock();
 82 |     for (size_t i = 0; i < N; i++) {
 83 |         if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
 84 |             fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
 85 |                     a[i], b[i], c[i]);
 86 |             exit(1);
 87 |         }
 88 |     }
 89 |     end = clock();
 90 |     log("host: access all arrays a second time", start, end);
 91 | 
 92 |     start = clock();
 93 |     check(cudaFreeHost(a));
 94 |     check(cudaFreeHost(b));
 95 |     check(cudaFreeHost(c));
 96 |     end = clock();
 97 |     log("host: free", start, end);
 98 | 
 99 |     end_program = clock();
100 |     log("total", start_program, end_program);
101 | 
102 |     return 0;
103 | }
104 | 


--------------------------------------------------------------------------------
/simpleManaged.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <cinttypes>
  3 | #include <cuda_runtime.h>
  4 | #include "common.hh"
  5 | 
  6 | 
  7 | static __global__ void
  8 | f(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N)
  9 | {
 10 |     int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
 11 |     int64_t stride = blockDim.x * gridDim.x;
 12 | 
 13 |     for (int64_t i = index; i < N; i += stride) {
 14 |         c[i] = a[i] * b[i];
 15 |     }
 16 | }
 17 | 
 18 | static void
 19 | doit(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N)
 20 | {
 21 |     int blockSize = 256;
 22 |     int64_t numBlocks = (N + blockSize - 1) / blockSize;
 23 | 
 24 |     f<<<numBlocks, blockSize>>>(a, b, c, N);
 25 | }
 26 | 
 27 | int
 28 | main(int argc, char *argv[])
 29 | {
 30 |     size_t N = 10000000;
 31 |     clock_t start_program, end_program;
 32 |     clock_t start, end;
 33 |     uint64_t *a, *b, *c;
 34 |     size_t count;
 35 | 
 36 |     if (argc == 2) {
 37 |         N = checked_strtosize(argv[1]);
 38 |     }
 39 |     count = checked_mul(N, sizeof(uint64_t));
 40 | 
 41 |     /* Initialize context */
 42 |     check(cudaMallocManaged(&a, 128));
 43 |     check(cudaDeviceSynchronize());
 44 |     check(cudaFree(a));
 45 | 
 46 |     start_program = clock();
 47 | 
 48 |     start = clock();
 49 |     check(cudaMallocManaged(&a, count));
 50 |     check(cudaMallocManaged(&b, count));
 51 |     check(cudaMallocManaged(&c, count));
 52 |     end = clock();
 53 |     log("host: MallocManaged", start, end);
 54 | 
 55 |     start = clock();
 56 |     for (size_t i = 0; i < N; i++) {
 57 |         a[i] = 3;
 58 |         b[i] = 5;
 59 |     }
 60 |     end = clock();
 61 |     log("host: init arrays", start, end);
 62 | 
 63 |     start = clock();
 64 |     doit(a, b, c, N);
 65 |     check(cudaDeviceSynchronize());
 66 |     end = clock();
 67 |     log("device: uvm+compute+synchronize", start, end);
 68 | 
 69 |     start = clock();
 70 |     for (size_t i = 0; i < N; i++) {
 71 |         if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
 72 |             fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
 73 |                     a[i], b[i], c[i]);
 74 |             exit(1);
 75 |         }
 76 |     }
 77 |     end = clock();
 78 |     log("host: access all arrays", start, end);
 79 | 
 80 |     start = clock();
 81 |     for (size_t i = 0; i < N; i++) {
 82 |         if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
 83 |             fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
 84 |                     a[i], b[i], c[i]);
 85 |             exit(1);
 86 |         }
 87 |     }
 88 |     end = clock();
 89 |     log("host: access all arrays a second time", start, end);
 90 | 
 91 |     start = clock();
 92 |     check(cudaFree(a));
 93 |     check(cudaFree(b));
 94 |     check(cudaFree(c));
 95 |     end = clock();
 96 |     log("host: free", start, end);
 97 | 
 98 |     end_program = clock();
 99 |     log("total", start_program, end_program);
100 | 
101 |     return 0;
102 | }
103 | 


--------------------------------------------------------------------------------
/simpleManagedPrefetch.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <cinttypes>
  3 | #include <cuda_runtime.h>
  4 | #include "common.hh"
  5 | 
  6 | 
  7 | static __global__ void
  8 | f(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N)
  9 | {
 10 |     int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
 11 |     int64_t stride = blockDim.x * gridDim.x;
 12 | 
 13 |     for (int64_t i = index; i < N; i += stride) {
 14 |         c[i] = a[i] * b[i];
 15 |     }
 16 | }
 17 | 
 18 | static void
 19 | doit(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N)
 20 | {
 21 |     int blockSize = 256;
 22 |     int64_t numBlocks = (N + blockSize - 1) / blockSize;
 23 | 
 24 |     f<<<numBlocks, blockSize>>>(a, b, c, N);
 25 | }
 26 | 
 27 | int
 28 | main(int argc, char *argv[])
 29 | {
 30 |     size_t N = 10000000;
 31 |     clock_t start_program, end_program;
 32 |     clock_t start, end;
 33 |     uint64_t *a, *b, *c;
 34 |     size_t count;
 35 | 
 36 |     if (argc == 2) {
 37 |         N = checked_strtosize(argv[1]);
 38 |     }
 39 |     count = checked_mul(N, sizeof(uint64_t));
 40 | 
 41 |     /* Initialize context */
 42 |     check(cudaMallocManaged(&a, 128));
 43 |     check(cudaDeviceSynchronize());
 44 |     check(cudaFree(a));
 45 | 
 46 |     start_program = clock();
 47 | 
 48 |     start = clock();
 49 |     check(cudaMallocManaged(&a, count, cudaMemAttachHost));
 50 |     check(cudaMallocManaged(&b, count, cudaMemAttachHost));
 51 |     check(cudaMallocManaged(&c, count));
 52 |     end = clock();
 53 |     log("host: MallocManaged", start, end);
 54 | 
 55 |     start = clock();
 56 |     for (size_t i = 0; i < N; i++) {
 57 |         a[i] = 3;
 58 |         b[i] = 5;
 59 |     }
 60 |     end = clock();
 61 |     log("host: init arrays", start, end);
 62 | 
 63 |     start = clock();
 64 |     check(cudaStreamAttachMemAsync(NULL, a, 0, cudaMemAttachGlobal));
 65 |     check(cudaStreamAttachMemAsync(NULL, b, 0, cudaMemAttachGlobal));
 66 |     doit(a, b, c, N);
 67 |     check(cudaStreamAttachMemAsync(NULL, a, 0, cudaMemAttachHost));
 68 |     check(cudaStreamAttachMemAsync(NULL, b, 0, cudaMemAttachHost));
 69 |     check(cudaStreamAttachMemAsync(NULL, c, 0, cudaMemAttachHost));
 70 |     check(cudaStreamSynchronize(NULL));
 71 |     end = clock();
 72 |     log("device: uvm+compute+synchronize", start, end);
 73 | 
 74 |     start = clock();
 75 |     for (size_t i = 0; i < N; i++) {
 76 |         if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
 77 |             fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
 78 |                     a[i], b[i], c[i]);
 79 |             exit(1);
 80 |         }
 81 |     }
 82 |     end = clock();
 83 |     log("host: access all arrays", start, end);
 84 | 
 85 |     start = clock();
 86 |     for (size_t i = 0; i < N; i++) {
 87 |         if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
 88 |             fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
 89 |                     a[i], b[i], c[i]);
 90 |             exit(1);
 91 |         }
 92 |     }
 93 |     end = clock();
 94 |     log("host: access all arrays a second time", start, end);
 95 | 
 96 |     start = clock();
 97 |     check(cudaFree(a));
 98 |     check(cudaFree(b));
 99 |     check(cudaFree(c));
100 |     end = clock();
101 |     log("host: free", start, end);
102 | 
103 |     end_program = clock();
104 |     log("total", start_program, end_program);
105 | 
106 |     return 0;
107 | }
108 | 


--------------------------------------------------------------------------------
/simpleMemcpy.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <cstdlib>
  3 | #include <cinttypes>
  4 | #include <cuda_runtime.h>
  5 | #include "common.hh"
  6 | 
  7 | 
  8 | static __global__ void
  9 | f(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N)
 10 | {
 11 |     int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
 12 |     int64_t stride = blockDim.x * gridDim.x;
 13 | 
 14 |     for (int64_t i = index; i < N; i += stride) {
 15 |         c[i] = a[i] * b[i];
 16 |     }
 17 | }
 18 | 
 19 | static void
 20 | doit(const uint64_t a[], const uint64_t b[], uint64_t c[], int64_t N)
 21 | {
 22 |     int blockSize = 256;
 23 |     int64_t numBlocks = (N + blockSize - 1) / blockSize;
 24 | 
 25 |     f<<<numBlocks, blockSize>>>(a, b, c, N);
 26 | }
 27 | 
 28 | int
 29 | main(int argc, char *argv[])
 30 | {
 31 |     size_t N = 10000000;
 32 |     clock_t start_program, end_program;
 33 |     clock_t start, end;
 34 |     uint64_t *a, *b, *c;
 35 |     uint64_t *da, *db, *dc;
 36 |     size_t count;
 37 | 
 38 |     if (argc == 2) {
 39 |         N = checked_strtosize(argv[1]);
 40 |     }
 41 |     count = checked_mul(N, sizeof(uint64_t));
 42 | 
 43 |     /* Initialize context */
 44 |     check(cudaMallocHost(&a, 128));
 45 |     check(cudaDeviceSynchronize());
 46 |     check(cudaFreeHost(a));
 47 | 
 48 |     start_program = clock();
 49 | 
 50 |     start = clock();
 51 |     check(cudaMallocHost(&a, count));
 52 |     check(cudaMallocHost(&b, count));
 53 |     check(cudaMallocHost(&c, count));
 54 |     end = clock();
 55 |     log("host: MallocHost", start, end);
 56 | 
 57 |     start = clock();
 58 |     for (size_t i = 0; i < N; i++) {
 59 |         a[i] = 3;
 60 |         b[i] = 5;
 61 |     }
 62 |     end = clock();
 63 |     log("host: init arrays", start, end);
 64 | 
 65 |     start = clock();
 66 |     check(cudaMalloc(&da, count));
 67 |     check(cudaMalloc(&db, count));
 68 |     check(cudaMalloc(&dc, count));
 69 | 
 70 |     check(cudaMemcpy(da, a, count, cudaMemcpyHostToDevice));
 71 |     check(cudaMemcpy(db, b, count, cudaMemcpyHostToDevice));
 72 | 
 73 |     doit(da, db, dc, N);
 74 | 
 75 |     check(cudaMemcpy(c, dc, count, cudaMemcpyDeviceToHost));
 76 | 
 77 |     check(cudaFree(da));
 78 |     check(cudaFree(db));
 79 |     check(cudaFree(dc));
 80 |     end = clock();
 81 |     log("device: malloc+copy+compute", start, end);
 82 | 
 83 |     start = clock();
 84 |     for (size_t i = 0; i < N; i++) {
 85 |         if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
 86 |             fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
 87 |                     a[i], b[i], c[i]);
 88 |             exit(1);
 89 |         }
 90 |     }
 91 |     end = clock();
 92 |     log("host: access all arrays", start, end);
 93 | 
 94 |     start = clock();
 95 |     for (size_t i = 0; i < N; i++) {
 96 |         if (a[i] != 3 || b[i] != 5 || c[i] != 15) {
 97 |             fprintf(stderr, "unexpected result a: %lu  b: %lu  c: %lu\n",
 98 |                     a[i], b[i], c[i]);
 99 |             exit(1);
100 |         }
101 |     }
102 |     end = clock();
103 |     log("host: access all arrays a second time", start, end);
104 | 
105 |     start = clock();
106 |     check(cudaFreeHost(a));
107 |     check(cudaFreeHost(b));
108 |     check(cudaFreeHost(c));
109 |     end = clock();
110 |     log("host: free", start, end);
111 | 
112 |     end_program = clock();
113 |     log("total", start_program, end_program);
114 | 
115 |     return 0;
116 | }
117 | 


--------------------------------------------------------------------------------
/stridedManaged.cu:
--------------------------------------------------------------------------------
  1 | #include <cstdio>
  2 | #include <cinttypes>
  3 | #include <cuda_runtime.h>
  4 | #include "common.hh"
  5 | 
  6 | 
  7 | static __global__ void
  8 | f(const uint64_t x0[], const uint64_t x1[], uint64_t x2[],
  9 |   const int64_t s0, const int64_t s1, const int64_t s2,
 10 |   int64_t N)
 11 | {
 12 |     int64_t index = threadIdx.x + blockIdx.x * blockDim.x;
 13 |     int64_t stride = blockDim.x * gridDim.x;
 14 | 
 15 |     for (int64_t i = index; i < N; i += stride) {
 16 |         const int64_t i0 = i * s0;
 17 |         const int64_t i1 = i * s1;
 18 |         const int64_t i2 = i * s2;
 19 |         x2[i2] = x0[i0] * x1[i1];
 20 |     }
 21 | }
 22 | 
 23 | static void
 24 | doit(const uint64_t a0[], const uint64_t a1[], uint64_t a2[],
 25 |      const int64_t s0, const int64_t s1, const int64_t s2,
 26 |      int64_t N)
 27 | {
 28 |     int blockSize = 256;
 29 |     int64_t numBlocks = (N + blockSize - 1) / blockSize;
 30 | 
 31 |     f<<<numBlocks, blockSize>>>(a0, a1, a2, s0, s1, s2, N);
 32 | }
 33 | 
 34 | int
 35 | main(int argc, char *argv[])
 36 | {
 37 |     size_t N = 1000000;
 38 |     clock_t start_program, end_program;
 39 |     clock_t start, end;
 40 |     uint64_t *x0, *x1, *x2;
 41 |     size_t count;
 42 |     const int64_t s0 = 37;
 43 |     const int64_t s1 = 101;
 44 |     const int64_t s2 = 311;
 45 |     size_t i, k0, k1, k2;
 46 | 
 47 |     if (argc == 2) {
 48 |         N = checked_strtosize(argv[1]);
 49 |     }
 50 |     count = checked_mul(N, sizeof(uint64_t));
 51 | 
 52 |     /* Initialize context */
 53 |     check(cudaMallocManaged(&x0, 128));
 54 |     check(cudaDeviceSynchronize());
 55 |     check(cudaFree(x0));
 56 | 
 57 |     start_program = clock();
 58 | 
 59 |     start = clock();
 60 |     check(cudaMallocManaged(&x0, count*s0));
 61 |     check(cudaMallocManaged(&x1, count*s1));
 62 |     check(cudaMallocManaged(&x2, count*s2));
 63 |     end = clock();
 64 |     log("host: MallocManaged", start, end);
 65 | 
 66 |     for (size_t i = 0; i < N*s0; i++) {
 67 |         x0[i] = UINT64_MAX;
 68 |     }
 69 |     for (size_t i = 0; i < N*s1; i++) {
 70 |         x1[i] = UINT64_MAX;
 71 |     }
 72 | 
 73 |     start = clock();
 74 |     for (i=0, k0=0, k1=0; i<N; i++, k0+=s0, k1+=s1) {
 75 |         x0[k0] = 3;
 76 |         x1[k1] = 5;
 77 |     }
 78 |     end = clock();
 79 |     log("host: init arrays", start, end);
 80 | 
 81 |     start = clock();
 82 |     doit(x0, x1, x2, s0, s1, s2, N);
 83 |     check(cudaDeviceSynchronize());
 84 |     end = clock();
 85 |     log("device: uvm+compute+synchronize", start, end);
 86 | 
 87 |     start = clock();
 88 |     for (i=0, k0=0, k1=0, k2=0; i<N; i++, k0+=s0, k1+=s1, k2+=s2) {
 89 |         if (x0[k0] != 3 || x1[k1] != 5 || x2[k2] != 15) {
 90 |             fprintf(stderr, "unexpected result x0: %lu  x1: %lu  x2: %lu\n",
 91 |                     x0[k0], x1[k1], x2[k2]);
 92 |             exit(1);
 93 |         }
 94 |     }
 95 |     end = clock();
 96 |     log("host: access all arrays", start, end);
 97 | 
 98 |     start = clock();
 99 |     for (i=0, k0=0, k1=0, k2=0; i<N; i++, k0+=s0, k1+=s1, k2+=s2) {
100 |         if (x0[k0] != 3 || x1[k1] != 5 || x2[k2] != 15) {
101 |             fprintf(stderr, "unexpected result x0: %lu  x1: %lu  x2: %lu\n",
102 |                     x0[k0], x1[k1], x2[k2]);
103 |             exit(1);
104 |         }
105 |     }
106 |     end = clock();
107 |     log("host: access all arrays a second time", start, end);
108 | 
109 |     start = clock();
110 |     check(cudaFree(x0));
111 |     check(cudaFree(x1));
112 |     check(cudaFree(x2));
113 |     end = clock();
114 |     log("host: free", start, end);
115 | 
116 |     end_program = clock();
117 |     log("total", start_program, end_program);
118 | 
119 |     return 0;
120 | }
121 | 


--------------------------------------------------------------------------------