├── BankRedux
    ├── Makefile
    ├── sum.h
    ├── sum_cuda.c
    ├── sum_cuda.output.carina.txt
    ├── sum_cudakernel.cu
    └── test.sh
├── CoMem_AXPY
    ├── Makefile
    ├── axpy.h
    ├── axpy_cuda.c
    ├── axpy_cuda.output.carina.txt
    ├── axpy_cudakernel.cu
    └── test.sh
├── CoMem_SpMM
    ├── Makefile
    ├── SpMM.h
    ├── SpMM_cuda.c
    ├── SpMM_cuda.output.carina.txt
    ├── SpMM_cuda.output.fornax.txt
    └── SpMM_cudakernel.cu
├── Common
    ├── FreeImage
    │   ├── freeimage-license.txt
    │   └── include
    │   │   └── FreeImage.h
    ├── README.md
    ├── UtilNPP
    │   ├── Exceptions.h
    │   ├── Image.h
    │   ├── ImageAllocatorsCPU.h
    │   ├── ImageAllocatorsNPP.h
    │   ├── ImageIO.h
    │   ├── ImagePacked.h
    │   ├── ImagesCPU.h
    │   ├── ImagesNPP.h
    │   ├── Pixel.h
    │   ├── Signal.h
    │   ├── SignalAllocatorsCPU.h
    │   ├── SignalAllocatorsNPP.h
    │   ├── SignalsCPU.h
    │   └── SignalsNPP.h
    ├── drvapi_error_string.h
    ├── dynlink_d3d11.h
    ├── exception.h
    ├── helper_cuda.h
    ├── helper_cuda_drvapi.h
    ├── helper_cusolver.h
    ├── helper_functions.h
    ├── helper_image.h
    ├── helper_math.h
    ├── helper_multiprocess.cpp
    ├── helper_multiprocess.h
    ├── helper_nvJPEG.hxx
    ├── helper_string.h
    ├── helper_timer.h
    ├── nvrtc_helper.h
    ├── rendercheck_d3d11.cpp
    └── rendercheck_d3d11.h
├── Conkernels
    ├── Makefile
    ├── Makefile_serialized
    ├── NsightEclipse.xml
    ├── README.md
    ├── concurrentKernels.cu
    ├── concurrentKernels_vs2015.sln
    ├── concurrentKernels_vs2015.vcxproj
    ├── concurrentKernels_vs2017.sln
    ├── concurrentKernels_vs2017.vcxproj
    ├── concurrentKernels_vs2019.sln
    └── concurrentKernels_vs2019.vcxproj
├── DynParallel
    ├── .gitignore
    ├── Dynamic_Parallelism.cu
    ├── Makefile
    ├── Non_Dynamic_Parallelism.cu
    ├── include
    │   ├── png.h
    │   ├── pngconf.h
    │   ├── pnglibconf.h
    │   ├── zconf.h
    │   └── zlib.h
    └── lib
    │   ├── libpng.lib
    │   ├── libpngd.lib
    │   ├── zlibstat.lib
    │   └── zlibstatd.lib
├── GSOverlap
    ├── Makefile
    ├── NsightEclipse.xml
    ├── README.md
    ├── globalToShmemAsyncCopy.cu
    ├── globalToShmemAsyncCopy_vs2015.sln
    ├── globalToShmemAsyncCopy_vs2015.vcxproj
    ├── globalToShmemAsyncCopy_vs2017.sln
    ├── globalToShmemAsyncCopy_vs2017.vcxproj
    ├── globalToShmemAsyncCopy_vs2019.sln
    └── globalToShmemAsyncCopy_vs2019.vcxproj
├── HDOverlap
    ├── Makefile
    ├── axpy_cudakernel.cu
    ├── results.txt
    └── test.sh
├── LICENSE_BSD.txt
├── MemAlign
    ├── Makefile
    ├── axpy.h
    ├── axpy_cuda.c
    ├── axpy_cuda.output.carina.txt
    ├── axpy_cudakernel.cu
    └── test.sh
├── MiniTransfer_SpMV
    ├── Makefile
    ├── SpMV.h
    ├── SpMV_cuda.c
    ├── SpMV_cuda.output.carina.txt
    ├── SpMV_cudakernel.cu
    └── test.sh
├── NOTICE
├── README.md
├── ReadOnlyMem_1D_Texture
    ├── Makefile
    ├── axpy.h
    ├── axpy_cuda.c
    ├── axpy_cuda.output.carina.txt
    ├── axpy_cuda.output.fornax.txt
    ├── axpy_cudakernel.cu
    └── test.sh
├── ReadOnlyMem_2D_Texture
    ├── Makefile
    ├── matadd.output.carina.txt
    ├── matadd.output.fornax.txt
    ├── matadd_2D.h
    ├── matadd_2D_cuda.c
    ├── matadd_2D_cudakernel.cu
    └── test.sh
├── Shmem
    ├── Makefile
    ├── mm_kernel.cu
    ├── mm_omp_cuda.c
    ├── mm_omp_cuda.h
    └── testResults.txt
├── Shuffle
    ├── cuda_global
    │   ├── Makefile
    │   ├── README.md
    │   ├── reduction.cpp
    │   ├── reduction.h
    │   ├── reduction_kernel.cu
    │   ├── result.txt
    │   └── test.sh
    └── cuda_shuffle
    │   ├── Makefile
    │   ├── README.md
    │   ├── reduction.cpp
    │   ├── reduction.h
    │   ├── reduction_kernel.cu
    │   ├── result.txt
    │   └── test.sh
├── TaskGraph
    ├── Makefile
    ├── NsightEclipse.xml
    ├── README.md
    ├── conjugateGradientCudaGraphs.cu
    ├── conjugateGradientCudaGraphs_vs2015.sln
    ├── conjugateGradientCudaGraphs_vs2015.vcxproj
    ├── conjugateGradientCudaGraphs_vs2017.sln
    ├── conjugateGradientCudaGraphs_vs2017.vcxproj
    ├── conjugateGradientCudaGraphs_vs2019.sln
    └── conjugateGradientCudaGraphs_vs2019.vcxproj
├── UniMem
    ├── LowAccessDensityTest.h
    ├── LowAccessDensityTest_cuda.cu
    ├── LowAccessDensityTest_cuda.output.carina.txt
    ├── LowAccessDensityTest_cuda_fixed_access_time.output.carina.txt
    ├── LowAccessDensityTest_omp.c
    ├── Makefile
    ├── test.sh
    └── test2.sh
└── WarpDivRedux
    ├── Makefile
    ├── test.sh
    ├── warpDivergenceTest.h
    ├── warpDivergenceTest_cuda.c
    ├── warpDivergenceTest_cuda.output.carina.txt
    └── warpDivergenceTest_cudakernel.cu


/BankRedux/Makefile:
--------------------------------------------------------------------------------
1 | default:
2 | 	nvcc -o sum_cuda sum_cuda.c sum_cudakernel.cu
3 | 


--------------------------------------------------------------------------------
/BankRedux/sum.h:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #define REAL float
 7 | #define ThreadsPerBlock 256
 8 | #define VEC_LEN 1024000 //use a fixed number for now
 9 | 
10 | #ifdef __cplusplus
11 | extern "C" {
12 | #endif
13 | extern void sum_cuda(int n, REAL *x, REAL *result);
14 | #ifdef __cplusplus
15 | }
16 | #endif
17 | 


--------------------------------------------------------------------------------
/BankRedux/sum_cuda.c:
--------------------------------------------------------------------------------
  1 | //******************************************************************************************************************//
  2 | // Copyright (c) 2021, University of North Carolina at Charlotte
  3 | // and Lawrence Livermore National Security, LLC.
  4 | // SPDX-License-Identifier: (BSD-3-Clause)
  5 | //*****************************************************************************************************************//
  6 | // Experimental test for bank conflict
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | #include <string.h>
 11 | #include <sys/timeb.h>
 12 | #include "sum.h"
 13 | 
 14 | double read_timer_ms() {
 15 |     struct timeb tm;
 16 |     ftime(&tm);
 17 |     return (double) tm.time * 1000.0 + (double) tm.millitm;
 18 | }
 19 | 
 20 | /* change this to do saxpy or daxpy : single precision or double precision*/
 21 | #define REAL float
 22 | 
 23 | //#define ThreadsPerBlock 256
 24 | 
 25 | /* zero out the entire vector */
 26 | void zero(REAL *A, int n)
 27 | {
 28 |     int i;
 29 |     for (i = 0; i < n; i++) {
 30 |         A[i] = 0.0;
 31 |     }
 32 | }
 33 | 
 34 | /* initialize a vector with random floating point numbers */
 35 | void init(REAL *A, int n)
 36 | {
 37 |     int i;
 38 |     for (i = 0; i < n; i++) {
 39 |         A[i] = (float)drand48();
 40 |     }
 41 | }
 42 | 
 43 | /*serial version */
 44 | float sum(int N, float *numbers) {
 45 | 	float sum = 0;
 46 | 	
 47 | 	for (int i = 0; i<N; i++)
 48 | 		sum += numbers[i];
 49 | 	
 50 | 	return sum;
 51 | }
 52 | 
 53 | /* compare two arrays and return percentage of difference */
 54 | REAL check(REAL*A, REAL*B, int n)
 55 | {
 56 |     int i;
 57 |     REAL diffsum =0.0, sum = 0.0;
 58 |     for (i = 0; i < n; i++) {
 59 |         diffsum += fabs(A[i] - B[i]);
 60 |         sum += fabs(B[i]);
 61 |     }
 62 |     return diffsum/sum;
 63 | }
 64 | 
 65 | int main(int argc, char *argv[])
 66 | {
 67 |   int n;
 68 |   REAL *x;
 69 |   REAL *result_cuda;
 70 | 
 71 |   n = VEC_LEN;
 72 |   fprintf(stderr, "Usage: sum <n>\n");
 73 |   if (argc >= 2) {
 74 |     n = atoi(argv[1]);
 75 |   }
 76 |   
 77 |   x = (REAL *) malloc(n * sizeof(REAL));
 78 |   result_cuda = (REAL*)malloc(((VEC_LEN + ThreadsPerBlock - 1) / ThreadsPerBlock) * sizeof(REAL));
 79 | 
 80 |   srand48(1<<12);
 81 |   init(x, n);
 82 |   
 83 |   volatile float answer = 0;
 84 |   answer = sum(n, x);
 85 | 
 86 |   int i;
 87 |   int num_runs = 10;
 88 |   /* cuda version */
 89 |   double elapsed = read_timer_ms();
 90 |   for (i=0; i<num_runs; i++) sum_cuda(n, x, result_cuda);
 91 |   
 92 |  	for (int i = 1; i < ((VEC_LEN + ThreadsPerBlock - 1) / ThreadsPerBlock); ++i)
 93 |   result_cuda[0] += result_cuda[i];
 94 | 
 95 |   elapsed = (read_timer_ms() - elapsed)/num_runs;
 96 |   printf("sum(%d): checksum: %g, time: %0.2fms\n", n, result_cuda[0]-answer, elapsed);
 97 | 
 98 | 
 99 |   free(x);
100 |   return 0;
101 | }
102 | 


--------------------------------------------------------------------------------
/BankRedux/sum_cudakernel.cu:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #include "sum.h"
 7 | 
 8 | __global__ void sum_warmingup(const REAL *x, REAL *result) {
 9 |   __shared__ REAL cache[ThreadsPerBlock];
10 |   int tid = blockIdx.x * blockDim.x + threadIdx.x;
11 |   int cacheIndex = threadIdx.x;
12 |   cache[cacheIndex] = x[tid];
13 |   __syncthreads();
14 |   for (int i = blockDim.x / 2; i > 0; i /= 2) {
15 |     if (cacheIndex < i) {
16 |       cache[cacheIndex] += cache[cacheIndex + i];
17 |     }
18 |     __syncthreads();
19 |   }
20 |   if (cacheIndex == 0)
21 |   result[blockIdx.x] = cache[cacheIndex];
22 | }
23 | 
24 | __global__ void sum_cudakernel(const REAL *x, REAL *result) {
25 |   __shared__ REAL cache[ThreadsPerBlock];
26 |   int tid = blockIdx.x * blockDim.x + threadIdx.x;
27 |   int cacheIndex = threadIdx.x;
28 |   cache[cacheIndex] = x[tid];
29 |   __syncthreads();
30 |   for (int i = blockDim.x / 2; i > 0; i /= 2) {
31 |     if (cacheIndex < i) {
32 |       cache[cacheIndex] += cache[cacheIndex + i];
33 |     }
34 |     __syncthreads();
35 |   }
36 |   if (cacheIndex == 0)
37 |   result[blockIdx.x] = cache[cacheIndex];
38 | }
39 | 
40 | __global__ void sum_cudakernel_bc(const REAL *x, REAL *result) {
41 |   __shared__ REAL cache[ThreadsPerBlock];
42 |   int tid = blockIdx.x * blockDim.x + threadIdx.x;
43 |   int cacheIndex = threadIdx.x;
44 |   cache[cacheIndex] = x[tid];
45 |   __syncthreads();
46 |   for (int i = 1; i < blockDim.x; i *= 2) {
47 |     int index = 2 * i * cacheIndex;
48 |     if (index < blockDim.x) {
49 |       cache[index] += cache[index + i];
50 |     }
51 |     __syncthreads();
52 |   }
53 |   if (cacheIndex == 0)
54 |   result[blockIdx.x] = cache[cacheIndex];
55 | }
56 | 
57 | void sum_cuda(int n, REAL *x, REAL *result) {
58 |   REAL *d_x;
59 |   REAL *d_result;
60 |   cudaMalloc(&d_x, n*sizeof(REAL));
61 |   cudaMalloc(&d_result, ((n+255)/256) * sizeof(REAL));
62 | 
63 |   cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice);
64 | 
65 |   sum_warmingup<<<(n+255)/256, 256>>>(d_x, d_result);
66 |   cudaDeviceSynchronize();
67 |   sum_cudakernel<<<(n+255)/256, 256>>>(d_x, d_result);
68 |   cudaDeviceSynchronize();
69 |   sum_cudakernel_bc<<<(n+255)/256, 256>>>(d_x, d_result);
70 |   cudaDeviceSynchronize();
71 | 
72 |   cudaMemcpy(result, d_result, ((n+255)/256) * sizeof(REAL), cudaMemcpyDeviceToHost);
73 |   cudaFree(d_x);
74 |   cudaFree(d_result);
75 | }
76 | 
77 | 


--------------------------------------------------------------------------------
/BankRedux/test.sh:
--------------------------------------------------------------------------------
1 | nvprof ./sum_cuda 102400
2 | nvprof ./sum_cuda 204800
3 | nvprof ./sum_cuda 409600
4 | nvprof ./sum_cuda 1024000


--------------------------------------------------------------------------------
/CoMem_AXPY/Makefile:
--------------------------------------------------------------------------------
1 | default:
2 | 	nvcc -o axpy_cuda axpy_cuda.c axpy_cudakernel.cu
3 | 


--------------------------------------------------------------------------------
/CoMem_AXPY/axpy.h:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #define REAL double
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | extern void axpy_cuda(REAL *x, REAL * y, int n, REAL a);
12 | #ifdef __cplusplus
13 | }
14 | #endif
15 | 


--------------------------------------------------------------------------------
/CoMem_AXPY/axpy_cuda.c:
--------------------------------------------------------------------------------
  1 | //******************************************************************************************************************//
  2 | // Copyright (c) 2021, University of North Carolina at Charlotte
  3 | // and Lawrence Livermore National Security, LLC.
  4 | // SPDX-License-Identifier: (BSD-3-Clause)
  5 | //*****************************************************************************************************************//
  6 | // Experimental tests for coalesced memory access and uncloalesced memory access
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | #include <string.h>
 11 | #include <sys/timeb.h>
 12 | #include "axpy.h"
 13 | 
 14 | double read_timer_ms() {
 15 |     struct timeb tm;
 16 |     ftime(&tm);
 17 |     return (double) tm.time * 1000.0 + (double) tm.millitm;
 18 | }
 19 | 
 20 | /* change this to do saxpy or daxpy : single precision or double precision*/
 21 | #define REAL double
 22 | #define VEC_LEN 1024000 //use a fixed number for now
 23 | /* zero out the entire vector */
 24 | void zero(REAL *A, int n)
 25 | {
 26 |     int i;
 27 |     for (i = 0; i < n; i++) {
 28 |         A[i] = 0.0;
 29 |     }
 30 | }
 31 | 
 32 | /* initialize a vector with random floating point numbers */
 33 | void init(REAL *A, int n)
 34 | {
 35 |     int i;
 36 |     for (i = 0; i < n; i++) {
 37 |         A[i] = (double)drand48();
 38 |     }
 39 | }
 40 | 
 41 | /*serial version */
 42 | void axpy(REAL* x, REAL* y, long n, REAL a) {
 43 |   int i;
 44 |   for (i = 0; i < n; ++i)
 45 |   {
 46 |     y[i] += a * x[i];
 47 |   }
 48 | }
 49 | 
 50 | /* compare two arrays and return percentage of difference */
 51 | REAL check(REAL*A, REAL*B, int n)
 52 | {
 53 |     int i;
 54 |     REAL diffsum =0.0, sum = 0.0;
 55 |     for (i = 0; i < n; i++) {
 56 |         diffsum += fabs(A[i] - B[i]);
 57 |         sum += fabs(B[i]);
 58 |     }
 59 |     return diffsum/sum;
 60 | }
 61 | 
 62 | int main(int argc, char *argv[])
 63 | {
 64 |   int n;
 65 |   REAL *y_cuda, *y, *x;
 66 |   REAL a = 123.456;
 67 | 
 68 |   n = VEC_LEN;
 69 |   fprintf(stderr, "Usage: axpy <n>\n");
 70 |   if (argc >= 2) {
 71 |     n = atoi(argv[1]);
 72 |   }
 73 |   y_cuda = (REAL *) malloc(n * sizeof(REAL));
 74 |   y  = (REAL *) malloc(n * sizeof(REAL));
 75 |   x = (REAL *) malloc(n * sizeof(REAL));
 76 | 
 77 |   srand48(1<<12);
 78 |   init(x, n);
 79 |   init(y_cuda, n);
 80 |   memcpy(y, y_cuda, n*sizeof(REAL));
 81 | 
 82 |   axpy(x, y, n, a);
 83 | 
 84 |   int i;
 85 |   int num_runs = 10;
 86 |   /* cuda version */
 87 |   double elapsed = read_timer_ms();
 88 |   for (i=0; i<num_runs; i++) axpy_cuda(x, y_cuda, n, a);
 89 |   elapsed = (read_timer_ms() - elapsed)/num_runs;
 90 | 
 91 |   REAL checkresult = check(y_cuda, y, n);
 92 |   printf("axpy(%d): checksum: %g, time: %0.2fms\n", n, checkresult, elapsed);
 93 |   //assert (checkresult < 1.0e-10);
 94 | 
 95 |   free(y_cuda);
 96 |   free(y);
 97 |   free(x);
 98 |   return 0;
 99 | }
100 | 


--------------------------------------------------------------------------------
/CoMem_AXPY/axpy_cudakernel.cu:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #include "axpy.h"
 7 | 
 8 | __global__ 
 9 | void
10 | axpy_cudakernel_warmingup(REAL* x, REAL* y, int n, REAL a)
11 | {
12 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
13 |     if (i < n) y[i] += a*x[i];
14 | }
15 | 
16 | __global__ 
17 | void
18 | axpy_cudakernel_1perThread(REAL* x, REAL* y, int n, REAL a)
19 | {
20 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
21 |     if (i < n) y[i] += a*x[i];
22 | }
23 | 
24 | /* block distribution of loop iteration */
25 | __global__ 
26 | void axpy_cudakernel_block(REAL* x, REAL* y, int n, REAL a) {
27 | 	int thread_num = threadIdx.x + blockIdx.x * blockDim.x;
28 | 	int total_threads = gridDim.x * blockDim.x;
29 | 
30 | 	int block_size = n / total_threads; //dividable, TODO handle non-dividiable later
31 | 	
32 | 	int start_index = thread_num * block_size;
33 | 	int stop_index = start_index + block_size;
34 | 	int i;
35 |         for (i=start_index; i<stop_index; i++) {
36 | 		if (i < n) y[i] += a*x[i];
37 | 	}
38 | }
39 | 
40 | /* cyclic distribution of loop distribution */
41 | __global__
42 | void axpy_cudakernel_cyclic(REAL* x, REAL* y, int n, REAL a) {
43 | 	int thread_num = threadIdx.x + blockIdx.x * blockDim.x;
44 | 	int total_threads = gridDim.x * blockDim.x;
45 | 	
46 | 	int i;
47 | 	for (i=thread_num; i<n; i+=total_threads) { 
48 | 		if (i < n) y[i] += a*x[i];
49 | 	}
50 | }
51 | 
52 | void axpy_cuda(REAL* x, REAL* y, int n, REAL a) {
53 |   REAL *d_x, *d_y;
54 |   cudaMalloc(&d_x, n*sizeof(REAL));
55 |   cudaMalloc(&d_y, n*sizeof(REAL));
56 | 
57 |   cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice);
58 |   cudaMemcpy(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice);
59 | 
60 |   // Perform axpy elements
61 |   axpy_cudakernel_warmingup<<<(n+255)/256, 256>>>(d_x, d_y, n, a);
62 |   cudaDeviceSynchronize();
63 |   axpy_cudakernel_1perThread<<<(n+255)/256, 256>>>(d_x, d_y, n, a);
64 |   cudaDeviceSynchronize();
65 |   axpy_cudakernel_block<<<1024, 256>>>(d_x, d_y, n, a);
66 |   cudaDeviceSynchronize();
67 |   axpy_cudakernel_cyclic<<<1024, 256>>>(d_x, d_y, n, a);
68 |   cudaDeviceSynchronize();
69 | 
70 |   cudaMemcpy(y, d_y, n*sizeof(REAL), cudaMemcpyDeviceToHost);
71 |   cudaFree(d_x);
72 |   cudaFree(d_y);
73 | }
74 | 
75 | 


--------------------------------------------------------------------------------
/CoMem_AXPY/test.sh:
--------------------------------------------------------------------------------
1 | nvprof ./axpy_cuda 1024000
2 | nvprof ./axpy_cuda 4096000
3 | nvprof ./axpy_cuda 10240000
4 | nvprof ./axpy_cuda 20480000


--------------------------------------------------------------------------------
/CoMem_SpMM/Makefile:
--------------------------------------------------------------------------------
1 | default:
2 | 	nvcc -o SpMM_cuda SpMM_cuda.c SpMM_cudakernel.cu
3 | 


--------------------------------------------------------------------------------
/CoMem_SpMM/SpMM.h:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #define REAL float
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | extern void spmm_csr_cuda(const int num_rows, const int *ptrA, const int * indicesA, const REAL *dataA, const int *ptrB, const int * indicesB, const REAL *dataB,  REAL* result, int nnzA, int nnzB);
12 | 
13 | extern void spmm_csc_cuda(const int num_rows, const int *ptrA, const int * indicesA, const REAL *dataA, const int *ptrB, const int * indicesB, const REAL *dataB,  REAL* result, int nnzA, int nnzB);
14 | 
15 | #ifdef __cplusplus
16 | }
17 | #endif
18 | 


--------------------------------------------------------------------------------
/CoMem_SpMM/SpMM_cuda.output.carina.txt:
--------------------------------------------------------------------------------
 1 | int num_rows = 100;
 2 | int nnz = 1024;
 3 | 
 4 | This result is tested on carina.
 5 | spmm_csr_kernel: two matrix are all in csr format
 6 | spmm_csc_kernel: one matrix is in csr format and the other one is in csc format
 7 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------
 8 | xyi2@cci-carina:~/CUDAMemBench/SpMM$ nvprof ./SpMM_cuda
 9 | ==10262== NVPROF is profiling process 10262, command: ./SpMM_cuda
10 | check(serial vs serial_csr):0.000000
11 | check(serial vs serial_csc):0.000000
12 | check(serial vs cuda_csr):0.000288
13 | check(serial vs cuda_csc):0.000288
14 | ==10262== Profiling application: ./SpMM_cuda
15 | ==10262== Profiling result:
16 |             Type  Time(%)      Time     Calls       Avg       Min       Max  Name
17 |  GPU activities:   49.05%  48.011ms         1  48.011ms  48.011ms  48.011ms  spmm_csr_csr_warmingup(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int)
18 |                    49.05%  48.007ms         1  48.007ms  48.007ms  48.007ms  spmm_csr_csr_kernel(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int)
19 |                     0.94%  916.76us         1  916.76us  916.76us  916.76us  spmm_csc_csr_warmingup(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int)
20 |                     0.94%  915.77us         1  915.77us  915.77us  915.77us  spmm_csc_csr_kernel(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int)
21 |                     0.02%  22.784us        12  1.8980us  1.4720us  2.3360us  [CUDA memcpy HtoD]
22 |                     0.01%  8.2880us         2  4.1440us  4.0320us  4.2560us  [CUDA memcpy DtoH]
23 |       API calls:   75.16%  305.55ms        14  21.825ms  3.9930us  305.31ms  cudaMalloc
24 |                    24.07%  97.865ms         4  24.466ms  920.19us  48.014ms  cudaDeviceSynchronize
25 |                     0.40%  1.6464ms         1  1.6464ms  1.6464ms  1.6464ms  cuDeviceTotalMem
26 |                     0.14%  584.79us        97  6.0280us     181ns  235.37us  cuDeviceGetAttribute
27 |                     0.11%  437.12us        14  31.222us  3.0890us  187.25us  cudaFree
28 |                     0.08%  309.85us        14  22.132us  11.453us  72.366us  cudaMemcpy
29 |                     0.02%  85.758us         4  21.439us  12.362us  40.483us  cudaLaunchKernel
30 |                     0.01%  55.244us         1  55.244us  55.244us  55.244us  cuDeviceGetName
31 |                     0.00%  5.0020us         1  5.0020us  5.0020us  5.0020us  cuDeviceGetPCIBusId
32 |                     0.00%  3.6080us         3  1.2020us     385ns  2.7630us  cuDeviceGetCount
33 |                     0.00%  1.3200us         2     660ns     253ns  1.0670us  cuDeviceGet
34 |                     0.00%     337ns         1     337ns     337ns     337ns  cuDeviceGetUuid
35 | 


--------------------------------------------------------------------------------
/CoMem_SpMM/SpMM_cuda.output.fornax.txt:
--------------------------------------------------------------------------------
 1 | int num_rows = 100;
 2 | int nnz = 1024;
 3 | 
 4 | This result is tested on fornax.
 5 | spmm_csr_kernel: two matrix are all in csr format
 6 | spmm_csc_kernel: one matrix is in csr format and the other one is in csc format
 7 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------
 8 | xyi2@fornax:~/CUDAMemBench/SpMM$ nvprof ./SpMM_cuda
 9 | ==30020== NVPROF is profiling process 30020, command: ./SpMM_cuda
10 | ==30020== Warning: Auto boost enabled on device 0. Profiling results may be inconsistent.
11 | check(serial vs serial_csr):0.000000
12 | check(serial vs serial_csc):0.000000
13 | check(serial vs cuda_csr):0.000308
14 | check(serial vs cuda_csc):0.000308
15 | ==30020== Profiling application: ./SpMM_cuda
16 | ==30020== Profiling result:
17 |             Type  Time(%)      Time     Calls       Avg       Min       Max  Name
18 |  GPU activities:   98.55%  354.30ms         1  354.30ms  354.30ms  354.30ms  spmm_csr_kernel(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int)
19 |                     1.44%  5.1618ms         1  5.1618ms  5.1618ms  5.1618ms  spmm_csc_kernel(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int)
20 |                     0.01%  23.392us        12  1.9490us  1.3120us  2.2400us  [CUDA memcpy HtoD]
21 |                     0.01%  18.752us         2  9.3760us  8.7680us  9.9840us  [CUDA memcpy DtoH]
22 |       API calls:   53.41%  423.54ms        14  30.253ms  7.6370us  423.02ms  cudaMalloc
23 |                    45.39%  359.89ms        14  25.707ms  14.310us  354.38ms  cudaMemcpy
24 |                     0.58%  4.5723ms         4  1.1431ms  1.1372ms  1.1488ms  cuDeviceTotalMem
25 |                     0.45%  3.5295ms       388  9.0960us     396ns  330.51us  cuDeviceGetAttribute
26 |                     0.10%  767.61us        14  54.829us  5.6940us  316.94us  cudaFree
27 |                     0.04%  318.99us         2  159.50us  38.867us  280.13us  cudaLaunchKernel
28 |                     0.04%  287.49us         4  71.873us  68.487us  80.183us  cuDeviceGetName
29 |                     0.00%  17.213us         4  4.3030us  3.2400us  6.1430us  cuDeviceGetPCIBusId
30 |                     0.00%  6.6230us         8     827ns     523ns  1.5630us  cuDeviceGet
31 |                     0.00%  3.4470us         3  1.1490us     633ns  1.8740us  cuDeviceGetCount
32 |                     0.00%  2.5630us         4     640ns     526ns     837ns  cuDeviceGetUuid
33 | 


--------------------------------------------------------------------------------
/Common/FreeImage/include/FreeImage.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/passlab/CUDAMicroBench/8c8cd594411c3060fc613a8cdbb3bab21fff1fe1/Common/FreeImage/include/FreeImage.h


--------------------------------------------------------------------------------
/Common/README.md:
--------------------------------------------------------------------------------
1 | This folder is derived from CUDA Samples. It is used to support several benchmarks which are derived from CUDA Samples, including GSOverlap, ConKernels and Taskgraph. Some header files required by these three benchmarks are stored here.
2 | 


--------------------------------------------------------------------------------
/Common/UtilNPP/Image.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #ifndef NV_UTIL_NPP_IMAGE_H
 29 | #define NV_UTIL_NPP_IMAGE_H
 30 | 
 31 | #include <cstddef>
 32 | 
 33 | namespace npp
 34 | {
 35 | 
 36 |     class Image
 37 |     {
 38 |         public:
 39 |             struct Size
 40 |             {
 41 |                 unsigned int nWidth;
 42 |                 unsigned int nHeight;
 43 | 
 44 |                 Size() : nWidth(0), nHeight(0)
 45 |                 { };
 46 | 
 47 |                 Size(unsigned int nWidthNew, unsigned nHeightNew) : nWidth(nWidthNew), nHeight(nHeightNew)
 48 |                 { };
 49 | 
 50 |                 Size(const Size &rSize) : nWidth(rSize.nWidth), nHeight(rSize.nHeight)
 51 |                 { };
 52 | 
 53 |                 Size &
 54 |                 operator= (const Size &rSize)
 55 |                 {
 56 |                     if (&rSize == this)
 57 |                     {
 58 |                         return *this;
 59 |                     }
 60 | 
 61 |                     nWidth = rSize.nWidth;
 62 |                     nHeight = rSize.nHeight;
 63 | 
 64 |                     return *this;
 65 |                 }
 66 | 
 67 |                 void
 68 |                 swap(Size &rSize)
 69 |                 {
 70 |                     unsigned int nTemp;
 71 |                     nTemp = nWidth;
 72 |                     nWidth = rSize.nWidth;
 73 |                     rSize.nWidth = nTemp;
 74 | 
 75 |                     nTemp = nHeight;
 76 |                     nHeight = rSize.nHeight;
 77 |                     rSize.nHeight = nTemp;
 78 |                 }
 79 |             };
 80 | 
 81 |             Image()
 82 |             { };
 83 | 
 84 |             Image(unsigned int nWidth, unsigned int nHeight) : oSize_(nWidth, nHeight)
 85 |             { };
 86 | 
 87 |             Image(const Image::Size &rSize) : oSize_(rSize)
 88 |             { };
 89 | 
 90 |             Image(const Image &rImage) : oSize_(rImage.oSize_)
 91 |             { };
 92 | 
 93 |             virtual
 94 |             ~Image()
 95 |             { };
 96 | 
 97 |             Image &
 98 |             operator= (const Image &rImage)
 99 |             {
100 |                 if (&rImage == this)
101 |                 {
102 |                     return *this;
103 |                 }
104 | 
105 |                 oSize_  = rImage.oSize_;
106 |                 return *this;
107 |             };
108 | 
109 |             unsigned int
110 |             width()
111 |             const
112 |             {
113 |                 return oSize_.nWidth;
114 |             }
115 | 
116 |             unsigned int
117 |             height()
118 |             const
119 |             {
120 |                 return oSize_.nHeight;
121 |             }
122 | 
123 |             Size
124 |             size()
125 |             const
126 |             {
127 |                 return oSize_;
128 |             }
129 | 
130 |             void
131 |             swap(Image &rImage)
132 |             {
133 |                 oSize_.swap(rImage.oSize_);
134 |             }
135 | 
136 |         private:
137 |             Size oSize_;
138 |     };
139 | 
140 |     bool
141 |     operator== (const Image::Size &rFirst, const Image::Size &rSecond)
142 |     {
143 |         return rFirst.nWidth == rSecond.nWidth && rFirst.nHeight == rSecond.nHeight;
144 |     }
145 | 
146 |     bool
147 |     operator!= (const Image::Size &rFirst, const Image::Size &rSecond)
148 |     {
149 |         return rFirst.nWidth != rSecond.nWidth || rFirst.nHeight != rSecond.nHeight;
150 |     }
151 | 
152 | } // npp namespace
153 | 
154 | 
155 | #endif // NV_UTIL_NPP_IMAGE_H
156 | 


--------------------------------------------------------------------------------
/Common/UtilNPP/ImageAllocatorsCPU.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | #ifndef NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H
29 | #define NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H
30 | 
31 | #include "Exceptions.h"
32 | 
33 | namespace npp
34 | {
35 | 
36 |     template <typename D, size_t N>
37 |     class ImageAllocatorCPU
38 |     {
39 |         public:
40 |             static
41 |             D *
42 |             Malloc2D(unsigned int nWidth, unsigned int nHeight, unsigned int *pPitch)
43 |             {
44 |                 NPP_ASSERT(nWidth * nHeight > 0);
45 | 
46 |                 D *pResult = new D[nWidth * N * nHeight];
47 |                 *pPitch = nWidth * sizeof(D) * N;
48 | 
49 |                 return pResult;
50 |             };
51 | 
52 |             static
53 |             void
54 |             Free2D(D *pPixels)
55 |             {
56 |                 delete[] pPixels;
57 |             };
58 | 
59 |             static
60 |             void
61 |             Copy2D(D *pDst, size_t nDstPitch, const D *pSrc, size_t nSrcPitch, size_t nWidth, size_t nHeight)
62 |             {
63 |                 const void *pSrcLine = pSrc;
64 |                 void        *pDstLine = pDst;
65 | 
66 |                 for (size_t iLine = 0; iLine < nHeight; ++iLine)
67 |                 {
68 |                     // copy one line worth of data
69 |                     memcpy(pDst, pSrc, nWidth * N * sizeof(D));
70 |                     // move data pointers to next line
71 |                     pDst += nDstPitch;
72 |                     pSrc += nSrcPitch;
73 |                 }
74 |             };
75 | 
76 |     };
77 | 
78 | } // npp namespace
79 | 
80 | #endif // NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H
81 | 


--------------------------------------------------------------------------------
/Common/UtilNPP/ImagesCPU.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #ifndef NV_UTIL_NPP_IMAGES_CPU_H
 29 | #define NV_UTIL_NPP_IMAGES_CPU_H
 30 | 
 31 | #include "ImagePacked.h"
 32 | 
 33 | #include "ImageAllocatorsCPU.h"
 34 | #include "Exceptions.h"
 35 | 
 36 | #include <npp.h>
 37 | 
 38 | 
 39 | namespace npp
 40 | {
 41 | 
 42 |     template<typename D, unsigned int N, class A>
 43 |     class ImageCPU: public npp::ImagePacked<D, N, A>
 44 |     {
 45 |         public:
 46 | 
 47 |             ImageCPU()
 48 |             {
 49 |                 ;
 50 |             }
 51 | 
 52 |             ImageCPU(unsigned int nWidth, unsigned int nHeight): ImagePacked<D, N, A>(nWidth, nHeight)
 53 |             {
 54 |                 ;
 55 |             }
 56 | 
 57 |             explicit
 58 |             ImageCPU(const npp::Image::Size &rSize): ImagePacked<D, N, A>(rSize)
 59 |             {
 60 |                 ;
 61 |             }
 62 | 
 63 |             ImageCPU(const ImageCPU<D, N, A> &rImage): Image(rImage)
 64 |             {
 65 |                 ;
 66 |             }
 67 | 
 68 |             virtual
 69 |             ~ImageCPU()
 70 |             {
 71 |                 ;
 72 |             }
 73 | 
 74 |             ImageCPU &
 75 |             operator= (const ImageCPU<D, N, A> &rImage)
 76 |             {
 77 |                 ImagePacked<D, N, A>::operator= (rImage);
 78 | 
 79 |                 return *this;
 80 |             }
 81 | 
 82 |             npp::Pixel<D, N> &
 83 |             operator()(unsigned int iX, unsigned int iY)
 84 |             {
 85 |                 return *ImagePacked<D, N, A>::pixels(iX, iY);
 86 |             }
 87 | 
 88 |             npp::Pixel<D, N>
 89 |             operator()(unsigned int iX, unsigned int iY)
 90 |             const
 91 |             {
 92 |                 return *ImagePacked<D, N, A>::pixels(iX, iY);
 93 |             }
 94 | 
 95 |     };
 96 | 
 97 | 
 98 |     typedef ImageCPU<Npp8u,  1, npp::ImageAllocatorCPU<Npp8u,      1>  >   ImageCPU_8u_C1;
 99 |     typedef ImageCPU<Npp8u,  2, npp::ImageAllocatorCPU<Npp8u,      2>  >   ImageCPU_8u_C2;
100 |     typedef ImageCPU<Npp8u,  3, npp::ImageAllocatorCPU<Npp8u,      3>  >   ImageCPU_8u_C3;
101 |     typedef ImageCPU<Npp8u,  4, npp::ImageAllocatorCPU<Npp8u,      4>  >   ImageCPU_8u_C4;
102 | 
103 |     typedef ImageCPU<Npp16u, 1, npp::ImageAllocatorCPU<Npp16u,     1>  >   ImageCPU_16u_C1;
104 |     typedef ImageCPU<Npp16u, 3, npp::ImageAllocatorCPU<Npp16u,     3>  >   ImageCPU_16u_C3;
105 |     typedef ImageCPU<Npp16u, 4, npp::ImageAllocatorCPU<Npp16u,     4>  >   ImageCPU_16u_C4;
106 | 
107 |     typedef ImageCPU<Npp16s, 1, npp::ImageAllocatorCPU<Npp16s,     1>  >   ImageCPU_16s_C1;
108 |     typedef ImageCPU<Npp16s, 3, npp::ImageAllocatorCPU<Npp16s,     3>  >   ImageCPU_16s_C3;
109 |     typedef ImageCPU<Npp16s, 4, npp::ImageAllocatorCPU<Npp16s,     4>  >   ImageCPU_16s_C4;
110 | 
111 |     typedef ImageCPU<Npp32s, 1, npp::ImageAllocatorCPU<Npp32s,     1>  >   ImageCPU_32s_C1;
112 |     typedef ImageCPU<Npp32s, 3, npp::ImageAllocatorCPU<Npp32s,     3>  >   ImageCPU_32s_C3;
113 |     typedef ImageCPU<Npp32s, 4, npp::ImageAllocatorCPU<Npp32s,     4>  >   ImageCPU_32s_C4;
114 | 
115 |     typedef ImageCPU<Npp32f, 1, npp::ImageAllocatorCPU<Npp32f,     1>  >   ImageCPU_32f_C1;
116 |     typedef ImageCPU<Npp32f, 3, npp::ImageAllocatorCPU<Npp32f,     3>  >   ImageCPU_32f_C3;
117 |     typedef ImageCPU<Npp32f, 4, npp::ImageAllocatorCPU<Npp32f,     4>  >   ImageCPU_32f_C4;
118 | 
119 | } // npp namespace
120 | 
121 | #endif // NV_IMAGE_IPP_H
122 | 


--------------------------------------------------------------------------------
/Common/UtilNPP/Pixel.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | 
 29 | #ifndef NV_UTIL_PIXEL_H
 30 | #define NV_UTIL_PIXEL_H
 31 | 
 32 | #include "Exceptions.h"
 33 | 
 34 | namespace npp
 35 | {
 36 |     template <typename D, size_t N>
 37 |     struct Pixel
 38 |     { };
 39 | 
 40 |     template <typename D>
 41 |     struct Pixel<D, 1>
 42 |     {
 43 |         D x;
 44 | 
 45 |         const D &
 46 |         operator[](size_t iChannel)
 47 |         const
 48 |         {
 49 |             NPP_ASSERT(iChannel < 1);
 50 |             return (&x)[iChannel];
 51 |         }
 52 | 
 53 |         D &
 54 |         operator[](size_t iChannel)
 55 |         {
 56 |             NPP_ASSERT(iChannel < 1);
 57 |             return (&x)[iChannel];
 58 |         }
 59 |     };
 60 | 
 61 |     template <typename D>
 62 |     struct Pixel<D, 2>
 63 |     {
 64 |         D x,y;
 65 | 
 66 |         const D &
 67 |         operator[](size_t iChannel)
 68 |         const
 69 |         {
 70 |             NPP_ASSERT(iChannel < 2);
 71 |             return (&x)[iChannel];
 72 |         }
 73 | 
 74 |         D &
 75 |         operator[](size_t iChannel)
 76 |         {
 77 |             NPP_ASSERT(iChannel < 2);
 78 |             return (&x)[iChannel];
 79 |         }
 80 |     };
 81 | 
 82 |     template <typename D>
 83 |     struct Pixel<D, 3>
 84 |     {
 85 |         D x,y,z;
 86 | 
 87 |         const D &
 88 |         operator[](size_t iChannel)
 89 |         const
 90 |         {
 91 |             NPP_ASSERT(iChannel < 3);
 92 |             return (&x)[iChannel];
 93 |         }
 94 | 
 95 |         D &
 96 |         operator[](size_t iChannel)
 97 |         {
 98 |             NPP_ASSERT(iChannel < 3);
 99 |             return (&x)[iChannel];
100 |         }
101 |     };
102 | 
103 |     template <typename D>
104 |     struct Pixel<D, 4>
105 |     {
106 |         D x, y, z, w;
107 | 
108 |         const D &
109 |         operator[](size_t iChannel)
110 |         const
111 |         {
112 |             NPP_ASSERT(iChannel < 4);
113 |             return (&x)[iChannel];
114 |         }
115 | 
116 |         D &
117 |         operator[](size_t iChannel)
118 |         {
119 |             NPP_ASSERT(iChannel < 4);
120 |             return (&x)[iChannel];
121 |         }
122 |     };
123 | 
124 | } // npp namespace
125 | 
126 | #endif // NV_UTIL_PIXEL_H
127 | 


--------------------------------------------------------------------------------
/Common/UtilNPP/Signal.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | 
 29 | #ifndef NV_UTIL_NPP_SIGNAL_H
 30 | #define NV_UTIL_NPP_SIGNAL_H
 31 | 
 32 | #include <cstring>
 33 | 
 34 | namespace npp
 35 | {
 36 |     class Signal
 37 |     {
 38 |         public:
 39 |             Signal() : nSize_(0)
 40 |             { };
 41 | 
 42 |             explicit
 43 |             Signal(size_t nSize) : nSize_(nSize)
 44 |             { };
 45 | 
 46 |             Signal(const Signal &rSignal) : nSize_(rSignal.nSize_)
 47 |             { };
 48 | 
 49 |             virtual
 50 |             ~Signal()
 51 |             { }
 52 | 
 53 |             Signal &
 54 |             operator= (const Signal &rSignal)
 55 |             {
 56 |                 nSize_ = rSignal.nSize_;
 57 |                 return *this;
 58 |             }
 59 | 
 60 |             size_t
 61 |             size()
 62 |             const
 63 |             {
 64 |                 return nSize_;
 65 |             }
 66 | 
 67 |             void
 68 |             swap(Signal &rSignal)
 69 |             {
 70 |                 size_t nTemp = nSize_;
 71 |                 nSize_ = rSignal.nSize_;
 72 |                 rSignal.nSize_ = nTemp;
 73 |             }
 74 | 
 75 | 
 76 |         private:
 77 |             size_t nSize_;
 78 |     };
 79 | 
 80 |     template<typename D, class A>
 81 |     class SignalTemplate: public Signal
 82 |     {
 83 |         public:
 84 |             typedef D tData;
 85 | 
 86 |             SignalTemplate(): aValues_(0)
 87 |             {
 88 |                 ;
 89 |             }
 90 | 
 91 |             SignalTemplate(size_t nSize): Signal(nSize)
 92 |                 , aValues_(0)
 93 |             {
 94 |                 aValues_ = A::Malloc1D(size());
 95 |             }
 96 | 
 97 |             SignalTemplate(const SignalTemplate<D, A> &rSignal): Signal(rSignal)
 98 |                 , aValues_(0)
 99 |             {
100 |                 aValues_ = A::Malloc1D(size());
101 |                 A::Copy1D(aValues_, rSignal.values(), size());
102 |             }
103 | 
104 |             virtual
105 |             ~SignalTemplate()
106 |             {
107 |                 A::Free1D(aValues_);
108 |             }
109 | 
110 |             SignalTemplate &
111 |             operator= (const SignalTemplate<D, A> &rSignal)
112 |             {
113 |                 // in case of self-assignment
114 |                 if (&rSignal == this)
115 |                 {
116 |                     return *this;
117 |                 }
118 | 
119 |                 A::Free1D(aValues_);
120 |                 this->aPixels_ = 0;
121 | 
122 |                 // assign parent class's data fields (width, height)
123 |                 Signal::operator =(rSignal);
124 | 
125 |                 aValues_ = A::Malloc1D(size());
126 |                 A::Copy1D(aValues_, rSignal.value(), size());
127 | 
128 |                 return *this;
129 |             }
130 | 
131 |             /// Get a pointer to the pixel array.
132 |             ///     The result pointer can be offset to pixel at position (x, y) and
133 |             /// even negative offsets are allowed.
134 |             /// \param nX Horizontal pointer/array offset.
135 |             /// \param nY Vertical pointer/array offset.
136 |             /// \return Pointer to the pixel array (or first pixel in array with coordinates (nX, nY).
137 |             tData *
138 |             values(int i = 0)
139 |             {
140 |                 return aValues_ + i;
141 |             }
142 | 
143 |             const
144 |             tData *
145 |             values(int i = 0)
146 |             const
147 |             {
148 |                 return aValues_ + i;
149 |             }
150 | 
151 |             void
152 |             swap(SignalTemplate<D, A> &rSignal)
153 |             {
154 |                 Signal::swap(rSignal);
155 | 
156 |                 tData *aTemp       = this->aValues_;
157 |                 this->aValues_      = rSignal.aValues_;
158 |                 rSignal.aValues_    = aTemp;
159 |             }
160 | 
161 |         private:
162 |             D *aValues_;
163 |     };
164 | 
165 | } // npp namespace
166 | 
167 | 
168 | #endif // NV_UTIL_NPP_SIGNAL_H
169 | 


--------------------------------------------------------------------------------
/Common/UtilNPP/SignalAllocatorsCPU.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | 
29 | #ifndef NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H
30 | #define NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H
31 | 
32 | #include "Exceptions.h"
33 | 
34 | namespace npp
35 | {
36 | 
37 |     template <typename D>
38 |     class SignalAllocatorCPU
39 |     {
40 |         public:
41 |             static
42 |             D *
43 |             Malloc1D(unsigned int nSize)
44 |             {
45 |                 return new D[nSize];;
46 |             };
47 | 
48 |             static
49 |             void
50 |             Free1D(D *pPixels)
51 |             {
52 |                 delete[] pPixels;
53 |             };
54 | 
55 |             static
56 |             void
57 |             Copy1D(D *pDst, const D *pSrc, size_t nSize)
58 |             {
59 |                 memcpy(pDst, pSrc, nSize * sizeof(D));
60 |             };
61 | 
62 |     };
63 | 
64 | } // npp namespace
65 | 
66 | #endif // NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H
67 | 


--------------------------------------------------------------------------------
/Common/UtilNPP/SignalsCPU.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | 
 29 | #ifndef NV_UTIL_NPP_SIGNALS_CPU_H
 30 | #define NV_UTIL_NPP_SIGNALS_CPU_H
 31 | 
 32 | #include "Signal.h"
 33 | 
 34 | #include "SignalAllocatorsCPU.h"
 35 | #include "Exceptions.h"
 36 | 
 37 | #include <npp.h>
 38 | 
 39 | 
 40 | namespace npp
 41 | {
 42 | 
 43 |     template<typename D, class A>
 44 |     class SignalCPU: public npp::SignalTemplate<D, A>
 45 |     {
 46 |         public:
 47 |             typedef typename npp::SignalTemplate<D, A>::tData tData;
 48 | 
 49 |             SignalCPU()
 50 |             {
 51 |                 ;
 52 |             }
 53 | 
 54 |             SignalCPU(size_t nSize): SignalTemplate<D, A>(nSize)
 55 |             {
 56 |                 ;
 57 |             }
 58 | 
 59 |             SignalCPU(const SignalCPU<D, A> &rSignal): SignalTemplate<D, A>(rSignal)
 60 |             {
 61 |                 ;
 62 |             }
 63 | 
 64 |             virtual
 65 |             ~SignalCPU()
 66 |             {
 67 |                 ;
 68 |             }
 69 | 
 70 |             SignalCPU &
 71 |             operator= (const SignalCPU<D,A> &rSignal)
 72 |             {
 73 |                 SignalTemplate<D, A>::operator= (rSignal);
 74 | 
 75 |                 return *this;
 76 |             }
 77 | 
 78 |             tData &
 79 |             operator [](unsigned int i)
 80 |             {
 81 |                 return *SignalTemplate<D, A>::values(i);
 82 |             }
 83 | 
 84 |             tData
 85 |             operator [](unsigned int i)
 86 |             const
 87 |             {
 88 |                 return *SignalTemplate<D, A>::values(i);
 89 |             }
 90 | 
 91 |     };
 92 | 
 93 |     typedef SignalCPU<Npp8u,   npp::SignalAllocatorCPU<Npp8u>   >   SignalCPU_8u;
 94 |     typedef SignalCPU<Npp32s,  npp::SignalAllocatorCPU<Npp32s>  >   SignalCPU_32s;
 95 |     typedef SignalCPU<Npp16s,  npp::SignalAllocatorCPU<Npp16s>  >   SignalCPU_16s;
 96 |     typedef SignalCPU<Npp16sc, npp::SignalAllocatorCPU<Npp16sc> >   SignalCPU_16sc;
 97 |     typedef SignalCPU<Npp32sc, npp::SignalAllocatorCPU<Npp32sc> >   SignalCPU_32sc;
 98 |     typedef SignalCPU<Npp32f,  npp::SignalAllocatorCPU<Npp32f>  >   SignalCPU_32f;
 99 |     typedef SignalCPU<Npp32fc, npp::SignalAllocatorCPU<Npp32fc> >   SignalCPU_32fc;
100 |     typedef SignalCPU<Npp64s,  npp::SignalAllocatorCPU<Npp64s>  >   SignalCPU_64s;
101 |     typedef SignalCPU<Npp64sc, npp::SignalAllocatorCPU<Npp64sc> >   SignalCPU_64sc;
102 |     typedef SignalCPU<Npp64f,  npp::SignalAllocatorCPU<Npp64f>  >   SignalCPU_64f;
103 |     typedef SignalCPU<Npp64fc, npp::SignalAllocatorCPU<Npp64fc> >   SignalCPU_64fc;
104 | 
105 | } // npp namespace
106 | 
107 | #endif // NV_UTIL_NPP_SIGNALS_CPU_H
108 | 


--------------------------------------------------------------------------------
/Common/UtilNPP/SignalsNPP.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | 
 29 | #ifndef NV_UTIL_NPP_SIGNALS_NPP_H
 30 | #define NV_UTIL_NPP_SIGNALS_NPP_H
 31 | 
 32 | #include "Exceptions.h"
 33 | #include "Signal.h"
 34 | 
 35 | #include "SignalAllocatorsNPP.h"
 36 | #include <cuda_runtime.h>
 37 | 
 38 | namespace npp
 39 | {
 40 |     // forward declaration
 41 |     template<typename D, class A> class SignalCPU;
 42 | 
 43 |     template<typename D>
 44 |     class SignalNPP: public npp::SignalTemplate<D, npp::SignalAllocator<D> >
 45 |     {
 46 |         public:
 47 |             SignalNPP()
 48 |             {
 49 |                 ;
 50 |             }
 51 | 
 52 |             explicit
 53 |             SignalNPP(size_t nSize): SignalTemplate<D, npp::SignalAllocator<D> >(nSize)
 54 |             {
 55 |                 ;
 56 |             }
 57 | 
 58 |             SignalNPP(const SignalNPP<D> &rSignal): SignalTemplate<D, npp::SignalAllocator<D> >(rSignal)
 59 |             {
 60 |                 ;
 61 |             }
 62 | 
 63 |             template<class X>
 64 |             explicit
 65 |             SignalNPP(const SignalCPU<D, X> &rSignal): SignalTemplate<D, npp::SignalAllocator<D> >(rSignal.size())
 66 |             {
 67 |                 npp::SignalAllocator<D>::HostToDeviceCopy1D(SignalTemplate<D, npp::SignalAllocator<D> >::values(),
 68 |                                                             rSignal.values(), SignalTemplate<D, npp::SignalAllocator<D> >::size());
 69 |             }
 70 | 
 71 |             virtual
 72 |             ~SignalNPP()
 73 |             {
 74 |                 ;
 75 |             }
 76 | 
 77 |             SignalNPP &
 78 |             operator= (const SignalNPP<D> &rSignal)
 79 |             {
 80 |                 SignalTemplate<D, npp::SignalAllocator<D> >::operator= (rSignal);
 81 | 
 82 |                 return *this;
 83 |             }
 84 | 
 85 |             void
 86 |             copyTo(D *pValues)
 87 |             const
 88 |             {
 89 |                 npp::SignalAllocator<D>::DeviceToHostCopy1D(pValues, SignalTemplate<D, npp::SignalAllocator<D> >::values(), SignalTemplate<D, npp::SignalAllocator<D> >::size());
 90 |             }
 91 | 
 92 |             void
 93 |             copyFrom(D *pValues)
 94 |             {
 95 |                 npp::SignalAllocator<D>::HostToDeviceCopy1D(SignalTemplate<D, npp::SignalAllocator<D> >::values(), pValues, SignalTemplate<D, npp::SignalAllocator<D> >::size());
 96 |             }
 97 |     };
 98 | 
 99 |     typedef SignalNPP<Npp8u>    SignalNPP_8u;
100 |     typedef SignalNPP<Npp16s>   SignalNPP_16s;
101 |     typedef SignalNPP<Npp16sc>  SignalNPP_16sc;
102 |     typedef SignalNPP<Npp32s>   SignalNPP_32s;
103 |     typedef SignalNPP<Npp32sc>  SignalNPP_32sc;
104 |     typedef SignalNPP<Npp32f>   SignalNPP_32f;
105 |     typedef SignalNPP<Npp32fc>  SignalNPP_32fc;
106 |     typedef SignalNPP<Npp64s>   SignalNPP_64s;
107 |     typedef SignalNPP<Npp64sc>  SignalNPP_64sc;
108 |     typedef SignalNPP<Npp64f>   SignalNPP_64f;
109 |     typedef SignalNPP<Npp64fc>  SignalNPP_64fc;
110 | 
111 | } // npp namespace
112 | 
113 | #endif // NV_UTIL_NPP_SIGNALS_NPP_H
114 | 


--------------------------------------------------------------------------------
/Common/helper_functions.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | // These are helper functions for the SDK samples (string parsing,
29 | // timers, image helpers, etc)
30 | #ifndef COMMON_HELPER_FUNCTIONS_H_
31 | #define COMMON_HELPER_FUNCTIONS_H_
32 | 
33 | #ifdef WIN32
34 | #pragma warning(disable : 4996)
35 | #endif
36 | 
37 | // includes, project
38 | #include <assert.h>
39 | #include <exception.h>
40 | #include <math.h>
41 | #include <stdio.h>
42 | #include <stdlib.h>
43 | 
44 | #include <algorithm>
45 | #include <fstream>
46 | #include <iostream>
47 | #include <string>
48 | #include <vector>
49 | 
50 | // includes, timer, string parsing, image helpers
51 | #include <helper_image.h>  // helper functions for image compare, dump, data comparisons
52 | #include <helper_string.h>  // helper functions for string parsing
53 | #include <helper_timer.h>   // helper functions for timers
54 | 
55 | #ifndef EXIT_WAIVED
56 | #define EXIT_WAIVED 2
57 | #endif
58 | 
59 | #endif  // COMMON_HELPER_FUNCTIONS_H_
60 | 


--------------------------------------------------------------------------------
/Common/helper_multiprocess.h:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | #ifndef HELPER_MULTIPROCESS_H
 29 | #define HELPER_MULTIPROCESS_H
 30 | 
 31 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 32 | #ifndef WIN32_LEAN_AND_MEAN
 33 | #define WIN32_LEAN_AND_MEAN
 34 | #endif
 35 | #include <windows.h>
 36 | #include <iostream>
 37 | #include <stdio.h>
 38 | #include <tchar.h>
 39 | #include <strsafe.h>
 40 | #include <sddl.h>
 41 | #include <aclapi.h>
 42 | #include <winternl.h>
 43 | #else
 44 | #include <stdio.h>
 45 | #include <fcntl.h>
 46 | #include <sys/mman.h>
 47 | #include <unistd.h>
 48 | #include <errno.h>
 49 | #include <sys/wait.h>
 50 | #include <sys/types.h>
 51 | #include <sys/socket.h>
 52 | #include <memory.h>
 53 | #include <sys/un.h>
 54 | #endif
 55 | #include <vector>
 56 | 
 57 | typedef struct sharedMemoryInfo_st {
 58 |     void *addr;
 59 |     size_t size;
 60 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 61 |     HANDLE shmHandle;
 62 | #else
 63 |     int shmFd;
 64 | #endif
 65 | } sharedMemoryInfo;
 66 | 
 67 | int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info);
 68 | 
 69 | int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info);
 70 | 
 71 | void sharedMemoryClose(sharedMemoryInfo *info);
 72 | 
 73 | 
 74 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 75 | typedef PROCESS_INFORMATION Process;
 76 | #else
 77 | typedef pid_t Process;
 78 | #endif
 79 | 
 80 | int spawnProcess(Process *process, const char *app, char * const *args);
 81 | 
 82 | int waitProcess(Process *process);
 83 | 
 84 | #define checkIpcErrors(ipcFuncResult) \
 85 |     if (ipcFuncResult == -1) { fprintf(stderr, "Failure at %u %s\n", __LINE__, __FILE__); exit(EXIT_FAILURE); }
 86 | 
 87 | #if defined(__linux__)
 88 | struct ipcHandle_st {
 89 |     int socket;
 90 |     char *socketName;
 91 | };
 92 | typedef int ShareableHandle;
 93 | #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 94 | struct ipcHandle_st {
 95 |     std::vector<HANDLE> hMailslot; // 1 Handle in case of child and `num children` Handles for parent.
 96 | };
 97 | typedef HANDLE ShareableHandle;
 98 | #endif
 99 | 
100 | typedef struct ipcHandle_st ipcHandle;
101 | 
102 | int
103 | ipcCreateSocket(ipcHandle *&handle, const char *name, const std::vector<Process>& processes);
104 | 
105 | int
106 | ipcOpenSocket(ipcHandle *&handle);
107 | 
108 | int
109 | ipcCloseSocket(ipcHandle *handle);
110 | 
111 | int
112 | ipcRecvShareableHandles(ipcHandle *handle, std::vector<ShareableHandle>& shareableHandles);
113 | 
114 | int
115 | ipcSendShareableHandles(ipcHandle *handle, const std::vector<ShareableHandle>& shareableHandles, const std::vector<Process>& processes);
116 | 
117 | int
118 | ipcCloseShareableHandle(ShareableHandle shHandle);
119 | 
120 | #endif // HELPER_MULTIPROCESS_H
121 | 


--------------------------------------------------------------------------------
/Common/rendercheck_d3d11.cpp:
--------------------------------------------------------------------------------
  1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
  2 |  *
  3 |  * Redistribution and use in source and binary forms, with or without
  4 |  * modification, are permitted provided that the following conditions
  5 |  * are met:
  6 |  *  * Redistributions of source code must retain the above copyright
  7 |  *    notice, this list of conditions and the following disclaimer.
  8 |  *  * Redistributions in binary form must reproduce the above copyright
  9 |  *    notice, this list of conditions and the following disclaimer in the
 10 |  *    documentation and/or other materials provided with the distribution.
 11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
 12 |  *    contributors may be used to endorse or promote products derived
 13 |  *    from this software without specific prior written permission.
 14 |  *
 15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
 16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
 22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
 23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 26 |  */
 27 | 
 28 | ////////////////////////////////////////////////////////////////////////////////
 29 | //
 30 | //  Utility funcs to wrap up saving a surface or the back buffer as a PPM file
 31 | //  In addition, wraps up a threshold comparision of two PPMs.
 32 | //
 33 | //  These functions are designed to be used to implement an automated QA testing for SDK samples.
 34 | //
 35 | //  Author: Bryan Dudash
 36 | //  Email: sdkfeedback@nvidia.com
 37 | //
 38 | // Copyright (c) NVIDIA Corporation. All rights reserved.
 39 | ////////////////////////////////////////////////////////////////////////////////
 40 | 
 41 | #include <helper_functions.h>
 42 | #include <rendercheck_d3d11.h>
 43 | 
 44 | HRESULT CheckRenderD3D11::ActiveRenderTargetToPPM(ID3D11Device *pDevice, const char *zFileName)
 45 | {
 46 |     ID3D11DeviceContext *pDeviceCtxt;
 47 |     pDevice->GetImmediateContext(&pDeviceCtxt);
 48 |     ID3D11RenderTargetView *pRTV = NULL;
 49 |     pDeviceCtxt->OMGetRenderTargets(1,&pRTV,NULL);
 50 | 
 51 |     ID3D11Resource *pSourceResource = NULL;
 52 |     pRTV->GetResource(&pSourceResource);
 53 | 
 54 |     return ResourceToPPM(pDevice,pSourceResource,zFileName);
 55 | }
 56 | 
 57 | HRESULT CheckRenderD3D11::ResourceToPPM(ID3D11Device *pDevice, ID3D11Resource *pResource, const char *zFileName)
 58 | {
 59 |     ID3D11DeviceContext *pDeviceCtxt;
 60 |     pDevice->GetImmediateContext(&pDeviceCtxt);
 61 |     D3D11_RESOURCE_DIMENSION rType;
 62 |     pResource->GetType(&rType);
 63 | 
 64 |     if (rType != D3D11_RESOURCE_DIMENSION_TEXTURE2D)
 65 |     {
 66 |         printf("SurfaceToPPM: pResource is not a 2D texture! Aborting...\n");
 67 |         return E_FAIL;
 68 |     }
 69 | 
 70 |     ID3D11Texture2D *pSourceTexture = (ID3D11Texture2D *)pResource;
 71 |     ID3D11Texture2D *pTargetTexture = NULL;
 72 | 
 73 |     D3D11_TEXTURE2D_DESC desc;
 74 |     pSourceTexture->GetDesc(&desc);
 75 |     desc.BindFlags = 0;
 76 |     desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
 77 |     desc.Usage = D3D11_USAGE_STAGING;
 78 | 
 79 |     if (FAILED(pDevice->CreateTexture2D(&desc,NULL,&pTargetTexture)))
 80 |     {
 81 |         printf("SurfaceToPPM: Unable to create target Texture resoruce! Aborting... \n");
 82 |         return E_FAIL;
 83 |     }
 84 | 
 85 |     pDeviceCtxt->CopyResource(pTargetTexture,pSourceTexture);
 86 | 
 87 |     D3D11_MAPPED_SUBRESOURCE mappedTex2D;
 88 |     pDeviceCtxt->Map(pTargetTexture, 0, D3D11_MAP_READ,0,&mappedTex2D);
 89 | 
 90 |     // Need to convert from dx pitch to pitch=width
 91 |     unsigned char *pPPMData = new unsigned char[desc.Width*desc.Height*4];
 92 | 
 93 |     for (unsigned int iHeight = 0; iHeight<desc.Height; iHeight++)
 94 |     {
 95 |         memcpy(&(pPPMData[iHeight*desc.Width*4]),(unsigned char *)(mappedTex2D.pData)+iHeight*mappedTex2D.RowPitch,desc.Width*4);
 96 |     }
 97 | 
 98 |     pDeviceCtxt->Unmap(pTargetTexture, 0);
 99 | 
100 |     // Prepends the PPM header info and bumps byte data afterwards
101 |     sdkSavePPM4ub(zFileName, pPPMData, desc.Width, desc.Height);
102 | 
103 |     delete [] pPPMData;
104 |     pTargetTexture->Release();
105 | 
106 |     return S_OK;
107 | }
108 | 
109 | bool CheckRenderD3D11::PPMvsPPM(const char *src_file, const char *ref_file, const char *exec_path,
110 |                                 const float epsilon, const float threshold)
111 | {
112 |     char *ref_file_path = sdkFindFilePath(ref_file, exec_path);
113 | 
114 |     if (ref_file_path == NULL)
115 |     {
116 |         printf("CheckRenderD3D11::PPMvsPPM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, exec_path);
117 |         printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file);
118 |         printf("Aborting comparison!\n");
119 |         printf("  FAILURE!\n");
120 |         return false;
121 |     }
122 | 
123 |     return sdkComparePPM(src_file,ref_file_path,epsilon,threshold,true) == true;
124 | }


--------------------------------------------------------------------------------
/Common/rendercheck_d3d11.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | #pragma once
29 | 
30 | #ifndef _RENDERCHECK_D3D11_H_
31 | #define _RENDERCHECK_D3D11_H_
32 | 
33 | #include <stdio.h>
34 | #include <stdlib.h>
35 | #include <string.h>
36 | #include <assert.h>
37 | #include <d3d11.h>
38 | 
39 | class CheckRenderD3D11
40 | {
41 |     public:
42 | 
43 |         CheckRenderD3D11() {}
44 | 
45 |         static HRESULT ActiveRenderTargetToPPM(ID3D11Device  *pDevice, const char *zFileName);
46 |         static HRESULT ResourceToPPM(ID3D11Device *pDevice, ID3D11Resource *pResource, const char *zFileName);
47 | 
48 |         static bool PPMvsPPM(const char *src_file, const char *ref_file, const char *exec_path,
49 |                              const float epsilon, const float threshold = 0.0f);
50 | };
51 | 
52 | #endif


--------------------------------------------------------------------------------
/Conkernels/NsightEclipse.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?> 
 2 | <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 3 | <entry>
 4 |   <name>concurrentKernels</name>
 5 |   <description><![CDATA[This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.]]></description>
 6 |   <devicecompilation>whole</devicecompilation>
 7 |   <includepaths>
 8 |     <path>./</path>
 9 |     <path>../</path>
10 |     <path>../../common/inc</path>
11 |   </includepaths>
12 |   <keyconcepts>
13 |     <concept level="advanced">Performance Strategies</concept>
14 |   </keyconcepts>
15 |   <keywords>
16 |     <keyword>CUDA</keyword>
17 |     <keyword>Concurrent Kernels</keyword>
18 |   </keywords>
19 |   <libraries>
20 |   </libraries>
21 |   <librarypaths>
22 |   </librarypaths>
23 |   <nsight_eclipse>true</nsight_eclipse>
24 |   <primary_file>concurrentKernels.cu</primary_file>
25 |   <scopes>
26 |     <scope>1:CUDA Advanced Topics</scope>
27 |     <scope>1:Performance Strategies</scope>
28 |   </scopes>
29 |   <sm-arch>sm35</sm-arch>
30 |   <sm-arch>sm37</sm-arch>
31 |   <sm-arch>sm50</sm-arch>
32 |   <sm-arch>sm52</sm-arch>
33 |   <sm-arch>sm60</sm-arch>
34 |   <sm-arch>sm61</sm-arch>
35 |   <sm-arch>sm70</sm-arch>
36 |   <sm-arch>sm72</sm-arch>
37 |   <sm-arch>sm75</sm-arch>
38 |   <sm-arch>sm80</sm-arch>
39 |   <sm-arch>sm86</sm-arch>
40 |   <supported_envs>
41 |     <env>
42 |       <arch>x86_64</arch>
43 |       <platform>linux</platform>
44 |     </env>
45 |     <env>
46 |       <platform>windows7</platform>
47 |     </env>
48 |     <env>
49 |       <arch>x86_64</arch>
50 |       <platform>macosx</platform>
51 |     </env>
52 |     <env>
53 |       <arch>arm</arch>
54 |     </env>
55 |     <env>
56 |       <arch>ppc64le</arch>
57 |       <platform>linux</platform>
58 |     </env>
59 |   </supported_envs>
60 |   <supported_sm_architectures>
61 |     <include>all</include>
62 |   </supported_sm_architectures>
63 |   <title>Concurrent Kernels</title>
64 | </entry>
65 | 


--------------------------------------------------------------------------------
/Conkernels/README.md:
--------------------------------------------------------------------------------
 1 | # concurrentKernels - Concurrent Kernels
 2 | 
 3 | ## Description
 4 | 
 5 | This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function.
 6 | 
 7 | ## Key Concepts
 8 | 
 9 | Performance Strategies
10 | 
11 | ## Supported SM Architectures
12 | 
13 | [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)
14 | 
15 | ## Supported OSes
16 | 
17 | Linux, Windows
18 | 
19 | ## Supported CPU Architecture
20 | 
21 | x86_64, ppc64le, armv7l
22 | 
23 | ## CUDA APIs involved
24 | 
25 | ## Prerequisites
26 | 
27 | Download and install the [CUDA Toolkit 11.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
28 | 
29 | ## Build and Run
30 | 
31 | ### Windows
32 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
33 | ```
34 | *_vs<version>.sln - for Visual Studio <version>
35 | ```
36 | Each individual sample has its own set of solution files in its directory:
37 | 
38 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
39 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
40 | 
41 | ### Linux
42 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
43 | ```
44 | $ cd <sample_dir>
45 | $ make
46 | ```
47 | The samples makefiles can take advantage of certain options:
48 | *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
49 |     By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
50 | `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
51 |     See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
52 | *   **dbg=1** - build with debug symbols
53 |     ```
54 |     $ make dbg=1
55 |     ```
56 | *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
57 |     ```
58 |     $ make SMS="50 60"
59 |     ```
60 | 
61 | *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
62 | ```
63 |     $ make HOST_COMPILER=g++
64 | ```
65 | 
66 | ## References (for more details)
67 | 
68 | 


--------------------------------------------------------------------------------
/Conkernels/concurrentKernels_vs2015.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 14.00
 3 | # Visual Studio 2015
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|x64 = Debug|x64
 9 | 		Release|x64 = Release|x64
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
13 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
14 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
15 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/Conkernels/concurrentKernels_vs2015.vcxproj:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <PropertyGroup>
  4 |     <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  5 |   </PropertyGroup>
  6 |   <ItemGroup Label="ProjectConfigurations">
  7 |     <ProjectConfiguration Include="Debug|x64">
  8 |       <Configuration>Debug</Configuration>
  9 |       <Platform>x64</Platform>
 10 |     </ProjectConfiguration>
 11 |     <ProjectConfiguration Include="Release|x64">
 12 |       <Configuration>Release</Configuration>
 13 |       <Platform>x64</Platform>
 14 |     </ProjectConfiguration>
 15 |   </ItemGroup>
 16 |   <PropertyGroup Label="Globals">
 17 |     <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
 18 |     <RootNamespace>concurrentKernels_vs2015</RootNamespace>
 19 |     <ProjectName>concurrentKernels</ProjectName>
 20 |     <CudaToolkitCustomDir />
 21 |   </PropertyGroup>
 22 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 23 |   <PropertyGroup>
 24 |     <ConfigurationType>Application</ConfigurationType>
 25 |     <CharacterSet>MultiByte</CharacterSet>
 26 |     <PlatformToolset>v140</PlatformToolset>
 27 |   </PropertyGroup>
 28 |   <PropertyGroup Condition="'$(Configuration)'=='Debug'">
 29 |     <UseDebugLibraries>true</UseDebugLibraries>
 30 |   </PropertyGroup>
 31 |   <PropertyGroup Condition="'$(Configuration)'=='Release'">
 32 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 33 |   </PropertyGroup>
 34 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 35 |   <ImportGroup Label="ExtensionSettings">
 36 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.props" />
 37 |   </ImportGroup>
 38 |   <ImportGroup Label="PropertySheets">
 39 |     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
 40 |   </ImportGroup>
 41 |   <PropertyGroup Label="UserMacros" />
 42 |   <PropertyGroup>
 43 |     <IntDir>$(Platform)/$(Configuration)/</IntDir>
 44 |     <IncludePath>$(IncludePath)</IncludePath>
 45 |     <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
 46 |     <CodeAnalysisRules />
 47 |     <CodeAnalysisRuleAssemblies />
 48 |   </PropertyGroup>
 49 |   <PropertyGroup Condition="'$(Platform)'=='x64'">
 50 |     <OutDir>../../bin/win64/$(Configuration)/</OutDir>
 51 |   </PropertyGroup>
 52 |   <ItemDefinitionGroup>
 53 |     <ClCompile>
 54 |       <WarningLevel>Level3</WarningLevel>
 55 |       <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 56 |       <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
 57 |     </ClCompile>
 58 |     <Link>
 59 |       <SubSystem>Console</SubSystem>
 60 |       <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
 61 |       <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
 62 |       <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
 63 |     </Link>
 64 |     <CudaCompile>
 65 |       <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
 66 |       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
 67 |       <Include>./;../../Common</Include>
 68 |       <Defines>WIN32</Defines>
 69 |     </CudaCompile>
 70 |   </ItemDefinitionGroup>
 71 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
 72 |     <ClCompile>
 73 |       <Optimization>Disabled</Optimization>
 74 |       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
 75 |     </ClCompile>
 76 |     <Link>
 77 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 78 |       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
 79 |     </Link>
 80 |     <CudaCompile>
 81 |       <Runtime>MTd</Runtime>
 82 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 83 |     </CudaCompile>
 84 |   </ItemDefinitionGroup>
 85 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
 86 |     <ClCompile>
 87 |       <Optimization>MaxSpeed</Optimization>
 88 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
 89 |     </ClCompile>
 90 |     <Link>
 91 |       <GenerateDebugInformation>false</GenerateDebugInformation>
 92 |       <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
 93 |     </Link>
 94 |     <CudaCompile>
 95 |       <Runtime>MT</Runtime>
 96 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 97 |     </CudaCompile>
 98 |   </ItemDefinitionGroup>
 99 |   <ItemGroup>
100 |     <CudaCompile Include="concurrentKernels.cu" />
101 | 
102 |   </ItemGroup>
103 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
104 |   <ImportGroup Label="ExtensionTargets">
105 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.targets" />
106 |   </ImportGroup>
107 | </Project>
108 | 


--------------------------------------------------------------------------------
/Conkernels/concurrentKernels_vs2017.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 2017
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|x64 = Debug|x64
 9 | 		Release|x64 = Release|x64
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
13 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
14 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
15 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/Conkernels/concurrentKernels_vs2017.vcxproj:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <PropertyGroup>
  4 |     <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  5 |   </PropertyGroup>
  6 |   <ItemGroup Label="ProjectConfigurations">
  7 |     <ProjectConfiguration Include="Debug|x64">
  8 |       <Configuration>Debug</Configuration>
  9 |       <Platform>x64</Platform>
 10 |     </ProjectConfiguration>
 11 |     <ProjectConfiguration Include="Release|x64">
 12 |       <Configuration>Release</Configuration>
 13 |       <Platform>x64</Platform>
 14 |     </ProjectConfiguration>
 15 |   </ItemGroup>
 16 |   <PropertyGroup Label="Globals">
 17 |     <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
 18 |     <RootNamespace>concurrentKernels_vs2017</RootNamespace>
 19 |     <ProjectName>concurrentKernels</ProjectName>
 20 |     <CudaToolkitCustomDir />
 21 |   </PropertyGroup>
 22 |   <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
 23 |     <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
 24 |     <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
 25 |     <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
 26 |   </PropertyGroup>
 27 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 28 |   <PropertyGroup>
 29 |     <ConfigurationType>Application</ConfigurationType>
 30 |     <CharacterSet>MultiByte</CharacterSet>
 31 |     <PlatformToolset>v141</PlatformToolset>
 32 |   </PropertyGroup>
 33 |   <PropertyGroup Condition="'$(Configuration)'=='Debug'">
 34 |     <UseDebugLibraries>true</UseDebugLibraries>
 35 |   </PropertyGroup>
 36 |   <PropertyGroup Condition="'$(Configuration)'=='Release'">
 37 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 38 |   </PropertyGroup>
 39 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 40 |   <ImportGroup Label="ExtensionSettings">
 41 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.props" />
 42 |   </ImportGroup>
 43 |   <ImportGroup Label="PropertySheets">
 44 |     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
 45 |   </ImportGroup>
 46 |   <PropertyGroup Label="UserMacros" />
 47 |   <PropertyGroup>
 48 |     <IntDir>$(Platform)/$(Configuration)/</IntDir>
 49 |     <IncludePath>$(IncludePath)</IncludePath>
 50 |     <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
 51 |     <CodeAnalysisRules />
 52 |     <CodeAnalysisRuleAssemblies />
 53 |   </PropertyGroup>
 54 |   <PropertyGroup Condition="'$(Platform)'=='x64'">
 55 |     <OutDir>../../bin/win64/$(Configuration)/</OutDir>
 56 |   </PropertyGroup>
 57 |   <ItemDefinitionGroup>
 58 |     <ClCompile>
 59 |       <WarningLevel>Level3</WarningLevel>
 60 |       <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 61 |       <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
 62 |     </ClCompile>
 63 |     <Link>
 64 |       <SubSystem>Console</SubSystem>
 65 |       <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
 66 |       <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
 67 |       <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
 68 |     </Link>
 69 |     <CudaCompile>
 70 |       <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
 71 |       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
 72 |       <Include>./;../../Common</Include>
 73 |       <Defines>WIN32</Defines>
 74 |     </CudaCompile>
 75 |   </ItemDefinitionGroup>
 76 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
 77 |     <ClCompile>
 78 |       <Optimization>Disabled</Optimization>
 79 |       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
 80 |     </ClCompile>
 81 |     <Link>
 82 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 83 |       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
 84 |     </Link>
 85 |     <CudaCompile>
 86 |       <Runtime>MTd</Runtime>
 87 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 88 |     </CudaCompile>
 89 |   </ItemDefinitionGroup>
 90 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
 91 |     <ClCompile>
 92 |       <Optimization>MaxSpeed</Optimization>
 93 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
 94 |     </ClCompile>
 95 |     <Link>
 96 |       <GenerateDebugInformation>false</GenerateDebugInformation>
 97 |       <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
 98 |     </Link>
 99 |     <CudaCompile>
100 |       <Runtime>MT</Runtime>
101 |       <TargetMachinePlatform>64</TargetMachinePlatform>
102 |     </CudaCompile>
103 |   </ItemDefinitionGroup>
104 |   <ItemGroup>
105 |     <CudaCompile Include="concurrentKernels.cu" />
106 | 
107 |   </ItemGroup>
108 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
109 |   <ImportGroup Label="ExtensionTargets">
110 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.targets" />
111 |   </ImportGroup>
112 | </Project>
113 | 


--------------------------------------------------------------------------------
/Conkernels/concurrentKernels_vs2019.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 2019
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|x64 = Debug|x64
 9 | 		Release|x64 = Release|x64
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
13 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
14 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
15 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/Conkernels/concurrentKernels_vs2019.vcxproj:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <PropertyGroup>
  4 |     <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  5 |   </PropertyGroup>
  6 |   <ItemGroup Label="ProjectConfigurations">
  7 |     <ProjectConfiguration Include="Debug|x64">
  8 |       <Configuration>Debug</Configuration>
  9 |       <Platform>x64</Platform>
 10 |     </ProjectConfiguration>
 11 |     <ProjectConfiguration Include="Release|x64">
 12 |       <Configuration>Release</Configuration>
 13 |       <Platform>x64</Platform>
 14 |     </ProjectConfiguration>
 15 |   </ItemGroup>
 16 |   <PropertyGroup Label="Globals">
 17 |     <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
 18 |     <RootNamespace>concurrentKernels_vs2019</RootNamespace>
 19 |     <ProjectName>concurrentKernels</ProjectName>
 20 |     <CudaToolkitCustomDir />
 21 |   </PropertyGroup>
 22 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 23 |   <PropertyGroup>
 24 |     <ConfigurationType>Application</ConfigurationType>
 25 |     <CharacterSet>MultiByte</CharacterSet>
 26 |     <PlatformToolset>v142</PlatformToolset>
 27 |     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
 28 |   </PropertyGroup>
 29 |   <PropertyGroup Condition="'$(Configuration)'=='Debug'">
 30 |     <UseDebugLibraries>true</UseDebugLibraries>
 31 |   </PropertyGroup>
 32 |   <PropertyGroup Condition="'$(Configuration)'=='Release'">
 33 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 34 |   </PropertyGroup>
 35 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 36 |   <ImportGroup Label="ExtensionSettings">
 37 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.props" />
 38 |   </ImportGroup>
 39 |   <ImportGroup Label="PropertySheets">
 40 |     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
 41 |   </ImportGroup>
 42 |   <PropertyGroup Label="UserMacros" />
 43 |   <PropertyGroup>
 44 |     <IntDir>$(Platform)/$(Configuration)/</IntDir>
 45 |     <IncludePath>$(IncludePath)</IncludePath>
 46 |     <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
 47 |     <CodeAnalysisRules />
 48 |     <CodeAnalysisRuleAssemblies />
 49 |   </PropertyGroup>
 50 |   <PropertyGroup Condition="'$(Platform)'=='x64'">
 51 |     <OutDir>../../bin/win64/$(Configuration)/</OutDir>
 52 |   </PropertyGroup>
 53 |   <ItemDefinitionGroup>
 54 |     <ClCompile>
 55 |       <WarningLevel>Level3</WarningLevel>
 56 |       <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 57 |       <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
 58 |     </ClCompile>
 59 |     <Link>
 60 |       <SubSystem>Console</SubSystem>
 61 |       <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
 62 |       <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
 63 |       <OutputFile>$(OutDir)/concurrentKernels.exe</OutputFile>
 64 |     </Link>
 65 |     <CudaCompile>
 66 |       <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
 67 |       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
 68 |       <Include>./;../../Common</Include>
 69 |       <Defines>WIN32</Defines>
 70 |     </CudaCompile>
 71 |   </ItemDefinitionGroup>
 72 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
 73 |     <ClCompile>
 74 |       <Optimization>Disabled</Optimization>
 75 |       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
 76 |     </ClCompile>
 77 |     <Link>
 78 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 79 |       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
 80 |     </Link>
 81 |     <CudaCompile>
 82 |       <Runtime>MTd</Runtime>
 83 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 84 |     </CudaCompile>
 85 |   </ItemDefinitionGroup>
 86 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
 87 |     <ClCompile>
 88 |       <Optimization>MaxSpeed</Optimization>
 89 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
 90 |     </ClCompile>
 91 |     <Link>
 92 |       <GenerateDebugInformation>false</GenerateDebugInformation>
 93 |       <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
 94 |     </Link>
 95 |     <CudaCompile>
 96 |       <Runtime>MT</Runtime>
 97 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 98 |     </CudaCompile>
 99 |   </ItemDefinitionGroup>
100 |   <ItemGroup>
101 |     <CudaCompile Include="concurrentKernels.cu" />
102 | 
103 |   </ItemGroup>
104 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
105 |   <ImportGroup Label="ExtensionTargets">
106 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.targets" />
107 |   </ImportGroup>
108 | </Project>
109 | 


--------------------------------------------------------------------------------
/DynParallel/.gitignore:
--------------------------------------------------------------------------------
1 | *.png


--------------------------------------------------------------------------------
/DynParallel/Makefile:
--------------------------------------------------------------------------------
 1 | NVCC=nvcc
 2 | 
 3 | CUDAFLAGS= -arch=sm_86 --cudart=shared -rdc=true -Xcompiler -fopenmp -lpng
 4 | 
 5 | OPT= -g -G
 6 | 
 7 | RM=/bin/rm -f
 8 | 
 9 | all: Dynamic_Parallelism Non_Dynamic_Parallelism
10 | 
11 | 
12 | main: Dynamic_Parallelism.o Non_Dynamic_Parallelism.o
13 | 
14 | 		${NVCC} ${OPT} -o main Dynamic_Parallelism.o 
15 | 		${NVCC} ${OPT} -o main Non_Dynamic_Parallelism.o
16 |         
17 | 
18 | Dynamic_Parallelism.o: Dynamic_Parallelism.cu Non_Dynamic_Parallelism.cu
19 | 
20 | 		$(NVCC) ${OPT} $(CUDAFLAGS) -c Dynamic_Parallelism.cu
21 | 		$(NVCC) ${OPT} $(CUDAFLAGS) -c Non_Dynamic_Parallelism.cu
22 | 
23 | Dynamic_Parallelism: Dynamic_Parallelism.o Non_Dynamic_Parallelism.o
24 | 
25 | 		${NVCC} ${CUDAFLAGS} -o Dynamic_Parallelism Dynamic_Parallelism.o 
26 | 		${NVCC} ${CUDAFLAGS} -o Non_Dynamic_Parallelism Non_Dynamic_Parallelism.o 
27 | 
28 | clean:
29 | 
30 | 		${RM} *.o Dynamic_Parallelism
31 | 		${RM} *.o Non_Dynamic_Parallelism


--------------------------------------------------------------------------------
/DynParallel/lib/libpng.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/passlab/CUDAMicroBench/8c8cd594411c3060fc613a8cdbb3bab21fff1fe1/DynParallel/lib/libpng.lib


--------------------------------------------------------------------------------
/DynParallel/lib/libpngd.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/passlab/CUDAMicroBench/8c8cd594411c3060fc613a8cdbb3bab21fff1fe1/DynParallel/lib/libpngd.lib


--------------------------------------------------------------------------------
/DynParallel/lib/zlibstat.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/passlab/CUDAMicroBench/8c8cd594411c3060fc613a8cdbb3bab21fff1fe1/DynParallel/lib/zlibstat.lib


--------------------------------------------------------------------------------
/DynParallel/lib/zlibstatd.lib:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/passlab/CUDAMicroBench/8c8cd594411c3060fc613a8cdbb3bab21fff1fe1/DynParallel/lib/zlibstatd.lib


--------------------------------------------------------------------------------
/GSOverlap/NsightEclipse.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?> 
 2 | <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 3 | <entry>
 4 |   <name>globalToShmemAsyncCopy</name>
 5 |   <cflags>
 6 |     <flag>--std=c++11</flag>
 7 |   </cflags>
 8 |   <cuda_api_list>
 9 |     <toolkit>cudaEventCreate</toolkit>
10 |     <toolkit>cudaEventRecord</toolkit>
11 |     <toolkit>cudaEventQuery</toolkit>
12 |     <toolkit>cudaEventDestroy</toolkit>
13 |     <toolkit>cudaEventElapsedTime</toolkit>
14 |     <toolkit>cudaEventSynchronize</toolkit>
15 |     <toolkit>cudaMalloc</toolkit>
16 |     <toolkit>cudaFree</toolkit>
17 |     <toolkit>cudaMemcpy</toolkit>
18 |   </cuda_api_list>
19 |   <description><![CDATA[This sample implements matrix multiplication which uses asynchronous copy of data from global to shared memory when on compute capability 8.0 or higher. Also demonstrates arrive-wait barrier for synchronization.]]></description>
20 |   <devicecompilation>whole</devicecompilation>
21 |   <includepaths>
22 |     <path>./</path>
23 |     <path>../</path>
24 |     <path>../../common/inc</path>
25 |   </includepaths>
26 |   <keyconcepts>
27 |     <concept level="basic">CUDA Runtime API</concept>
28 |     <concept level="basic">Linear Algebra</concept>
29 |     <concept level="basic">CPP11 CUDA</concept>
30 |   </keyconcepts>
31 |   <keywords>
32 |     <keyword>CUDA</keyword>
33 |     <keyword>matrix multiply</keyword>
34 |     <keyword>Async copy</keyword>
35 |     <keyword>CPP11</keyword>
36 |     <keyword>GCC 5.0.0</keyword>
37 |   </keywords>
38 |   <libraries>
39 |   </libraries>
40 |   <librarypaths>
41 |   </librarypaths>
42 |   <nsight_eclipse>true</nsight_eclipse>
43 |   <primary_file>globalToShmemAsyncCopy.cu</primary_file>
44 |   <required_dependencies>
45 |     <dependency>CPP11</dependency>
46 |   </required_dependencies>
47 |   <scopes>
48 |     <scope>1:CUDA Basic Topics</scope>
49 |     <scope>3:Linear Algebra</scope>
50 |   </scopes>
51 |   <sm-arch>sm35</sm-arch>
52 |   <sm-arch>sm37</sm-arch>
53 |   <sm-arch>sm50</sm-arch>
54 |   <sm-arch>sm52</sm-arch>
55 |   <sm-arch>sm60</sm-arch>
56 |   <sm-arch>sm61</sm-arch>
57 |   <sm-arch>sm70</sm-arch>
58 |   <sm-arch>sm72</sm-arch>
59 |   <sm-arch>sm75</sm-arch>
60 |   <sm-arch>sm80</sm-arch>
61 |   <sm-arch>sm86</sm-arch>
62 |   <supported_envs>
63 |     <env>
64 |       <arch>x86_64</arch>
65 |       <platform>linux</platform>
66 |     </env>
67 |     <env>
68 |       <arch>x86_64</arch>
69 |       <platform>macosx</platform>
70 |     </env>
71 |     <env>
72 |       <arch>arm</arch>
73 |     </env>
74 |     <env>
75 |       <arch>ppc64le</arch>
76 |       <platform>linux</platform>
77 |     </env>
78 |     <env>
79 |       <arch>aarch64</arch>
80 |       <platform>linux</platform>
81 |     </env>
82 |     <env>
83 |       <arch>aarch64</arch>
84 |       <platform>qnx</platform>
85 |     </env>
86 |     <env>
87 |       <platform>windows7</platform>
88 |     </env>
89 |   </supported_envs>
90 |   <supported_sm_architectures>
91 |     <include>all</include>
92 |   </supported_sm_architectures>
93 |   <title>Global Memory to Shared Memory Async Copy</title>
94 | </entry>
95 | 


--------------------------------------------------------------------------------
/GSOverlap/README.md:
--------------------------------------------------------------------------------
 1 | # globalToShmemAsyncCopy - Global Memory to Shared Memory Async Copy
 2 | 
 3 | ## Description
 4 | 
 5 | This sample implements matrix multiplication which uses asynchronous copy of data from global to shared memory when on compute capability 8.0 or higher. Also demonstrates arrive-wait barrier for synchronization.
 6 | 
 7 | ## Key Concepts
 8 | 
 9 | CUDA Runtime API, Linear Algebra, CPP11 CUDA
10 | 
11 | ## Supported SM Architectures
12 | 
13 | [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)
14 | 
15 | ## Supported OSes
16 | 
17 | Linux, Windows, QNX
18 | 
19 | ## Supported CPU Architecture
20 | 
21 | x86_64, ppc64le, armv7l, aarch64
22 | 
23 | ## CUDA APIs involved
24 | 
25 | ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
26 | cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventElapsedTime, cudaEventSynchronize, cudaMalloc, cudaFree, cudaMemcpy
27 | 
28 | ## Dependencies needed to build/run
29 | [CPP11](../../README.md#cpp11)
30 | 
31 | ## Prerequisites
32 | 
33 | Download and install the [CUDA Toolkit 11.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
34 | Make sure the dependencies mentioned in [Dependencies]() section above are installed.
35 | 
36 | ## Build and Run
37 | 
38 | ### Windows
39 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
40 | ```
41 | *_vs<version>.sln - for Visual Studio <version>
42 | ```
43 | Each individual sample has its own set of solution files in its directory:
44 | 
45 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
46 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
47 | 
48 | ### Linux
49 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
50 | ```
51 | $ cd <sample_dir>
52 | $ make
53 | ```
54 | The samples makefiles can take advantage of certain options:
55 | *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64.
56 |     By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
57 | `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/> `$ make TARGET_ARCH=aarch64` <br/>
58 |     See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
59 | *   **dbg=1** - build with debug symbols
60 |     ```
61 |     $ make dbg=1
62 |     ```
63 | *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
64 |     ```
65 |     $ make SMS="50 60"
66 |     ```
67 | 
68 | *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
69 | ```
70 |     $ make HOST_COMPILER=g++
71 | ```
72 | 
73 | ## References (for more details)
74 | 
75 | 


--------------------------------------------------------------------------------
/GSOverlap/globalToShmemAsyncCopy_vs2015.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 14.00
 3 | # Visual Studio 2015
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "globalToShmemAsyncCopy", "globalToShmemAsyncCopy_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|x64 = Debug|x64
 9 | 		Release|x64 = Release|x64
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
13 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
14 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
15 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/GSOverlap/globalToShmemAsyncCopy_vs2015.vcxproj:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <PropertyGroup>
  4 |     <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  5 |   </PropertyGroup>
  6 |   <ItemGroup Label="ProjectConfigurations">
  7 |     <ProjectConfiguration Include="Debug|x64">
  8 |       <Configuration>Debug</Configuration>
  9 |       <Platform>x64</Platform>
 10 |     </ProjectConfiguration>
 11 |     <ProjectConfiguration Include="Release|x64">
 12 |       <Configuration>Release</Configuration>
 13 |       <Platform>x64</Platform>
 14 |     </ProjectConfiguration>
 15 |   </ItemGroup>
 16 |   <PropertyGroup Label="Globals">
 17 |     <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
 18 |     <RootNamespace>globalToShmemAsyncCopy_vs2015</RootNamespace>
 19 |     <ProjectName>globalToShmemAsyncCopy</ProjectName>
 20 |     <CudaToolkitCustomDir />
 21 |   </PropertyGroup>
 22 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 23 |   <PropertyGroup>
 24 |     <ConfigurationType>Application</ConfigurationType>
 25 |     <CharacterSet>MultiByte</CharacterSet>
 26 |     <PlatformToolset>v140</PlatformToolset>
 27 |   </PropertyGroup>
 28 |   <PropertyGroup Condition="'$(Configuration)'=='Debug'">
 29 |     <UseDebugLibraries>true</UseDebugLibraries>
 30 |   </PropertyGroup>
 31 |   <PropertyGroup Condition="'$(Configuration)'=='Release'">
 32 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 33 |   </PropertyGroup>
 34 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 35 |   <ImportGroup Label="ExtensionSettings">
 36 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.props" />
 37 |   </ImportGroup>
 38 |   <ImportGroup Label="PropertySheets">
 39 |     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
 40 |   </ImportGroup>
 41 |   <PropertyGroup Label="UserMacros" />
 42 |   <PropertyGroup>
 43 |     <IntDir>$(Platform)/$(Configuration)/</IntDir>
 44 |     <IncludePath>$(IncludePath)</IncludePath>
 45 |     <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
 46 |     <CodeAnalysisRules />
 47 |     <CodeAnalysisRuleAssemblies />
 48 |   </PropertyGroup>
 49 |   <PropertyGroup Condition="'$(Platform)'=='x64'">
 50 |     <OutDir>../../bin/win64/$(Configuration)/</OutDir>
 51 |   </PropertyGroup>
 52 |   <ItemDefinitionGroup>
 53 |     <ClCompile>
 54 |       <WarningLevel>Level3</WarningLevel>
 55 |       <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 56 |       <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
 57 |     </ClCompile>
 58 |     <Link>
 59 |       <SubSystem>Console</SubSystem>
 60 |       <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
 61 |       <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
 62 |       <OutputFile>$(OutDir)/globalToShmemAsyncCopy.exe</OutputFile>
 63 |     </Link>
 64 |     <CudaCompile>
 65 |       <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
 66 |       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
 67 |       <Include>./;../../Common</Include>
 68 |       <Defines>WIN32</Defines>
 69 |     </CudaCompile>
 70 |   </ItemDefinitionGroup>
 71 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
 72 |     <ClCompile>
 73 |       <Optimization>Disabled</Optimization>
 74 |       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
 75 |     </ClCompile>
 76 |     <Link>
 77 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 78 |       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
 79 |     </Link>
 80 |     <CudaCompile>
 81 |       <Runtime>MTd</Runtime>
 82 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 83 |     </CudaCompile>
 84 |   </ItemDefinitionGroup>
 85 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
 86 |     <ClCompile>
 87 |       <Optimization>MaxSpeed</Optimization>
 88 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
 89 |     </ClCompile>
 90 |     <Link>
 91 |       <GenerateDebugInformation>false</GenerateDebugInformation>
 92 |       <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
 93 |     </Link>
 94 |     <CudaCompile>
 95 |       <Runtime>MT</Runtime>
 96 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 97 |     </CudaCompile>
 98 |   </ItemDefinitionGroup>
 99 |   <ItemGroup>
100 |     <CudaCompile Include="globalToShmemAsyncCopy.cu" />
101 | 
102 |   </ItemGroup>
103 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
104 |   <ImportGroup Label="ExtensionTargets">
105 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.targets" />
106 |   </ImportGroup>
107 | </Project>
108 | 


--------------------------------------------------------------------------------
/GSOverlap/globalToShmemAsyncCopy_vs2017.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 2017
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "globalToShmemAsyncCopy", "globalToShmemAsyncCopy_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|x64 = Debug|x64
 9 | 		Release|x64 = Release|x64
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
13 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
14 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
15 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/GSOverlap/globalToShmemAsyncCopy_vs2017.vcxproj:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <PropertyGroup>
  4 |     <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  5 |   </PropertyGroup>
  6 |   <ItemGroup Label="ProjectConfigurations">
  7 |     <ProjectConfiguration Include="Debug|x64">
  8 |       <Configuration>Debug</Configuration>
  9 |       <Platform>x64</Platform>
 10 |     </ProjectConfiguration>
 11 |     <ProjectConfiguration Include="Release|x64">
 12 |       <Configuration>Release</Configuration>
 13 |       <Platform>x64</Platform>
 14 |     </ProjectConfiguration>
 15 |   </ItemGroup>
 16 |   <PropertyGroup Label="Globals">
 17 |     <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
 18 |     <RootNamespace>globalToShmemAsyncCopy_vs2017</RootNamespace>
 19 |     <ProjectName>globalToShmemAsyncCopy</ProjectName>
 20 |     <CudaToolkitCustomDir />
 21 |   </PropertyGroup>
 22 |   <PropertyGroup Condition="'$(WindowsTargetPlatformVersion)'==''">
 23 |     <LatestTargetPlatformVersion>$([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0'))</LatestTargetPlatformVersion>
 24 |     <WindowsTargetPlatformVersion Condition="'$(WindowsTargetPlatformVersion)' == ''">$(LatestTargetPlatformVersion)</WindowsTargetPlatformVersion>
 25 |     <TargetPlatformVersion>$(WindowsTargetPlatformVersion)</TargetPlatformVersion>
 26 |   </PropertyGroup>
 27 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 28 |   <PropertyGroup>
 29 |     <ConfigurationType>Application</ConfigurationType>
 30 |     <CharacterSet>MultiByte</CharacterSet>
 31 |     <PlatformToolset>v141</PlatformToolset>
 32 |   </PropertyGroup>
 33 |   <PropertyGroup Condition="'$(Configuration)'=='Debug'">
 34 |     <UseDebugLibraries>true</UseDebugLibraries>
 35 |   </PropertyGroup>
 36 |   <PropertyGroup Condition="'$(Configuration)'=='Release'">
 37 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 38 |   </PropertyGroup>
 39 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 40 |   <ImportGroup Label="ExtensionSettings">
 41 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.props" />
 42 |   </ImportGroup>
 43 |   <ImportGroup Label="PropertySheets">
 44 |     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
 45 |   </ImportGroup>
 46 |   <PropertyGroup Label="UserMacros" />
 47 |   <PropertyGroup>
 48 |     <IntDir>$(Platform)/$(Configuration)/</IntDir>
 49 |     <IncludePath>$(IncludePath)</IncludePath>
 50 |     <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
 51 |     <CodeAnalysisRules />
 52 |     <CodeAnalysisRuleAssemblies />
 53 |   </PropertyGroup>
 54 |   <PropertyGroup Condition="'$(Platform)'=='x64'">
 55 |     <OutDir>../../bin/win64/$(Configuration)/</OutDir>
 56 |   </PropertyGroup>
 57 |   <ItemDefinitionGroup>
 58 |     <ClCompile>
 59 |       <WarningLevel>Level3</WarningLevel>
 60 |       <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 61 |       <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
 62 |     </ClCompile>
 63 |     <Link>
 64 |       <SubSystem>Console</SubSystem>
 65 |       <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
 66 |       <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
 67 |       <OutputFile>$(OutDir)/globalToShmemAsyncCopy.exe</OutputFile>
 68 |     </Link>
 69 |     <CudaCompile>
 70 |       <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
 71 |       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
 72 |       <Include>./;../../Common</Include>
 73 |       <Defines>WIN32</Defines>
 74 |     </CudaCompile>
 75 |   </ItemDefinitionGroup>
 76 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
 77 |     <ClCompile>
 78 |       <Optimization>Disabled</Optimization>
 79 |       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
 80 |     </ClCompile>
 81 |     <Link>
 82 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 83 |       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
 84 |     </Link>
 85 |     <CudaCompile>
 86 |       <Runtime>MTd</Runtime>
 87 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 88 |     </CudaCompile>
 89 |   </ItemDefinitionGroup>
 90 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
 91 |     <ClCompile>
 92 |       <Optimization>MaxSpeed</Optimization>
 93 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
 94 |     </ClCompile>
 95 |     <Link>
 96 |       <GenerateDebugInformation>false</GenerateDebugInformation>
 97 |       <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
 98 |     </Link>
 99 |     <CudaCompile>
100 |       <Runtime>MT</Runtime>
101 |       <TargetMachinePlatform>64</TargetMachinePlatform>
102 |     </CudaCompile>
103 |   </ItemDefinitionGroup>
104 |   <ItemGroup>
105 |     <CudaCompile Include="globalToShmemAsyncCopy.cu" />
106 | 
107 |   </ItemGroup>
108 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
109 |   <ImportGroup Label="ExtensionTargets">
110 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.targets" />
111 |   </ImportGroup>
112 | </Project>
113 | 


--------------------------------------------------------------------------------
/GSOverlap/globalToShmemAsyncCopy_vs2019.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 2019
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "globalToShmemAsyncCopy", "globalToShmemAsyncCopy_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|x64 = Debug|x64
 9 | 		Release|x64 = Release|x64
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
13 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
14 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
15 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/GSOverlap/globalToShmemAsyncCopy_vs2019.vcxproj:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <PropertyGroup>
  4 |     <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  5 |   </PropertyGroup>
  6 |   <ItemGroup Label="ProjectConfigurations">
  7 |     <ProjectConfiguration Include="Debug|x64">
  8 |       <Configuration>Debug</Configuration>
  9 |       <Platform>x64</Platform>
 10 |     </ProjectConfiguration>
 11 |     <ProjectConfiguration Include="Release|x64">
 12 |       <Configuration>Release</Configuration>
 13 |       <Platform>x64</Platform>
 14 |     </ProjectConfiguration>
 15 |   </ItemGroup>
 16 |   <PropertyGroup Label="Globals">
 17 |     <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
 18 |     <RootNamespace>globalToShmemAsyncCopy_vs2019</RootNamespace>
 19 |     <ProjectName>globalToShmemAsyncCopy</ProjectName>
 20 |     <CudaToolkitCustomDir />
 21 |   </PropertyGroup>
 22 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 23 |   <PropertyGroup>
 24 |     <ConfigurationType>Application</ConfigurationType>
 25 |     <CharacterSet>MultiByte</CharacterSet>
 26 |     <PlatformToolset>v142</PlatformToolset>
 27 |     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
 28 |   </PropertyGroup>
 29 |   <PropertyGroup Condition="'$(Configuration)'=='Debug'">
 30 |     <UseDebugLibraries>true</UseDebugLibraries>
 31 |   </PropertyGroup>
 32 |   <PropertyGroup Condition="'$(Configuration)'=='Release'">
 33 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 34 |   </PropertyGroup>
 35 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 36 |   <ImportGroup Label="ExtensionSettings">
 37 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.props" />
 38 |   </ImportGroup>
 39 |   <ImportGroup Label="PropertySheets">
 40 |     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
 41 |   </ImportGroup>
 42 |   <PropertyGroup Label="UserMacros" />
 43 |   <PropertyGroup>
 44 |     <IntDir>$(Platform)/$(Configuration)/</IntDir>
 45 |     <IncludePath>$(IncludePath)</IncludePath>
 46 |     <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
 47 |     <CodeAnalysisRules />
 48 |     <CodeAnalysisRuleAssemblies />
 49 |   </PropertyGroup>
 50 |   <PropertyGroup Condition="'$(Platform)'=='x64'">
 51 |     <OutDir>../../bin/win64/$(Configuration)/</OutDir>
 52 |   </PropertyGroup>
 53 |   <ItemDefinitionGroup>
 54 |     <ClCompile>
 55 |       <WarningLevel>Level3</WarningLevel>
 56 |       <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 57 |       <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;</AdditionalIncludeDirectories>
 58 |     </ClCompile>
 59 |     <Link>
 60 |       <SubSystem>Console</SubSystem>
 61 |       <AdditionalDependencies>cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
 62 |       <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
 63 |       <OutputFile>$(OutDir)/globalToShmemAsyncCopy.exe</OutputFile>
 64 |     </Link>
 65 |     <CudaCompile>
 66 |       <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
 67 |       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
 68 |       <Include>./;../../Common</Include>
 69 |       <Defines>WIN32</Defines>
 70 |     </CudaCompile>
 71 |   </ItemDefinitionGroup>
 72 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
 73 |     <ClCompile>
 74 |       <Optimization>Disabled</Optimization>
 75 |       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
 76 |     </ClCompile>
 77 |     <Link>
 78 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 79 |       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
 80 |     </Link>
 81 |     <CudaCompile>
 82 |       <Runtime>MTd</Runtime>
 83 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 84 |     </CudaCompile>
 85 |   </ItemDefinitionGroup>
 86 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
 87 |     <ClCompile>
 88 |       <Optimization>MaxSpeed</Optimization>
 89 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
 90 |     </ClCompile>
 91 |     <Link>
 92 |       <GenerateDebugInformation>false</GenerateDebugInformation>
 93 |       <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
 94 |     </Link>
 95 |     <CudaCompile>
 96 |       <Runtime>MT</Runtime>
 97 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 98 |     </CudaCompile>
 99 |   </ItemDefinitionGroup>
100 |   <ItemGroup>
101 |     <CudaCompile Include="globalToShmemAsyncCopy.cu" />
102 | 
103 |   </ItemGroup>
104 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
105 |   <ImportGroup Label="ExtensionTargets">
106 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.targets" />
107 |   </ImportGroup>
108 | </Project>
109 | 


--------------------------------------------------------------------------------
/HDOverlap/Makefile:
--------------------------------------------------------------------------------
1 | default:
2 | 	nvcc -o axpy_cuda axpy_cudakernel.cu
3 | 


--------------------------------------------------------------------------------
/HDOverlap/axpy_cudakernel.cu:
--------------------------------------------------------------------------------
  1 | //******************************************************************************************************************//
  2 | // Copyright (c) 2021, University of North Carolina at Charlotte
  3 | // and Lawrence Livermore National Security, LLC.
  4 | // SPDX-License-Identifier: (BSD-3-Clause)
  5 | //*****************************************************************************************************************//
  6 | // Experimental test for new function memcpy_async in CUDA11
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | #include <string.h>
 11 | #include <sys/timeb.h>
 12 | #include <cmath>
 13 | #include <omp.h>
 14 | #include <vector>
 15 | #include "cuda_runtime.h"
 16 | #include "device_launch_parameters.h"
 17 | 
 18 | double read_timer_ms() {
 19 |     struct timeb tm;
 20 |     ftime(&tm);
 21 |     return (double) tm.time * 1000.0 + (double) tm.millitm;
 22 | }
 23 | 
 24 | /* change this to do saxpy or daxpy : single precision or double precision*/
 25 | #define REAL double
 26 | #define VEC_LEN 1024000 //use a fixed number for now
 27 | /* zero out the entire vector */
 28 | void zero(REAL *A, int n)
 29 | {
 30 |     int i;
 31 |     for (i = 0; i < n; i++) {
 32 |         A[i] = 0.0;
 33 |     }
 34 | }
 35 | 
 36 | __global__ 
 37 | void
 38 | axpy_cudakernel_1perThread(REAL* x, REAL* y, int n, REAL a)
 39 | {
 40 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
 41 |     if (i > 0 &&i < n) y[i] += a*x[i];
 42 | }
 43 | 
 44 | double axpy_cuda_normal(REAL* x, REAL* y, int n, REAL a) {
 45 |   REAL *d_x, *d_y;
 46 |   cudaMalloc(&d_x, n*sizeof(REAL));
 47 |   cudaMalloc(&d_y, n*sizeof(REAL));
 48 |   double time = read_timer_ms();
 49 | 
 50 |   cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice);
 51 |   cudaMemcpy(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice);
 52 |   time = read_timer_ms() - time;
 53 | 
 54 |   // Perform axpy elements
 55 |   axpy_cudakernel_1perThread<<<(n+255)/256, 256>>>(d_x, d_y, n, a);
 56 |   cudaDeviceSynchronize();
 57 |   
 58 |   cudaMemcpy(y, d_y, n*sizeof(REAL), cudaMemcpyDeviceToHost);
 59 |   cudaFree(d_x);
 60 |   cudaFree(d_y);
 61 |   return time;
 62 | }
 63 | 
 64 | double axpy_cuda_async(REAL* x, REAL* y, int n, REAL a) {
 65 | 		cudaStream_t stream1;
 66 | 		cudaError_t result;
 67 | 		result = cudaStreamCreate(&stream1);
 68 | 
 69 |   REAL *d_x, *d_y;
 70 |   cudaMalloc(&d_x, n*sizeof(REAL));
 71 |   cudaMalloc(&d_y, n*sizeof(REAL));
 72 |   double time2 = read_timer_ms();
 73 | 
 74 |   cudaMemcpyAsync(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice, stream1);
 75 |   cudaMemcpyAsync(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice, stream1);
 76 |   time2 = read_timer_ms() - time2;
 77 | 
 78 | 
 79 |   //cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice);
 80 |   //cudaMemcpy(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice);
 81 |     // Perform axpy elements
 82 |   axpy_cudakernel_1perThread<<<(n+255)/256, 256>>>(d_x, d_y, n, a);
 83 |   cudaDeviceSynchronize();
 84 |   
 85 | 
 86 |   cudaMemcpy(y, d_y, n*sizeof(REAL), cudaMemcpyDeviceToHost);
 87 |   cudaFree(d_x);
 88 |   cudaFree(d_y);
 89 | return time2;
 90 | }
 91 | 
 92 | 
 93 | 
 94 | /* initialize a vector with random floating point numbers */
 95 | void init(REAL *A, int n)
 96 | {
 97 |     int i;
 98 |     for (i = 0; i < n; i++) {
 99 |         A[i] = (double)drand48();
100 |     }
101 | }
102 | 
103 | /*serial version */
104 | void axpy(REAL* x, REAL* y, long n, REAL a) {
105 |   int i;
106 |   for (i = 1; i < n; ++i)
107 |   {
108 |     y[i] += a * x[i];
109 |   }
110 | }
111 | 
112 | /* compare two arrays and return percentage of difference */
113 | REAL check(REAL*A, REAL*B, int n)
114 | {
115 |     int i;
116 |     REAL diffsum =0.0, sum = 0.0;
117 |     for (i = 0; i < n; i++) {
118 |         diffsum += fabs(A[i] - B[i]);
119 |         sum += fabs(B[i]);
120 |     }
121 |     return diffsum/sum;
122 | }
123 | 
124 | int main(int argc, char *argv[])
125 | {
126 |   int n;
127 |   REAL *y_cuda, *y, *x, *y_cuda_async;
128 |   REAL a = 123.456;
129 | 
130 |   n = VEC_LEN;
131 |   fprintf(stderr, "Usage: axpy <n>\n");
132 |   if (argc >= 2) {
133 |     n = atoi(argv[1]);
134 |   }
135 |   y_cuda = (REAL *) malloc(n * sizeof(REAL));
136 |   y_cuda_async = (REAL *) malloc(n * sizeof(REAL));
137 |   y  = (REAL *) malloc(n * sizeof(REAL));
138 |   x = (REAL *) malloc(n * sizeof(REAL));
139 | 
140 |   srand48(1<<12);
141 |   init(x, n);
142 |   init(y_cuda, n);
143 |   memcpy(y, y_cuda, n*sizeof(REAL));
144 |   memcpy(y_cuda_async, y_cuda, n*sizeof(REAL));
145 | 
146 |   int i;
147 |   int num_runs = 10;
148 |   for (i=0; i<num_runs; i++) axpy(x, y, n, a);
149 | 
150 |   //warming up
151 |   axpy_cuda_normal(x, y_cuda_async, n, a);
152 |   axpy_cuda_async(x, y_cuda_async, n, a);
153 | 
154 |   /* cuda version */
155 |   double elapsed;// = read_timer_ms();
156 |   for (i=0; i<num_runs; i++) elapsed += axpy_cuda_normal(x, y_cuda_async, n, a);
157 |   elapsed =  elapsed/num_runs;
158 | 
159 |   double elapsed1;// = read_timer_ms();
160 |   for (i=0; i<num_runs; i++) elapsed1 += axpy_cuda_async(x, y_cuda_async, n, a);
161 |   elapsed1 = elapsed1/num_runs;
162 | 
163 |   REAL checkresult = check(y_cuda, y, n);
164 |   REAL checkresult1 = check(y_cuda_async, y, n);
165 | 
166 |   printf("axpy(%d): checksum: %g, time: %0.2fms\n", n, checkresult, elapsed);
167 |   printf("axpy_async(%d): checksum: %g, time: %0.2fms\n", n, checkresult1, elapsed1);
168 | 
169 |   //assert (checkresult < 1.0e-10);
170 | 
171 |   free(y_cuda);
172 |   free(y);
173 |   free(x);
174 |   return 0;
175 | }
176 | 


--------------------------------------------------------------------------------
/HDOverlap/results.txt:
--------------------------------------------------------------------------------
 1 | //test results on fornax
 2 | 
 3 | xyi2@fornax:~/CUDAMemBench/features_tests/Samples/async_copy$ sh test.sh
 4 | Usage: axpy <n>
 5 | axpy(1024000): checksum: 0.99919, time: 13.30ms
 6 | axpy_async(1024000): checksum: 1.19903, time: 13.30ms
 7 | Usage: axpy <n>
 8 | axpy(4096000): checksum: 0.99919, time: 94.20ms
 9 | axpy_async(4096000): checksum: 1.19903, time: 98.20ms
10 | Usage: axpy <n>
11 | axpy(10240000): checksum: 0.999191, time: 246.70ms
12 | axpy_async(10240000): checksum: 1.19903, time: 243.60ms
13 | Usage: axpy <n>
14 | axpy(20480000): checksum: 0.999191, time: 518.00ms
15 | axpy_async(20480000): checksum: 1.19903, time: 500.00ms
16 | Usage: axpy <n>
17 | axpy(40960000): checksum: 0.999191, time: 1021.20ms
18 | axpy_async(40960000): checksum: 1.19903, time: 989.60ms
19 | Usage: axpy <n>
20 | axpy(102400000): checksum: 0.999191, time: 2395.00ms
21 | axpy_async(102400000): checksum: 1.19903, time: 2370.10ms
22 | 


--------------------------------------------------------------------------------
/HDOverlap/test.sh:
--------------------------------------------------------------------------------
1 | ./axpy_cuda 1024000
2 | ./axpy_cuda 4096000
3 | ./axpy_cuda 10240000
4 | ./axpy_cuda 20480000
5 | ./axpy_cuda 40960000
6 | ./axpy_cuda 102400000


--------------------------------------------------------------------------------
/LICENSE_BSD.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2020 - 2021 HPCAS Lab (https://passlab.github.io) 
 2 | from University of North Carolina at Charlotte, and 
 3 | Lawrence Livermore National Laboratory, LLC.
 4 | 
 5 | LLNL-CODE-825202
 6 | 
 7 | All rights reserved. 
 8 | 
 9 | Funding for this research and development was provided by the National Science Foundation 
10 | under award number CISE SHF-1551182 and CISE SHF-2015254. 
11 | The development is also funded by LLNL under Contract DE-AC52-07NA27344 and LLNL-LDRD Program 
12 | under project 18-ERD-006.
13 | 
14 | Redistribution and use in source and binary forms, with or without modification, 
15 | are permitted provided that the following conditions are met:
16 | 
17 | 1. Redistributions of source code must retain the above copyright notice, 
18 |    this list of conditions and the following disclaimer.
19 | 2. Redistributions in binary form must reproduce the above copyright notice, 
20 |    this list of conditions and the following disclaimer in the documentation 
21 |    and/or other materials provided with the distribution.
22 | 3. Neither the name of the copyright holder nor the names of its contributors may 
23 |    be used to endorse or promote products derived from this software without specific prior written permission.
24 | 
25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 
26 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 
27 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 
28 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
29 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 
30 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 
31 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 
32 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 | 


--------------------------------------------------------------------------------
/MemAlign/Makefile:
--------------------------------------------------------------------------------
1 | default:
2 | 	nvcc -g -G -arch=sm_30 -o axpy_cuda axpy_cuda.c axpy_cudakernel.cu
3 | 


--------------------------------------------------------------------------------
/MemAlign/axpy.h:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #define REAL double
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | extern void axpy_cuda(REAL *x, REAL * y, int n, REAL a);
12 | #ifdef __cplusplus
13 | }
14 | #endif
15 | 


--------------------------------------------------------------------------------
/MemAlign/axpy_cuda.c:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | // Experimental tests for aligned memory access and unaligned memory access
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <math.h>
10 | #include <string.h>
11 | #include <sys/timeb.h>
12 | #include "axpy.h"
13 | 
14 | double read_timer_ms() {
15 |     struct timeb tm;
16 |     ftime(&tm);
17 |     return (double) tm.time * 1000.0 + (double) tm.millitm;
18 | }
19 | 
20 | /* change this to do saxpy or daxpy : single precision or double precision*/
21 | #define REAL double
22 | #define VEC_LEN 1024000 //use a fixed number for now
23 | /* zero out the entire vector */
24 | void zero(REAL *A, int n)
25 | {
26 |     int i;
27 |     for (i = 0; i < n; i++) {
28 |         A[i] = 0.0;
29 |     }
30 | }
31 | 
32 | /* initialize a vector with random floating point numbers */
33 | void init(REAL *A, int n)
34 | {
35 |     int i;
36 |     for (i = 0; i < n; i++) {
37 |         A[i] = (double)drand48();
38 |     }
39 | }
40 | 
41 | /*serial version */
42 | void axpy(REAL* x, REAL* y, long n, REAL a) {
43 |   int i;
44 |   for (i = 1; i < n; ++i)
45 |   {
46 |     y[i] += a * x[i];
47 |   }
48 | }
49 | 
50 | /* compare two arrays and return percentage of difference */
51 | REAL check(REAL*A, REAL*B, int n)
52 | {
53 |     int i;
54 |     REAL diffsum =0.0, sum = 0.0;
55 |     for (i = 0; i < n; i++) {
56 |         diffsum += fabs(A[i] - B[i]);
57 |         sum += fabs(B[i]);
58 |     }
59 |     return diffsum/sum;
60 | }
61 | 
62 | int main(int argc, char *argv[])
63 | {
64 |   int n;
65 |   REAL *y_cuda, *y, *x;
66 |   REAL a = 123.456;
67 | 
68 |   n = VEC_LEN;
69 |   fprintf(stderr, "Usage: axpy <n>\n");
70 |   if (argc >= 2) {
71 |     n = atoi(argv[1]);
72 |   }
73 |   y_cuda = (REAL *) malloc(n * sizeof(REAL));
74 |   y  = (REAL *) malloc(n * sizeof(REAL));
75 |   x = (REAL *) malloc(n * sizeof(REAL));
76 | 
77 |   srand48(1<<12);
78 |   init(x, n);
79 |   init(y_cuda, n);
80 |   memcpy(y, y_cuda, n*sizeof(REAL));
81 | 
82 |   int i;
83 |   int num_runs = 10;
84 |   for (i=0; i<num_runs; i++) axpy(x, y, n, a);
85 |   /* cuda version */
86 |   double elapsed = read_timer_ms();
87 |   for (i=0; i<num_runs; i++) axpy_cuda(x, y_cuda, n, a);
88 |   elapsed = (read_timer_ms() - elapsed)/num_runs;
89 | 
90 |   REAL checkresult = check(y_cuda, y, n);
91 |   printf("axpy(%d): checksum: %g, time: %0.2fms\n", n, checkresult, elapsed);
92 |   //assert (checkresult < 1.0e-10);
93 | 
94 |   free(y_cuda);
95 |   free(y);
96 |   free(x);
97 |   return 0;
98 | }
99 | 


--------------------------------------------------------------------------------
/MemAlign/axpy_cudakernel.cu:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #include "axpy.h"
 7 | 
 8 | __global__ 
 9 | void
10 | axpy_cudakernel_1perThread(REAL* x, REAL* y, int n, REAL a)
11 | {
12 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
13 |     if (i > 0 &&i < n) y[i] += a*x[i];
14 | }
15 | 
16 | __global__ 
17 | void
18 | axpy_cudakernel_1perThread_misaligned(REAL* x, REAL* y, int n, REAL a)
19 | {
20 |     int i = blockDim.x * blockIdx.x + threadIdx.x + 1;
21 |     if (i < n) y[i] += a*x[i];
22 | }
23 | 
24 | __global__ 
25 | void
26 | axpy_cudakernel_1perThread_warmup(REAL* x, REAL* y, int n, REAL a)
27 | {
28 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
29 |     if (i > 1 && i < n) y[i] += a*x[i];
30 | }
31 | 
32 | 
33 | void axpy_cuda(REAL* x, REAL* y, int n, REAL a) {
34 |   REAL *d_x, *d_y;
35 |   cudaMalloc(&d_x, n*sizeof(REAL));
36 |   cudaMalloc(&d_y, n*sizeof(REAL));
37 | 
38 |   cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice);
39 |   cudaMemcpy(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice);
40 |   
41 |   //warm up
42 |   axpy_cudakernel_1perThread_warmup<<<(n+255)/256, 256>>>(d_x, d_y, n, a);
43 |   cudaDeviceSynchronize();
44 |   // Perform axpy elements
45 |   axpy_cudakernel_1perThread_misaligned<<<(n+255)/256, 256>>>(d_x, d_y, n, a);
46 |   cudaDeviceSynchronize();
47 |   axpy_cudakernel_1perThread<<<(n+255)/256, 256>>>(d_x, d_y, n, a);
48 |   cudaDeviceSynchronize();
49 |   
50 | 
51 |   cudaMemcpy(y, d_y, n*sizeof(REAL), cudaMemcpyDeviceToHost);
52 |   cudaFree(d_x);
53 |   cudaFree(d_y);
54 | }
55 | 
56 | 


--------------------------------------------------------------------------------
/MemAlign/test.sh:
--------------------------------------------------------------------------------
1 | nvprof ./axpy_cuda 1024000
2 | nvprof ./axpy_cuda 4096000
3 | nvprof ./axpy_cuda 10240000
4 | nvprof ./axpy_cuda 20480000
5 | nvprof ./axpy_cuda 40960000
6 | nvprof ./axpy_cuda 102400000


--------------------------------------------------------------------------------
/MiniTransfer_SpMV/Makefile:
--------------------------------------------------------------------------------
1 | default:
2 | 	nvcc -o SpMV_cuda SpMV_cuda.c SpMV_cudakernel.cu
3 | 


--------------------------------------------------------------------------------
/MiniTransfer_SpMV/SpMV.h:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #define REAL float
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | extern double spmv_cuda_csr_discrete(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y_normal);
12 | extern double spmv_cuda_dense_discrete(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y_normal);
13 | extern double warmingup(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y_normal);
14 | extern void init_csr(int *ptr, REAL *data, int *indices, REAL *matrix, int num_rows, int nnz);
15 | extern double warmingup_dense(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y);
16 | extern double warmingup_csr(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y);
17 | extern void init_index(int * row, int * column, REAL *matrix, int num_rows);
18 | extern double spmv_cuda_unified(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y);
19 | extern double spmv_cuda_unified_count(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y);
20 | extern void init_index_count(int * row_nnz, int * row, int * column, REAL *matrix, int num_rows);
21 | //extern void init_ptr(int *ptr, REAL * matrix, int num_rows, int nnz);
22 | extern double read_timer_ms();
23 | #ifdef __cplusplus
24 | }
25 | #endif
26 | 


--------------------------------------------------------------------------------
/MiniTransfer_SpMV/SpMV_cuda.output.carina.txt:
--------------------------------------------------------------------------------
 1 | xyi2@cci-carina:~/CUDAMemBench/SpMV_new$ sh test.sh
 2 | Usage: SpMV <n>
 3 | Spmv (dense) (524288): time: 2.30ms
 4 | Spmv (csr) (524288): time: 14.88ms
 5 | Spmv (unified) (524288): time: 68.68ms
 6 | Spmv (unified_count) (524288): time: 68.44ms
 7 | Usage: SpMV <n>
 8 | Spmv (dense) (524288): time: 1.60ms
 9 | Spmv (csr) (524288): time: 12.98ms
10 | Spmv (unified) (524288): time: 68.92ms
11 | Spmv (unified_count) (524288): time: 69.24ms
12 | Usage: SpMV <n>
13 | Spmv (dense) (262144): time: 1.62ms
14 | Spmv (csr) (262144): time: 8.38ms
15 | Spmv (unified) (262144): time: 37.80ms
16 | Spmv (unified_count) (262144): time: 37.92ms
17 | Usage: SpMV <n>
18 | Spmv (dense) (131072): time: 1.62ms
19 | Spmv (csr) (131072): time: 5.28ms
20 | Spmv (unified) (131072): time: 20.70ms
21 | Spmv (unified_count) (131072): time: 20.80ms
22 | Usage: SpMV <n>
23 | Spmv (dense) (65535): time: 1.60ms
24 | Spmv (csr) (65535): time: 4.04ms
25 | Spmv (unified) (65535): time: 12.58ms
26 | Spmv (unified_count) (65535): time: 12.58ms
27 | Usage: SpMV <n>
28 | Spmv (dense) (32768): time: 1.58ms
29 | Spmv (csr) (32768): time: 3.24ms
30 | Spmv (unified) (32768): time: 8.32ms
31 | Spmv (unified_count) (32768): time: 8.30ms
32 | Usage: SpMV <n>
33 | Spmv (dense) (16384): time: 1.58ms
34 | Spmv (csr) (16384): time: 3.08ms
35 | Spmv (unified) (16384): time: 6.40ms
36 | Spmv (unified_count) (16384): time: 6.32ms
37 | Usage: SpMV <n>
38 | Spmv (dense) (8192): time: 1.58ms
39 | Spmv (csr) (8192): time: 2.80ms
40 | Spmv (unified) (8192): time: 5.82ms
41 | Spmv (unified_count) (8192): time: 5.68ms
42 | Usage: SpMV <n>
43 | Spmv (dense) (4096): time: 1.58ms
44 | Spmv (csr) (4096): time: 2.74ms
45 | Spmv (unified) (4096): time: 5.44ms
46 | Spmv (unified_count) (4096): time: 5.40ms
47 | Usage: SpMV <n>
48 | Spmv (dense) (2048): time: 1.60ms
49 | Spmv (csr) (2048): time: 2.70ms
50 | Spmv (unified) (2048): time: 5.64ms
51 | Spmv (unified_count) (2048): time: 5.42ms
52 | Usage: SpMV <n>
53 | Spmv (dense) (1024): time: 1.62ms
54 | Spmv (csr) (1024): time: 2.68ms
55 | Spmv (unified) (1024): time: 5.22ms
56 | Spmv (unified_count) (1024): time: 5.22ms
57 | Usage: SpMV <n>
58 | Spmv (dense) (512): time: 1.62ms
59 | Spmv (csr) (512): time: 2.66ms
60 | Spmv (unified) (512): time: 5.20ms
61 | Spmv (unified_count) (512): time: 5.14ms
62 | Usage: SpMV <n>
63 | Spmv (dense) (256): time: 1.60ms
64 | Spmv (csr) (256): time: 2.64ms
65 | Spmv (unified) (256): time: 5.14ms
66 | Spmv (unified_count) (256): time: 5.04ms
67 | Usage: SpMV <n>
68 | Spmv (dense) (128): time: 1.60ms
69 | Spmv (csr) (128): time: 2.64ms
70 | Spmv (unified) (128): time: 5.18ms
71 | Spmv (unified_count) (128): time: 5.22ms
72 | Usage: SpMV <n>
73 | Spmv (dense) (64): time: 1.64ms
74 | Spmv (csr) (64): time: 2.64ms
75 | Spmv (unified) (64): time: 4.98ms
76 | Spmv (unified_count) (64): time: 4.94ms
77 | Usage: SpMV <n>
78 | Spmv (dense) (32): time: 1.62ms
79 | Spmv (csr) (32): time: 2.66ms
80 | Spmv (unified) (32): time: 4.84ms
81 | Spmv (unified_count) (32): time: 4.86ms
82 | Usage: SpMV <n>
83 | Spmv (dense) (16): time: 1.64ms
84 | Spmv (csr) (16): time: 2.66ms
85 | Spmv (unified) (16): time: 4.54ms
86 | Spmv (unified_count) (16): time: 4.58ms
87 | Usage: SpMV <n>
88 | Spmv (dense) (8): time: 1.64ms
89 | Spmv (csr) (8): time: 2.66ms
90 | Spmv (unified) (8): time: 4.40ms
91 | Spmv (unified_count) (8): time: 4.38ms
92 | 


--------------------------------------------------------------------------------
/MiniTransfer_SpMV/test.sh:
--------------------------------------------------------------------------------
 1 | ./SpMV_cuda  67108864 10240
 2 | ./SpMV_cuda  33554432 10240
 3 | ./SpMV_cuda  16777216 10240
 4 | ./SpMV_cuda  8388608 10240
 5 | ./SpMV_cuda  4194304 10240
 6 | ./SpMV_cuda  2097152 10240
 7 | ./SpMV_cuda  1048576 10240
 8 | ./SpMV_cuda  524288 10240
 9 | ./SpMV_cuda  262144 10240
10 | ./SpMV_cuda  131072 10240
11 | ./SpMV_cuda  65536 10240
12 | ./SpMV_cuda  32768 10240
13 | ./SpMV_cuda  16384 10240
14 | ./SpMV_cuda  8192 10240
15 | ./SpMV_cuda  4096 10240
16 | ./SpMV_cuda  2048 10240
17 | ./SpMV_cuda  1024 10240
18 | ./SpMV_cuda  512 10240
19 | ./SpMV_cuda  256 10240
20 | ./SpMV_cuda  128 10240
21 | ./SpMV_cuda  64 10240
22 | ./SpMV_cuda  32 10240
23 | ./SpMV_cuda  16 10240
24 | ./SpMV_cuda  8 10240
25 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | This work was produced under the auspices of the U.S. Department of
 2 | Energy by Lawrence Livermore National Laboratory under Contract
 3 | DE-AC52-07NA27344.
 4 | 
 5 | This work was prepared as an account of work sponsored by an agency of
 6 | the United States Government. Neither the United States Government nor
 7 | Lawrence Livermore National Security, LLC, nor any of their employees
 8 | makes any warranty, expressed or implied, or assumes any legal liability
 9 | or responsibility for the accuracy, completeness, or usefulness of any
10 | information, apparatus, product, or process disclosed, or represents that
11 | its use would not infringe privately owned rights.
12 | 
13 | Reference herein to any specific commercial product, process, or service
14 | by trade name, trademark, manufacturer, or otherwise does not necessarily
15 | constitute or imply its endorsement, recommendation, or favoring by the
16 | United States Government or Lawrence Livermore National Security, LLC.
17 | 
18 | The views and opinions of authors expressed herein do not necessarily
19 | state or reflect those of the United States Government or Lawrence
20 | Livermore National Security, LLC, and shall not be used for advertising
21 | or product endorsement purposes.
22 | 


--------------------------------------------------------------------------------
/ReadOnlyMem_1D_Texture/Makefile:
--------------------------------------------------------------------------------
1 | default:
2 | 	nvcc -o axpy_cuda axpy_cuda.c axpy_cudakernel.cu
3 | 


--------------------------------------------------------------------------------
/ReadOnlyMem_1D_Texture/axpy.h:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #define REAL float
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | extern void axpy_cuda(REAL *x, REAL * y, int n, REAL a);
12 | #ifdef __cplusplus
13 | }
14 | #endif
15 | 


--------------------------------------------------------------------------------
/ReadOnlyMem_1D_Texture/axpy_cuda.c:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | // Experimental test for texture memory using 1-D array
 7 | #include <stdio.h>
 8 | #include <stdlib.h>
 9 | #include <math.h>
10 | #include <string.h>
11 | #include <sys/timeb.h>
12 | #include "axpy.h"
13 | 
14 | double read_timer_ms() {
15 |     struct timeb tm;
16 |     ftime(&tm);
17 |     return (double) tm.time * 1000.0 + (double) tm.millitm;
18 | }
19 | 
20 | /* change this to do saxpy or daxpy : single precision or double precision*/
21 | #define VEC_LEN 1024000//use a fixed number for now
22 | /* zero out the entire vector */
23 | void zero(REAL *A, int n)
24 | {
25 |     int i;
26 |     for (i = 0; i < n; i++) {
27 |         A[i] = 0.0;
28 |     }
29 | }
30 | 
31 | /* initialize a vector with random floating point numbers */
32 | void init(REAL *A, int n)
33 | {
34 |     int i;
35 |     for (i = 0; i < n; i++) {
36 |         A[i] = (float)drand48();
37 |     }
38 | }
39 | 
40 | /*serial version */
41 | void axpy(REAL* x, REAL* y, long n, REAL a) {
42 |   int i;
43 |   for (i = 0; i < n; ++i)
44 |   {
45 |     y[i] += a * x[i];
46 |   }
47 | }
48 | 
49 | /* compare two arrays and return percentage of difference */
50 | REAL check(REAL*A, REAL*B, int n)
51 | {
52 |     int i;
53 |     REAL diffsum =0.0, sum = 0.0;
54 |     for (i = 0; i < n; i++) {
55 |         diffsum += fabs(A[i] - B[i]);
56 |         sum += fabs(B[i]);
57 |     }
58 |     return diffsum;
59 | }
60 | 
61 | int main(int argc, char *argv[])
62 | {
63 |   int n;
64 |   REAL *y_cuda, *y, *x;
65 |   REAL a = 123.456;
66 | 
67 |   n = VEC_LEN;
68 |   fprintf(stderr, "Usage: axpy <n>\n");
69 |   if (argc >= 2) {
70 |     n = atoi(argv[1]);
71 |   }
72 |   y_cuda = (REAL *) malloc(n * sizeof(REAL));
73 |   y  = (REAL *) malloc(n * sizeof(REAL));
74 |   x = (REAL *) malloc(n * sizeof(REAL));
75 | 
76 |   srand48(1<<12);
77 |   init(x, n);
78 |   init(y_cuda, n);
79 |   memcpy(y, y_cuda, n*sizeof(REAL));
80 | 
81 |   axpy(x, y, n, a);
82 | 
83 |   int i;
84 |   int num_runs = 10;
85 |   /* cuda version */
86 |   double elapsed = read_timer_ms();
87 |   for (i=0; i<num_runs; i++) axpy_cuda(x, y_cuda, n, a);
88 |   elapsed = (read_timer_ms() - elapsed)/num_runs;
89 | 
90 |   REAL checkresult = check(y_cuda, y, n);
91 |   printf("axpy(%d): checksum: %g, time: %0.2fms\n", n, checkresult, elapsed);
92 | 
93 | 
94 |   free(y_cuda);
95 |   free(y);
96 |   free(x);
97 |   return 0;
98 | }
99 | 


--------------------------------------------------------------------------------
/ReadOnlyMem_1D_Texture/axpy_cudakernel.cu:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #include "axpy.h"
 7 | #include"cuda_runtime.h"  
 8 | #include"device_launch_parameters.h"  
 9 | #include <stdio.h>
10 | 
11 | 
12 | texture<float, 1, cudaReadModeElementType> rT1;  
13 | 
14 | __global__ 
15 | void
16 | axpy_cudakernel_warmingup(REAL* x, REAL* y, int n, REAL a)
17 | {
18 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
19 |     if (i < n) y[i] += a*x[i];
20 | }
21 | 
22 | 
23 | __global__ 
24 | void
25 | axpy_cudakernel_1perThread_texture(REAL* y, int n, REAL a)
26 | {
27 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
28 |     if (i < n) y[i] += a * tex1Dfetch(rT1, i);
29 | }
30 | 
31 | __global__ 
32 | void
33 | axpy_cudakernel_1perThread(REAL* x, REAL* y, int n, REAL a)
34 | {
35 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
36 |     if (i < n) y[i] += a*x[i];
37 | }
38 | 
39 | void axpy_cuda(REAL* x, REAL* y, int n, REAL a) {
40 |   REAL *d_x, *d_y;
41 |   cudaMalloc(&d_x, n*sizeof(REAL));
42 |   cudaMalloc(&d_y, n*sizeof(REAL));
43 | 
44 |   cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice);
45 |   cudaMemcpy(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice);
46 | 
47 |   cudaBindTexture(0, rT1, d_x);  
48 | 
49 |   // Perform axpy elements
50 |   axpy_cudakernel_warmingup<<<(n+255)/256, 256>>>(d_x, d_y, n, a);
51 |   cudaDeviceSynchronize();
52 |   axpy_cudakernel_1perThread_texture<<<(n+255)/256, 256>>>(d_y, n, a);
53 |   cudaDeviceSynchronize();
54 |   axpy_cudakernel_1perThread<<<(n+255)/256, 256>>>(d_x, d_y, n, a);
55 |   cudaDeviceSynchronize();
56 | 
57 |   cudaMemcpy(y, d_y, n*sizeof(REAL), cudaMemcpyDeviceToHost);
58 |   cudaUnbindTexture(rT1);
59 | 
60 |   cudaFree(d_x);
61 |   cudaFree(d_y);
62 | }
63 | 


--------------------------------------------------------------------------------
/ReadOnlyMem_1D_Texture/test.sh:
--------------------------------------------------------------------------------
1 | nvprof ./axpy_cuda 1024000
2 | nvprof ./axpy_cuda 4096000
3 | nvprof ./axpy_cuda 10240000
4 | nvprof ./axpy_cuda 20480000
5 | nvprof ./axpy_cuda 102400000


--------------------------------------------------------------------------------
/ReadOnlyMem_2D_Texture/Makefile:
--------------------------------------------------------------------------------
1 | default:
2 | 	nvcc -o matadd_2D_cuda matadd_2D_cuda.c matadd_2D_cudakernel.cu
3 | 


--------------------------------------------------------------------------------
/ReadOnlyMem_2D_Texture/matadd_2D.h:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #define REAL float
 7 | 
 8 | 
 9 | #ifdef __cplusplus
10 | extern "C" {
11 | #endif
12 | extern void matadd(float * h_flMat1, float * h_flMat2, int iMatSizeM, int iMatSizeN, float * h_flMatSum);
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 


--------------------------------------------------------------------------------
/ReadOnlyMem_2D_Texture/matadd_2D_cuda.c:
--------------------------------------------------------------------------------
  1 | //******************************************************************************************************************//
  2 | // Copyright (c) 2021, University of North Carolina at Charlotte
  3 | // and Lawrence Livermore National Security, LLC.
  4 | // SPDX-License-Identifier: (BSD-3-Clause)
  5 | //*****************************************************************************************************************//
  6 | // Experimental test for texture memory using 2-D array
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | #include <string.h>
 11 | #include <sys/timeb.h>
 12 | #include "matadd_2D.h"
 13 | #include<cuda_runtime_api.h> 
 14 | #include<device_launch_parameters.h> 
 15 | #include<stdio.h>
 16 | #include<time.h>
 17 |  
 18 | double read_timer_ms() {
 19 |     struct timeb tm;
 20 |     ftime(&tm);
 21 |     return (double) tm.time * 1000.0 + (double) tm.millitm;
 22 | }
 23 | 
 24 | #define VEC_LEN 1024//use a fixed number for now
 25 | 
 26 | 
 27 | /* zero out the entire vector */
 28 | void zero(REAL *A, int n)
 29 | {
 30 |     int i;
 31 |     for (i = 0; i < n; i++) {
 32 |         A[i] = 0.0;
 33 |     }
 34 | }
 35 | 
 36 | /* initialize a matrix with random REALing point numbers */
 37 | void init_matrix(REAL *matrix, int m, int n) {
 38 | 	for (int i = 0; i<m; i++) {
 39 | 		for (int j = 0; j<n; j++) {
 40 | 			matrix[i*n+j] = 1;//(REAL)drand48();//(REAL)rand()/(REAL)(RAND_MAX/10.0);
 41 | 		}
 42 | 	}
 43 | }
 44 | 
 45 | void printMatrix(REAL *pflMat, int M, int N)
 46 | {
 47 | 	for(int idxM = 0; idxM < M; idxM++)
 48 | 	{
 49 | 		for(int idxN = 0; idxN < N; idxN++)
 50 | 		{
 51 | 			printf("%f\t",pflMat[(idxM * N) + idxN]);
 52 | 		}
 53 | 		printf("\n");
 54 | 	}
 55 | 	printf("\n");
 56 | }
 57 | 
 58 | 
 59 | /*serial version */
 60 | void mat_add_serial(REAL* x, REAL* y, int m, int n, REAL* result) {
 61 |   int i;
 62 |   for (i = 0; i < m; i++) {
 63 |     for(int j = 0; j < n; j++){
 64 |       result[i*n+j] = x[i*n+j] + y[i*n+j];
 65 |     }
 66 |   }
 67 | }
 68 | 
 69 | /* compare two arrays and return percentage of difference */
 70 | REAL check(REAL*A, REAL*B, int n)
 71 | {
 72 |     int i;
 73 |     REAL diffsum =0.0, sum = 0.0;
 74 |     for (i = 0; i < n; i++) {
 75 |         diffsum += fabs(A[i] - B[i]);
 76 |         sum += fabs(B[i]);
 77 |     }
 78 |     return diffsum/sum;
 79 | }
 80 | 
 81 | 
 82 | int main(int argc, char *argv[])
 83 | {
 84 |     int N;
 85 |     N = VEC_LEN;
 86 |     fprintf(stderr, "Usage: MatAdd <N*M>\n");
 87 |     if (argc >= 2) {
 88 |       N = atoi(argv[1]);
 89 |     }
 90 |   
 91 |   	int M=N;
 92 | 
 93 |   	REAL *h_matrixA = (REAL*)malloc(M * N * sizeof(REAL));
 94 |   	REAL *h_matrixB = (REAL*)malloc(M * N * sizeof(REAL));
 95 |   	REAL *h_result = (REAL*)malloc(M * N * sizeof(REAL));
 96 |    	REAL *result_serial = (REAL*)malloc(M * N * sizeof(REAL));
 97 |  
 98 |     init_matrix(h_matrixA, M, N);
 99 |     init_matrix(h_matrixB, M, N);
100 |  
101 |     int i;
102 |     int num_runs = 5;
103 |     mat_add_serial(h_matrixA, h_matrixB, M, N, result_serial);
104 |     for (i=0; i<num_runs; i++) matadd(h_matrixA, h_matrixB, M, N, h_result);
105 |     printf("check:%f\n", check(result_serial,h_result,M*N));
106 |   
107 | }
108 | 


--------------------------------------------------------------------------------
/ReadOnlyMem_2D_Texture/matadd_2D_cudakernel.cu:
--------------------------------------------------------------------------------
  1 | //******************************************************************************************************************//
  2 | // Copyright (c) 2021, University of North Carolina at Charlotte
  3 | // and Lawrence Livermore National Security, LLC.
  4 | // SPDX-License-Identifier: (BSD-3-Clause)
  5 | //*****************************************************************************************************************//
  6 | #include "matadd_2D.h"
  7 | #include"cuda_runtime.h"  
  8 | #include"device_launch_parameters.h"  
  9 | #include <stdio.h>
 10 | 
 11 | #define BLOCK_SIZE 16
 12 | 
 13 | texture<float,2>texMatrixA;
 14 | texture<float,2>texMatrixB;
 15 | 
 16 | //constant memory
 17 | __constant__ int cons_M;
 18 | __constant__ int cons_N;
 19 | 
 20 | __global__ void add_warmingup(float * d_matrixA, float * d_matrixB, float *d_Result, int d_M, int d_N)  
 21 | {  
 22 |     const int tidx = blockDim.x * blockIdx.x + threadIdx.x;
 23 |     const int tidy = blockDim.y * blockIdx.y + threadIdx.y;
 24 |     if(tidx<d_M && tidy<d_N) {
 25 |         d_Result[tidx * d_N + tidy] = d_matrixA[tidx * d_N + tidy] + d_matrixB[tidx * d_N + tidy];
 26 |     }
 27 | }  
 28 | 
 29 | __global__ void add(float * d_matrixA, float * d_matrixB, float *d_Result, int d_M, int d_N)  
 30 | {  
 31 |     const int tidx = blockDim.x * blockIdx.x + threadIdx.x;
 32 |     const int tidy = blockDim.y * blockIdx.y + threadIdx.y;
 33 |     if(tidx<d_M && tidy<d_N) {
 34 |         d_Result[tidx * d_N + tidy] = d_matrixA[tidx * d_N + tidy] + d_matrixB[tidx * d_N + tidy];
 35 |     }
 36 | }  
 37 | 
 38 | __global__ void add_const(float * d_matrixA, float * d_matrixB, float *d_Result)  
 39 | {  
 40 |     const int tidx = blockDim.x * blockIdx.x + threadIdx.x;
 41 |     const int tidy = blockDim.y * blockIdx.y + threadIdx.y;
 42 |     if(tidx<cons_M && tidy<cons_N) {
 43 |         d_Result[tidx * cons_N + tidy] = d_matrixA[tidx * cons_N + tidy] + d_matrixB[tidx * cons_N + tidy];
 44 |     }
 45 | }  
 46 | 
 47 | 
 48 | __global__ static void add_texture(float *d_Result, int d_M, int d_N)
 49 | {
 50 |     const int tidx = blockDim.x * blockIdx.x + threadIdx.x;
 51 |     const int tidy = blockDim.y * blockIdx.y + threadIdx.y;
 52 |     if(tidx<d_M && tidy<d_N) {
 53 |         float u = tex2D(texMatrixA,tidx,tidy);
 54 |         float v = tex2D(texMatrixB,tidx,tidy);
 55 |         d_Result[tidx * d_N + tidy] = u + v;
 56 |     }
 57 | }
 58 | 
 59 | __global__ static void add_texture_constant(float *d_Result)
 60 | {
 61 |     const int tidx = blockDim.x * blockIdx.x + threadIdx.x;
 62 |     const int tidy = blockDim.y * blockIdx.y + threadIdx.y;
 63 |     if(tidx<cons_M && tidy<cons_N) {
 64 |         float u = tex2D(texMatrixA,tidx,tidy);
 65 |         float v = tex2D(texMatrixB,tidx,tidy);
 66 |         d_Result[tidx * cons_N+ tidy] = u + v;
 67 |     }
 68 | }
 69 | 
 70 | 
 71 | void matadd(float * h_matrixA, float * h_matrixB, int M, int N, float * h_result) {
 72 |     cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
 73 |     float *d_matrixA = NULL, *d_matrixB = NULL, *d_result = NULL;
 74 |     cudaMalloc(&d_matrixA, M * N * sizeof(float));
 75 |     cudaMalloc(&d_matrixB, M * N * sizeof(float));
 76 |     cudaMalloc(&d_result, M * N * sizeof(float));
 77 | 
 78 |     cudaMemcpy(d_matrixA, h_matrixA, M * N * sizeof(float), cudaMemcpyHostToDevice);
 79 |     cudaMemcpy(d_matrixB, h_matrixB, M * N * sizeof(float), cudaMemcpyHostToDevice);
 80 |     cudaBindTexture2D(0, texMatrixA, d_matrixA, channelDesc, N, M, M * sizeof(float));
 81 |     cudaBindTexture2D(0, texMatrixB, d_matrixB, channelDesc, N, M, M * sizeof(float));
 82 | 
 83 |     cudaMemcpyToSymbol(cons_M,&M,sizeof(float),0);
 84 |     cudaMemcpyToSymbol(cons_N,&N,sizeof(float),0);
 85 | 
 86 |     dim3 blocks(1,1,1);
 87 |     dim3 threadsperblock(BLOCK_SIZE,BLOCK_SIZE,1);
 88 |     blocks.x=((M/BLOCK_SIZE) + (((M)%BLOCK_SIZE)==0?0:1));
 89 |     blocks.y=((N/BLOCK_SIZE) + (((N)%BLOCK_SIZE)==0?0:1));
 90 | 
 91 |     add_warmingup<<<blocks,threadsperblock>>>(d_matrixA,d_matrixB,d_result,M,N);
 92 |     cudaDeviceSynchronize();
 93 |     add<<<blocks,threadsperblock>>>(d_matrixA,d_matrixB,d_result,M,N);
 94 |     cudaDeviceSynchronize();
 95 |     add_const<<<blocks,threadsperblock>>>(d_matrixA,d_matrixB,d_result);
 96 |     cudaDeviceSynchronize();
 97 |     add_texture<<<blocks,threadsperblock>>>(d_result,M,N);
 98 |     cudaDeviceSynchronize();
 99 |     add_texture_constant<<<blocks,threadsperblock>>>(d_result);
100 |     cudaDeviceSynchronize();
101 | 
102 |     cudaDeviceSynchronize();
103 |     cudaMemcpy(h_result,d_result,M * N * sizeof(float), cudaMemcpyDeviceToHost);
104 |     cudaUnbindTexture(texMatrixA);
105 |     cudaUnbindTexture(texMatrixB);
106 | 
107 |     cudaFree(d_matrixA);
108 |     cudaFree(d_matrixB);
109 |     cudaFree(d_result);
110 | }
111 | 


--------------------------------------------------------------------------------
/ReadOnlyMem_2D_Texture/test.sh:
--------------------------------------------------------------------------------
1 | nvprof ./matadd_2D_cuda 1024
2 | nvprof ./matadd_2D_cuda 10240
3 | nvprof ./matadd_2D_cuda 20480
4 | nvprof ./matadd_2D_cuda 40960


--------------------------------------------------------------------------------
/Shmem/Makefile:
--------------------------------------------------------------------------------
1 | default: mm_omp_cuda
2 | 
3 | clean: 
4 | 	rm -rf ${OBJS} *.log *.out
5 | 
6 | mm_omp_cuda: mm_omp_cuda.c mm_omp_cuda.h mm_kernel.cu
7 | 	nvcc mm_omp_cuda.c mm_kernel.cu -o mm_omp_cuda.out
8 | 
9 | 


--------------------------------------------------------------------------------
/Shmem/mm_omp_cuda.c:
--------------------------------------------------------------------------------
  1 | //******************************************************************************************************************//
  2 | // Copyright (c) 2021, University of North Carolina at Charlotte
  3 | // and Lawrence Livermore National Security, LLC.
  4 | // SPDX-License-Identifier: (BSD-3-Clause)
  5 | //*****************************************************************************************************************//
  6 | /*
  7 |  * Square matrix multiplication
  8 |  * A[N][N] * B[N][N] = C[N][N]
  9 |  *
 10 |  */
 11 | #include <stdio.h>
 12 | #include <stdlib.h>
 13 | #include <math.h>
 14 | #include <sys/timeb.h>
 15 | #include <string.h>
 16 | #include "mm_omp_cuda.h"
 17 | 
 18 | #define ALLOWED_DIFF 0.0001
 19 | 
 20 | /* read timer in second */
 21 | double read_timer() {
 22 |     struct timeb tm;
 23 |     ftime(&tm);
 24 |     return (double) tm.time + (double) tm.millitm / 1000.0;
 25 | }
 26 | 
 27 | /* read timer in ms */
 28 | double read_timer_ms() {
 29 |     struct timeb tm;
 30 |     ftime(&tm);
 31 |     return (double) tm.time * 1000.0 + (double) tm.millitm;
 32 | }
 33 | 
 34 | #define REAL double
 35 | 
 36 | void init(int N, REAL *A) {
 37 |     int i, j;
 38 | 
 39 |     for (i = 0; i < N; i++) {
 40 |         for (j = 0; j < N; j++) {
 41 |             A[i*N+j] = (REAL) drand48();
 42 |         }
 43 |     }
 44 | }
 45 | 
 46 | 
 47 | void matmul_serial(int N, REAL *A, REAL *B, REAL *C) {
 48 |     int i,j,k;
 49 |     REAL temp;
 50 |     for (i = 0; i < N; i++) {
 51 |         for (j = 0; j < N; j++) {
 52 |             temp = 0;
 53 |             for (k = 0; k < N; k++) {
 54 |                 temp += (A[i * N + k] * B[k * N + j]);
 55 |             }
 56 |             C[i * N + j] = temp;
 57 |         }
 58 |     }
 59 | }
 60 | 
 61 | int main(int argc, char *argv[]) {
 62 |     int N;
 63 | 
 64 |     int num_threads = 4; /* 4 is default number of threads */
 65 |     if (argc < 2) {
 66 |         fprintf(stderr, "Usage: mm <n> (default %d) [<num_threads>] (default %d)\n", N, num_threads);
 67 |         exit(1);
 68 |     }
 69 |     N = atoi(argv[1]);
 70 | 
 71 |     double elapsed_shmem;
 72 |     double elapsed_cuda;
 73 | 
 74 |     REAL *A = malloc(sizeof(REAL)*N*N);
 75 |     REAL *B = malloc(sizeof(REAL)*N*N);
 76 |     REAL *C_shmem = malloc(sizeof(REAL)*N*N);
 77 |     REAL *C = malloc(sizeof(REAL)*N*N);
 78 |     REAL *C_serial = malloc(sizeof(REAL)*N*N);
 79 | 
 80 |     srand48((1 << 12));
 81 |     init(N, A);
 82 |     init(N, B);
 83 | 
 84 |     int i, j;
 85 |     int num_runs = 10;
 86 |     
 87 |     matmul_serial(N, A, B, C_serial);
 88 |     mm_kernel_shmem(A, B, C_shmem,N);
 89 |     
 90 |     elapsed_cuda = read_timer();
 91 |     for (i=0; i<num_runs; i++)
 92 |         mm_kernel(A, B, C, N);
 93 |     elapsed_cuda = (read_timer() - elapsed_cuda)/num_runs;
 94 |     
 95 |     elapsed_shmem = read_timer();
 96 |     for (i=0; i<num_runs; i++)
 97 |         mm_kernel_shmem(A, B, C_shmem,N);
 98 |     elapsed_shmem = (read_timer() - elapsed_shmem)/num_runs;
 99 |     /* you should add the call to each function and time the execution */
100 | 
101 | 
102 |     
103 |     for (i = 0; i < N; i++) {
104 |         for (j = 0; j < N; j++) {
105 |             if (fabs(C[i * N + j] - C_serial[i * N + j]) > ALLOWED_DIFF) {
106 |                 printf("C[%d][%d]: %g, C_omp[%d][%d]: %g\n", i, j, C[i * N + j], i, j, C_serial[i * N + j]);
107 |                 break;
108 |             }
109 |         }
110 |     };
111 | 
112 |     printf("======================================================================================================\n");
113 |     printf("\tMatrix Multiplication: A[N][N] * B[N][N] = C[N][N], N=%d\n", N);
114 |     printf("------------------------------------------------------------------------------------------------------\n");
115 |     printf("Performance:\t\tRuntime (ms)\t MFLOPS\n");
116 |     printf("------------------------------------------------------------------------------------------------------\n");
117 |     printf("matmul_cuda:\t\t%4f\t%4f\n", elapsed_cuda * 1.0e3, ((((2.0 * N) * N) * N) / (1.0e6 * elapsed_cuda)));
118 |     printf("------------------------------------------------------------------------------------------------------\n");
119 |     printf("matmul_shmem:\t\t%4f\t%4f\n", elapsed_shmem * 1.0e3, ((((2.0 * N) * N) * N) / (1.0e6 * elapsed_shmem)));
120 | 
121 |     return 0;
122 | }
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/Shmem/mm_omp_cuda.h:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #define REAL double
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | extern void mm_kernel(REAL*, REAL*, REAL*, int);
12 | extern void mm_kernel_shmem(REAL*, REAL*, REAL*, int);
13 | #ifdef __cplusplus
14 | }
15 | #endif
16 | 


--------------------------------------------------------------------------------
/Shuffle/cuda_global/README.md:
--------------------------------------------------------------------------------
 1 | # reduction - CUDA Parallel Reduction
 2 | 
 3 | ## Description
 4 | 
 5 | A parallel sum reduction that computes the sum of a large arrays of values.  This sample demonstrates several important optimization strategies for Data-Parallel Algorithms like reduction.
 6 | 
 7 | ## Key Concepts
 8 | 
 9 | Data-Parallel Algorithms, Performance Strategies
10 | 
11 | ## Supported SM Architectures
12 | 
13 | [SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
14 | 
15 | ## Supported OSes
16 | 
17 | Linux, Windows, MacOSX
18 | 
19 | ## Supported CPU Architecture
20 | 
21 | x86_64, ppc64le, armv7l
22 | 
23 | ## CUDA APIs involved
24 | 
25 | ## Prerequisites
26 | 
27 | Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
28 | 
29 | ## Build and Run
30 | 
31 | ### Windows
32 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
33 | ```
34 | *_vs<version>.sln - for Visual Studio <version>
35 | ```
36 | Each individual sample has its own set of solution files in its directory:
37 | 
38 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
39 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
40 | 
41 | ### Linux
42 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
43 | ```
44 | $ cd <sample_dir>
45 | $ make
46 | ```
47 | The samples makefiles can take advantage of certain options:
48 | *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
49 |     By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
50 | `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
51 |     See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
52 | *   **dbg=1** - build with debug symbols
53 |     ```
54 |     $ make dbg=1
55 |     ```
56 | *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
57 |     ```
58 |     $ make SMS="50 60"
59 |     ```
60 | 
61 | *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
62 | ```
63 |     $ make HOST_COMPILER=g++
64 | ```
65 | 
66 | ### Mac
67 | The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
68 | ```
69 | $ cd <sample_dir>
70 | $ make
71 | ```
72 | 
73 | The samples makefiles can take advantage of certain options:
74 | 
75 | *  **dbg=1** - build with debug symbols
76 |     ```
77 |     $ make dbg=1
78 |     ```
79 | 
80 | *  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
81 |     ```
82 |     $ make SMS="A B ..."
83 |     ```
84 | 
85 | *  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
86 |     ```
87 |     $ make HOST_COMPILER=clang
88 |     ```
89 | 
90 | ## References (for more details)
91 | 
92 | 


--------------------------------------------------------------------------------
/Shuffle/cuda_global/reduction.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | 
29 | #ifndef __REDUCTION_H__
30 | #define __REDUCTION_H__
31 | 
32 | template <class T>
33 | void reduce(int size, int threads, int blocks,
34 |             int whichKernel, T *d_idata, T *d_odata);
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/Shuffle/cuda_global/result.txt:
--------------------------------------------------------------------------------
 1 | ./reduction.out Starting...
 2 | 
 3 | GPU Device 0: "Volta" with compute capability 7.0
 4 | 
 5 | Using Device 0: Tesla V100-PCIE-32GB
 6 | 
 7 | Reducing array of type int
 8 | 
 9 | 16777216 elements
10 | 256 threads (max)
11 | 32768 blocks
12 | 
13 | Reduction, Throughput = 67.8752 GB/s, Time = 0.00099 s, Size = 16777216 Elements, NumDevsUsed = 1, Workgroup = 256
14 | 
15 | GPU result = 2139353471
16 | CPU result = 2139353471
17 | 
18 | Test passed
19 | ./reduction.out Starting...
20 | 
21 | GPU Device 0: "Volta" with compute capability 7.0
22 | 
23 | Using Device 0: Tesla V100-PCIE-32GB
24 | 
25 | Reducing array of type int
26 | 
27 | 33554432 elements
28 | 256 threads (max)
29 | 65536 blocks
30 | 
31 | Reduction, Throughput = 161.0948 GB/s, Time = 0.00097 s, Size = 33554432 Elements, NumDevsUsed = 1, Workgroup = 256
32 | 
33 | GPU result = -16317892
34 | CPU result = -16317892
35 | 
36 | Test passed
37 | ./reduction.out Starting...
38 | 
39 | GPU Device 0: "Volta" with compute capability 7.0
40 | 
41 | Using Device 0: Tesla V100-PCIE-32GB
42 | 
43 | Reducing array of type int
44 | 
45 | 67108864 elements
46 | 256 threads (max)
47 | 131072 blocks
48 | 
49 | Reduction, Throughput = 292.4071 GB/s, Time = 0.00103 s, Size = 67108864 Elements, NumDevsUsed = 1, Workgroup = 256
50 | 
51 | GPU result = -32918757
52 | CPU result = -32918757
53 | 
54 | Test passed
55 | ./reduction.out Starting...
56 | 
57 | GPU Device 0: "Volta" with compute capability 7.0
58 | 
59 | Using Device 0: Tesla V100-PCIE-32GB
60 | 
61 | Reducing array of type int
62 | 
63 | 134217728 elements
64 | 256 threads (max)
65 | 262144 blocks
66 | 
67 | Reduction, Throughput = 459.6851 GB/s, Time = 0.00121 s, Size = 134217728 Elements, NumDevsUsed = 1, Workgroup = 256
68 | 
69 | GPU result = -66248749
70 | CPU result = -66248749
71 | 
72 | Test passed
73 | 


--------------------------------------------------------------------------------
/Shuffle/cuda_global/test.sh:
--------------------------------------------------------------------------------
1 | ./reduction.out n=16777216
2 | ./reduction.out n=33554432
3 | ./reduction.out n=67108864
4 | ./reduction.out n=134217728


--------------------------------------------------------------------------------
/Shuffle/cuda_shuffle/README.md:
--------------------------------------------------------------------------------
 1 | # reduction - CUDA Parallel Reduction
 2 | 
 3 | ## Description
 4 | 
 5 | A parallel sum reduction that computes the sum of a large arrays of values.  This sample demonstrates several important optimization strategies for Data-Parallel Algorithms like reduction.
 6 | 
 7 | ## Key Concepts
 8 | 
 9 | Data-Parallel Algorithms, Performance Strategies
10 | 
11 | ## Supported SM Architectures
12 | 
13 | [SM 3.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)
14 | 
15 | ## Supported OSes
16 | 
17 | Linux, Windows, MacOSX
18 | 
19 | ## Supported CPU Architecture
20 | 
21 | x86_64, ppc64le, armv7l
22 | 
23 | ## CUDA APIs involved
24 | 
25 | ## Prerequisites
26 | 
27 | Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
28 | 
29 | ## Build and Run
30 | 
31 | ### Windows
32 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
33 | ```
34 | *_vs<version>.sln - for Visual Studio <version>
35 | ```
36 | Each individual sample has its own set of solution files in its directory:
37 | 
38 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
39 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
40 | 
41 | ### Linux
42 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
43 | ```
44 | $ cd <sample_dir>
45 | $ make
46 | ```
47 | The samples makefiles can take advantage of certain options:
48 | *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
49 |     By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
50 | `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
51 |     See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
52 | *   **dbg=1** - build with debug symbols
53 |     ```
54 |     $ make dbg=1
55 |     ```
56 | *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
57 |     ```
58 |     $ make SMS="50 60"
59 |     ```
60 | 
61 | *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
62 | ```
63 |     $ make HOST_COMPILER=g++
64 | ```
65 | 
66 | ### Mac
67 | The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make:
68 | ```
69 | $ cd <sample_dir>
70 | $ make
71 | ```
72 | 
73 | The samples makefiles can take advantage of certain options:
74 | 
75 | *  **dbg=1** - build with debug symbols
76 |     ```
77 |     $ make dbg=1
78 |     ```
79 | 
80 | *  **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60".
81 |     ```
82 |     $ make SMS="A B ..."
83 |     ```
84 | 
85 | *  **HOST_COMPILER=<host_compiler>** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers.
86 |     ```
87 |     $ make HOST_COMPILER=clang
88 |     ```
89 | 
90 | ## References (for more details)
91 | 
92 | 


--------------------------------------------------------------------------------
/Shuffle/cuda_shuffle/reduction.h:
--------------------------------------------------------------------------------
 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved.
 2 |  *
 3 |  * Redistribution and use in source and binary forms, with or without
 4 |  * modification, are permitted provided that the following conditions
 5 |  * are met:
 6 |  *  * Redistributions of source code must retain the above copyright
 7 |  *    notice, this list of conditions and the following disclaimer.
 8 |  *  * Redistributions in binary form must reproduce the above copyright
 9 |  *    notice, this list of conditions and the following disclaimer in the
10 |  *    documentation and/or other materials provided with the distribution.
11 |  *  * Neither the name of NVIDIA CORPORATION nor the names of its
12 |  *    contributors may be used to endorse or promote products derived
13 |  *    from this software without specific prior written permission.
14 |  *
15 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
16 |  * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17 |  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
18 |  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
19 |  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
20 |  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
21 |  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
22 |  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
23 |  * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 |  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 |  */
27 | 
28 | 
29 | #ifndef __REDUCTION_H__
30 | #define __REDUCTION_H__
31 | 
32 | template <class T>
33 | void reduce(int size, int threads, int blocks,
34 |             int whichKernel, T *d_idata, T *d_odata);
35 | 
36 | #endif
37 | 


--------------------------------------------------------------------------------
/Shuffle/cuda_shuffle/result.txt:
--------------------------------------------------------------------------------
 1 | ./reduction.out Starting...
 2 | 
 3 | GPU Device 0: "Volta" with compute capability 7.0
 4 | 
 5 | Using Device 0: Tesla V100-PCIE-32GB
 6 | 
 7 | Reducing array of type int
 8 | 
 9 | 16777216 elements
10 | 256 threads (max)
11 | 32768 blocks
12 | 
13 | Reduction, Throughput = 73.3799 GB/s, Time = 0.00091 s, Size = 16777216 Elements, NumDevsUsed = 1, Workgroup = 256
14 | 
15 | GPU result = 2139353471
16 | CPU result = 2139353471
17 | 
18 | Test passed
19 | ./reduction.out Starting...
20 | 
21 | GPU Device 0: "Volta" with compute capability 7.0
22 | 
23 | Using Device 0: Tesla V100-PCIE-32GB
24 | 
25 | Reducing array of type int
26 | 
27 | 33554432 elements
28 | 256 threads (max)
29 | 65536 blocks
30 | 
31 | Reduction, Throughput = 161.9071 GB/s, Time = 0.00083 s, Size = 33554432 Elements, NumDevsUsed = 1, Workgroup = 256
32 | 
33 | GPU result = -16317892
34 | CPU result = -16317892
35 | 
36 | Test passed
37 | ./reduction.out Starting...
38 | 
39 | GPU Device 0: "Volta" with compute capability 7.0
40 | 
41 | Using Device 0: Tesla V100-PCIE-32GB
42 | 
43 | Reducing array of type int
44 | 
45 | 67108864 elements
46 | 256 threads (max)
47 | 131072 blocks
48 | 
49 | Reduction, Throughput = 323.6463 GB/s, Time = 0.00083 s, Size = 67108864 Elements, NumDevsUsed = 1, Workgroup = 256
50 | 
51 | GPU result = -32918757
52 | CPU result = -32918757
53 | 
54 | Test passed
55 | ./reduction.out Starting...
56 | 
57 | GPU Device 0: "Volta" with compute capability 7.0
58 | 
59 | Using Device 0: Tesla V100-PCIE-32GB
60 | 
61 | Reducing array of type int
62 | 
63 | 134217728 elements
64 | 256 threads (max)
65 | 262144 blocks
66 | 
67 | Reduction, Throughput = 590.8185 GB/s, Time = 0.00091 s, Size = 134217728 Elements, NumDevsUsed = 1, Workgroup = 256
68 | 
69 | GPU result = -66248749
70 | CPU result = -66248749
71 | 
72 | Test passed
73 | 


--------------------------------------------------------------------------------
/Shuffle/cuda_shuffle/test.sh:
--------------------------------------------------------------------------------
1 | ./reduction.out n=16777216
2 | ./reduction.out n=33554432
3 | ./reduction.out n=67108864
4 | ./reduction.out n=134217728


--------------------------------------------------------------------------------
/TaskGraph/NsightEclipse.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?> 
 2 | <!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
 3 | <entry>
 4 |   <name>conjugateGradientCudaGraphs</name>
 5 |   <cuda_api_list>
 6 |     <toolkit>cudaStreamBeginCapture</toolkit>
 7 |     <toolkit>cudaStreamEndCapture</toolkit>
 8 |     <toolkit>cudaGraphCreate</toolkit>
 9 |     <toolkit>cudaGraphLaunch</toolkit>
10 |     <toolkit>cudaGraphInstantiate</toolkit>
11 |     <toolkit>cudaGraphExecDestroy</toolkit>
12 |     <toolkit>cudaGraphDestroy</toolkit>
13 |   </cuda_api_list>
14 |   <description><![CDATA[This sample implements a conjugate gradient solver on GPU using CUBLAS and CUSPARSE library calls captured and called using CUDA Graph APIs.]]></description>
15 |   <devicecompilation>whole</devicecompilation>
16 |   <includepaths>
17 |     <path>./</path>
18 |     <path>../</path>
19 |     <path>../../common/inc</path>
20 |   </includepaths>
21 |   <keyconcepts>
22 |     <concept level="advanced">Linear Algebra</concept>
23 |     <concept level="advanced">CUBLAS Library</concept>
24 |     <concept level="advanced">CUSPARSE Library</concept>
25 |   </keyconcepts>
26 |   <keywords>
27 |     <keyword>CUDA</keyword>
28 |     <keyword>CUBLAS</keyword>
29 |     <keyword>CUSPARSE</keyword>
30 |     <keyword>Sparse Matrix</keyword>
31 |   </keywords>
32 |   <libraries>
33 |     <library>cublas_static</library>
34 |     <library>cublasLt_static</library>
35 |     <library>cusparse_static</library>
36 |     <library>culibos</library>
37 |   </libraries>
38 |   <librarypaths>
39 |   </librarypaths>
40 |   <nsight_eclipse>true</nsight_eclipse>
41 |   <primary_file>conjugateGradientCudaGraphs.cu</primary_file>
42 |   <required_dependencies>
43 |     <dependency>CUBLAS</dependency>
44 |     <dependency>CUSPARSE</dependency>
45 |   </required_dependencies>
46 |   <scopes>
47 |     <scope>1:CUDA Advanced Topics</scope>
48 |     <scope>3:Linear Algebra</scope>
49 |     <scope>1:CUDA Graphs</scope>
50 |   </scopes>
51 |   <sm-arch>sm35</sm-arch>
52 |   <sm-arch>sm37</sm-arch>
53 |   <sm-arch>sm50</sm-arch>
54 |   <sm-arch>sm52</sm-arch>
55 |   <sm-arch>sm60</sm-arch>
56 |   <sm-arch>sm61</sm-arch>
57 |   <sm-arch>sm70</sm-arch>
58 |   <sm-arch>sm72</sm-arch>
59 |   <sm-arch>sm75</sm-arch>
60 |   <sm-arch>sm80</sm-arch>
61 |   <sm-arch>sm86</sm-arch>
62 |   <supported_envs>
63 |     <env>
64 |       <arch>x86_64</arch>
65 |       <platform>linux</platform>
66 |     </env>
67 |     <env>
68 |       <platform>windows7</platform>
69 |     </env>
70 |     <env>
71 |       <arch>x86_64</arch>
72 |       <platform>macosx</platform>
73 |     </env>
74 |     <env>
75 |       <arch>arm</arch>
76 |     </env>
77 |     <env>
78 |       <arch>ppc64le</arch>
79 |       <platform>linux</platform>
80 |     </env>
81 |   </supported_envs>
82 |   <supported_sm_architectures>
83 |     <include>all</include>
84 |   </supported_sm_architectures>
85 |   <title>Conjugate Gradient using Cuda Graphs</title>
86 |   <type>exe</type>
87 | </entry>
88 | 


--------------------------------------------------------------------------------
/TaskGraph/README.md:
--------------------------------------------------------------------------------
 1 | # conjugateGradientCudaGraphs - Conjugate Gradient using Cuda Graphs
 2 | 
 3 | ## Description
 4 | 
 5 | This sample implements a conjugate gradient solver on GPU using CUBLAS and CUSPARSE library calls captured and called using CUDA Graph APIs.
 6 | 
 7 | ## Key Concepts
 8 | 
 9 | Linear Algebra, CUBLAS Library, CUSPARSE Library
10 | 
11 | ## Supported SM Architectures
12 | 
13 | [SM 3.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 3.7 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 5.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 6.1 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.2 ](https://developer.nvidia.com/cuda-gpus)  [SM 7.5 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.0 ](https://developer.nvidia.com/cuda-gpus)  [SM 8.6 ](https://developer.nvidia.com/cuda-gpus)
14 | 
15 | ## Supported OSes
16 | 
17 | Linux, Windows
18 | 
19 | ## Supported CPU Architecture
20 | 
21 | x86_64, ppc64le, armv7l
22 | 
23 | ## CUDA APIs involved
24 | 
25 | ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html)
26 | cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch, cudaGraphInstantiate, cudaGraphExecDestroy, cudaGraphDestroy
27 | 
28 | ## Dependencies needed to build/run
29 | [CUBLAS](../../README.md#cublas), [CUSPARSE](../../README.md#cusparse)
30 | 
31 | ## Prerequisites
32 | 
33 | Download and install the [CUDA Toolkit 11.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform.
34 | Make sure the dependencies mentioned in [Dependencies]() section above are installed.
35 | 
36 | ## Build and Run
37 | 
38 | ### Windows
39 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format:
40 | ```
41 | *_vs<version>.sln - for Visual Studio <version>
42 | ```
43 | Each individual sample has its own set of solution files in its directory:
44 | 
45 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used.
46 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details."
47 | 
48 | ### Linux
49 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make:
50 | ```
51 | $ cd <sample_dir>
52 | $ make
53 | ```
54 | The samples makefiles can take advantage of certain options:
55 | *  **TARGET_ARCH=<arch>** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l.
56 |     By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.<br/>
57 | `$ make TARGET_ARCH=x86_64` <br/> `$ make TARGET_ARCH=ppc64le` <br/> `$ make TARGET_ARCH=armv7l` <br/>
58 |     See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details.
59 | *   **dbg=1** - build with debug symbols
60 |     ```
61 |     $ make dbg=1
62 |     ```
63 | *   **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`.
64 |     ```
65 |     $ make SMS="50 60"
66 |     ```
67 | 
68 | *  **HOST_COMPILER=<host_compiler>** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers.
69 | ```
70 |     $ make HOST_COMPILER=g++
71 | ```
72 | 
73 | ## References (for more details)
74 | 
75 | 


--------------------------------------------------------------------------------
/TaskGraph/conjugateGradientCudaGraphs_vs2015.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 14.00
 3 | # Visual Studio 2015
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|x64 = Debug|x64
 9 | 		Release|x64 = Release|x64
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
13 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
14 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
15 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/TaskGraph/conjugateGradientCudaGraphs_vs2015.vcxproj:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <PropertyGroup>
  4 |     <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  5 |   </PropertyGroup>
  6 |   <ItemGroup Label="ProjectConfigurations">
  7 |     <ProjectConfiguration Include="Debug|x64">
  8 |       <Configuration>Debug</Configuration>
  9 |       <Platform>x64</Platform>
 10 |     </ProjectConfiguration>
 11 |     <ProjectConfiguration Include="Release|x64">
 12 |       <Configuration>Release</Configuration>
 13 |       <Platform>x64</Platform>
 14 |     </ProjectConfiguration>
 15 |   </ItemGroup>
 16 |   <PropertyGroup Label="Globals">
 17 |     <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
 18 |     <RootNamespace>conjugateGradientCudaGraphs_vs2015</RootNamespace>
 19 |     <ProjectName>conjugateGradientCudaGraphs</ProjectName>
 20 |     <CudaToolkitCustomDir />
 21 |   </PropertyGroup>
 22 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 23 |   <PropertyGroup>
 24 |     <ConfigurationType>Application</ConfigurationType>
 25 |     <CharacterSet>MultiByte</CharacterSet>
 26 |     <PlatformToolset>v140</PlatformToolset>
 27 |   </PropertyGroup>
 28 |   <PropertyGroup Condition="'$(Configuration)'=='Debug'">
 29 |     <UseDebugLibraries>true</UseDebugLibraries>
 30 |   </PropertyGroup>
 31 |   <PropertyGroup Condition="'$(Configuration)'=='Release'">
 32 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 33 |   </PropertyGroup>
 34 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 35 |   <ImportGroup Label="ExtensionSettings">
 36 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.props" />
 37 |   </ImportGroup>
 38 |   <ImportGroup Label="PropertySheets">
 39 |     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
 40 |   </ImportGroup>
 41 |   <PropertyGroup Label="UserMacros" />
 42 |   <PropertyGroup>
 43 |     <IntDir>$(Platform)/$(Configuration)/</IntDir>
 44 |     <IncludePath>$(IncludePath)</IncludePath>
 45 |     <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
 46 |     <CodeAnalysisRules />
 47 |     <CodeAnalysisRuleAssemblies />
 48 |   </PropertyGroup>
 49 |   <PropertyGroup Condition="'$(Platform)'=='x64'">
 50 |     <OutDir>../../bin/win64/$(Configuration)/</OutDir>
 51 |   </PropertyGroup>
 52 |   <ItemDefinitionGroup>
 53 |     <ClCompile>
 54 |       <WarningLevel>Level3</WarningLevel>
 55 |       <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 56 |       <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
 57 |     </ClCompile>
 58 |     <Link>
 59 |       <SubSystem>Console</SubSystem>
 60 |       <AdditionalDependencies>cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
 61 |       <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
 62 |       <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
 63 |     </Link>
 64 |     <CudaCompile>
 65 |       <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
 66 |       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
 67 |       <Include>./;../../Common</Include>
 68 |       <Defines>WIN32</Defines>
 69 |     </CudaCompile>
 70 |   </ItemDefinitionGroup>
 71 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
 72 |     <ClCompile>
 73 |       <Optimization>Disabled</Optimization>
 74 |       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
 75 |     </ClCompile>
 76 |     <Link>
 77 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 78 |       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
 79 |     </Link>
 80 |     <CudaCompile>
 81 |       <Runtime>MTd</Runtime>
 82 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 83 |     </CudaCompile>
 84 |   </ItemDefinitionGroup>
 85 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
 86 |     <ClCompile>
 87 |       <Optimization>MaxSpeed</Optimization>
 88 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
 89 |     </ClCompile>
 90 |     <Link>
 91 |       <GenerateDebugInformation>false</GenerateDebugInformation>
 92 |       <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
 93 |     </Link>
 94 |     <CudaCompile>
 95 |       <Runtime>MT</Runtime>
 96 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 97 |     </CudaCompile>
 98 |   </ItemDefinitionGroup>
 99 |   <ItemGroup>
100 |     <CudaCompile Include="conjugateGradientCudaGraphs.cu" />
101 | 
102 |   </ItemGroup>
103 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
104 |   <ImportGroup Label="ExtensionTargets">
105 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.targets" />
106 |   </ImportGroup>
107 | </Project>
108 | 


--------------------------------------------------------------------------------
/TaskGraph/conjugateGradientCudaGraphs_vs2017.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 2017
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|x64 = Debug|x64
 9 | 		Release|x64 = Release|x64
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
13 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
14 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
15 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/TaskGraph/conjugateGradientCudaGraphs_vs2019.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 2019
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}"
 5 | EndProject
 6 | Global
 7 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 8 | 		Debug|x64 = Debug|x64
 9 | 		Release|x64 = Release|x64
10 | 	EndGlobalSection
11 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
12 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64
13 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64
14 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64
15 | 		{997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/TaskGraph/conjugateGradientCudaGraphs_vs2019.vcxproj:
--------------------------------------------------------------------------------
  1 | ﻿<?xml version="1.0" encoding="utf-8"?>
  2 | <Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  3 |   <PropertyGroup>
  4 |     <CUDAPropsPath Condition="'$(CUDAPropsPath)'==''">$(VCTargetsPath)\BuildCustomizations</CUDAPropsPath>
  5 |   </PropertyGroup>
  6 |   <ItemGroup Label="ProjectConfigurations">
  7 |     <ProjectConfiguration Include="Debug|x64">
  8 |       <Configuration>Debug</Configuration>
  9 |       <Platform>x64</Platform>
 10 |     </ProjectConfiguration>
 11 |     <ProjectConfiguration Include="Release|x64">
 12 |       <Configuration>Release</Configuration>
 13 |       <Platform>x64</Platform>
 14 |     </ProjectConfiguration>
 15 |   </ItemGroup>
 16 |   <PropertyGroup Label="Globals">
 17 |     <ProjectGuid>{997E0757-EA74-4A4E-A0FC-47D8C8831A15}</ProjectGuid>
 18 |     <RootNamespace>conjugateGradientCudaGraphs_vs2019</RootNamespace>
 19 |     <ProjectName>conjugateGradientCudaGraphs</ProjectName>
 20 |     <CudaToolkitCustomDir />
 21 |   </PropertyGroup>
 22 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
 23 |   <PropertyGroup>
 24 |     <ConfigurationType>Application</ConfigurationType>
 25 |     <CharacterSet>MultiByte</CharacterSet>
 26 |     <PlatformToolset>v142</PlatformToolset>
 27 |     <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
 28 |   </PropertyGroup>
 29 |   <PropertyGroup Condition="'$(Configuration)'=='Debug'">
 30 |     <UseDebugLibraries>true</UseDebugLibraries>
 31 |   </PropertyGroup>
 32 |   <PropertyGroup Condition="'$(Configuration)'=='Release'">
 33 |     <WholeProgramOptimization>true</WholeProgramOptimization>
 34 |   </PropertyGroup>
 35 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
 36 |   <ImportGroup Label="ExtensionSettings">
 37 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.props" />
 38 |   </ImportGroup>
 39 |   <ImportGroup Label="PropertySheets">
 40 |     <Import Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" />
 41 |   </ImportGroup>
 42 |   <PropertyGroup Label="UserMacros" />
 43 |   <PropertyGroup>
 44 |     <IntDir>$(Platform)/$(Configuration)/</IntDir>
 45 |     <IncludePath>$(IncludePath)</IncludePath>
 46 |     <CodeAnalysisRuleSet>AllRules.ruleset</CodeAnalysisRuleSet>
 47 |     <CodeAnalysisRules />
 48 |     <CodeAnalysisRuleAssemblies />
 49 |   </PropertyGroup>
 50 |   <PropertyGroup Condition="'$(Platform)'=='x64'">
 51 |     <OutDir>../../bin/win64/$(Configuration)/</OutDir>
 52 |   </PropertyGroup>
 53 |   <ItemDefinitionGroup>
 54 |     <ClCompile>
 55 |       <WarningLevel>Level3</WarningLevel>
 56 |       <PreprocessorDefinitions>WIN32;_MBCS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
 57 |       <AdditionalIncludeDirectories>./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir);</AdditionalIncludeDirectories>
 58 |     </ClCompile>
 59 |     <Link>
 60 |       <SubSystem>Console</SubSystem>
 61 |       <AdditionalDependencies>cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
 62 |       <AdditionalLibraryDirectories>$(CudaToolkitLibDir);</AdditionalLibraryDirectories>
 63 |       <OutputFile>$(OutDir)/conjugateGradientCudaGraphs.exe</OutputFile>
 64 |     </Link>
 65 |     <CudaCompile>
 66 |       <CodeGeneration>compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86;</CodeGeneration>
 67 |       <AdditionalOptions>-Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
 68 |       <Include>./;../../Common</Include>
 69 |       <Defines>WIN32</Defines>
 70 |     </CudaCompile>
 71 |   </ItemDefinitionGroup>
 72 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Debug'">
 73 |     <ClCompile>
 74 |       <Optimization>Disabled</Optimization>
 75 |       <RuntimeLibrary>MultiThreadedDebug</RuntimeLibrary>
 76 |     </ClCompile>
 77 |     <Link>
 78 |       <GenerateDebugInformation>true</GenerateDebugInformation>
 79 |       <LinkTimeCodeGeneration>Default</LinkTimeCodeGeneration>
 80 |     </Link>
 81 |     <CudaCompile>
 82 |       <Runtime>MTd</Runtime>
 83 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 84 |     </CudaCompile>
 85 |   </ItemDefinitionGroup>
 86 |   <ItemDefinitionGroup Condition="'$(Configuration)'=='Release'">
 87 |     <ClCompile>
 88 |       <Optimization>MaxSpeed</Optimization>
 89 |       <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
 90 |     </ClCompile>
 91 |     <Link>
 92 |       <GenerateDebugInformation>false</GenerateDebugInformation>
 93 |       <LinkTimeCodeGeneration>UseLinkTimeCodeGeneration</LinkTimeCodeGeneration>
 94 |     </Link>
 95 |     <CudaCompile>
 96 |       <Runtime>MT</Runtime>
 97 |       <TargetMachinePlatform>64</TargetMachinePlatform>
 98 |     </CudaCompile>
 99 |   </ItemDefinitionGroup>
100 |   <ItemGroup>
101 |     <CudaCompile Include="conjugateGradientCudaGraphs.cu" />
102 | 
103 |   </ItemGroup>
104 |   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
105 |   <ImportGroup Label="ExtensionTargets">
106 |     <Import Project="$(CUDAPropsPath)\CUDA 11.1.targets" />
107 |   </ImportGroup>
108 | </Project>
109 | 


--------------------------------------------------------------------------------
/UniMem/LowAccessDensityTest.h:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #define REAL float
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | extern void LowAccessDensityTest_cuda(REAL* x, REAL* y, long int n, REAL a, int stride);
11 | extern void LowAccessDensityTest_cuda_unified(REAL* x, REAL* y, long int n, REAL a, int stride);
12 | #ifdef __cplusplus
13 | }
14 | #endif
15 | 


--------------------------------------------------------------------------------
/UniMem/LowAccessDensityTest_cuda.cu:
--------------------------------------------------------------------------------
  1 | //******************************************************************************************************************//
  2 | // Copyright (c) 2021, University of North Carolina at Charlotte
  3 | // and Lawrence Livermore National Security, LLC.
  4 | // SPDX-License-Identifier: (BSD-3-Clause)
  5 | //*****************************************************************************************************************//
  6 | // Experimental test input for Accelerator directives
  7 | //  simplest scalar*vector operations
  8 | // Liao 1/15/2013
  9 | #include <stdio.h>
 10 | #include <stdlib.h>
 11 | #include <math.h>
 12 | #include <string.h>
 13 | #include <sys/timeb.h>
 14 | #include "LowAccessDensityTest.h"
 15 | 
 16 | 
 17 | double read_timer_ms() {
 18 |     struct timeb tm;
 19 |     ftime(&tm);
 20 |     return (double) tm.time * 1000.0 + (double) tm.millitm;
 21 | }
 22 | 
 23 | /* change this to do saxpy or daxpy : single precision or double precision*/
 24 | #define REAL float
 25 | #define VEC_LEN 102400000//use a fixed number for now
 26 | #define STRIDE 1024
 27 | 
 28 | /* zero out the entire vector */
 29 | void zero(REAL *A, long int n)
 30 | {
 31 |     int i;
 32 |     for (i = 0; i < n; i++) {
 33 |         A[i] = 0.0;
 34 |     }
 35 | }
 36 | 
 37 | /* initialize a vector with random floating point numbers */
 38 | void init(REAL *A, long int n)
 39 | {
 40 |     int i;
 41 |     for (i = 0; i < n; i++) {
 42 |         A[i] = (double)drand48();
 43 |     }
 44 | }
 45 | 
 46 | __global__ 
 47 | void
 48 | LowAccessDensityTest_cudakernel(REAL* x, REAL* y, int n, REAL a, int stride)
 49 | {
 50 |     int i = blockDim.x * blockIdx.x + threadIdx.x;
 51 |     if (i < (n/stride)) y[i] = a*x[i*stride];
 52 | }
 53 | 
 54 | void LowAccessDensityTest_cuda_discrete_memory(REAL* x, REAL* y, long int n, REAL a, int stride) {
 55 |   REAL *d_x, *d_y;
 56 |   cudaMalloc(&d_x, n*sizeof(REAL));
 57 |   cudaMalloc(&d_y, (n/stride)*sizeof(REAL));
 58 | 
 59 |   cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice);
 60 |   LowAccessDensityTest_cudakernel<<<(n+255)/256, 256>>>(d_x, d_y, n, a, stride);
 61 |   cudaDeviceSynchronize();
 62 |   //cudaMemcpy(y, d_y, (n/stride)*sizeof(REAL), cudaMemcpyDeviceToHost);
 63 | 
 64 |   cudaFree(d_x);
 65 |   cudaFree(d_y);
 66 | }
 67 | 
 68 | /* return the measured time */
 69 | double LowAccessDensityTest_cuda_unified_memory(REAL* x, REAL* y, long int n, REAL a, int stride) {
 70 |       
 71 |   double elapsed1 = read_timer_ms();
 72 |   REAL *x2;
 73 |   cudaMallocManaged(&x2, n*sizeof(REAL));
 74 |   elapsed1 = (read_timer_ms() - elapsed1);
 75 | 
 76 |   //initial unified memory, should not count time here
 77 |   memcpy(x2, x, n*sizeof(REAL));
 78 | 
 79 |   double elapsed2 = read_timer_ms();
 80 |   REAL *d_y;
 81 |   cudaMalloc(&d_y, (n/stride)*sizeof(REAL));
 82 | 
 83 |   LowAccessDensityTest_cudakernel<<<(n+255)/256, 256>>>(x2, d_y, n, a, stride);
 84 |   cudaDeviceSynchronize();
 85 |   elapsed2 = (read_timer_ms() - elapsed2);
 86 |   //cudaMemcpy(y, d_y, (n/stride)*sizeof(REAL), cudaMemcpyDeviceToHost);
 87 | 
 88 |   cudaFree(x2);
 89 |   cudaFree(d_y);
 90 | 
 91 |   return elapsed1 + elapsed2;
 92 | }
 93 | 
 94 | 
 95 | /*serial version */
 96 | void serial(REAL* x, REAL* y, long n, REAL a, int stride) {
 97 |   int i;
 98 |   for (i = 0; i < (n/stride); i++)
 99 |   {
100 |     y[i] = a * x[i*stride];
101 |   }
102 | }
103 | 
104 | /* compare two arrays and return percentage of difference */
105 | REAL check(REAL*A, REAL*B, long int n)
106 | {
107 |     int i;
108 |     REAL diffsum =0.0, sum = 0.0;
109 |     for (i = 0; i < n; i++) {
110 |         diffsum += fabs(A[i] - B[i]);
111 |         sum += fabs(B[i]);
112 |     }
113 |     return diffsum/sum;
114 | }
115 | 
116 | int main(int argc, char *argv[])
117 | {
118 |   long int n;
119 |   int stride = STRIDE;
120 |   REAL *y_cuda, *y, *x, *y_cuda_unified;
121 |   REAL a = 123.456;
122 | 
123 |   n = VEC_LEN;
124 |   fprintf(stderr, "Usage: Low Access Test <n>\n");
125 |   if (argc >= 2) {
126 |     stride = atoi(argv[1]);
127 |   }
128 |   if (argc >= 3) {
129 |     n = atoi(argv[2]);
130 |   }
131 |   y_cuda = (REAL *) malloc((n/stride) * sizeof(REAL));
132 |   y_cuda_unified = (REAL *) malloc((n/stride) * sizeof(REAL));
133 |   y  = (REAL *) malloc((n/stride) * sizeof(REAL));
134 |   x = (REAL *) malloc(n * sizeof(REAL));
135 | 
136 |   srand48(1<<12);
137 |   init(x, n);
138 | 
139 |   serial(x, y, n, a, stride);
140 | 
141 |   int i;
142 |   int num_runs = 100;
143 |   /* cuda version */
144 |   //warming up
145 |   LowAccessDensityTest_cuda_discrete_memory(x, y_cuda, n, a, stride);
146 |   
147 |   double elapsed = read_timer_ms();
148 |   for (i=0; i<num_runs; i++) LowAccessDensityTest_cuda_discrete_memory(x, y_cuda, n, a, stride);
149 |   elapsed = (read_timer_ms() - elapsed)/num_runs;
150 |   
151 |   
152 |   double elapsed_unified = 0;
153 |   for (i=0; i<num_runs; i++) elapsed_unified += LowAccessDensityTest_cuda_unified_memory(x, y_cuda_unified, n, a, stride);
154 |   elapsed_unified /= num_runs;
155 | 
156 |   printf("Low Access Test (Discrete Memory) (%ld), stride:%d: time: %0.2fms\n", n, stride,elapsed);
157 |   printf("Low Access Test (Unified Memory) (%ld), stride:%d: time: %0.2fms\n", n, stride,elapsed_unified);
158 | 
159 |   free(y_cuda);
160 |   free(y_cuda_unified);
161 |   free(y);
162 |   free(x);
163 |   return 0;
164 | }
165 | 


--------------------------------------------------------------------------------
/UniMem/LowAccessDensityTest_omp.c:
--------------------------------------------------------------------------------
  1 | //******************************************************************************************************************//
  2 | // Copyright (c) 2021, University of North Carolina at Charlotte
  3 | // and Lawrence Livermore National Security, LLC.
  4 | // SPDX-License-Identifier: (BSD-3-Clause)
  5 | //*****************************************************************************************************************//
  6 | // Experimental test for low memory access density using unified memory
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | #include <string.h>
 11 | #include <sys/timeb.h>
 12 | #include <time.h>
 13 | 
 14 | double read_timer_ms() {
 15 |     struct timeb tm;
 16 |     ftime(&tm);
 17 |     return (double) tm.time * 1000.0 + (double) tm.millitm;
 18 | }
 19 | 
 20 | /* change this to do saxpy or daxpy : single precision or double precision*/
 21 | #define REAL double
 22 | #define VEC_LEN 102400000//use a fixed number for now
 23 | #define STRIDE 1024
 24 | 
 25 | /* zero out the entire vector */
 26 | void zero(REAL *A, long int n)
 27 | {
 28 |     int i;
 29 |     for (i = 0; i < n; i++) {
 30 |         A[i] = 0.0;
 31 |     }
 32 | }
 33 | 
 34 | /* initialize a vector with random floating point numbers */
 35 | void init(REAL *A, long int n)
 36 | {
 37 |     int i;
 38 |     for (i = 0; i < n; i++) {
 39 |         A[i] = (double)drand48();
 40 |     }
 41 | }
 42 | 
 43 | /*serial version */
 44 | void serial_kernel(REAL* x, REAL* y, long n, REAL a, int stride) {
 45 |   int i;
 46 |   for (i = 0; i < n; i+=stride)
 47 |   {
 48 |     y[i] += a * x[i];
 49 |   }
 50 | }
 51 | 
 52 | /*omp version */
 53 | void omp_kernel(REAL* x, REAL* y, long n, REAL a, int stride) {
 54 |   int i;
 55 |   #pragma omp parallel for shared(x,y,a,n,stride) private(i)  
 56 |   for (i = 0; i < n; i+=stride)
 57 |   {
 58 |     y[i] += a * x[i];
 59 |   }
 60 | }
 61 | 
 62 | /*omp gpu version */
 63 | void omp_gpu_kernel(REAL* x, REAL* y, long n, REAL a, int stride) {
 64 |   int i;
 65 |   //#pragma omp target teams distribute parallel for map(tofrom:y) map(to:x,a,n,stride)
 66 |   #pragma omp target map(to:a,n,x[0:n]) map(tofrom:y[0:n])
 67 |   #pragma parallel for
 68 |   for (i = 0; i < n; i+=stride)
 69 |   {
 70 |     y[i] += a * x[i];
 71 |   }
 72 | }
 73 | 
 74 | 
 75 | 
 76 | /* compare two arrays and return percentage of difference */
 77 | REAL check(REAL*A, REAL*B, long int n)
 78 | {
 79 |     int i;
 80 |     REAL diffRatioSum= 0.0;
 81 |     for (i = 0; i < n; i++) {
 82 |       REAL diff = fabs(A[i] - B[i]);
 83 |       if (fabs(B[i])==0.0)
 84 | 	diffRatioSum+=0.0;
 85 |       else
 86 | 	diffRatioSum += diff/fabs(B[i]);
 87 |     }
 88 |     return diffRatioSum/n;
 89 | }
 90 | 
 91 | int main(int argc, char *argv[])
 92 | {
 93 |   long int n;
 94 |   int stride = STRIDE;
 95 |   REAL *y_omp, *y, *x;
 96 |   REAL a = 123.456;
 97 | 
 98 |   n = VEC_LEN;
 99 |   fprintf(stderr, "Usage: %s <stride> [vec_len]\n", argv[0]);
100 |   if (argc >= 2) {
101 |     stride = atoi(argv[1]);
102 |   }
103 | 
104 |   if (argc >= 3) {
105 |     n = atoi(argv[2]);
106 |   }
107 |   printf("vec len(n_=%ld, stride=%d\n", n, stride);
108 | 
109 |   // same input x
110 |   x = (REAL *) malloc(n * sizeof(REAL));
111 |   if (x==NULL)
112 |   {
113 |     fprintf(stderr, "malloc returns NULL: out of memory\n");
114 |     abort();
115 |   }
116 |   srand48(time(NULL));
117 |   init(x, n);
118 | 
119 |   // output for serial and omp version
120 |   y  = (REAL *) malloc(n * sizeof(REAL));
121 |   if (y==NULL)
122 |   {
123 |     fprintf(stderr, "y malloc returns NULL: out of memory\n");
124 |     abort();
125 |   }
126 | 
127 |   y_omp  = (REAL *) malloc(n * sizeof(REAL));
128 |   if (y_omp==NULL)
129 |   {
130 |     fprintf(stderr, "y_omp malloc returns NULL: out of memory\n");
131 |     abort();
132 |   }
133 | 
134 |   REAL*  y_omp_gpu  = (REAL *) malloc(n * sizeof(REAL));
135 |   if (y_omp_gpu==NULL)
136 |   {
137 |     fprintf(stderr, "y_omp malloc returns NULL: out of memory\n");
138 |     abort();
139 |   }
140 | 
141 | 
142 |   // serial version as a reference
143 |   serial_kernel(x, y, n, a, stride);
144 | 
145 |   int i;
146 |   int num_runs = 100;
147 | 
148 |   /* OMP version */
149 |   double elapsed = read_timer_ms();
150 |   for (i=0; i<num_runs; i++) 
151 |     omp_kernel(x, y_omp, n, a, stride);
152 |   elapsed = (read_timer_ms() - elapsed)/num_runs;
153 | 
154 |   printf("diff ratio=%f\n", check(y,y_omp, n));
155 | 
156 |   for (i=0; i<num_runs; i++) 
157 |     omp_gpu_kernel(x, y_omp_gpu, n, a, stride);
158 | 
159 |   printf("diff ratio=%f\n", check(y,y_omp_gpu, n));
160 | 
161 |   free(x);
162 |   free(y);
163 |   free(y_omp);
164 |   free(y_omp_gpu);
165 |   return 0;
166 | }
167 | 


--------------------------------------------------------------------------------
/UniMem/Makefile:
--------------------------------------------------------------------------------
1 | default:
2 | 	nvcc -o LowAccessDensityTest_cuda LowAccessDensityTest_cuda.cu
3 | 


--------------------------------------------------------------------------------
/UniMem/test.sh:
--------------------------------------------------------------------------------
 1 | ./LowAccessDensityTest_cuda 1 134217728
 2 | ./LowAccessDensityTest_cuda 2 134217728
 3 | ./LowAccessDensityTest_cuda 4 134217728
 4 | ./LowAccessDensityTest_cuda 8 134217728
 5 | ./LowAccessDensityTest_cuda 16 134217728
 6 | ./LowAccessDensityTest_cuda 32 134217728
 7 | ./LowAccessDensityTest_cuda 64 134217728
 8 | ./LowAccessDensityTest_cuda 128 134217728
 9 | ./LowAccessDensityTest_cuda 256 134217728
10 | ./LowAccessDensityTest_cuda 512 134217728
11 | ./LowAccessDensityTest_cuda 1024 134217728
12 | ./LowAccessDensityTest_cuda 2048 134217728
13 | ./LowAccessDensityTest_cuda 4096 134217728
14 | ./LowAccessDensityTest_cuda 8192 134217728
15 | ./LowAccessDensityTest_cuda 16384 134217728
16 | ./LowAccessDensityTest_cuda 32768 134217728
17 | ./LowAccessDensityTest_cuda 65536 134217728
18 | ./LowAccessDensityTest_cuda 131072 134217728
19 | ./LowAccessDensityTest_cuda 262144 134217728
20 | ./LowAccessDensityTest_cuda 524288 134217728
21 | ./LowAccessDensityTest_cuda 1048567 134217728
22 | ./LowAccessDensityTest_cuda 2097152 134217728
23 | ./LowAccessDensityTest_cuda 4194304 134217728
24 | ./LowAccessDensityTest_cuda 8388608 134217728
25 | ./LowAccessDensityTest_cuda 16777216 134217728
26 | ./LowAccessDensityTest_cuda 33554432 134217728
27 | ./LowAccessDensityTest_cuda 67108864 134217728
28 | ./LowAccessDensityTest_cuda 134217728 134217728


--------------------------------------------------------------------------------
/UniMem/test2.sh:
--------------------------------------------------------------------------------
 1 | ./LowAccessDensityTest_cuda 1 1024
 2 | ./LowAccessDensityTest_cuda 2 2048     
 3 | ./LowAccessDensityTest_cuda 4 4096
 4 | ./LowAccessDensityTest_cuda 8 8192
 5 | ./LowAccessDensityTest_cuda 16 16384
 6 | ./LowAccessDensityTest_cuda 32 32768
 7 | ./LowAccessDensityTest_cuda 64 65536
 8 | ./LowAccessDensityTest_cuda 128 131072
 9 | ./LowAccessDensityTest_cuda 256 262144
10 | ./LowAccessDensityTest_cuda 512 524288
11 | ./LowAccessDensityTest_cuda 1024 1048567
12 | ./LowAccessDensityTest_cuda 2048 2097152
13 | ./LowAccessDensityTest_cuda 4096 4194304
14 | ./LowAccessDensityTest_cuda 8192 8388608
15 | ./LowAccessDensityTest_cuda 16384 16777216
16 | ./LowAccessDensityTest_cuda 32768 33554432
17 | ./LowAccessDensityTest_cuda 65536 67108864
18 | ./LowAccessDensityTest_cuda 131072 134217728
19 | ./LowAccessDensityTest_cuda 262144 268435456     
20 | ./LowAccessDensityTest_cuda 524288 536870912
21 | ./LowAccessDensityTest_cuda 1048576 1073741824
22 | 


--------------------------------------------------------------------------------
/WarpDivRedux/Makefile:
--------------------------------------------------------------------------------
1 | default:
2 | 	nvcc -g -G -arch=sm_30 -o warpDivergenceTest_cuda warpDivergenceTest_cuda.c warpDivergenceTest_cudakernel.cu
3 | 


--------------------------------------------------------------------------------
/WarpDivRedux/test.sh:
--------------------------------------------------------------------------------
 1 | nvprof ./warpDivergenceTest_cuda 1024000
 2 | nvprof --metrics branch_efficiency ./warpDivergenceTest_cuda 1024000
 3 | nvprof ./warpDivergenceTest_cuda 4096000
 4 | nvprof --metrics branch_efficiency ./warpDivergenceTest_cuda 4096000
 5 | nvprof ./warpDivergenceTest_cuda 10240000
 6 | nvprof --metrics branch_efficiency ./warpDivergenceTest_cuda 10240000
 7 | nvprof ./warpDivergenceTest_cuda 40960000
 8 | nvprof --metrics branch_efficiency ./warpDivergenceTest_cuda 40960000
 9 | nvprof ./warpDivergenceTest_cuda 102400000
10 | nvprof --metrics branch_efficiency ./warpDivergenceTest_cuda 102400000


--------------------------------------------------------------------------------
/WarpDivRedux/warpDivergenceTest.h:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #define REAL float
 7 | 
 8 | #ifdef __cplusplus
 9 | extern "C" {
10 | #endif
11 | extern void warpDivergenceTest_cuda(REAL* x, REAL* y, REAL *warp_divergence, REAL *no_warp_divergence, int n);
12 | #ifdef __cplusplus
13 | }
14 | #endif
15 | 


--------------------------------------------------------------------------------
/WarpDivRedux/warpDivergenceTest_cuda.c:
--------------------------------------------------------------------------------
  1 | //******************************************************************************************************************//
  2 | // Copyright (c) 2021, University of North Carolina at Charlotte
  3 | // and Lawrence Livermore National Security, LLC.
  4 | // SPDX-License-Identifier: (BSD-3-Clause)
  5 | //*****************************************************************************************************************//
  6 | // Experimental tests for warp divergence
  7 | #include <stdio.h>
  8 | #include <stdlib.h>
  9 | #include <math.h>
 10 | #include <string.h>
 11 | #include <sys/timeb.h>
 12 | #include "warpDivergenceTest.h"
 13 | 
 14 | double read_timer_ms() {
 15 |     struct timeb tm;
 16 |     ftime(&tm);
 17 |     return (double) tm.time * 1000.0 + (double) tm.millitm;
 18 | }
 19 | 
 20 | /* change this to do saxpy or daxpy : single precision or double precision*/
 21 | #define REAL float
 22 | #define VEC_LEN 32000 //use a fixed number for now
 23 | /* zero out the entire vector */
 24 | void zero(REAL *A, int n)
 25 | {
 26 |     int i;
 27 |     for (i = 0; i < n; i++) {
 28 |         A[i] = 0.0;
 29 |     }
 30 | }
 31 | 
 32 | /* initialize a vector with random floating point numbers */
 33 | void init(REAL *A, int n)
 34 | {
 35 |     int i;
 36 |     for (i = 0; i < n; i++) {
 37 |         A[i] = (float)drand48();
 38 |     }
 39 | }
 40 | 
 41 | /*serial version */
 42 | void warpDivergenceSerial(REAL* x, REAL* y, REAL* z, int n) {
 43 |   int i;
 44 |   for (i = 0; i < n; ++i)
 45 |   {
 46 |     if(i%2 == 0) z[i] = 2 * x[i] + 3 * y[i];
 47 |     else z[i] = 3 * x[i] + 2 * y[i];
 48 |   }
 49 | }
 50 | 
 51 | void NoWarpDivergenceSerial(REAL* x, REAL* y, REAL* z, int n) {
 52 |   int i;
 53 |   for (i = 0; i < n; ++i)
 54 |   {
 55 |     if((i/32)%2 ==0 ) z[i] = 2 * x[i] + 3 * y[i];
 56 |     else z[i] = 3 * x[i] + 2 * y[i];
 57 |   }
 58 | }
 59 | 
 60 | /* compare two arrays and return percentage of difference */
 61 | REAL check(REAL*A, REAL*B, int n)
 62 | {
 63 |     int i;
 64 |     REAL diffsum =0.0, sum = 0.0;
 65 |     for (i = 0; i < n; i++) {
 66 |         diffsum += fabs(A[i] - B[i]);
 67 |         sum += fabs(B[i]);
 68 |     }
 69 |     return diffsum/sum;
 70 | }
 71 | 
 72 | int main(int argc, char *argv[])
 73 | {
 74 |   int n;
 75 |   REAL *x, *y, *warp_divergence, *no_warp_divergence, *warp_divergence_serial, *no_warp_divergence_serial;
 76 | 
 77 |   n = VEC_LEN;
 78 |   fprintf(stderr, "Usage: warpDivergenceTest <n>\n");
 79 |   if (argc >= 2) {
 80 |     n = atoi(argv[1]);
 81 |   }
 82 |   x = (REAL *) malloc(n * sizeof(REAL));
 83 |   y = (REAL *) malloc(n * sizeof(REAL));
 84 |   warp_divergence = (REAL *) malloc(n * sizeof(REAL));
 85 |   no_warp_divergence = (REAL *) malloc(n * sizeof(REAL));
 86 |   
 87 |   warp_divergence_serial = (REAL *) malloc(n * sizeof(REAL));
 88 |   no_warp_divergence_serial = (REAL *) malloc(n * sizeof(REAL));
 89 |     
 90 | 
 91 |   srand48(1<<12);
 92 |   init(x, n);
 93 |   //init(y, n);
 94 |   memcpy(y, x, n*sizeof(REAL));
 95 | 
 96 | 
 97 |   int i;
 98 |   int num_runs = 10;
 99 |   
100 |   warpDivergenceSerial(x,y,warp_divergence_serial,n);
101 |   NoWarpDivergenceSerial(x,y,no_warp_divergence_serial,n);
102 |   /* cuda version */
103 |   double elapsed = read_timer_ms();
104 |   for (i=0; i<num_runs; i++) warpDivergenceTest_cuda(x, y, warp_divergence, no_warp_divergence, n);
105 |   elapsed = (read_timer_ms() - elapsed)/num_runs;
106 | 
107 |   float check1 = check(warp_divergence,warp_divergence_serial,n);
108 |   float check2 = check(no_warp_divergence,no_warp_divergence_serial,n);
109 |   printf("check:%f\n", check1);
110 |   printf("check:%f\n", check2);
111 |   //assert (checkresult < 1.0e-10);
112 | 
113 |   free(x);
114 |   free(y);
115 |   free(warp_divergence);
116 |   free(no_warp_divergence);
117 |   free(warp_divergence_serial);
118 |   free(no_warp_divergence_serial);
119 |   return 0;
120 | }
121 | 


--------------------------------------------------------------------------------
/WarpDivRedux/warpDivergenceTest_cudakernel.cu:
--------------------------------------------------------------------------------
 1 | //******************************************************************************************************************//
 2 | // Copyright (c) 2021, University of North Carolina at Charlotte
 3 | // and Lawrence Livermore National Security, LLC.
 4 | // SPDX-License-Identifier: (BSD-3-Clause)
 5 | //*****************************************************************************************************************//
 6 | #include "warpDivergenceTest.h"
 7 | 
 8 | 
 9 | __global__ void warmingup(float *x, float *y, float *z) {
10 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
11 |     if (tid % 2 == 0) {
12 |         z[tid] = 2 * x[tid] + 3 * y[tid];
13 | 
14 |     } else {
15 |         z[tid] = 3 * x[tid] + 2 * y[tid];
16 |     }
17 | }
18 | 
19 | __global__ void warpDivergence(float *x, float *y, float *z) {
20 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
21 |     if (tid % 2 == 0) {
22 |         z[tid] = 2 * x[tid] + 3 * y[tid];
23 | 
24 |     } else {
25 |         z[tid] = 3 * x[tid] + 2 * y[tid];
26 |     }
27 | }
28 | 
29 | __global__ void noWarpDivergence(float *x, float *y, float *z) {
30 |     int tid = blockIdx.x * blockDim.x + threadIdx.x;
31 |     if ((tid / warpSize) % 2 == 0) {
32 |         z[tid] = 2 * x[tid] + 3 * y[tid];
33 |     } else {
34 |         z[tid] = 3 * x[tid] + 2 * y[tid];
35 |     }
36 | }
37 | void warpDivergenceTest_cuda(REAL* x, REAL* y, REAL *warp_divergence, REAL *no_warp_divergence, int n) {
38 |   REAL *d_x, *d_y, *d_warp_divergence, *d_no_warp_divergence;
39 |   cudaMalloc(&d_x, n*sizeof(REAL));
40 |   cudaMalloc(&d_y, n*sizeof(REAL));
41 |   cudaMalloc(&d_warp_divergence, n*sizeof(REAL));
42 |   cudaMalloc(&d_no_warp_divergence, n*sizeof(REAL));
43 | 
44 |   cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice);
45 |   cudaMemcpy(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice);
46 | 
47 |   cudaDeviceSynchronize();
48 | 
49 |   warmingup<<<(n+255)/256, 256>>> (d_x, d_y, d_warp_divergence);
50 |   cudaDeviceSynchronize();
51 | 
52 |   warpDivergence<<<(n+255)/256, 256>>>(d_x, d_y, d_warp_divergence);
53 |   cudaDeviceSynchronize();
54 | 
55 |   noWarpDivergence<<<(n+255)/256, 256>>>(d_x, d_y, d_no_warp_divergence);
56 |   cudaDeviceSynchronize();
57 | 
58 |   cudaMemcpy(warp_divergence, d_warp_divergence, n*sizeof(REAL), cudaMemcpyDeviceToHost);
59 |   cudaMemcpy(no_warp_divergence, d_no_warp_divergence, n*sizeof(REAL), cudaMemcpyDeviceToHost);
60 | 
61 | 
62 |   cudaFree(d_x);
63 |   cudaFree(d_y);
64 | 
65 |   cudaFree(d_warp_divergence);
66 |   cudaFree(d_no_warp_divergence);
67 | 
68 | 
69 | }
70 | 


--------------------------------------------------------------------------------