├── BankRedux ├── Makefile ├── sum.h ├── sum_cuda.c ├── sum_cuda.output.carina.txt ├── sum_cudakernel.cu └── test.sh ├── CoMem_AXPY ├── Makefile ├── axpy.h ├── axpy_cuda.c ├── axpy_cuda.output.carina.txt ├── axpy_cudakernel.cu └── test.sh ├── CoMem_SpMM ├── Makefile ├── SpMM.h ├── SpMM_cuda.c ├── SpMM_cuda.output.carina.txt ├── SpMM_cuda.output.fornax.txt └── SpMM_cudakernel.cu ├── Common ├── FreeImage │ ├── freeimage-license.txt │ └── include │ │ └── FreeImage.h ├── README.md ├── UtilNPP │ ├── Exceptions.h │ ├── Image.h │ ├── ImageAllocatorsCPU.h │ ├── ImageAllocatorsNPP.h │ ├── ImageIO.h │ ├── ImagePacked.h │ ├── ImagesCPU.h │ ├── ImagesNPP.h │ ├── Pixel.h │ ├── Signal.h │ ├── SignalAllocatorsCPU.h │ ├── SignalAllocatorsNPP.h │ ├── SignalsCPU.h │ └── SignalsNPP.h ├── drvapi_error_string.h ├── dynlink_d3d11.h ├── exception.h ├── helper_cuda.h ├── helper_cuda_drvapi.h ├── helper_cusolver.h ├── helper_functions.h ├── helper_image.h ├── helper_math.h ├── helper_multiprocess.cpp ├── helper_multiprocess.h ├── helper_nvJPEG.hxx ├── helper_string.h ├── helper_timer.h ├── nvrtc_helper.h ├── rendercheck_d3d11.cpp └── rendercheck_d3d11.h ├── Conkernels ├── Makefile ├── Makefile_serialized ├── NsightEclipse.xml ├── README.md ├── concurrentKernels.cu ├── concurrentKernels_vs2015.sln ├── concurrentKernels_vs2015.vcxproj ├── concurrentKernels_vs2017.sln ├── concurrentKernels_vs2017.vcxproj ├── concurrentKernels_vs2019.sln └── concurrentKernels_vs2019.vcxproj ├── DynParallel ├── .gitignore ├── Dynamic_Parallelism.cu ├── Makefile ├── Non_Dynamic_Parallelism.cu ├── include │ ├── png.h │ ├── pngconf.h │ ├── pnglibconf.h │ ├── zconf.h │ └── zlib.h └── lib │ ├── libpng.lib │ ├── libpngd.lib │ ├── zlibstat.lib │ └── zlibstatd.lib ├── GSOverlap ├── Makefile ├── NsightEclipse.xml ├── README.md ├── globalToShmemAsyncCopy.cu ├── globalToShmemAsyncCopy_vs2015.sln ├── globalToShmemAsyncCopy_vs2015.vcxproj ├── globalToShmemAsyncCopy_vs2017.sln ├── globalToShmemAsyncCopy_vs2017.vcxproj ├── globalToShmemAsyncCopy_vs2019.sln └── globalToShmemAsyncCopy_vs2019.vcxproj ├── HDOverlap ├── Makefile ├── axpy_cudakernel.cu ├── results.txt └── test.sh ├── LICENSE_BSD.txt ├── MemAlign ├── Makefile ├── axpy.h ├── axpy_cuda.c ├── axpy_cuda.output.carina.txt ├── axpy_cudakernel.cu └── test.sh ├── MiniTransfer_SpMV ├── Makefile ├── SpMV.h ├── SpMV_cuda.c ├── SpMV_cuda.output.carina.txt ├── SpMV_cudakernel.cu └── test.sh ├── NOTICE ├── README.md ├── ReadOnlyMem_1D_Texture ├── Makefile ├── axpy.h ├── axpy_cuda.c ├── axpy_cuda.output.carina.txt ├── axpy_cuda.output.fornax.txt ├── axpy_cudakernel.cu └── test.sh ├── ReadOnlyMem_2D_Texture ├── Makefile ├── matadd.output.carina.txt ├── matadd.output.fornax.txt ├── matadd_2D.h ├── matadd_2D_cuda.c ├── matadd_2D_cudakernel.cu └── test.sh ├── Shmem ├── Makefile ├── mm_kernel.cu ├── mm_omp_cuda.c ├── mm_omp_cuda.h └── testResults.txt ├── Shuffle ├── cuda_global │ ├── Makefile │ ├── README.md │ ├── reduction.cpp │ ├── reduction.h │ ├── reduction_kernel.cu │ ├── result.txt │ └── test.sh └── cuda_shuffle │ ├── Makefile │ ├── README.md │ ├── reduction.cpp │ ├── reduction.h │ ├── reduction_kernel.cu │ ├── result.txt │ └── test.sh ├── TaskGraph ├── Makefile ├── NsightEclipse.xml ├── README.md ├── conjugateGradientCudaGraphs.cu ├── conjugateGradientCudaGraphs_vs2015.sln ├── conjugateGradientCudaGraphs_vs2015.vcxproj ├── conjugateGradientCudaGraphs_vs2017.sln ├── conjugateGradientCudaGraphs_vs2017.vcxproj ├── conjugateGradientCudaGraphs_vs2019.sln └── conjugateGradientCudaGraphs_vs2019.vcxproj ├── UniMem ├── LowAccessDensityTest.h ├── LowAccessDensityTest_cuda.cu ├── LowAccessDensityTest_cuda.output.carina.txt ├── LowAccessDensityTest_cuda_fixed_access_time.output.carina.txt ├── LowAccessDensityTest_omp.c ├── Makefile ├── test.sh └── test2.sh └── WarpDivRedux ├── Makefile ├── test.sh ├── warpDivergenceTest.h ├── warpDivergenceTest_cuda.c ├── warpDivergenceTest_cuda.output.carina.txt └── warpDivergenceTest_cudakernel.cu /BankRedux/Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | nvcc -o sum_cuda sum_cuda.c sum_cudakernel.cu 3 | -------------------------------------------------------------------------------- /BankRedux/sum.h: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | #define REAL float 7 | #define ThreadsPerBlock 256 8 | #define VEC_LEN 1024000 //use a fixed number for now 9 | 10 | #ifdef __cplusplus 11 | extern "C" { 12 | #endif 13 | extern void sum_cuda(int n, REAL *x, REAL *result); 14 | #ifdef __cplusplus 15 | } 16 | #endif 17 | -------------------------------------------------------------------------------- /BankRedux/sum_cuda.c: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | // Experimental test for bank conflict 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "sum.h" 13 | 14 | double read_timer_ms() { 15 | struct timeb tm; 16 | ftime(&tm); 17 | return (double) tm.time * 1000.0 + (double) tm.millitm; 18 | } 19 | 20 | /* change this to do saxpy or daxpy : single precision or double precision*/ 21 | #define REAL float 22 | 23 | //#define ThreadsPerBlock 256 24 | 25 | /* zero out the entire vector */ 26 | void zero(REAL *A, int n) 27 | { 28 | int i; 29 | for (i = 0; i < n; i++) { 30 | A[i] = 0.0; 31 | } 32 | } 33 | 34 | /* initialize a vector with random floating point numbers */ 35 | void init(REAL *A, int n) 36 | { 37 | int i; 38 | for (i = 0; i < n; i++) { 39 | A[i] = (float)drand48(); 40 | } 41 | } 42 | 43 | /*serial version */ 44 | float sum(int N, float *numbers) { 45 | float sum = 0; 46 | 47 | for (int i = 0; i\n"); 73 | if (argc >= 2) { 74 | n = atoi(argv[1]); 75 | } 76 | 77 | x = (REAL *) malloc(n * sizeof(REAL)); 78 | result_cuda = (REAL*)malloc(((VEC_LEN + ThreadsPerBlock - 1) / ThreadsPerBlock) * sizeof(REAL)); 79 | 80 | srand48(1<<12); 81 | init(x, n); 82 | 83 | volatile float answer = 0; 84 | answer = sum(n, x); 85 | 86 | int i; 87 | int num_runs = 10; 88 | /* cuda version */ 89 | double elapsed = read_timer_ms(); 90 | for (i=0; i 0; i /= 2) { 15 | if (cacheIndex < i) { 16 | cache[cacheIndex] += cache[cacheIndex + i]; 17 | } 18 | __syncthreads(); 19 | } 20 | if (cacheIndex == 0) 21 | result[blockIdx.x] = cache[cacheIndex]; 22 | } 23 | 24 | __global__ void sum_cudakernel(const REAL *x, REAL *result) { 25 | __shared__ REAL cache[ThreadsPerBlock]; 26 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 27 | int cacheIndex = threadIdx.x; 28 | cache[cacheIndex] = x[tid]; 29 | __syncthreads(); 30 | for (int i = blockDim.x / 2; i > 0; i /= 2) { 31 | if (cacheIndex < i) { 32 | cache[cacheIndex] += cache[cacheIndex + i]; 33 | } 34 | __syncthreads(); 35 | } 36 | if (cacheIndex == 0) 37 | result[blockIdx.x] = cache[cacheIndex]; 38 | } 39 | 40 | __global__ void sum_cudakernel_bc(const REAL *x, REAL *result) { 41 | __shared__ REAL cache[ThreadsPerBlock]; 42 | int tid = blockIdx.x * blockDim.x + threadIdx.x; 43 | int cacheIndex = threadIdx.x; 44 | cache[cacheIndex] = x[tid]; 45 | __syncthreads(); 46 | for (int i = 1; i < blockDim.x; i *= 2) { 47 | int index = 2 * i * cacheIndex; 48 | if (index < blockDim.x) { 49 | cache[index] += cache[index + i]; 50 | } 51 | __syncthreads(); 52 | } 53 | if (cacheIndex == 0) 54 | result[blockIdx.x] = cache[cacheIndex]; 55 | } 56 | 57 | void sum_cuda(int n, REAL *x, REAL *result) { 58 | REAL *d_x; 59 | REAL *d_result; 60 | cudaMalloc(&d_x, n*sizeof(REAL)); 61 | cudaMalloc(&d_result, ((n+255)/256) * sizeof(REAL)); 62 | 63 | cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice); 64 | 65 | sum_warmingup<<<(n+255)/256, 256>>>(d_x, d_result); 66 | cudaDeviceSynchronize(); 67 | sum_cudakernel<<<(n+255)/256, 256>>>(d_x, d_result); 68 | cudaDeviceSynchronize(); 69 | sum_cudakernel_bc<<<(n+255)/256, 256>>>(d_x, d_result); 70 | cudaDeviceSynchronize(); 71 | 72 | cudaMemcpy(result, d_result, ((n+255)/256) * sizeof(REAL), cudaMemcpyDeviceToHost); 73 | cudaFree(d_x); 74 | cudaFree(d_result); 75 | } 76 | 77 | -------------------------------------------------------------------------------- /BankRedux/test.sh: -------------------------------------------------------------------------------- 1 | nvprof ./sum_cuda 102400 2 | nvprof ./sum_cuda 204800 3 | nvprof ./sum_cuda 409600 4 | nvprof ./sum_cuda 1024000 -------------------------------------------------------------------------------- /CoMem_AXPY/Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | nvcc -o axpy_cuda axpy_cuda.c axpy_cudakernel.cu 3 | -------------------------------------------------------------------------------- /CoMem_AXPY/axpy.h: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | #define REAL double 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | extern void axpy_cuda(REAL *x, REAL * y, int n, REAL a); 12 | #ifdef __cplusplus 13 | } 14 | #endif 15 | -------------------------------------------------------------------------------- /CoMem_AXPY/axpy_cuda.c: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | // Experimental tests for coalesced memory access and uncloalesced memory access 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "axpy.h" 13 | 14 | double read_timer_ms() { 15 | struct timeb tm; 16 | ftime(&tm); 17 | return (double) tm.time * 1000.0 + (double) tm.millitm; 18 | } 19 | 20 | /* change this to do saxpy or daxpy : single precision or double precision*/ 21 | #define REAL double 22 | #define VEC_LEN 1024000 //use a fixed number for now 23 | /* zero out the entire vector */ 24 | void zero(REAL *A, int n) 25 | { 26 | int i; 27 | for (i = 0; i < n; i++) { 28 | A[i] = 0.0; 29 | } 30 | } 31 | 32 | /* initialize a vector with random floating point numbers */ 33 | void init(REAL *A, int n) 34 | { 35 | int i; 36 | for (i = 0; i < n; i++) { 37 | A[i] = (double)drand48(); 38 | } 39 | } 40 | 41 | /*serial version */ 42 | void axpy(REAL* x, REAL* y, long n, REAL a) { 43 | int i; 44 | for (i = 0; i < n; ++i) 45 | { 46 | y[i] += a * x[i]; 47 | } 48 | } 49 | 50 | /* compare two arrays and return percentage of difference */ 51 | REAL check(REAL*A, REAL*B, int n) 52 | { 53 | int i; 54 | REAL diffsum =0.0, sum = 0.0; 55 | for (i = 0; i < n; i++) { 56 | diffsum += fabs(A[i] - B[i]); 57 | sum += fabs(B[i]); 58 | } 59 | return diffsum/sum; 60 | } 61 | 62 | int main(int argc, char *argv[]) 63 | { 64 | int n; 65 | REAL *y_cuda, *y, *x; 66 | REAL a = 123.456; 67 | 68 | n = VEC_LEN; 69 | fprintf(stderr, "Usage: axpy \n"); 70 | if (argc >= 2) { 71 | n = atoi(argv[1]); 72 | } 73 | y_cuda = (REAL *) malloc(n * sizeof(REAL)); 74 | y = (REAL *) malloc(n * sizeof(REAL)); 75 | x = (REAL *) malloc(n * sizeof(REAL)); 76 | 77 | srand48(1<<12); 78 | init(x, n); 79 | init(y_cuda, n); 80 | memcpy(y, y_cuda, n*sizeof(REAL)); 81 | 82 | axpy(x, y, n, a); 83 | 84 | int i; 85 | int num_runs = 10; 86 | /* cuda version */ 87 | double elapsed = read_timer_ms(); 88 | for (i=0; i>>(d_x, d_y, n, a); 62 | cudaDeviceSynchronize(); 63 | axpy_cudakernel_1perThread<<<(n+255)/256, 256>>>(d_x, d_y, n, a); 64 | cudaDeviceSynchronize(); 65 | axpy_cudakernel_block<<<1024, 256>>>(d_x, d_y, n, a); 66 | cudaDeviceSynchronize(); 67 | axpy_cudakernel_cyclic<<<1024, 256>>>(d_x, d_y, n, a); 68 | cudaDeviceSynchronize(); 69 | 70 | cudaMemcpy(y, d_y, n*sizeof(REAL), cudaMemcpyDeviceToHost); 71 | cudaFree(d_x); 72 | cudaFree(d_y); 73 | } 74 | 75 | -------------------------------------------------------------------------------- /CoMem_AXPY/test.sh: -------------------------------------------------------------------------------- 1 | nvprof ./axpy_cuda 1024000 2 | nvprof ./axpy_cuda 4096000 3 | nvprof ./axpy_cuda 10240000 4 | nvprof ./axpy_cuda 20480000 -------------------------------------------------------------------------------- /CoMem_SpMM/Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | nvcc -o SpMM_cuda SpMM_cuda.c SpMM_cudakernel.cu 3 | -------------------------------------------------------------------------------- /CoMem_SpMM/SpMM.h: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | #define REAL float 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | extern void spmm_csr_cuda(const int num_rows, const int *ptrA, const int * indicesA, const REAL *dataA, const int *ptrB, const int * indicesB, const REAL *dataB, REAL* result, int nnzA, int nnzB); 12 | 13 | extern void spmm_csc_cuda(const int num_rows, const int *ptrA, const int * indicesA, const REAL *dataA, const int *ptrB, const int * indicesB, const REAL *dataB, REAL* result, int nnzA, int nnzB); 14 | 15 | #ifdef __cplusplus 16 | } 17 | #endif 18 | -------------------------------------------------------------------------------- /CoMem_SpMM/SpMM_cuda.output.carina.txt: -------------------------------------------------------------------------------- 1 | int num_rows = 100; 2 | int nnz = 1024; 3 | 4 | This result is tested on carina. 5 | spmm_csr_kernel: two matrix are all in csr format 6 | spmm_csc_kernel: one matrix is in csr format and the other one is in csc format 7 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ 8 | xyi2@cci-carina:~/CUDAMemBench/SpMM$ nvprof ./SpMM_cuda 9 | ==10262== NVPROF is profiling process 10262, command: ./SpMM_cuda 10 | check(serial vs serial_csr):0.000000 11 | check(serial vs serial_csc):0.000000 12 | check(serial vs cuda_csr):0.000288 13 | check(serial vs cuda_csc):0.000288 14 | ==10262== Profiling application: ./SpMM_cuda 15 | ==10262== Profiling result: 16 | Type Time(%) Time Calls Avg Min Max Name 17 | GPU activities: 49.05% 48.011ms 1 48.011ms 48.011ms 48.011ms spmm_csr_csr_warmingup(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int) 18 | 49.05% 48.007ms 1 48.007ms 48.007ms 48.007ms spmm_csr_csr_kernel(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int) 19 | 0.94% 916.76us 1 916.76us 916.76us 916.76us spmm_csc_csr_warmingup(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int) 20 | 0.94% 915.77us 1 915.77us 915.77us 915.77us spmm_csc_csr_kernel(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int) 21 | 0.02% 22.784us 12 1.8980us 1.4720us 2.3360us [CUDA memcpy HtoD] 22 | 0.01% 8.2880us 2 4.1440us 4.0320us 4.2560us [CUDA memcpy DtoH] 23 | API calls: 75.16% 305.55ms 14 21.825ms 3.9930us 305.31ms cudaMalloc 24 | 24.07% 97.865ms 4 24.466ms 920.19us 48.014ms cudaDeviceSynchronize 25 | 0.40% 1.6464ms 1 1.6464ms 1.6464ms 1.6464ms cuDeviceTotalMem 26 | 0.14% 584.79us 97 6.0280us 181ns 235.37us cuDeviceGetAttribute 27 | 0.11% 437.12us 14 31.222us 3.0890us 187.25us cudaFree 28 | 0.08% 309.85us 14 22.132us 11.453us 72.366us cudaMemcpy 29 | 0.02% 85.758us 4 21.439us 12.362us 40.483us cudaLaunchKernel 30 | 0.01% 55.244us 1 55.244us 55.244us 55.244us cuDeviceGetName 31 | 0.00% 5.0020us 1 5.0020us 5.0020us 5.0020us cuDeviceGetPCIBusId 32 | 0.00% 3.6080us 3 1.2020us 385ns 2.7630us cuDeviceGetCount 33 | 0.00% 1.3200us 2 660ns 253ns 1.0670us cuDeviceGet 34 | 0.00% 337ns 1 337ns 337ns 337ns cuDeviceGetUuid 35 | -------------------------------------------------------------------------------- /CoMem_SpMM/SpMM_cuda.output.fornax.txt: -------------------------------------------------------------------------------- 1 | int num_rows = 100; 2 | int nnz = 1024; 3 | 4 | This result is tested on fornax. 5 | spmm_csr_kernel: two matrix are all in csr format 6 | spmm_csc_kernel: one matrix is in csr format and the other one is in csc format 7 | ------------------------------------------------------------------------------------------------------------------------------------------------------------------ 8 | xyi2@fornax:~/CUDAMemBench/SpMM$ nvprof ./SpMM_cuda 9 | ==30020== NVPROF is profiling process 30020, command: ./SpMM_cuda 10 | ==30020== Warning: Auto boost enabled on device 0. Profiling results may be inconsistent. 11 | check(serial vs serial_csr):0.000000 12 | check(serial vs serial_csc):0.000000 13 | check(serial vs cuda_csr):0.000308 14 | check(serial vs cuda_csc):0.000308 15 | ==30020== Profiling application: ./SpMM_cuda 16 | ==30020== Profiling result: 17 | Type Time(%) Time Calls Avg Min Max Name 18 | GPU activities: 98.55% 354.30ms 1 354.30ms 354.30ms 354.30ms spmm_csr_kernel(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int) 19 | 1.44% 5.1618ms 1 5.1618ms 5.1618ms 5.1618ms spmm_csc_kernel(int, int const *, int const *, float const *, int const *, int const *, float const *, float*, int, int) 20 | 0.01% 23.392us 12 1.9490us 1.3120us 2.2400us [CUDA memcpy HtoD] 21 | 0.01% 18.752us 2 9.3760us 8.7680us 9.9840us [CUDA memcpy DtoH] 22 | API calls: 53.41% 423.54ms 14 30.253ms 7.6370us 423.02ms cudaMalloc 23 | 45.39% 359.89ms 14 25.707ms 14.310us 354.38ms cudaMemcpy 24 | 0.58% 4.5723ms 4 1.1431ms 1.1372ms 1.1488ms cuDeviceTotalMem 25 | 0.45% 3.5295ms 388 9.0960us 396ns 330.51us cuDeviceGetAttribute 26 | 0.10% 767.61us 14 54.829us 5.6940us 316.94us cudaFree 27 | 0.04% 318.99us 2 159.50us 38.867us 280.13us cudaLaunchKernel 28 | 0.04% 287.49us 4 71.873us 68.487us 80.183us cuDeviceGetName 29 | 0.00% 17.213us 4 4.3030us 3.2400us 6.1430us cuDeviceGetPCIBusId 30 | 0.00% 6.6230us 8 827ns 523ns 1.5630us cuDeviceGet 31 | 0.00% 3.4470us 3 1.1490us 633ns 1.8740us cuDeviceGetCount 32 | 0.00% 2.5630us 4 640ns 526ns 837ns cuDeviceGetUuid 33 | -------------------------------------------------------------------------------- /Common/FreeImage/include/FreeImage.h: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/passlab/CUDAMicroBench/8c8cd594411c3060fc613a8cdbb3bab21fff1fe1/Common/FreeImage/include/FreeImage.h -------------------------------------------------------------------------------- /Common/README.md: -------------------------------------------------------------------------------- 1 | This folder is derived from CUDA Samples. It is used to support several benchmarks which are derived from CUDA Samples, including GSOverlap, ConKernels and Taskgraph. Some header files required by these three benchmarks are stored here. 2 | -------------------------------------------------------------------------------- /Common/UtilNPP/Image.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef NV_UTIL_NPP_IMAGE_H 29 | #define NV_UTIL_NPP_IMAGE_H 30 | 31 | #include 32 | 33 | namespace npp 34 | { 35 | 36 | class Image 37 | { 38 | public: 39 | struct Size 40 | { 41 | unsigned int nWidth; 42 | unsigned int nHeight; 43 | 44 | Size() : nWidth(0), nHeight(0) 45 | { }; 46 | 47 | Size(unsigned int nWidthNew, unsigned nHeightNew) : nWidth(nWidthNew), nHeight(nHeightNew) 48 | { }; 49 | 50 | Size(const Size &rSize) : nWidth(rSize.nWidth), nHeight(rSize.nHeight) 51 | { }; 52 | 53 | Size & 54 | operator= (const Size &rSize) 55 | { 56 | if (&rSize == this) 57 | { 58 | return *this; 59 | } 60 | 61 | nWidth = rSize.nWidth; 62 | nHeight = rSize.nHeight; 63 | 64 | return *this; 65 | } 66 | 67 | void 68 | swap(Size &rSize) 69 | { 70 | unsigned int nTemp; 71 | nTemp = nWidth; 72 | nWidth = rSize.nWidth; 73 | rSize.nWidth = nTemp; 74 | 75 | nTemp = nHeight; 76 | nHeight = rSize.nHeight; 77 | rSize.nHeight = nTemp; 78 | } 79 | }; 80 | 81 | Image() 82 | { }; 83 | 84 | Image(unsigned int nWidth, unsigned int nHeight) : oSize_(nWidth, nHeight) 85 | { }; 86 | 87 | Image(const Image::Size &rSize) : oSize_(rSize) 88 | { }; 89 | 90 | Image(const Image &rImage) : oSize_(rImage.oSize_) 91 | { }; 92 | 93 | virtual 94 | ~Image() 95 | { }; 96 | 97 | Image & 98 | operator= (const Image &rImage) 99 | { 100 | if (&rImage == this) 101 | { 102 | return *this; 103 | } 104 | 105 | oSize_ = rImage.oSize_; 106 | return *this; 107 | }; 108 | 109 | unsigned int 110 | width() 111 | const 112 | { 113 | return oSize_.nWidth; 114 | } 115 | 116 | unsigned int 117 | height() 118 | const 119 | { 120 | return oSize_.nHeight; 121 | } 122 | 123 | Size 124 | size() 125 | const 126 | { 127 | return oSize_; 128 | } 129 | 130 | void 131 | swap(Image &rImage) 132 | { 133 | oSize_.swap(rImage.oSize_); 134 | } 135 | 136 | private: 137 | Size oSize_; 138 | }; 139 | 140 | bool 141 | operator== (const Image::Size &rFirst, const Image::Size &rSecond) 142 | { 143 | return rFirst.nWidth == rSecond.nWidth && rFirst.nHeight == rSecond.nHeight; 144 | } 145 | 146 | bool 147 | operator!= (const Image::Size &rFirst, const Image::Size &rSecond) 148 | { 149 | return rFirst.nWidth != rSecond.nWidth || rFirst.nHeight != rSecond.nHeight; 150 | } 151 | 152 | } // npp namespace 153 | 154 | 155 | #endif // NV_UTIL_NPP_IMAGE_H 156 | -------------------------------------------------------------------------------- /Common/UtilNPP/ImageAllocatorsCPU.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H 29 | #define NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H 30 | 31 | #include "Exceptions.h" 32 | 33 | namespace npp 34 | { 35 | 36 | template 37 | class ImageAllocatorCPU 38 | { 39 | public: 40 | static 41 | D * 42 | Malloc2D(unsigned int nWidth, unsigned int nHeight, unsigned int *pPitch) 43 | { 44 | NPP_ASSERT(nWidth * nHeight > 0); 45 | 46 | D *pResult = new D[nWidth * N * nHeight]; 47 | *pPitch = nWidth * sizeof(D) * N; 48 | 49 | return pResult; 50 | }; 51 | 52 | static 53 | void 54 | Free2D(D *pPixels) 55 | { 56 | delete[] pPixels; 57 | }; 58 | 59 | static 60 | void 61 | Copy2D(D *pDst, size_t nDstPitch, const D *pSrc, size_t nSrcPitch, size_t nWidth, size_t nHeight) 62 | { 63 | const void *pSrcLine = pSrc; 64 | void *pDstLine = pDst; 65 | 66 | for (size_t iLine = 0; iLine < nHeight; ++iLine) 67 | { 68 | // copy one line worth of data 69 | memcpy(pDst, pSrc, nWidth * N * sizeof(D)); 70 | // move data pointers to next line 71 | pDst += nDstPitch; 72 | pSrc += nSrcPitch; 73 | } 74 | }; 75 | 76 | }; 77 | 78 | } // npp namespace 79 | 80 | #endif // NV_UTIL_NPP_IMAGE_ALLOCATORS_CPU_H 81 | -------------------------------------------------------------------------------- /Common/UtilNPP/ImagesCPU.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef NV_UTIL_NPP_IMAGES_CPU_H 29 | #define NV_UTIL_NPP_IMAGES_CPU_H 30 | 31 | #include "ImagePacked.h" 32 | 33 | #include "ImageAllocatorsCPU.h" 34 | #include "Exceptions.h" 35 | 36 | #include 37 | 38 | 39 | namespace npp 40 | { 41 | 42 | template 43 | class ImageCPU: public npp::ImagePacked 44 | { 45 | public: 46 | 47 | ImageCPU() 48 | { 49 | ; 50 | } 51 | 52 | ImageCPU(unsigned int nWidth, unsigned int nHeight): ImagePacked(nWidth, nHeight) 53 | { 54 | ; 55 | } 56 | 57 | explicit 58 | ImageCPU(const npp::Image::Size &rSize): ImagePacked(rSize) 59 | { 60 | ; 61 | } 62 | 63 | ImageCPU(const ImageCPU &rImage): Image(rImage) 64 | { 65 | ; 66 | } 67 | 68 | virtual 69 | ~ImageCPU() 70 | { 71 | ; 72 | } 73 | 74 | ImageCPU & 75 | operator= (const ImageCPU &rImage) 76 | { 77 | ImagePacked::operator= (rImage); 78 | 79 | return *this; 80 | } 81 | 82 | npp::Pixel & 83 | operator()(unsigned int iX, unsigned int iY) 84 | { 85 | return *ImagePacked::pixels(iX, iY); 86 | } 87 | 88 | npp::Pixel 89 | operator()(unsigned int iX, unsigned int iY) 90 | const 91 | { 92 | return *ImagePacked::pixels(iX, iY); 93 | } 94 | 95 | }; 96 | 97 | 98 | typedef ImageCPU > ImageCPU_8u_C1; 99 | typedef ImageCPU > ImageCPU_8u_C2; 100 | typedef ImageCPU > ImageCPU_8u_C3; 101 | typedef ImageCPU > ImageCPU_8u_C4; 102 | 103 | typedef ImageCPU > ImageCPU_16u_C1; 104 | typedef ImageCPU > ImageCPU_16u_C3; 105 | typedef ImageCPU > ImageCPU_16u_C4; 106 | 107 | typedef ImageCPU > ImageCPU_16s_C1; 108 | typedef ImageCPU > ImageCPU_16s_C3; 109 | typedef ImageCPU > ImageCPU_16s_C4; 110 | 111 | typedef ImageCPU > ImageCPU_32s_C1; 112 | typedef ImageCPU > ImageCPU_32s_C3; 113 | typedef ImageCPU > ImageCPU_32s_C4; 114 | 115 | typedef ImageCPU > ImageCPU_32f_C1; 116 | typedef ImageCPU > ImageCPU_32f_C3; 117 | typedef ImageCPU > ImageCPU_32f_C4; 118 | 119 | } // npp namespace 120 | 121 | #endif // NV_IMAGE_IPP_H 122 | -------------------------------------------------------------------------------- /Common/UtilNPP/Pixel.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef NV_UTIL_PIXEL_H 30 | #define NV_UTIL_PIXEL_H 31 | 32 | #include "Exceptions.h" 33 | 34 | namespace npp 35 | { 36 | template 37 | struct Pixel 38 | { }; 39 | 40 | template 41 | struct Pixel 42 | { 43 | D x; 44 | 45 | const D & 46 | operator[](size_t iChannel) 47 | const 48 | { 49 | NPP_ASSERT(iChannel < 1); 50 | return (&x)[iChannel]; 51 | } 52 | 53 | D & 54 | operator[](size_t iChannel) 55 | { 56 | NPP_ASSERT(iChannel < 1); 57 | return (&x)[iChannel]; 58 | } 59 | }; 60 | 61 | template 62 | struct Pixel 63 | { 64 | D x,y; 65 | 66 | const D & 67 | operator[](size_t iChannel) 68 | const 69 | { 70 | NPP_ASSERT(iChannel < 2); 71 | return (&x)[iChannel]; 72 | } 73 | 74 | D & 75 | operator[](size_t iChannel) 76 | { 77 | NPP_ASSERT(iChannel < 2); 78 | return (&x)[iChannel]; 79 | } 80 | }; 81 | 82 | template 83 | struct Pixel 84 | { 85 | D x,y,z; 86 | 87 | const D & 88 | operator[](size_t iChannel) 89 | const 90 | { 91 | NPP_ASSERT(iChannel < 3); 92 | return (&x)[iChannel]; 93 | } 94 | 95 | D & 96 | operator[](size_t iChannel) 97 | { 98 | NPP_ASSERT(iChannel < 3); 99 | return (&x)[iChannel]; 100 | } 101 | }; 102 | 103 | template 104 | struct Pixel 105 | { 106 | D x, y, z, w; 107 | 108 | const D & 109 | operator[](size_t iChannel) 110 | const 111 | { 112 | NPP_ASSERT(iChannel < 4); 113 | return (&x)[iChannel]; 114 | } 115 | 116 | D & 117 | operator[](size_t iChannel) 118 | { 119 | NPP_ASSERT(iChannel < 4); 120 | return (&x)[iChannel]; 121 | } 122 | }; 123 | 124 | } // npp namespace 125 | 126 | #endif // NV_UTIL_PIXEL_H 127 | -------------------------------------------------------------------------------- /Common/UtilNPP/Signal.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef NV_UTIL_NPP_SIGNAL_H 30 | #define NV_UTIL_NPP_SIGNAL_H 31 | 32 | #include 33 | 34 | namespace npp 35 | { 36 | class Signal 37 | { 38 | public: 39 | Signal() : nSize_(0) 40 | { }; 41 | 42 | explicit 43 | Signal(size_t nSize) : nSize_(nSize) 44 | { }; 45 | 46 | Signal(const Signal &rSignal) : nSize_(rSignal.nSize_) 47 | { }; 48 | 49 | virtual 50 | ~Signal() 51 | { } 52 | 53 | Signal & 54 | operator= (const Signal &rSignal) 55 | { 56 | nSize_ = rSignal.nSize_; 57 | return *this; 58 | } 59 | 60 | size_t 61 | size() 62 | const 63 | { 64 | return nSize_; 65 | } 66 | 67 | void 68 | swap(Signal &rSignal) 69 | { 70 | size_t nTemp = nSize_; 71 | nSize_ = rSignal.nSize_; 72 | rSignal.nSize_ = nTemp; 73 | } 74 | 75 | 76 | private: 77 | size_t nSize_; 78 | }; 79 | 80 | template 81 | class SignalTemplate: public Signal 82 | { 83 | public: 84 | typedef D tData; 85 | 86 | SignalTemplate(): aValues_(0) 87 | { 88 | ; 89 | } 90 | 91 | SignalTemplate(size_t nSize): Signal(nSize) 92 | , aValues_(0) 93 | { 94 | aValues_ = A::Malloc1D(size()); 95 | } 96 | 97 | SignalTemplate(const SignalTemplate &rSignal): Signal(rSignal) 98 | , aValues_(0) 99 | { 100 | aValues_ = A::Malloc1D(size()); 101 | A::Copy1D(aValues_, rSignal.values(), size()); 102 | } 103 | 104 | virtual 105 | ~SignalTemplate() 106 | { 107 | A::Free1D(aValues_); 108 | } 109 | 110 | SignalTemplate & 111 | operator= (const SignalTemplate &rSignal) 112 | { 113 | // in case of self-assignment 114 | if (&rSignal == this) 115 | { 116 | return *this; 117 | } 118 | 119 | A::Free1D(aValues_); 120 | this->aPixels_ = 0; 121 | 122 | // assign parent class's data fields (width, height) 123 | Signal::operator =(rSignal); 124 | 125 | aValues_ = A::Malloc1D(size()); 126 | A::Copy1D(aValues_, rSignal.value(), size()); 127 | 128 | return *this; 129 | } 130 | 131 | /// Get a pointer to the pixel array. 132 | /// The result pointer can be offset to pixel at position (x, y) and 133 | /// even negative offsets are allowed. 134 | /// \param nX Horizontal pointer/array offset. 135 | /// \param nY Vertical pointer/array offset. 136 | /// \return Pointer to the pixel array (or first pixel in array with coordinates (nX, nY). 137 | tData * 138 | values(int i = 0) 139 | { 140 | return aValues_ + i; 141 | } 142 | 143 | const 144 | tData * 145 | values(int i = 0) 146 | const 147 | { 148 | return aValues_ + i; 149 | } 150 | 151 | void 152 | swap(SignalTemplate &rSignal) 153 | { 154 | Signal::swap(rSignal); 155 | 156 | tData *aTemp = this->aValues_; 157 | this->aValues_ = rSignal.aValues_; 158 | rSignal.aValues_ = aTemp; 159 | } 160 | 161 | private: 162 | D *aValues_; 163 | }; 164 | 165 | } // npp namespace 166 | 167 | 168 | #endif // NV_UTIL_NPP_SIGNAL_H 169 | -------------------------------------------------------------------------------- /Common/UtilNPP/SignalAllocatorsCPU.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H 30 | #define NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H 31 | 32 | #include "Exceptions.h" 33 | 34 | namespace npp 35 | { 36 | 37 | template 38 | class SignalAllocatorCPU 39 | { 40 | public: 41 | static 42 | D * 43 | Malloc1D(unsigned int nSize) 44 | { 45 | return new D[nSize];; 46 | }; 47 | 48 | static 49 | void 50 | Free1D(D *pPixels) 51 | { 52 | delete[] pPixels; 53 | }; 54 | 55 | static 56 | void 57 | Copy1D(D *pDst, const D *pSrc, size_t nSize) 58 | { 59 | memcpy(pDst, pSrc, nSize * sizeof(D)); 60 | }; 61 | 62 | }; 63 | 64 | } // npp namespace 65 | 66 | #endif // NV_UTIL_NPP_SIGNAL_ALLOCATORS_CPU_H 67 | -------------------------------------------------------------------------------- /Common/UtilNPP/SignalsCPU.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef NV_UTIL_NPP_SIGNALS_CPU_H 30 | #define NV_UTIL_NPP_SIGNALS_CPU_H 31 | 32 | #include "Signal.h" 33 | 34 | #include "SignalAllocatorsCPU.h" 35 | #include "Exceptions.h" 36 | 37 | #include 38 | 39 | 40 | namespace npp 41 | { 42 | 43 | template 44 | class SignalCPU: public npp::SignalTemplate 45 | { 46 | public: 47 | typedef typename npp::SignalTemplate::tData tData; 48 | 49 | SignalCPU() 50 | { 51 | ; 52 | } 53 | 54 | SignalCPU(size_t nSize): SignalTemplate(nSize) 55 | { 56 | ; 57 | } 58 | 59 | SignalCPU(const SignalCPU &rSignal): SignalTemplate(rSignal) 60 | { 61 | ; 62 | } 63 | 64 | virtual 65 | ~SignalCPU() 66 | { 67 | ; 68 | } 69 | 70 | SignalCPU & 71 | operator= (const SignalCPU &rSignal) 72 | { 73 | SignalTemplate::operator= (rSignal); 74 | 75 | return *this; 76 | } 77 | 78 | tData & 79 | operator [](unsigned int i) 80 | { 81 | return *SignalTemplate::values(i); 82 | } 83 | 84 | tData 85 | operator [](unsigned int i) 86 | const 87 | { 88 | return *SignalTemplate::values(i); 89 | } 90 | 91 | }; 92 | 93 | typedef SignalCPU > SignalCPU_8u; 94 | typedef SignalCPU > SignalCPU_32s; 95 | typedef SignalCPU > SignalCPU_16s; 96 | typedef SignalCPU > SignalCPU_16sc; 97 | typedef SignalCPU > SignalCPU_32sc; 98 | typedef SignalCPU > SignalCPU_32f; 99 | typedef SignalCPU > SignalCPU_32fc; 100 | typedef SignalCPU > SignalCPU_64s; 101 | typedef SignalCPU > SignalCPU_64sc; 102 | typedef SignalCPU > SignalCPU_64f; 103 | typedef SignalCPU > SignalCPU_64fc; 104 | 105 | } // npp namespace 106 | 107 | #endif // NV_UTIL_NPP_SIGNALS_CPU_H 108 | -------------------------------------------------------------------------------- /Common/UtilNPP/SignalsNPP.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef NV_UTIL_NPP_SIGNALS_NPP_H 30 | #define NV_UTIL_NPP_SIGNALS_NPP_H 31 | 32 | #include "Exceptions.h" 33 | #include "Signal.h" 34 | 35 | #include "SignalAllocatorsNPP.h" 36 | #include 37 | 38 | namespace npp 39 | { 40 | // forward declaration 41 | template class SignalCPU; 42 | 43 | template 44 | class SignalNPP: public npp::SignalTemplate > 45 | { 46 | public: 47 | SignalNPP() 48 | { 49 | ; 50 | } 51 | 52 | explicit 53 | SignalNPP(size_t nSize): SignalTemplate >(nSize) 54 | { 55 | ; 56 | } 57 | 58 | SignalNPP(const SignalNPP &rSignal): SignalTemplate >(rSignal) 59 | { 60 | ; 61 | } 62 | 63 | template 64 | explicit 65 | SignalNPP(const SignalCPU &rSignal): SignalTemplate >(rSignal.size()) 66 | { 67 | npp::SignalAllocator::HostToDeviceCopy1D(SignalTemplate >::values(), 68 | rSignal.values(), SignalTemplate >::size()); 69 | } 70 | 71 | virtual 72 | ~SignalNPP() 73 | { 74 | ; 75 | } 76 | 77 | SignalNPP & 78 | operator= (const SignalNPP &rSignal) 79 | { 80 | SignalTemplate >::operator= (rSignal); 81 | 82 | return *this; 83 | } 84 | 85 | void 86 | copyTo(D *pValues) 87 | const 88 | { 89 | npp::SignalAllocator::DeviceToHostCopy1D(pValues, SignalTemplate >::values(), SignalTemplate >::size()); 90 | } 91 | 92 | void 93 | copyFrom(D *pValues) 94 | { 95 | npp::SignalAllocator::HostToDeviceCopy1D(SignalTemplate >::values(), pValues, SignalTemplate >::size()); 96 | } 97 | }; 98 | 99 | typedef SignalNPP SignalNPP_8u; 100 | typedef SignalNPP SignalNPP_16s; 101 | typedef SignalNPP SignalNPP_16sc; 102 | typedef SignalNPP SignalNPP_32s; 103 | typedef SignalNPP SignalNPP_32sc; 104 | typedef SignalNPP SignalNPP_32f; 105 | typedef SignalNPP SignalNPP_32fc; 106 | typedef SignalNPP SignalNPP_64s; 107 | typedef SignalNPP SignalNPP_64sc; 108 | typedef SignalNPP SignalNPP_64f; 109 | typedef SignalNPP SignalNPP_64fc; 110 | 111 | } // npp namespace 112 | 113 | #endif // NV_UTIL_NPP_SIGNALS_NPP_H 114 | -------------------------------------------------------------------------------- /Common/helper_functions.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | // These are helper functions for the SDK samples (string parsing, 29 | // timers, image helpers, etc) 30 | #ifndef COMMON_HELPER_FUNCTIONS_H_ 31 | #define COMMON_HELPER_FUNCTIONS_H_ 32 | 33 | #ifdef WIN32 34 | #pragma warning(disable : 4996) 35 | #endif 36 | 37 | // includes, project 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | 50 | // includes, timer, string parsing, image helpers 51 | #include // helper functions for image compare, dump, data comparisons 52 | #include // helper functions for string parsing 53 | #include // helper functions for timers 54 | 55 | #ifndef EXIT_WAIVED 56 | #define EXIT_WAIVED 2 57 | #endif 58 | 59 | #endif // COMMON_HELPER_FUNCTIONS_H_ 60 | -------------------------------------------------------------------------------- /Common/helper_multiprocess.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #ifndef HELPER_MULTIPROCESS_H 29 | #define HELPER_MULTIPROCESS_H 30 | 31 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 32 | #ifndef WIN32_LEAN_AND_MEAN 33 | #define WIN32_LEAN_AND_MEAN 34 | #endif 35 | #include 36 | #include 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #else 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | #include 50 | #include 51 | #include 52 | #include 53 | #include 54 | #endif 55 | #include 56 | 57 | typedef struct sharedMemoryInfo_st { 58 | void *addr; 59 | size_t size; 60 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 61 | HANDLE shmHandle; 62 | #else 63 | int shmFd; 64 | #endif 65 | } sharedMemoryInfo; 66 | 67 | int sharedMemoryCreate(const char *name, size_t sz, sharedMemoryInfo *info); 68 | 69 | int sharedMemoryOpen(const char *name, size_t sz, sharedMemoryInfo *info); 70 | 71 | void sharedMemoryClose(sharedMemoryInfo *info); 72 | 73 | 74 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 75 | typedef PROCESS_INFORMATION Process; 76 | #else 77 | typedef pid_t Process; 78 | #endif 79 | 80 | int spawnProcess(Process *process, const char *app, char * const *args); 81 | 82 | int waitProcess(Process *process); 83 | 84 | #define checkIpcErrors(ipcFuncResult) \ 85 | if (ipcFuncResult == -1) { fprintf(stderr, "Failure at %u %s\n", __LINE__, __FILE__); exit(EXIT_FAILURE); } 86 | 87 | #if defined(__linux__) 88 | struct ipcHandle_st { 89 | int socket; 90 | char *socketName; 91 | }; 92 | typedef int ShareableHandle; 93 | #elif defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64) 94 | struct ipcHandle_st { 95 | std::vector hMailslot; // 1 Handle in case of child and `num children` Handles for parent. 96 | }; 97 | typedef HANDLE ShareableHandle; 98 | #endif 99 | 100 | typedef struct ipcHandle_st ipcHandle; 101 | 102 | int 103 | ipcCreateSocket(ipcHandle *&handle, const char *name, const std::vector& processes); 104 | 105 | int 106 | ipcOpenSocket(ipcHandle *&handle); 107 | 108 | int 109 | ipcCloseSocket(ipcHandle *handle); 110 | 111 | int 112 | ipcRecvShareableHandles(ipcHandle *handle, std::vector& shareableHandles); 113 | 114 | int 115 | ipcSendShareableHandles(ipcHandle *handle, const std::vector& shareableHandles, const std::vector& processes); 116 | 117 | int 118 | ipcCloseShareableHandle(ShareableHandle shHandle); 119 | 120 | #endif // HELPER_MULTIPROCESS_H 121 | -------------------------------------------------------------------------------- /Common/rendercheck_d3d11.cpp: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | //////////////////////////////////////////////////////////////////////////////// 29 | // 30 | // Utility funcs to wrap up saving a surface or the back buffer as a PPM file 31 | // In addition, wraps up a threshold comparision of two PPMs. 32 | // 33 | // These functions are designed to be used to implement an automated QA testing for SDK samples. 34 | // 35 | // Author: Bryan Dudash 36 | // Email: sdkfeedback@nvidia.com 37 | // 38 | // Copyright (c) NVIDIA Corporation. All rights reserved. 39 | //////////////////////////////////////////////////////////////////////////////// 40 | 41 | #include 42 | #include 43 | 44 | HRESULT CheckRenderD3D11::ActiveRenderTargetToPPM(ID3D11Device *pDevice, const char *zFileName) 45 | { 46 | ID3D11DeviceContext *pDeviceCtxt; 47 | pDevice->GetImmediateContext(&pDeviceCtxt); 48 | ID3D11RenderTargetView *pRTV = NULL; 49 | pDeviceCtxt->OMGetRenderTargets(1,&pRTV,NULL); 50 | 51 | ID3D11Resource *pSourceResource = NULL; 52 | pRTV->GetResource(&pSourceResource); 53 | 54 | return ResourceToPPM(pDevice,pSourceResource,zFileName); 55 | } 56 | 57 | HRESULT CheckRenderD3D11::ResourceToPPM(ID3D11Device *pDevice, ID3D11Resource *pResource, const char *zFileName) 58 | { 59 | ID3D11DeviceContext *pDeviceCtxt; 60 | pDevice->GetImmediateContext(&pDeviceCtxt); 61 | D3D11_RESOURCE_DIMENSION rType; 62 | pResource->GetType(&rType); 63 | 64 | if (rType != D3D11_RESOURCE_DIMENSION_TEXTURE2D) 65 | { 66 | printf("SurfaceToPPM: pResource is not a 2D texture! Aborting...\n"); 67 | return E_FAIL; 68 | } 69 | 70 | ID3D11Texture2D *pSourceTexture = (ID3D11Texture2D *)pResource; 71 | ID3D11Texture2D *pTargetTexture = NULL; 72 | 73 | D3D11_TEXTURE2D_DESC desc; 74 | pSourceTexture->GetDesc(&desc); 75 | desc.BindFlags = 0; 76 | desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ; 77 | desc.Usage = D3D11_USAGE_STAGING; 78 | 79 | if (FAILED(pDevice->CreateTexture2D(&desc,NULL,&pTargetTexture))) 80 | { 81 | printf("SurfaceToPPM: Unable to create target Texture resoruce! Aborting... \n"); 82 | return E_FAIL; 83 | } 84 | 85 | pDeviceCtxt->CopyResource(pTargetTexture,pSourceTexture); 86 | 87 | D3D11_MAPPED_SUBRESOURCE mappedTex2D; 88 | pDeviceCtxt->Map(pTargetTexture, 0, D3D11_MAP_READ,0,&mappedTex2D); 89 | 90 | // Need to convert from dx pitch to pitch=width 91 | unsigned char *pPPMData = new unsigned char[desc.Width*desc.Height*4]; 92 | 93 | for (unsigned int iHeight = 0; iHeightUnmap(pTargetTexture, 0); 99 | 100 | // Prepends the PPM header info and bumps byte data afterwards 101 | sdkSavePPM4ub(zFileName, pPPMData, desc.Width, desc.Height); 102 | 103 | delete [] pPPMData; 104 | pTargetTexture->Release(); 105 | 106 | return S_OK; 107 | } 108 | 109 | bool CheckRenderD3D11::PPMvsPPM(const char *src_file, const char *ref_file, const char *exec_path, 110 | const float epsilon, const float threshold) 111 | { 112 | char *ref_file_path = sdkFindFilePath(ref_file, exec_path); 113 | 114 | if (ref_file_path == NULL) 115 | { 116 | printf("CheckRenderD3D11::PPMvsPPM unable to find <%s> in <%s> Aborting comparison!\n", ref_file, exec_path); 117 | printf(">>> Check info.xml and [project//data] folder <%s> <<<\n", ref_file); 118 | printf("Aborting comparison!\n"); 119 | printf(" FAILURE!\n"); 120 | return false; 121 | } 122 | 123 | return sdkComparePPM(src_file,ref_file_path,epsilon,threshold,true) == true; 124 | } -------------------------------------------------------------------------------- /Common/rendercheck_d3d11.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | #pragma once 29 | 30 | #ifndef _RENDERCHECK_D3D11_H_ 31 | #define _RENDERCHECK_D3D11_H_ 32 | 33 | #include 34 | #include 35 | #include 36 | #include 37 | #include 38 | 39 | class CheckRenderD3D11 40 | { 41 | public: 42 | 43 | CheckRenderD3D11() {} 44 | 45 | static HRESULT ActiveRenderTargetToPPM(ID3D11Device *pDevice, const char *zFileName); 46 | static HRESULT ResourceToPPM(ID3D11Device *pDevice, ID3D11Resource *pResource, const char *zFileName); 47 | 48 | static bool PPMvsPPM(const char *src_file, const char *ref_file, const char *exec_path, 49 | const float epsilon, const float threshold = 0.0f); 50 | }; 51 | 52 | #endif -------------------------------------------------------------------------------- /Conkernels/NsightEclipse.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | concurrentKernels 5 | 6 | whole 7 | 8 | ./ 9 | ../ 10 | ../../common/inc 11 | 12 | 13 | Performance Strategies 14 | 15 | 16 | CUDA 17 | Concurrent Kernels 18 | 19 | 20 | 21 | 22 | 23 | true 24 | concurrentKernels.cu 25 | 26 | 1:CUDA Advanced Topics 27 | 1:Performance Strategies 28 | 29 | sm35 30 | sm37 31 | sm50 32 | sm52 33 | sm60 34 | sm61 35 | sm70 36 | sm72 37 | sm75 38 | sm80 39 | sm86 40 | 41 | 42 | x86_64 43 | linux 44 | 45 | 46 | windows7 47 | 48 | 49 | x86_64 50 | macosx 51 | 52 | 53 | arm 54 | 55 | 56 | ppc64le 57 | linux 58 | 59 | 60 | 61 | all 62 | 63 | Concurrent Kernels 64 | 65 | -------------------------------------------------------------------------------- /Conkernels/README.md: -------------------------------------------------------------------------------- 1 | # concurrentKernels - Concurrent Kernels 2 | 3 | ## Description 4 | 5 | This sample demonstrates the use of CUDA streams for concurrent execution of several kernels on GPU device. It also illustrates how to introduce dependencies between CUDA streams with the new cudaStreamWaitEvent function. 6 | 7 | ## Key Concepts 8 | 9 | Performance Strategies 10 | 11 | ## Supported SM Architectures 12 | 13 | [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) 14 | 15 | ## Supported OSes 16 | 17 | Linux, Windows 18 | 19 | ## Supported CPU Architecture 20 | 21 | x86_64, ppc64le, armv7l 22 | 23 | ## CUDA APIs involved 24 | 25 | ## Prerequisites 26 | 27 | Download and install the [CUDA Toolkit 11.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. 28 | 29 | ## Build and Run 30 | 31 | ### Windows 32 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: 33 | ``` 34 | *_vs.sln - for Visual Studio 35 | ``` 36 | Each individual sample has its own set of solution files in its directory: 37 | 38 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. 39 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." 40 | 41 | ### Linux 42 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: 43 | ``` 44 | $ cd 45 | $ make 46 | ``` 47 | The samples makefiles can take advantage of certain options: 48 | * **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l. 49 | By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
50 | `$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
51 | See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. 52 | * **dbg=1** - build with debug symbols 53 | ``` 54 | $ make dbg=1 55 | ``` 56 | * **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. 57 | ``` 58 | $ make SMS="50 60" 59 | ``` 60 | 61 | * **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. 62 | ``` 63 | $ make HOST_COMPILER=g++ 64 | ``` 65 | 66 | ## References (for more details) 67 | 68 | -------------------------------------------------------------------------------- /Conkernels/concurrentKernels_vs2015.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 14.00 3 | # Visual Studio 2015 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|x64 = Debug|x64 9 | Release|x64 = Release|x64 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 13 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 14 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 15 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /Conkernels/concurrentKernels_vs2015.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | $(VCTargetsPath)\BuildCustomizations 5 | 6 | 7 | 8 | Debug 9 | x64 10 | 11 | 12 | Release 13 | x64 14 | 15 | 16 | 17 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15} 18 | concurrentKernels_vs2015 19 | concurrentKernels 20 | 21 | 22 | 23 | 24 | Application 25 | MultiByte 26 | v140 27 | 28 | 29 | true 30 | 31 | 32 | true 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | $(Platform)/$(Configuration)/ 44 | $(IncludePath) 45 | AllRules.ruleset 46 | 47 | 48 | 49 | 50 | ../../bin/win64/$(Configuration)/ 51 | 52 | 53 | 54 | Level3 55 | WIN32;_MBCS;%(PreprocessorDefinitions) 56 | ./;$(CudaToolkitDir)/include;../../Common; 57 | 58 | 59 | Console 60 | cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 61 | $(CudaToolkitLibDir); 62 | $(OutDir)/concurrentKernels.exe 63 | 64 | 65 | compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; 66 | -Xcompiler "/wd 4819" %(AdditionalOptions) 67 | ./;../../Common 68 | WIN32 69 | 70 | 71 | 72 | 73 | Disabled 74 | MultiThreadedDebug 75 | 76 | 77 | true 78 | Default 79 | 80 | 81 | MTd 82 | 64 83 | 84 | 85 | 86 | 87 | MaxSpeed 88 | MultiThreaded 89 | 90 | 91 | false 92 | UseLinkTimeCodeGeneration 93 | 94 | 95 | MT 96 | 64 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /Conkernels/concurrentKernels_vs2017.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2017 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|x64 = Debug|x64 9 | Release|x64 = Release|x64 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 13 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 14 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 15 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /Conkernels/concurrentKernels_vs2017.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | $(VCTargetsPath)\BuildCustomizations 5 | 6 | 7 | 8 | Debug 9 | x64 10 | 11 | 12 | Release 13 | x64 14 | 15 | 16 | 17 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15} 18 | concurrentKernels_vs2017 19 | concurrentKernels 20 | 21 | 22 | 23 | $([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0')) 24 | $(LatestTargetPlatformVersion) 25 | $(WindowsTargetPlatformVersion) 26 | 27 | 28 | 29 | Application 30 | MultiByte 31 | v141 32 | 33 | 34 | true 35 | 36 | 37 | true 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | $(Platform)/$(Configuration)/ 49 | $(IncludePath) 50 | AllRules.ruleset 51 | 52 | 53 | 54 | 55 | ../../bin/win64/$(Configuration)/ 56 | 57 | 58 | 59 | Level3 60 | WIN32;_MBCS;%(PreprocessorDefinitions) 61 | ./;$(CudaToolkitDir)/include;../../Common; 62 | 63 | 64 | Console 65 | cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 66 | $(CudaToolkitLibDir); 67 | $(OutDir)/concurrentKernels.exe 68 | 69 | 70 | compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; 71 | -Xcompiler "/wd 4819" %(AdditionalOptions) 72 | ./;../../Common 73 | WIN32 74 | 75 | 76 | 77 | 78 | Disabled 79 | MultiThreadedDebug 80 | 81 | 82 | true 83 | Default 84 | 85 | 86 | MTd 87 | 64 88 | 89 | 90 | 91 | 92 | MaxSpeed 93 | MultiThreaded 94 | 95 | 96 | false 97 | UseLinkTimeCodeGeneration 98 | 99 | 100 | MT 101 | 64 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /Conkernels/concurrentKernels_vs2019.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2019 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "concurrentKernels", "concurrentKernels_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|x64 = Debug|x64 9 | Release|x64 = Release|x64 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 13 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 14 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 15 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /Conkernels/concurrentKernels_vs2019.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | $(VCTargetsPath)\BuildCustomizations 5 | 6 | 7 | 8 | Debug 9 | x64 10 | 11 | 12 | Release 13 | x64 14 | 15 | 16 | 17 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15} 18 | concurrentKernels_vs2019 19 | concurrentKernels 20 | 21 | 22 | 23 | 24 | Application 25 | MultiByte 26 | v142 27 | 10.0 28 | 29 | 30 | true 31 | 32 | 33 | true 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | $(Platform)/$(Configuration)/ 45 | $(IncludePath) 46 | AllRules.ruleset 47 | 48 | 49 | 50 | 51 | ../../bin/win64/$(Configuration)/ 52 | 53 | 54 | 55 | Level3 56 | WIN32;_MBCS;%(PreprocessorDefinitions) 57 | ./;$(CudaToolkitDir)/include;../../Common; 58 | 59 | 60 | Console 61 | cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 62 | $(CudaToolkitLibDir); 63 | $(OutDir)/concurrentKernels.exe 64 | 65 | 66 | compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; 67 | -Xcompiler "/wd 4819" %(AdditionalOptions) 68 | ./;../../Common 69 | WIN32 70 | 71 | 72 | 73 | 74 | Disabled 75 | MultiThreadedDebug 76 | 77 | 78 | true 79 | Default 80 | 81 | 82 | MTd 83 | 64 84 | 85 | 86 | 87 | 88 | MaxSpeed 89 | MultiThreaded 90 | 91 | 92 | false 93 | UseLinkTimeCodeGeneration 94 | 95 | 96 | MT 97 | 64 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /DynParallel/.gitignore: -------------------------------------------------------------------------------- 1 | *.png -------------------------------------------------------------------------------- /DynParallel/Makefile: -------------------------------------------------------------------------------- 1 | NVCC=nvcc 2 | 3 | CUDAFLAGS= -arch=sm_86 --cudart=shared -rdc=true -Xcompiler -fopenmp -lpng 4 | 5 | OPT= -g -G 6 | 7 | RM=/bin/rm -f 8 | 9 | all: Dynamic_Parallelism Non_Dynamic_Parallelism 10 | 11 | 12 | main: Dynamic_Parallelism.o Non_Dynamic_Parallelism.o 13 | 14 | ${NVCC} ${OPT} -o main Dynamic_Parallelism.o 15 | ${NVCC} ${OPT} -o main Non_Dynamic_Parallelism.o 16 | 17 | 18 | Dynamic_Parallelism.o: Dynamic_Parallelism.cu Non_Dynamic_Parallelism.cu 19 | 20 | $(NVCC) ${OPT} $(CUDAFLAGS) -c Dynamic_Parallelism.cu 21 | $(NVCC) ${OPT} $(CUDAFLAGS) -c Non_Dynamic_Parallelism.cu 22 | 23 | Dynamic_Parallelism: Dynamic_Parallelism.o Non_Dynamic_Parallelism.o 24 | 25 | ${NVCC} ${CUDAFLAGS} -o Dynamic_Parallelism Dynamic_Parallelism.o 26 | ${NVCC} ${CUDAFLAGS} -o Non_Dynamic_Parallelism Non_Dynamic_Parallelism.o 27 | 28 | clean: 29 | 30 | ${RM} *.o Dynamic_Parallelism 31 | ${RM} *.o Non_Dynamic_Parallelism -------------------------------------------------------------------------------- /DynParallel/lib/libpng.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/passlab/CUDAMicroBench/8c8cd594411c3060fc613a8cdbb3bab21fff1fe1/DynParallel/lib/libpng.lib -------------------------------------------------------------------------------- /DynParallel/lib/libpngd.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/passlab/CUDAMicroBench/8c8cd594411c3060fc613a8cdbb3bab21fff1fe1/DynParallel/lib/libpngd.lib -------------------------------------------------------------------------------- /DynParallel/lib/zlibstat.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/passlab/CUDAMicroBench/8c8cd594411c3060fc613a8cdbb3bab21fff1fe1/DynParallel/lib/zlibstat.lib -------------------------------------------------------------------------------- /DynParallel/lib/zlibstatd.lib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/passlab/CUDAMicroBench/8c8cd594411c3060fc613a8cdbb3bab21fff1fe1/DynParallel/lib/zlibstatd.lib -------------------------------------------------------------------------------- /GSOverlap/NsightEclipse.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | globalToShmemAsyncCopy 5 | 6 | --std=c++11 7 | 8 | 9 | cudaEventCreate 10 | cudaEventRecord 11 | cudaEventQuery 12 | cudaEventDestroy 13 | cudaEventElapsedTime 14 | cudaEventSynchronize 15 | cudaMalloc 16 | cudaFree 17 | cudaMemcpy 18 | 19 | 20 | whole 21 | 22 | ./ 23 | ../ 24 | ../../common/inc 25 | 26 | 27 | CUDA Runtime API 28 | Linear Algebra 29 | CPP11 CUDA 30 | 31 | 32 | CUDA 33 | matrix multiply 34 | Async copy 35 | CPP11 36 | GCC 5.0.0 37 | 38 | 39 | 40 | 41 | 42 | true 43 | globalToShmemAsyncCopy.cu 44 | 45 | CPP11 46 | 47 | 48 | 1:CUDA Basic Topics 49 | 3:Linear Algebra 50 | 51 | sm35 52 | sm37 53 | sm50 54 | sm52 55 | sm60 56 | sm61 57 | sm70 58 | sm72 59 | sm75 60 | sm80 61 | sm86 62 | 63 | 64 | x86_64 65 | linux 66 | 67 | 68 | x86_64 69 | macosx 70 | 71 | 72 | arm 73 | 74 | 75 | ppc64le 76 | linux 77 | 78 | 79 | aarch64 80 | linux 81 | 82 | 83 | aarch64 84 | qnx 85 | 86 | 87 | windows7 88 | 89 | 90 | 91 | all 92 | 93 | Global Memory to Shared Memory Async Copy 94 | 95 | -------------------------------------------------------------------------------- /GSOverlap/README.md: -------------------------------------------------------------------------------- 1 | # globalToShmemAsyncCopy - Global Memory to Shared Memory Async Copy 2 | 3 | ## Description 4 | 5 | This sample implements matrix multiplication which uses asynchronous copy of data from global to shared memory when on compute capability 8.0 or higher. Also demonstrates arrive-wait barrier for synchronization. 6 | 7 | ## Key Concepts 8 | 9 | CUDA Runtime API, Linear Algebra, CPP11 CUDA 10 | 11 | ## Supported SM Architectures 12 | 13 | [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) 14 | 15 | ## Supported OSes 16 | 17 | Linux, Windows, QNX 18 | 19 | ## Supported CPU Architecture 20 | 21 | x86_64, ppc64le, armv7l, aarch64 22 | 23 | ## CUDA APIs involved 24 | 25 | ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) 26 | cudaEventCreate, cudaEventRecord, cudaEventQuery, cudaEventDestroy, cudaEventElapsedTime, cudaEventSynchronize, cudaMalloc, cudaFree, cudaMemcpy 27 | 28 | ## Dependencies needed to build/run 29 | [CPP11](../../README.md#cpp11) 30 | 31 | ## Prerequisites 32 | 33 | Download and install the [CUDA Toolkit 11.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. 34 | Make sure the dependencies mentioned in [Dependencies]() section above are installed. 35 | 36 | ## Build and Run 37 | 38 | ### Windows 39 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: 40 | ``` 41 | *_vs.sln - for Visual Studio 42 | ``` 43 | Each individual sample has its own set of solution files in its directory: 44 | 45 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. 46 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." 47 | 48 | ### Linux 49 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: 50 | ``` 51 | $ cd 52 | $ make 53 | ``` 54 | The samples makefiles can take advantage of certain options: 55 | * **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l, aarch64. 56 | By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
57 | `$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
`$ make TARGET_ARCH=aarch64`
58 | See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. 59 | * **dbg=1** - build with debug symbols 60 | ``` 61 | $ make dbg=1 62 | ``` 63 | * **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. 64 | ``` 65 | $ make SMS="50 60" 66 | ``` 67 | 68 | * **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. 69 | ``` 70 | $ make HOST_COMPILER=g++ 71 | ``` 72 | 73 | ## References (for more details) 74 | 75 | -------------------------------------------------------------------------------- /GSOverlap/globalToShmemAsyncCopy_vs2015.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 14.00 3 | # Visual Studio 2015 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "globalToShmemAsyncCopy", "globalToShmemAsyncCopy_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|x64 = Debug|x64 9 | Release|x64 = Release|x64 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 13 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 14 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 15 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /GSOverlap/globalToShmemAsyncCopy_vs2015.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | $(VCTargetsPath)\BuildCustomizations 5 | 6 | 7 | 8 | Debug 9 | x64 10 | 11 | 12 | Release 13 | x64 14 | 15 | 16 | 17 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15} 18 | globalToShmemAsyncCopy_vs2015 19 | globalToShmemAsyncCopy 20 | 21 | 22 | 23 | 24 | Application 25 | MultiByte 26 | v140 27 | 28 | 29 | true 30 | 31 | 32 | true 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | $(Platform)/$(Configuration)/ 44 | $(IncludePath) 45 | AllRules.ruleset 46 | 47 | 48 | 49 | 50 | ../../bin/win64/$(Configuration)/ 51 | 52 | 53 | 54 | Level3 55 | WIN32;_MBCS;%(PreprocessorDefinitions) 56 | ./;$(CudaToolkitDir)/include;../../Common; 57 | 58 | 59 | Console 60 | cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 61 | $(CudaToolkitLibDir); 62 | $(OutDir)/globalToShmemAsyncCopy.exe 63 | 64 | 65 | compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; 66 | -Xcompiler "/wd 4819" %(AdditionalOptions) 67 | ./;../../Common 68 | WIN32 69 | 70 | 71 | 72 | 73 | Disabled 74 | MultiThreadedDebug 75 | 76 | 77 | true 78 | Default 79 | 80 | 81 | MTd 82 | 64 83 | 84 | 85 | 86 | 87 | MaxSpeed 88 | MultiThreaded 89 | 90 | 91 | false 92 | UseLinkTimeCodeGeneration 93 | 94 | 95 | MT 96 | 64 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /GSOverlap/globalToShmemAsyncCopy_vs2017.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2017 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "globalToShmemAsyncCopy", "globalToShmemAsyncCopy_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|x64 = Debug|x64 9 | Release|x64 = Release|x64 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 13 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 14 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 15 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /GSOverlap/globalToShmemAsyncCopy_vs2017.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | $(VCTargetsPath)\BuildCustomizations 5 | 6 | 7 | 8 | Debug 9 | x64 10 | 11 | 12 | Release 13 | x64 14 | 15 | 16 | 17 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15} 18 | globalToShmemAsyncCopy_vs2017 19 | globalToShmemAsyncCopy 20 | 21 | 22 | 23 | $([Microsoft.Build.Utilities.ToolLocationHelper]::GetLatestSDKTargetPlatformVersion('Windows', '10.0')) 24 | $(LatestTargetPlatformVersion) 25 | $(WindowsTargetPlatformVersion) 26 | 27 | 28 | 29 | Application 30 | MultiByte 31 | v141 32 | 33 | 34 | true 35 | 36 | 37 | true 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | $(Platform)/$(Configuration)/ 49 | $(IncludePath) 50 | AllRules.ruleset 51 | 52 | 53 | 54 | 55 | ../../bin/win64/$(Configuration)/ 56 | 57 | 58 | 59 | Level3 60 | WIN32;_MBCS;%(PreprocessorDefinitions) 61 | ./;$(CudaToolkitDir)/include;../../Common; 62 | 63 | 64 | Console 65 | cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 66 | $(CudaToolkitLibDir); 67 | $(OutDir)/globalToShmemAsyncCopy.exe 68 | 69 | 70 | compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; 71 | -Xcompiler "/wd 4819" %(AdditionalOptions) 72 | ./;../../Common 73 | WIN32 74 | 75 | 76 | 77 | 78 | Disabled 79 | MultiThreadedDebug 80 | 81 | 82 | true 83 | Default 84 | 85 | 86 | MTd 87 | 64 88 | 89 | 90 | 91 | 92 | MaxSpeed 93 | MultiThreaded 94 | 95 | 96 | false 97 | UseLinkTimeCodeGeneration 98 | 99 | 100 | MT 101 | 64 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /GSOverlap/globalToShmemAsyncCopy_vs2019.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2019 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "globalToShmemAsyncCopy", "globalToShmemAsyncCopy_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|x64 = Debug|x64 9 | Release|x64 = Release|x64 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 13 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 14 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 15 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /GSOverlap/globalToShmemAsyncCopy_vs2019.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | $(VCTargetsPath)\BuildCustomizations 5 | 6 | 7 | 8 | Debug 9 | x64 10 | 11 | 12 | Release 13 | x64 14 | 15 | 16 | 17 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15} 18 | globalToShmemAsyncCopy_vs2019 19 | globalToShmemAsyncCopy 20 | 21 | 22 | 23 | 24 | Application 25 | MultiByte 26 | v142 27 | 10.0 28 | 29 | 30 | true 31 | 32 | 33 | true 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | $(Platform)/$(Configuration)/ 45 | $(IncludePath) 46 | AllRules.ruleset 47 | 48 | 49 | 50 | 51 | ../../bin/win64/$(Configuration)/ 52 | 53 | 54 | 55 | Level3 56 | WIN32;_MBCS;%(PreprocessorDefinitions) 57 | ./;$(CudaToolkitDir)/include;../../Common; 58 | 59 | 60 | Console 61 | cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 62 | $(CudaToolkitLibDir); 63 | $(OutDir)/globalToShmemAsyncCopy.exe 64 | 65 | 66 | compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; 67 | -Xcompiler "/wd 4819" %(AdditionalOptions) 68 | ./;../../Common 69 | WIN32 70 | 71 | 72 | 73 | 74 | Disabled 75 | MultiThreadedDebug 76 | 77 | 78 | true 79 | Default 80 | 81 | 82 | MTd 83 | 64 84 | 85 | 86 | 87 | 88 | MaxSpeed 89 | MultiThreaded 90 | 91 | 92 | false 93 | UseLinkTimeCodeGeneration 94 | 95 | 96 | MT 97 | 64 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /HDOverlap/Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | nvcc -o axpy_cuda axpy_cudakernel.cu 3 | -------------------------------------------------------------------------------- /HDOverlap/axpy_cudakernel.cu: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | // Experimental test for new function memcpy_async in CUDA11 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include "cuda_runtime.h" 16 | #include "device_launch_parameters.h" 17 | 18 | double read_timer_ms() { 19 | struct timeb tm; 20 | ftime(&tm); 21 | return (double) tm.time * 1000.0 + (double) tm.millitm; 22 | } 23 | 24 | /* change this to do saxpy or daxpy : single precision or double precision*/ 25 | #define REAL double 26 | #define VEC_LEN 1024000 //use a fixed number for now 27 | /* zero out the entire vector */ 28 | void zero(REAL *A, int n) 29 | { 30 | int i; 31 | for (i = 0; i < n; i++) { 32 | A[i] = 0.0; 33 | } 34 | } 35 | 36 | __global__ 37 | void 38 | axpy_cudakernel_1perThread(REAL* x, REAL* y, int n, REAL a) 39 | { 40 | int i = blockDim.x * blockIdx.x + threadIdx.x; 41 | if (i > 0 &&i < n) y[i] += a*x[i]; 42 | } 43 | 44 | double axpy_cuda_normal(REAL* x, REAL* y, int n, REAL a) { 45 | REAL *d_x, *d_y; 46 | cudaMalloc(&d_x, n*sizeof(REAL)); 47 | cudaMalloc(&d_y, n*sizeof(REAL)); 48 | double time = read_timer_ms(); 49 | 50 | cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice); 51 | cudaMemcpy(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice); 52 | time = read_timer_ms() - time; 53 | 54 | // Perform axpy elements 55 | axpy_cudakernel_1perThread<<<(n+255)/256, 256>>>(d_x, d_y, n, a); 56 | cudaDeviceSynchronize(); 57 | 58 | cudaMemcpy(y, d_y, n*sizeof(REAL), cudaMemcpyDeviceToHost); 59 | cudaFree(d_x); 60 | cudaFree(d_y); 61 | return time; 62 | } 63 | 64 | double axpy_cuda_async(REAL* x, REAL* y, int n, REAL a) { 65 | cudaStream_t stream1; 66 | cudaError_t result; 67 | result = cudaStreamCreate(&stream1); 68 | 69 | REAL *d_x, *d_y; 70 | cudaMalloc(&d_x, n*sizeof(REAL)); 71 | cudaMalloc(&d_y, n*sizeof(REAL)); 72 | double time2 = read_timer_ms(); 73 | 74 | cudaMemcpyAsync(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice, stream1); 75 | cudaMemcpyAsync(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice, stream1); 76 | time2 = read_timer_ms() - time2; 77 | 78 | 79 | //cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice); 80 | //cudaMemcpy(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice); 81 | // Perform axpy elements 82 | axpy_cudakernel_1perThread<<<(n+255)/256, 256>>>(d_x, d_y, n, a); 83 | cudaDeviceSynchronize(); 84 | 85 | 86 | cudaMemcpy(y, d_y, n*sizeof(REAL), cudaMemcpyDeviceToHost); 87 | cudaFree(d_x); 88 | cudaFree(d_y); 89 | return time2; 90 | } 91 | 92 | 93 | 94 | /* initialize a vector with random floating point numbers */ 95 | void init(REAL *A, int n) 96 | { 97 | int i; 98 | for (i = 0; i < n; i++) { 99 | A[i] = (double)drand48(); 100 | } 101 | } 102 | 103 | /*serial version */ 104 | void axpy(REAL* x, REAL* y, long n, REAL a) { 105 | int i; 106 | for (i = 1; i < n; ++i) 107 | { 108 | y[i] += a * x[i]; 109 | } 110 | } 111 | 112 | /* compare two arrays and return percentage of difference */ 113 | REAL check(REAL*A, REAL*B, int n) 114 | { 115 | int i; 116 | REAL diffsum =0.0, sum = 0.0; 117 | for (i = 0; i < n; i++) { 118 | diffsum += fabs(A[i] - B[i]); 119 | sum += fabs(B[i]); 120 | } 121 | return diffsum/sum; 122 | } 123 | 124 | int main(int argc, char *argv[]) 125 | { 126 | int n; 127 | REAL *y_cuda, *y, *x, *y_cuda_async; 128 | REAL a = 123.456; 129 | 130 | n = VEC_LEN; 131 | fprintf(stderr, "Usage: axpy \n"); 132 | if (argc >= 2) { 133 | n = atoi(argv[1]); 134 | } 135 | y_cuda = (REAL *) malloc(n * sizeof(REAL)); 136 | y_cuda_async = (REAL *) malloc(n * sizeof(REAL)); 137 | y = (REAL *) malloc(n * sizeof(REAL)); 138 | x = (REAL *) malloc(n * sizeof(REAL)); 139 | 140 | srand48(1<<12); 141 | init(x, n); 142 | init(y_cuda, n); 143 | memcpy(y, y_cuda, n*sizeof(REAL)); 144 | memcpy(y_cuda_async, y_cuda, n*sizeof(REAL)); 145 | 146 | int i; 147 | int num_runs = 10; 148 | for (i=0; i 5 | axpy(1024000): checksum: 0.99919, time: 13.30ms 6 | axpy_async(1024000): checksum: 1.19903, time: 13.30ms 7 | Usage: axpy 8 | axpy(4096000): checksum: 0.99919, time: 94.20ms 9 | axpy_async(4096000): checksum: 1.19903, time: 98.20ms 10 | Usage: axpy 11 | axpy(10240000): checksum: 0.999191, time: 246.70ms 12 | axpy_async(10240000): checksum: 1.19903, time: 243.60ms 13 | Usage: axpy 14 | axpy(20480000): checksum: 0.999191, time: 518.00ms 15 | axpy_async(20480000): checksum: 1.19903, time: 500.00ms 16 | Usage: axpy 17 | axpy(40960000): checksum: 0.999191, time: 1021.20ms 18 | axpy_async(40960000): checksum: 1.19903, time: 989.60ms 19 | Usage: axpy 20 | axpy(102400000): checksum: 0.999191, time: 2395.00ms 21 | axpy_async(102400000): checksum: 1.19903, time: 2370.10ms 22 | -------------------------------------------------------------------------------- /HDOverlap/test.sh: -------------------------------------------------------------------------------- 1 | ./axpy_cuda 1024000 2 | ./axpy_cuda 4096000 3 | ./axpy_cuda 10240000 4 | ./axpy_cuda 20480000 5 | ./axpy_cuda 40960000 6 | ./axpy_cuda 102400000 -------------------------------------------------------------------------------- /LICENSE_BSD.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2020 - 2021 HPCAS Lab (https://passlab.github.io) 2 | from University of North Carolina at Charlotte, and 3 | Lawrence Livermore National Laboratory, LLC. 4 | 5 | LLNL-CODE-825202 6 | 7 | All rights reserved. 8 | 9 | Funding for this research and development was provided by the National Science Foundation 10 | under award number CISE SHF-1551182 and CISE SHF-2015254. 11 | The development is also funded by LLNL under Contract DE-AC52-07NA27344 and LLNL-LDRD Program 12 | under project 18-ERD-006. 13 | 14 | Redistribution and use in source and binary forms, with or without modification, 15 | are permitted provided that the following conditions are met: 16 | 17 | 1. Redistributions of source code must retain the above copyright notice, 18 | this list of conditions and the following disclaimer. 19 | 2. Redistributions in binary form must reproduce the above copyright notice, 20 | this list of conditions and the following disclaimer in the documentation 21 | and/or other materials provided with the distribution. 22 | 3. Neither the name of the copyright holder nor the names of its contributors may 23 | be used to endorse or promote products derived from this software without specific prior written permission. 24 | 25 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR 26 | IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND 27 | FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR 28 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 30 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 31 | WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 32 | ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | -------------------------------------------------------------------------------- /MemAlign/Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | nvcc -g -G -arch=sm_30 -o axpy_cuda axpy_cuda.c axpy_cudakernel.cu 3 | -------------------------------------------------------------------------------- /MemAlign/axpy.h: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | #define REAL double 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | extern void axpy_cuda(REAL *x, REAL * y, int n, REAL a); 12 | #ifdef __cplusplus 13 | } 14 | #endif 15 | -------------------------------------------------------------------------------- /MemAlign/axpy_cuda.c: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | // Experimental tests for aligned memory access and unaligned memory access 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "axpy.h" 13 | 14 | double read_timer_ms() { 15 | struct timeb tm; 16 | ftime(&tm); 17 | return (double) tm.time * 1000.0 + (double) tm.millitm; 18 | } 19 | 20 | /* change this to do saxpy or daxpy : single precision or double precision*/ 21 | #define REAL double 22 | #define VEC_LEN 1024000 //use a fixed number for now 23 | /* zero out the entire vector */ 24 | void zero(REAL *A, int n) 25 | { 26 | int i; 27 | for (i = 0; i < n; i++) { 28 | A[i] = 0.0; 29 | } 30 | } 31 | 32 | /* initialize a vector with random floating point numbers */ 33 | void init(REAL *A, int n) 34 | { 35 | int i; 36 | for (i = 0; i < n; i++) { 37 | A[i] = (double)drand48(); 38 | } 39 | } 40 | 41 | /*serial version */ 42 | void axpy(REAL* x, REAL* y, long n, REAL a) { 43 | int i; 44 | for (i = 1; i < n; ++i) 45 | { 46 | y[i] += a * x[i]; 47 | } 48 | } 49 | 50 | /* compare two arrays and return percentage of difference */ 51 | REAL check(REAL*A, REAL*B, int n) 52 | { 53 | int i; 54 | REAL diffsum =0.0, sum = 0.0; 55 | for (i = 0; i < n; i++) { 56 | diffsum += fabs(A[i] - B[i]); 57 | sum += fabs(B[i]); 58 | } 59 | return diffsum/sum; 60 | } 61 | 62 | int main(int argc, char *argv[]) 63 | { 64 | int n; 65 | REAL *y_cuda, *y, *x; 66 | REAL a = 123.456; 67 | 68 | n = VEC_LEN; 69 | fprintf(stderr, "Usage: axpy \n"); 70 | if (argc >= 2) { 71 | n = atoi(argv[1]); 72 | } 73 | y_cuda = (REAL *) malloc(n * sizeof(REAL)); 74 | y = (REAL *) malloc(n * sizeof(REAL)); 75 | x = (REAL *) malloc(n * sizeof(REAL)); 76 | 77 | srand48(1<<12); 78 | init(x, n); 79 | init(y_cuda, n); 80 | memcpy(y, y_cuda, n*sizeof(REAL)); 81 | 82 | int i; 83 | int num_runs = 10; 84 | for (i=0; i 0 &&i < n) y[i] += a*x[i]; 14 | } 15 | 16 | __global__ 17 | void 18 | axpy_cudakernel_1perThread_misaligned(REAL* x, REAL* y, int n, REAL a) 19 | { 20 | int i = blockDim.x * blockIdx.x + threadIdx.x + 1; 21 | if (i < n) y[i] += a*x[i]; 22 | } 23 | 24 | __global__ 25 | void 26 | axpy_cudakernel_1perThread_warmup(REAL* x, REAL* y, int n, REAL a) 27 | { 28 | int i = blockDim.x * blockIdx.x + threadIdx.x; 29 | if (i > 1 && i < n) y[i] += a*x[i]; 30 | } 31 | 32 | 33 | void axpy_cuda(REAL* x, REAL* y, int n, REAL a) { 34 | REAL *d_x, *d_y; 35 | cudaMalloc(&d_x, n*sizeof(REAL)); 36 | cudaMalloc(&d_y, n*sizeof(REAL)); 37 | 38 | cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice); 39 | cudaMemcpy(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice); 40 | 41 | //warm up 42 | axpy_cudakernel_1perThread_warmup<<<(n+255)/256, 256>>>(d_x, d_y, n, a); 43 | cudaDeviceSynchronize(); 44 | // Perform axpy elements 45 | axpy_cudakernel_1perThread_misaligned<<<(n+255)/256, 256>>>(d_x, d_y, n, a); 46 | cudaDeviceSynchronize(); 47 | axpy_cudakernel_1perThread<<<(n+255)/256, 256>>>(d_x, d_y, n, a); 48 | cudaDeviceSynchronize(); 49 | 50 | 51 | cudaMemcpy(y, d_y, n*sizeof(REAL), cudaMemcpyDeviceToHost); 52 | cudaFree(d_x); 53 | cudaFree(d_y); 54 | } 55 | 56 | -------------------------------------------------------------------------------- /MemAlign/test.sh: -------------------------------------------------------------------------------- 1 | nvprof ./axpy_cuda 1024000 2 | nvprof ./axpy_cuda 4096000 3 | nvprof ./axpy_cuda 10240000 4 | nvprof ./axpy_cuda 20480000 5 | nvprof ./axpy_cuda 40960000 6 | nvprof ./axpy_cuda 102400000 -------------------------------------------------------------------------------- /MiniTransfer_SpMV/Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | nvcc -o SpMV_cuda SpMV_cuda.c SpMV_cudakernel.cu 3 | -------------------------------------------------------------------------------- /MiniTransfer_SpMV/SpMV.h: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | #define REAL float 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | extern double spmv_cuda_csr_discrete(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y_normal); 12 | extern double spmv_cuda_dense_discrete(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y_normal); 13 | extern double warmingup(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y_normal); 14 | extern void init_csr(int *ptr, REAL *data, int *indices, REAL *matrix, int num_rows, int nnz); 15 | extern double warmingup_dense(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y); 16 | extern double warmingup_csr(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y); 17 | extern void init_index(int * row, int * column, REAL *matrix, int num_rows); 18 | extern double spmv_cuda_unified(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y); 19 | extern double spmv_cuda_unified_count(const int num_rows, const REAL * x, int nnz, REAL* matrix, REAL *y); 20 | extern void init_index_count(int * row_nnz, int * row, int * column, REAL *matrix, int num_rows); 21 | //extern void init_ptr(int *ptr, REAL * matrix, int num_rows, int nnz); 22 | extern double read_timer_ms(); 23 | #ifdef __cplusplus 24 | } 25 | #endif 26 | -------------------------------------------------------------------------------- /MiniTransfer_SpMV/SpMV_cuda.output.carina.txt: -------------------------------------------------------------------------------- 1 | xyi2@cci-carina:~/CUDAMemBench/SpMV_new$ sh test.sh 2 | Usage: SpMV 3 | Spmv (dense) (524288): time: 2.30ms 4 | Spmv (csr) (524288): time: 14.88ms 5 | Spmv (unified) (524288): time: 68.68ms 6 | Spmv (unified_count) (524288): time: 68.44ms 7 | Usage: SpMV 8 | Spmv (dense) (524288): time: 1.60ms 9 | Spmv (csr) (524288): time: 12.98ms 10 | Spmv (unified) (524288): time: 68.92ms 11 | Spmv (unified_count) (524288): time: 69.24ms 12 | Usage: SpMV 13 | Spmv (dense) (262144): time: 1.62ms 14 | Spmv (csr) (262144): time: 8.38ms 15 | Spmv (unified) (262144): time: 37.80ms 16 | Spmv (unified_count) (262144): time: 37.92ms 17 | Usage: SpMV 18 | Spmv (dense) (131072): time: 1.62ms 19 | Spmv (csr) (131072): time: 5.28ms 20 | Spmv (unified) (131072): time: 20.70ms 21 | Spmv (unified_count) (131072): time: 20.80ms 22 | Usage: SpMV 23 | Spmv (dense) (65535): time: 1.60ms 24 | Spmv (csr) (65535): time: 4.04ms 25 | Spmv (unified) (65535): time: 12.58ms 26 | Spmv (unified_count) (65535): time: 12.58ms 27 | Usage: SpMV 28 | Spmv (dense) (32768): time: 1.58ms 29 | Spmv (csr) (32768): time: 3.24ms 30 | Spmv (unified) (32768): time: 8.32ms 31 | Spmv (unified_count) (32768): time: 8.30ms 32 | Usage: SpMV 33 | Spmv (dense) (16384): time: 1.58ms 34 | Spmv (csr) (16384): time: 3.08ms 35 | Spmv (unified) (16384): time: 6.40ms 36 | Spmv (unified_count) (16384): time: 6.32ms 37 | Usage: SpMV 38 | Spmv (dense) (8192): time: 1.58ms 39 | Spmv (csr) (8192): time: 2.80ms 40 | Spmv (unified) (8192): time: 5.82ms 41 | Spmv (unified_count) (8192): time: 5.68ms 42 | Usage: SpMV 43 | Spmv (dense) (4096): time: 1.58ms 44 | Spmv (csr) (4096): time: 2.74ms 45 | Spmv (unified) (4096): time: 5.44ms 46 | Spmv (unified_count) (4096): time: 5.40ms 47 | Usage: SpMV 48 | Spmv (dense) (2048): time: 1.60ms 49 | Spmv (csr) (2048): time: 2.70ms 50 | Spmv (unified) (2048): time: 5.64ms 51 | Spmv (unified_count) (2048): time: 5.42ms 52 | Usage: SpMV 53 | Spmv (dense) (1024): time: 1.62ms 54 | Spmv (csr) (1024): time: 2.68ms 55 | Spmv (unified) (1024): time: 5.22ms 56 | Spmv (unified_count) (1024): time: 5.22ms 57 | Usage: SpMV 58 | Spmv (dense) (512): time: 1.62ms 59 | Spmv (csr) (512): time: 2.66ms 60 | Spmv (unified) (512): time: 5.20ms 61 | Spmv (unified_count) (512): time: 5.14ms 62 | Usage: SpMV 63 | Spmv (dense) (256): time: 1.60ms 64 | Spmv (csr) (256): time: 2.64ms 65 | Spmv (unified) (256): time: 5.14ms 66 | Spmv (unified_count) (256): time: 5.04ms 67 | Usage: SpMV 68 | Spmv (dense) (128): time: 1.60ms 69 | Spmv (csr) (128): time: 2.64ms 70 | Spmv (unified) (128): time: 5.18ms 71 | Spmv (unified_count) (128): time: 5.22ms 72 | Usage: SpMV 73 | Spmv (dense) (64): time: 1.64ms 74 | Spmv (csr) (64): time: 2.64ms 75 | Spmv (unified) (64): time: 4.98ms 76 | Spmv (unified_count) (64): time: 4.94ms 77 | Usage: SpMV 78 | Spmv (dense) (32): time: 1.62ms 79 | Spmv (csr) (32): time: 2.66ms 80 | Spmv (unified) (32): time: 4.84ms 81 | Spmv (unified_count) (32): time: 4.86ms 82 | Usage: SpMV 83 | Spmv (dense) (16): time: 1.64ms 84 | Spmv (csr) (16): time: 2.66ms 85 | Spmv (unified) (16): time: 4.54ms 86 | Spmv (unified_count) (16): time: 4.58ms 87 | Usage: SpMV 88 | Spmv (dense) (8): time: 1.64ms 89 | Spmv (csr) (8): time: 2.66ms 90 | Spmv (unified) (8): time: 4.40ms 91 | Spmv (unified_count) (8): time: 4.38ms 92 | -------------------------------------------------------------------------------- /MiniTransfer_SpMV/test.sh: -------------------------------------------------------------------------------- 1 | ./SpMV_cuda 67108864 10240 2 | ./SpMV_cuda 33554432 10240 3 | ./SpMV_cuda 16777216 10240 4 | ./SpMV_cuda 8388608 10240 5 | ./SpMV_cuda 4194304 10240 6 | ./SpMV_cuda 2097152 10240 7 | ./SpMV_cuda 1048576 10240 8 | ./SpMV_cuda 524288 10240 9 | ./SpMV_cuda 262144 10240 10 | ./SpMV_cuda 131072 10240 11 | ./SpMV_cuda 65536 10240 12 | ./SpMV_cuda 32768 10240 13 | ./SpMV_cuda 16384 10240 14 | ./SpMV_cuda 8192 10240 15 | ./SpMV_cuda 4096 10240 16 | ./SpMV_cuda 2048 10240 17 | ./SpMV_cuda 1024 10240 18 | ./SpMV_cuda 512 10240 19 | ./SpMV_cuda 256 10240 20 | ./SpMV_cuda 128 10240 21 | ./SpMV_cuda 64 10240 22 | ./SpMV_cuda 32 10240 23 | ./SpMV_cuda 16 10240 24 | ./SpMV_cuda 8 10240 25 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | This work was produced under the auspices of the U.S. Department of 2 | Energy by Lawrence Livermore National Laboratory under Contract 3 | DE-AC52-07NA27344. 4 | 5 | This work was prepared as an account of work sponsored by an agency of 6 | the United States Government. Neither the United States Government nor 7 | Lawrence Livermore National Security, LLC, nor any of their employees 8 | makes any warranty, expressed or implied, or assumes any legal liability 9 | or responsibility for the accuracy, completeness, or usefulness of any 10 | information, apparatus, product, or process disclosed, or represents that 11 | its use would not infringe privately owned rights. 12 | 13 | Reference herein to any specific commercial product, process, or service 14 | by trade name, trademark, manufacturer, or otherwise does not necessarily 15 | constitute or imply its endorsement, recommendation, or favoring by the 16 | United States Government or Lawrence Livermore National Security, LLC. 17 | 18 | The views and opinions of authors expressed herein do not necessarily 19 | state or reflect those of the United States Government or Lawrence 20 | Livermore National Security, LLC, and shall not be used for advertising 21 | or product endorsement purposes. 22 | -------------------------------------------------------------------------------- /ReadOnlyMem_1D_Texture/Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | nvcc -o axpy_cuda axpy_cuda.c axpy_cudakernel.cu 3 | -------------------------------------------------------------------------------- /ReadOnlyMem_1D_Texture/axpy.h: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | #define REAL float 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | extern void axpy_cuda(REAL *x, REAL * y, int n, REAL a); 12 | #ifdef __cplusplus 13 | } 14 | #endif 15 | -------------------------------------------------------------------------------- /ReadOnlyMem_1D_Texture/axpy_cuda.c: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | // Experimental test for texture memory using 1-D array 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "axpy.h" 13 | 14 | double read_timer_ms() { 15 | struct timeb tm; 16 | ftime(&tm); 17 | return (double) tm.time * 1000.0 + (double) tm.millitm; 18 | } 19 | 20 | /* change this to do saxpy or daxpy : single precision or double precision*/ 21 | #define VEC_LEN 1024000//use a fixed number for now 22 | /* zero out the entire vector */ 23 | void zero(REAL *A, int n) 24 | { 25 | int i; 26 | for (i = 0; i < n; i++) { 27 | A[i] = 0.0; 28 | } 29 | } 30 | 31 | /* initialize a vector with random floating point numbers */ 32 | void init(REAL *A, int n) 33 | { 34 | int i; 35 | for (i = 0; i < n; i++) { 36 | A[i] = (float)drand48(); 37 | } 38 | } 39 | 40 | /*serial version */ 41 | void axpy(REAL* x, REAL* y, long n, REAL a) { 42 | int i; 43 | for (i = 0; i < n; ++i) 44 | { 45 | y[i] += a * x[i]; 46 | } 47 | } 48 | 49 | /* compare two arrays and return percentage of difference */ 50 | REAL check(REAL*A, REAL*B, int n) 51 | { 52 | int i; 53 | REAL diffsum =0.0, sum = 0.0; 54 | for (i = 0; i < n; i++) { 55 | diffsum += fabs(A[i] - B[i]); 56 | sum += fabs(B[i]); 57 | } 58 | return diffsum; 59 | } 60 | 61 | int main(int argc, char *argv[]) 62 | { 63 | int n; 64 | REAL *y_cuda, *y, *x; 65 | REAL a = 123.456; 66 | 67 | n = VEC_LEN; 68 | fprintf(stderr, "Usage: axpy \n"); 69 | if (argc >= 2) { 70 | n = atoi(argv[1]); 71 | } 72 | y_cuda = (REAL *) malloc(n * sizeof(REAL)); 73 | y = (REAL *) malloc(n * sizeof(REAL)); 74 | x = (REAL *) malloc(n * sizeof(REAL)); 75 | 76 | srand48(1<<12); 77 | init(x, n); 78 | init(y_cuda, n); 79 | memcpy(y, y_cuda, n*sizeof(REAL)); 80 | 81 | axpy(x, y, n, a); 82 | 83 | int i; 84 | int num_runs = 10; 85 | /* cuda version */ 86 | double elapsed = read_timer_ms(); 87 | for (i=0; i 10 | 11 | 12 | texture rT1; 13 | 14 | __global__ 15 | void 16 | axpy_cudakernel_warmingup(REAL* x, REAL* y, int n, REAL a) 17 | { 18 | int i = blockDim.x * blockIdx.x + threadIdx.x; 19 | if (i < n) y[i] += a*x[i]; 20 | } 21 | 22 | 23 | __global__ 24 | void 25 | axpy_cudakernel_1perThread_texture(REAL* y, int n, REAL a) 26 | { 27 | int i = blockDim.x * blockIdx.x + threadIdx.x; 28 | if (i < n) y[i] += a * tex1Dfetch(rT1, i); 29 | } 30 | 31 | __global__ 32 | void 33 | axpy_cudakernel_1perThread(REAL* x, REAL* y, int n, REAL a) 34 | { 35 | int i = blockDim.x * blockIdx.x + threadIdx.x; 36 | if (i < n) y[i] += a*x[i]; 37 | } 38 | 39 | void axpy_cuda(REAL* x, REAL* y, int n, REAL a) { 40 | REAL *d_x, *d_y; 41 | cudaMalloc(&d_x, n*sizeof(REAL)); 42 | cudaMalloc(&d_y, n*sizeof(REAL)); 43 | 44 | cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice); 45 | cudaMemcpy(d_y, y, n*sizeof(REAL), cudaMemcpyHostToDevice); 46 | 47 | cudaBindTexture(0, rT1, d_x); 48 | 49 | // Perform axpy elements 50 | axpy_cudakernel_warmingup<<<(n+255)/256, 256>>>(d_x, d_y, n, a); 51 | cudaDeviceSynchronize(); 52 | axpy_cudakernel_1perThread_texture<<<(n+255)/256, 256>>>(d_y, n, a); 53 | cudaDeviceSynchronize(); 54 | axpy_cudakernel_1perThread<<<(n+255)/256, 256>>>(d_x, d_y, n, a); 55 | cudaDeviceSynchronize(); 56 | 57 | cudaMemcpy(y, d_y, n*sizeof(REAL), cudaMemcpyDeviceToHost); 58 | cudaUnbindTexture(rT1); 59 | 60 | cudaFree(d_x); 61 | cudaFree(d_y); 62 | } 63 | -------------------------------------------------------------------------------- /ReadOnlyMem_1D_Texture/test.sh: -------------------------------------------------------------------------------- 1 | nvprof ./axpy_cuda 1024000 2 | nvprof ./axpy_cuda 4096000 3 | nvprof ./axpy_cuda 10240000 4 | nvprof ./axpy_cuda 20480000 5 | nvprof ./axpy_cuda 102400000 -------------------------------------------------------------------------------- /ReadOnlyMem_2D_Texture/Makefile: -------------------------------------------------------------------------------- 1 | default: 2 | nvcc -o matadd_2D_cuda matadd_2D_cuda.c matadd_2D_cudakernel.cu 3 | -------------------------------------------------------------------------------- /ReadOnlyMem_2D_Texture/matadd_2D.h: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | #define REAL float 7 | 8 | 9 | #ifdef __cplusplus 10 | extern "C" { 11 | #endif 12 | extern void matadd(float * h_flMat1, float * h_flMat2, int iMatSizeM, int iMatSizeN, float * h_flMatSum); 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | -------------------------------------------------------------------------------- /ReadOnlyMem_2D_Texture/matadd_2D_cuda.c: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | // Experimental test for texture memory using 2-D array 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "matadd_2D.h" 13 | #include 14 | #include 15 | #include 16 | #include 17 | 18 | double read_timer_ms() { 19 | struct timeb tm; 20 | ftime(&tm); 21 | return (double) tm.time * 1000.0 + (double) tm.millitm; 22 | } 23 | 24 | #define VEC_LEN 1024//use a fixed number for now 25 | 26 | 27 | /* zero out the entire vector */ 28 | void zero(REAL *A, int n) 29 | { 30 | int i; 31 | for (i = 0; i < n; i++) { 32 | A[i] = 0.0; 33 | } 34 | } 35 | 36 | /* initialize a matrix with random REALing point numbers */ 37 | void init_matrix(REAL *matrix, int m, int n) { 38 | for (int i = 0; i\n"); 87 | if (argc >= 2) { 88 | N = atoi(argv[1]); 89 | } 90 | 91 | int M=N; 92 | 93 | REAL *h_matrixA = (REAL*)malloc(M * N * sizeof(REAL)); 94 | REAL *h_matrixB = (REAL*)malloc(M * N * sizeof(REAL)); 95 | REAL *h_result = (REAL*)malloc(M * N * sizeof(REAL)); 96 | REAL *result_serial = (REAL*)malloc(M * N * sizeof(REAL)); 97 | 98 | init_matrix(h_matrixA, M, N); 99 | init_matrix(h_matrixB, M, N); 100 | 101 | int i; 102 | int num_runs = 5; 103 | mat_add_serial(h_matrixA, h_matrixB, M, N, result_serial); 104 | for (i=0; i 10 | 11 | #define BLOCK_SIZE 16 12 | 13 | texturetexMatrixA; 14 | texturetexMatrixB; 15 | 16 | //constant memory 17 | __constant__ int cons_M; 18 | __constant__ int cons_N; 19 | 20 | __global__ void add_warmingup(float * d_matrixA, float * d_matrixB, float *d_Result, int d_M, int d_N) 21 | { 22 | const int tidx = blockDim.x * blockIdx.x + threadIdx.x; 23 | const int tidy = blockDim.y * blockIdx.y + threadIdx.y; 24 | if(tidx(); 73 | float *d_matrixA = NULL, *d_matrixB = NULL, *d_result = NULL; 74 | cudaMalloc(&d_matrixA, M * N * sizeof(float)); 75 | cudaMalloc(&d_matrixB, M * N * sizeof(float)); 76 | cudaMalloc(&d_result, M * N * sizeof(float)); 77 | 78 | cudaMemcpy(d_matrixA, h_matrixA, M * N * sizeof(float), cudaMemcpyHostToDevice); 79 | cudaMemcpy(d_matrixB, h_matrixB, M * N * sizeof(float), cudaMemcpyHostToDevice); 80 | cudaBindTexture2D(0, texMatrixA, d_matrixA, channelDesc, N, M, M * sizeof(float)); 81 | cudaBindTexture2D(0, texMatrixB, d_matrixB, channelDesc, N, M, M * sizeof(float)); 82 | 83 | cudaMemcpyToSymbol(cons_M,&M,sizeof(float),0); 84 | cudaMemcpyToSymbol(cons_N,&N,sizeof(float),0); 85 | 86 | dim3 blocks(1,1,1); 87 | dim3 threadsperblock(BLOCK_SIZE,BLOCK_SIZE,1); 88 | blocks.x=((M/BLOCK_SIZE) + (((M)%BLOCK_SIZE)==0?0:1)); 89 | blocks.y=((N/BLOCK_SIZE) + (((N)%BLOCK_SIZE)==0?0:1)); 90 | 91 | add_warmingup<<>>(d_matrixA,d_matrixB,d_result,M,N); 92 | cudaDeviceSynchronize(); 93 | add<<>>(d_matrixA,d_matrixB,d_result,M,N); 94 | cudaDeviceSynchronize(); 95 | add_const<<>>(d_matrixA,d_matrixB,d_result); 96 | cudaDeviceSynchronize(); 97 | add_texture<<>>(d_result,M,N); 98 | cudaDeviceSynchronize(); 99 | add_texture_constant<<>>(d_result); 100 | cudaDeviceSynchronize(); 101 | 102 | cudaDeviceSynchronize(); 103 | cudaMemcpy(h_result,d_result,M * N * sizeof(float), cudaMemcpyDeviceToHost); 104 | cudaUnbindTexture(texMatrixA); 105 | cudaUnbindTexture(texMatrixB); 106 | 107 | cudaFree(d_matrixA); 108 | cudaFree(d_matrixB); 109 | cudaFree(d_result); 110 | } 111 | -------------------------------------------------------------------------------- /ReadOnlyMem_2D_Texture/test.sh: -------------------------------------------------------------------------------- 1 | nvprof ./matadd_2D_cuda 1024 2 | nvprof ./matadd_2D_cuda 10240 3 | nvprof ./matadd_2D_cuda 20480 4 | nvprof ./matadd_2D_cuda 40960 -------------------------------------------------------------------------------- /Shmem/Makefile: -------------------------------------------------------------------------------- 1 | default: mm_omp_cuda 2 | 3 | clean: 4 | rm -rf ${OBJS} *.log *.out 5 | 6 | mm_omp_cuda: mm_omp_cuda.c mm_omp_cuda.h mm_kernel.cu 7 | nvcc mm_omp_cuda.c mm_kernel.cu -o mm_omp_cuda.out 8 | 9 | -------------------------------------------------------------------------------- /Shmem/mm_omp_cuda.c: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | /* 7 | * Square matrix multiplication 8 | * A[N][N] * B[N][N] = C[N][N] 9 | * 10 | */ 11 | #include 12 | #include 13 | #include 14 | #include 15 | #include 16 | #include "mm_omp_cuda.h" 17 | 18 | #define ALLOWED_DIFF 0.0001 19 | 20 | /* read timer in second */ 21 | double read_timer() { 22 | struct timeb tm; 23 | ftime(&tm); 24 | return (double) tm.time + (double) tm.millitm / 1000.0; 25 | } 26 | 27 | /* read timer in ms */ 28 | double read_timer_ms() { 29 | struct timeb tm; 30 | ftime(&tm); 31 | return (double) tm.time * 1000.0 + (double) tm.millitm; 32 | } 33 | 34 | #define REAL double 35 | 36 | void init(int N, REAL *A) { 37 | int i, j; 38 | 39 | for (i = 0; i < N; i++) { 40 | for (j = 0; j < N; j++) { 41 | A[i*N+j] = (REAL) drand48(); 42 | } 43 | } 44 | } 45 | 46 | 47 | void matmul_serial(int N, REAL *A, REAL *B, REAL *C) { 48 | int i,j,k; 49 | REAL temp; 50 | for (i = 0; i < N; i++) { 51 | for (j = 0; j < N; j++) { 52 | temp = 0; 53 | for (k = 0; k < N; k++) { 54 | temp += (A[i * N + k] * B[k * N + j]); 55 | } 56 | C[i * N + j] = temp; 57 | } 58 | } 59 | } 60 | 61 | int main(int argc, char *argv[]) { 62 | int N; 63 | 64 | int num_threads = 4; /* 4 is default number of threads */ 65 | if (argc < 2) { 66 | fprintf(stderr, "Usage: mm (default %d) [] (default %d)\n", N, num_threads); 67 | exit(1); 68 | } 69 | N = atoi(argv[1]); 70 | 71 | double elapsed_shmem; 72 | double elapsed_cuda; 73 | 74 | REAL *A = malloc(sizeof(REAL)*N*N); 75 | REAL *B = malloc(sizeof(REAL)*N*N); 76 | REAL *C_shmem = malloc(sizeof(REAL)*N*N); 77 | REAL *C = malloc(sizeof(REAL)*N*N); 78 | REAL *C_serial = malloc(sizeof(REAL)*N*N); 79 | 80 | srand48((1 << 12)); 81 | init(N, A); 82 | init(N, B); 83 | 84 | int i, j; 85 | int num_runs = 10; 86 | 87 | matmul_serial(N, A, B, C_serial); 88 | mm_kernel_shmem(A, B, C_shmem,N); 89 | 90 | elapsed_cuda = read_timer(); 91 | for (i=0; i ALLOWED_DIFF) { 106 | printf("C[%d][%d]: %g, C_omp[%d][%d]: %g\n", i, j, C[i * N + j], i, j, C_serial[i * N + j]); 107 | break; 108 | } 109 | } 110 | }; 111 | 112 | printf("======================================================================================================\n"); 113 | printf("\tMatrix Multiplication: A[N][N] * B[N][N] = C[N][N], N=%d\n", N); 114 | printf("------------------------------------------------------------------------------------------------------\n"); 115 | printf("Performance:\t\tRuntime (ms)\t MFLOPS\n"); 116 | printf("------------------------------------------------------------------------------------------------------\n"); 117 | printf("matmul_cuda:\t\t%4f\t%4f\n", elapsed_cuda * 1.0e3, ((((2.0 * N) * N) * N) / (1.0e6 * elapsed_cuda))); 118 | printf("------------------------------------------------------------------------------------------------------\n"); 119 | printf("matmul_shmem:\t\t%4f\t%4f\n", elapsed_shmem * 1.0e3, ((((2.0 * N) * N) * N) / (1.0e6 * elapsed_shmem))); 120 | 121 | return 0; 122 | } 123 | 124 | 125 | -------------------------------------------------------------------------------- /Shmem/mm_omp_cuda.h: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | #define REAL double 7 | 8 | #ifdef __cplusplus 9 | extern "C" { 10 | #endif 11 | extern void mm_kernel(REAL*, REAL*, REAL*, int); 12 | extern void mm_kernel_shmem(REAL*, REAL*, REAL*, int); 13 | #ifdef __cplusplus 14 | } 15 | #endif 16 | -------------------------------------------------------------------------------- /Shuffle/cuda_global/README.md: -------------------------------------------------------------------------------- 1 | # reduction - CUDA Parallel Reduction 2 | 3 | ## Description 4 | 5 | A parallel sum reduction that computes the sum of a large arrays of values. This sample demonstrates several important optimization strategies for Data-Parallel Algorithms like reduction. 6 | 7 | ## Key Concepts 8 | 9 | Data-Parallel Algorithms, Performance Strategies 10 | 11 | ## Supported SM Architectures 12 | 13 | [SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) 14 | 15 | ## Supported OSes 16 | 17 | Linux, Windows, MacOSX 18 | 19 | ## Supported CPU Architecture 20 | 21 | x86_64, ppc64le, armv7l 22 | 23 | ## CUDA APIs involved 24 | 25 | ## Prerequisites 26 | 27 | Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. 28 | 29 | ## Build and Run 30 | 31 | ### Windows 32 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: 33 | ``` 34 | *_vs.sln - for Visual Studio 35 | ``` 36 | Each individual sample has its own set of solution files in its directory: 37 | 38 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. 39 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." 40 | 41 | ### Linux 42 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: 43 | ``` 44 | $ cd 45 | $ make 46 | ``` 47 | The samples makefiles can take advantage of certain options: 48 | * **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l. 49 | By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
50 | `$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
51 | See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. 52 | * **dbg=1** - build with debug symbols 53 | ``` 54 | $ make dbg=1 55 | ``` 56 | * **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. 57 | ``` 58 | $ make SMS="50 60" 59 | ``` 60 | 61 | * **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. 62 | ``` 63 | $ make HOST_COMPILER=g++ 64 | ``` 65 | 66 | ### Mac 67 | The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: 68 | ``` 69 | $ cd 70 | $ make 71 | ``` 72 | 73 | The samples makefiles can take advantage of certain options: 74 | 75 | * **dbg=1** - build with debug symbols 76 | ``` 77 | $ make dbg=1 78 | ``` 79 | 80 | * **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". 81 | ``` 82 | $ make SMS="A B ..." 83 | ``` 84 | 85 | * **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. 86 | ``` 87 | $ make HOST_COMPILER=clang 88 | ``` 89 | 90 | ## References (for more details) 91 | 92 | -------------------------------------------------------------------------------- /Shuffle/cuda_global/reduction.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef __REDUCTION_H__ 30 | #define __REDUCTION_H__ 31 | 32 | template 33 | void reduce(int size, int threads, int blocks, 34 | int whichKernel, T *d_idata, T *d_odata); 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /Shuffle/cuda_global/result.txt: -------------------------------------------------------------------------------- 1 | ./reduction.out Starting... 2 | 3 | GPU Device 0: "Volta" with compute capability 7.0 4 | 5 | Using Device 0: Tesla V100-PCIE-32GB 6 | 7 | Reducing array of type int 8 | 9 | 16777216 elements 10 | 256 threads (max) 11 | 32768 blocks 12 | 13 | Reduction, Throughput = 67.8752 GB/s, Time = 0.00099 s, Size = 16777216 Elements, NumDevsUsed = 1, Workgroup = 256 14 | 15 | GPU result = 2139353471 16 | CPU result = 2139353471 17 | 18 | Test passed 19 | ./reduction.out Starting... 20 | 21 | GPU Device 0: "Volta" with compute capability 7.0 22 | 23 | Using Device 0: Tesla V100-PCIE-32GB 24 | 25 | Reducing array of type int 26 | 27 | 33554432 elements 28 | 256 threads (max) 29 | 65536 blocks 30 | 31 | Reduction, Throughput = 161.0948 GB/s, Time = 0.00097 s, Size = 33554432 Elements, NumDevsUsed = 1, Workgroup = 256 32 | 33 | GPU result = -16317892 34 | CPU result = -16317892 35 | 36 | Test passed 37 | ./reduction.out Starting... 38 | 39 | GPU Device 0: "Volta" with compute capability 7.0 40 | 41 | Using Device 0: Tesla V100-PCIE-32GB 42 | 43 | Reducing array of type int 44 | 45 | 67108864 elements 46 | 256 threads (max) 47 | 131072 blocks 48 | 49 | Reduction, Throughput = 292.4071 GB/s, Time = 0.00103 s, Size = 67108864 Elements, NumDevsUsed = 1, Workgroup = 256 50 | 51 | GPU result = -32918757 52 | CPU result = -32918757 53 | 54 | Test passed 55 | ./reduction.out Starting... 56 | 57 | GPU Device 0: "Volta" with compute capability 7.0 58 | 59 | Using Device 0: Tesla V100-PCIE-32GB 60 | 61 | Reducing array of type int 62 | 63 | 134217728 elements 64 | 256 threads (max) 65 | 262144 blocks 66 | 67 | Reduction, Throughput = 459.6851 GB/s, Time = 0.00121 s, Size = 134217728 Elements, NumDevsUsed = 1, Workgroup = 256 68 | 69 | GPU result = -66248749 70 | CPU result = -66248749 71 | 72 | Test passed 73 | -------------------------------------------------------------------------------- /Shuffle/cuda_global/test.sh: -------------------------------------------------------------------------------- 1 | ./reduction.out n=16777216 2 | ./reduction.out n=33554432 3 | ./reduction.out n=67108864 4 | ./reduction.out n=134217728 -------------------------------------------------------------------------------- /Shuffle/cuda_shuffle/README.md: -------------------------------------------------------------------------------- 1 | # reduction - CUDA Parallel Reduction 2 | 3 | ## Description 4 | 5 | A parallel sum reduction that computes the sum of a large arrays of values. This sample demonstrates several important optimization strategies for Data-Parallel Algorithms like reduction. 6 | 7 | ## Key Concepts 8 | 9 | Data-Parallel Algorithms, Performance Strategies 10 | 11 | ## Supported SM Architectures 12 | 13 | [SM 3.0 ](https://developer.nvidia.com/cuda-gpus) [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) 14 | 15 | ## Supported OSes 16 | 17 | Linux, Windows, MacOSX 18 | 19 | ## Supported CPU Architecture 20 | 21 | x86_64, ppc64le, armv7l 22 | 23 | ## CUDA APIs involved 24 | 25 | ## Prerequisites 26 | 27 | Download and install the [CUDA Toolkit 10.2](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. 28 | 29 | ## Build and Run 30 | 31 | ### Windows 32 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: 33 | ``` 34 | *_vs.sln - for Visual Studio 35 | ``` 36 | Each individual sample has its own set of solution files in its directory: 37 | 38 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. 39 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." 40 | 41 | ### Linux 42 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: 43 | ``` 44 | $ cd 45 | $ make 46 | ``` 47 | The samples makefiles can take advantage of certain options: 48 | * **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l. 49 | By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
50 | `$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
51 | See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. 52 | * **dbg=1** - build with debug symbols 53 | ``` 54 | $ make dbg=1 55 | ``` 56 | * **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. 57 | ``` 58 | $ make SMS="50 60" 59 | ``` 60 | 61 | * **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. 62 | ``` 63 | $ make HOST_COMPILER=g++ 64 | ``` 65 | 66 | ### Mac 67 | The Mac samples are built using makefiles. To use the makefiles, change directory into the sample directory you wish to build, and run make: 68 | ``` 69 | $ cd 70 | $ make 71 | ``` 72 | 73 | The samples makefiles can take advantage of certain options: 74 | 75 | * **dbg=1** - build with debug symbols 76 | ``` 77 | $ make dbg=1 78 | ``` 79 | 80 | * **SMS="A B ..."** - override the SM architectures for which the sample will be built, where "A B ..." is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use SMS="50 60". 81 | ``` 82 | $ make SMS="A B ..." 83 | ``` 84 | 85 | * **HOST_COMPILER=** - override the default clang host compiler. See the [Mac Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#system-requirements) for a list of supported host compilers. 86 | ``` 87 | $ make HOST_COMPILER=clang 88 | ``` 89 | 90 | ## References (for more details) 91 | 92 | -------------------------------------------------------------------------------- /Shuffle/cuda_shuffle/reduction.h: -------------------------------------------------------------------------------- 1 | /* Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. 2 | * 3 | * Redistribution and use in source and binary forms, with or without 4 | * modification, are permitted provided that the following conditions 5 | * are met: 6 | * * Redistributions of source code must retain the above copyright 7 | * notice, this list of conditions and the following disclaimer. 8 | * * Redistributions in binary form must reproduce the above copyright 9 | * notice, this list of conditions and the following disclaimer in the 10 | * documentation and/or other materials provided with the distribution. 11 | * * Neither the name of NVIDIA CORPORATION nor the names of its 12 | * contributors may be used to endorse or promote products derived 13 | * from this software without specific prior written permission. 14 | * 15 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY 16 | * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 17 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 18 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR 19 | * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 20 | * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 21 | * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 22 | * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY 23 | * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | */ 27 | 28 | 29 | #ifndef __REDUCTION_H__ 30 | #define __REDUCTION_H__ 31 | 32 | template 33 | void reduce(int size, int threads, int blocks, 34 | int whichKernel, T *d_idata, T *d_odata); 35 | 36 | #endif 37 | -------------------------------------------------------------------------------- /Shuffle/cuda_shuffle/result.txt: -------------------------------------------------------------------------------- 1 | ./reduction.out Starting... 2 | 3 | GPU Device 0: "Volta" with compute capability 7.0 4 | 5 | Using Device 0: Tesla V100-PCIE-32GB 6 | 7 | Reducing array of type int 8 | 9 | 16777216 elements 10 | 256 threads (max) 11 | 32768 blocks 12 | 13 | Reduction, Throughput = 73.3799 GB/s, Time = 0.00091 s, Size = 16777216 Elements, NumDevsUsed = 1, Workgroup = 256 14 | 15 | GPU result = 2139353471 16 | CPU result = 2139353471 17 | 18 | Test passed 19 | ./reduction.out Starting... 20 | 21 | GPU Device 0: "Volta" with compute capability 7.0 22 | 23 | Using Device 0: Tesla V100-PCIE-32GB 24 | 25 | Reducing array of type int 26 | 27 | 33554432 elements 28 | 256 threads (max) 29 | 65536 blocks 30 | 31 | Reduction, Throughput = 161.9071 GB/s, Time = 0.00083 s, Size = 33554432 Elements, NumDevsUsed = 1, Workgroup = 256 32 | 33 | GPU result = -16317892 34 | CPU result = -16317892 35 | 36 | Test passed 37 | ./reduction.out Starting... 38 | 39 | GPU Device 0: "Volta" with compute capability 7.0 40 | 41 | Using Device 0: Tesla V100-PCIE-32GB 42 | 43 | Reducing array of type int 44 | 45 | 67108864 elements 46 | 256 threads (max) 47 | 131072 blocks 48 | 49 | Reduction, Throughput = 323.6463 GB/s, Time = 0.00083 s, Size = 67108864 Elements, NumDevsUsed = 1, Workgroup = 256 50 | 51 | GPU result = -32918757 52 | CPU result = -32918757 53 | 54 | Test passed 55 | ./reduction.out Starting... 56 | 57 | GPU Device 0: "Volta" with compute capability 7.0 58 | 59 | Using Device 0: Tesla V100-PCIE-32GB 60 | 61 | Reducing array of type int 62 | 63 | 134217728 elements 64 | 256 threads (max) 65 | 262144 blocks 66 | 67 | Reduction, Throughput = 590.8185 GB/s, Time = 0.00091 s, Size = 134217728 Elements, NumDevsUsed = 1, Workgroup = 256 68 | 69 | GPU result = -66248749 70 | CPU result = -66248749 71 | 72 | Test passed 73 | -------------------------------------------------------------------------------- /Shuffle/cuda_shuffle/test.sh: -------------------------------------------------------------------------------- 1 | ./reduction.out n=16777216 2 | ./reduction.out n=33554432 3 | ./reduction.out n=67108864 4 | ./reduction.out n=134217728 -------------------------------------------------------------------------------- /TaskGraph/NsightEclipse.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | conjugateGradientCudaGraphs 5 | 6 | cudaStreamBeginCapture 7 | cudaStreamEndCapture 8 | cudaGraphCreate 9 | cudaGraphLaunch 10 | cudaGraphInstantiate 11 | cudaGraphExecDestroy 12 | cudaGraphDestroy 13 | 14 | 15 | whole 16 | 17 | ./ 18 | ../ 19 | ../../common/inc 20 | 21 | 22 | Linear Algebra 23 | CUBLAS Library 24 | CUSPARSE Library 25 | 26 | 27 | CUDA 28 | CUBLAS 29 | CUSPARSE 30 | Sparse Matrix 31 | 32 | 33 | cublas_static 34 | cublasLt_static 35 | cusparse_static 36 | culibos 37 | 38 | 39 | 40 | true 41 | conjugateGradientCudaGraphs.cu 42 | 43 | CUBLAS 44 | CUSPARSE 45 | 46 | 47 | 1:CUDA Advanced Topics 48 | 3:Linear Algebra 49 | 1:CUDA Graphs 50 | 51 | sm35 52 | sm37 53 | sm50 54 | sm52 55 | sm60 56 | sm61 57 | sm70 58 | sm72 59 | sm75 60 | sm80 61 | sm86 62 | 63 | 64 | x86_64 65 | linux 66 | 67 | 68 | windows7 69 | 70 | 71 | x86_64 72 | macosx 73 | 74 | 75 | arm 76 | 77 | 78 | ppc64le 79 | linux 80 | 81 | 82 | 83 | all 84 | 85 | Conjugate Gradient using Cuda Graphs 86 | exe 87 | 88 | -------------------------------------------------------------------------------- /TaskGraph/README.md: -------------------------------------------------------------------------------- 1 | # conjugateGradientCudaGraphs - Conjugate Gradient using Cuda Graphs 2 | 3 | ## Description 4 | 5 | This sample implements a conjugate gradient solver on GPU using CUBLAS and CUSPARSE library calls captured and called using CUDA Graph APIs. 6 | 7 | ## Key Concepts 8 | 9 | Linear Algebra, CUBLAS Library, CUSPARSE Library 10 | 11 | ## Supported SM Architectures 12 | 13 | [SM 3.5 ](https://developer.nvidia.com/cuda-gpus) [SM 3.7 ](https://developer.nvidia.com/cuda-gpus) [SM 5.0 ](https://developer.nvidia.com/cuda-gpus) [SM 5.2 ](https://developer.nvidia.com/cuda-gpus) [SM 6.0 ](https://developer.nvidia.com/cuda-gpus) [SM 6.1 ](https://developer.nvidia.com/cuda-gpus) [SM 7.0 ](https://developer.nvidia.com/cuda-gpus) [SM 7.2 ](https://developer.nvidia.com/cuda-gpus) [SM 7.5 ](https://developer.nvidia.com/cuda-gpus) [SM 8.0 ](https://developer.nvidia.com/cuda-gpus) [SM 8.6 ](https://developer.nvidia.com/cuda-gpus) 14 | 15 | ## Supported OSes 16 | 17 | Linux, Windows 18 | 19 | ## Supported CPU Architecture 20 | 21 | x86_64, ppc64le, armv7l 22 | 23 | ## CUDA APIs involved 24 | 25 | ### [CUDA Runtime API](http://docs.nvidia.com/cuda/cuda-runtime-api/index.html) 26 | cudaStreamBeginCapture, cudaStreamEndCapture, cudaGraphCreate, cudaGraphLaunch, cudaGraphInstantiate, cudaGraphExecDestroy, cudaGraphDestroy 27 | 28 | ## Dependencies needed to build/run 29 | [CUBLAS](../../README.md#cublas), [CUSPARSE](../../README.md#cusparse) 30 | 31 | ## Prerequisites 32 | 33 | Download and install the [CUDA Toolkit 11.1](https://developer.nvidia.com/cuda-downloads) for your corresponding platform. 34 | Make sure the dependencies mentioned in [Dependencies]() section above are installed. 35 | 36 | ## Build and Run 37 | 38 | ### Windows 39 | The Windows samples are built using the Visual Studio IDE. Solution files (.sln) are provided for each supported version of Visual Studio, using the format: 40 | ``` 41 | *_vs.sln - for Visual Studio 42 | ``` 43 | Each individual sample has its own set of solution files in its directory: 44 | 45 | To build/examine all the samples at once, the complete solution files should be used. To build/examine a single sample, the individual sample solution files should be used. 46 | > **Note:** Some samples require that the Microsoft DirectX SDK (June 2010 or newer) be installed and that the VC++ directory paths are properly set up (**Tools > Options...**). Check DirectX Dependencies section for details." 47 | 48 | ### Linux 49 | The Linux samples are built using makefiles. To use the makefiles, change the current directory to the sample directory you wish to build, and run make: 50 | ``` 51 | $ cd 52 | $ make 53 | ``` 54 | The samples makefiles can take advantage of certain options: 55 | * **TARGET_ARCH=** - cross-compile targeting a specific architecture. Allowed architectures are x86_64, ppc64le, armv7l. 56 | By default, TARGET_ARCH is set to HOST_ARCH. On a x86_64 machine, not setting TARGET_ARCH is the equivalent of setting TARGET_ARCH=x86_64.
57 | `$ make TARGET_ARCH=x86_64`
`$ make TARGET_ARCH=ppc64le`
`$ make TARGET_ARCH=armv7l`
58 | See [here](http://docs.nvidia.com/cuda/cuda-samples/index.html#cross-samples) for more details. 59 | * **dbg=1** - build with debug symbols 60 | ``` 61 | $ make dbg=1 62 | ``` 63 | * **SMS="A B ..."** - override the SM architectures for which the sample will be built, where `"A B ..."` is a space-delimited list of SM architectures. For example, to generate SASS for SM 50 and SM 60, use `SMS="50 60"`. 64 | ``` 65 | $ make SMS="50 60" 66 | ``` 67 | 68 | * **HOST_COMPILER=** - override the default g++ host compiler. See the [Linux Installation Guide](http://docs.nvidia.com/cuda/cuda-installation-guide-linux/index.html#system-requirements) for a list of supported host compilers. 69 | ``` 70 | $ make HOST_COMPILER=g++ 71 | ``` 72 | 73 | ## References (for more details) 74 | 75 | -------------------------------------------------------------------------------- /TaskGraph/conjugateGradientCudaGraphs_vs2015.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 14.00 3 | # Visual Studio 2015 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2015.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|x64 = Debug|x64 9 | Release|x64 = Release|x64 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 13 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 14 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 15 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /TaskGraph/conjugateGradientCudaGraphs_vs2015.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | $(VCTargetsPath)\BuildCustomizations 5 | 6 | 7 | 8 | Debug 9 | x64 10 | 11 | 12 | Release 13 | x64 14 | 15 | 16 | 17 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15} 18 | conjugateGradientCudaGraphs_vs2015 19 | conjugateGradientCudaGraphs 20 | 21 | 22 | 23 | 24 | Application 25 | MultiByte 26 | v140 27 | 28 | 29 | true 30 | 31 | 32 | true 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | $(Platform)/$(Configuration)/ 44 | $(IncludePath) 45 | AllRules.ruleset 46 | 47 | 48 | 49 | 50 | ../../bin/win64/$(Configuration)/ 51 | 52 | 53 | 54 | Level3 55 | WIN32;_MBCS;%(PreprocessorDefinitions) 56 | ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); 57 | 58 | 59 | Console 60 | cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 61 | $(CudaToolkitLibDir); 62 | $(OutDir)/conjugateGradientCudaGraphs.exe 63 | 64 | 65 | compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; 66 | -Xcompiler "/wd 4819" %(AdditionalOptions) 67 | ./;../../Common 68 | WIN32 69 | 70 | 71 | 72 | 73 | Disabled 74 | MultiThreadedDebug 75 | 76 | 77 | true 78 | Default 79 | 80 | 81 | MTd 82 | 64 83 | 84 | 85 | 86 | 87 | MaxSpeed 88 | MultiThreaded 89 | 90 | 91 | false 92 | UseLinkTimeCodeGeneration 93 | 94 | 95 | MT 96 | 64 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | -------------------------------------------------------------------------------- /TaskGraph/conjugateGradientCudaGraphs_vs2017.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2017 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2017.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|x64 = Debug|x64 9 | Release|x64 = Release|x64 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 13 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 14 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 15 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /TaskGraph/conjugateGradientCudaGraphs_vs2019.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 2019 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "conjugateGradientCudaGraphs", "conjugateGradientCudaGraphs_vs2019.vcxproj", "{997E0757-EA74-4A4E-A0FC-47D8C8831A15}" 5 | EndProject 6 | Global 7 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 8 | Debug|x64 = Debug|x64 9 | Release|x64 = Release|x64 10 | EndGlobalSection 11 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 12 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.ActiveCfg = Debug|x64 13 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Debug|x64.Build.0 = Debug|x64 14 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.ActiveCfg = Release|x64 15 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15}.Release|x64.Build.0 = Release|x64 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /TaskGraph/conjugateGradientCudaGraphs_vs2019.vcxproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | $(VCTargetsPath)\BuildCustomizations 5 | 6 | 7 | 8 | Debug 9 | x64 10 | 11 | 12 | Release 13 | x64 14 | 15 | 16 | 17 | {997E0757-EA74-4A4E-A0FC-47D8C8831A15} 18 | conjugateGradientCudaGraphs_vs2019 19 | conjugateGradientCudaGraphs 20 | 21 | 22 | 23 | 24 | Application 25 | MultiByte 26 | v142 27 | 10.0 28 | 29 | 30 | true 31 | 32 | 33 | true 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | $(Platform)/$(Configuration)/ 45 | $(IncludePath) 46 | AllRules.ruleset 47 | 48 | 49 | 50 | 51 | ../../bin/win64/$(Configuration)/ 52 | 53 | 54 | 55 | Level3 56 | WIN32;_MBCS;%(PreprocessorDefinitions) 57 | ./;$(CudaToolkitDir)/include;../../Common;$(CudaToolkitIncludeDir); 58 | 59 | 60 | Console 61 | cublas.lib;cusparse.lib;cudart_static.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies) 62 | $(CudaToolkitLibDir); 63 | $(OutDir)/conjugateGradientCudaGraphs.exe 64 | 65 | 66 | compute_35,sm_35;compute_37,sm_37;compute_50,sm_50;compute_52,sm_52;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70;compute_75,sm_75;compute_80,sm_80;compute_86,sm_86; 67 | -Xcompiler "/wd 4819" %(AdditionalOptions) 68 | ./;../../Common 69 | WIN32 70 | 71 | 72 | 73 | 74 | Disabled 75 | MultiThreadedDebug 76 | 77 | 78 | true 79 | Default 80 | 81 | 82 | MTd 83 | 64 84 | 85 | 86 | 87 | 88 | MaxSpeed 89 | MultiThreaded 90 | 91 | 92 | false 93 | UseLinkTimeCodeGeneration 94 | 95 | 96 | MT 97 | 64 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /UniMem/LowAccessDensityTest.h: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | #define REAL float 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | extern void LowAccessDensityTest_cuda(REAL* x, REAL* y, long int n, REAL a, int stride); 11 | extern void LowAccessDensityTest_cuda_unified(REAL* x, REAL* y, long int n, REAL a, int stride); 12 | #ifdef __cplusplus 13 | } 14 | #endif 15 | -------------------------------------------------------------------------------- /UniMem/LowAccessDensityTest_cuda.cu: -------------------------------------------------------------------------------- 1 | //******************************************************************************************************************// 2 | // Copyright (c) 2021, University of North Carolina at Charlotte 3 | // and Lawrence Livermore National Security, LLC. 4 | // SPDX-License-Identifier: (BSD-3-Clause) 5 | //*****************************************************************************************************************// 6 | // Experimental test input for Accelerator directives 7 | // simplest scalar*vector operations 8 | // Liao 1/15/2013 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include "LowAccessDensityTest.h" 15 | 16 | 17 | double read_timer_ms() { 18 | struct timeb tm; 19 | ftime(&tm); 20 | return (double) tm.time * 1000.0 + (double) tm.millitm; 21 | } 22 | 23 | /* change this to do saxpy or daxpy : single precision or double precision*/ 24 | #define REAL float 25 | #define VEC_LEN 102400000//use a fixed number for now 26 | #define STRIDE 1024 27 | 28 | /* zero out the entire vector */ 29 | void zero(REAL *A, long int n) 30 | { 31 | int i; 32 | for (i = 0; i < n; i++) { 33 | A[i] = 0.0; 34 | } 35 | } 36 | 37 | /* initialize a vector with random floating point numbers */ 38 | void init(REAL *A, long int n) 39 | { 40 | int i; 41 | for (i = 0; i < n; i++) { 42 | A[i] = (double)drand48(); 43 | } 44 | } 45 | 46 | __global__ 47 | void 48 | LowAccessDensityTest_cudakernel(REAL* x, REAL* y, int n, REAL a, int stride) 49 | { 50 | int i = blockDim.x * blockIdx.x + threadIdx.x; 51 | if (i < (n/stride)) y[i] = a*x[i*stride]; 52 | } 53 | 54 | void LowAccessDensityTest_cuda_discrete_memory(REAL* x, REAL* y, long int n, REAL a, int stride) { 55 | REAL *d_x, *d_y; 56 | cudaMalloc(&d_x, n*sizeof(REAL)); 57 | cudaMalloc(&d_y, (n/stride)*sizeof(REAL)); 58 | 59 | cudaMemcpy(d_x, x, n*sizeof(REAL), cudaMemcpyHostToDevice); 60 | LowAccessDensityTest_cudakernel<<<(n+255)/256, 256>>>(d_x, d_y, n, a, stride); 61 | cudaDeviceSynchronize(); 62 | //cudaMemcpy(y, d_y, (n/stride)*sizeof(REAL), cudaMemcpyDeviceToHost); 63 | 64 | cudaFree(d_x); 65 | cudaFree(d_y); 66 | } 67 | 68 | /* return the measured time */ 69 | double LowAccessDensityTest_cuda_unified_memory(REAL* x, REAL* y, long int n, REAL a, int stride) { 70 | 71 | double elapsed1 = read_timer_ms(); 72 | REAL *x2; 73 | cudaMallocManaged(&x2, n*sizeof(REAL)); 74 | elapsed1 = (read_timer_ms() - elapsed1); 75 | 76 | //initial unified memory, should not count time here 77 | memcpy(x2, x, n*sizeof(REAL)); 78 | 79 | double elapsed2 = read_timer_ms(); 80 | REAL *d_y; 81 | cudaMalloc(&d_y, (n/stride)*sizeof(REAL)); 82 | 83 | LowAccessDensityTest_cudakernel<<<(n+255)/256, 256>>>(x2, d_y, n, a, stride); 84 | cudaDeviceSynchronize(); 85 | elapsed2 = (read_timer_ms() - elapsed2); 86 | //cudaMemcpy(y, d_y, (n/stride)*sizeof(REAL), cudaMemcpyDeviceToHost); 87 | 88 | cudaFree(x2); 89 | cudaFree(d_y); 90 | 91 | return elapsed1 + elapsed2; 92 | } 93 | 94 | 95 | /*serial version */ 96 | void serial(REAL* x, REAL* y, long n, REAL a, int stride) { 97 | int i; 98 | for (i = 0; i < (n/stride); i++) 99 | { 100 | y[i] = a * x[i*stride]; 101 | } 102 | } 103 | 104 | /* compare two arrays and return percentage of difference */ 105 | REAL check(REAL*A, REAL*B, long int n) 106 | { 107 | int i; 108 | REAL diffsum =0.0, sum = 0.0; 109 | for (i = 0; i < n; i++) { 110 | diffsum += fabs(A[i] - B[i]); 111 | sum += fabs(B[i]); 112 | } 113 | return diffsum/sum; 114 | } 115 | 116 | int main(int argc, char *argv[]) 117 | { 118 | long int n; 119 | int stride = STRIDE; 120 | REAL *y_cuda, *y, *x, *y_cuda_unified; 121 | REAL a = 123.456; 122 | 123 | n = VEC_LEN; 124 | fprintf(stderr, "Usage: Low Access Test \n"); 125 | if (argc >= 2) { 126 | stride = atoi(argv[1]); 127 | } 128 | if (argc >= 3) { 129 | n = atoi(argv[2]); 130 | } 131 | y_cuda = (REAL *) malloc((n/stride) * sizeof(REAL)); 132 | y_cuda_unified = (REAL *) malloc((n/stride) * sizeof(REAL)); 133 | y = (REAL *) malloc((n/stride) * sizeof(REAL)); 134 | x = (REAL *) malloc(n * sizeof(REAL)); 135 | 136 | srand48(1<<12); 137 | init(x, n); 138 | 139 | serial(x, y, n, a, stride); 140 | 141 | int i; 142 | int num_runs = 100; 143 | /* cuda version */ 144 | //warming up 145 | LowAccessDensityTest_cuda_discrete_memory(x, y_cuda, n, a, stride); 146 | 147 | double elapsed = read_timer_ms(); 148 | for (i=0; i 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | double read_timer_ms() { 15 | struct timeb tm; 16 | ftime(&tm); 17 | return (double) tm.time * 1000.0 + (double) tm.millitm; 18 | } 19 | 20 | /* change this to do saxpy or daxpy : single precision or double precision*/ 21 | #define REAL double 22 | #define VEC_LEN 102400000//use a fixed number for now 23 | #define STRIDE 1024 24 | 25 | /* zero out the entire vector */ 26 | void zero(REAL *A, long int n) 27 | { 28 | int i; 29 | for (i = 0; i < n; i++) { 30 | A[i] = 0.0; 31 | } 32 | } 33 | 34 | /* initialize a vector with random floating point numbers */ 35 | void init(REAL *A, long int n) 36 | { 37 | int i; 38 | for (i = 0; i < n; i++) { 39 | A[i] = (double)drand48(); 40 | } 41 | } 42 | 43 | /*serial version */ 44 | void serial_kernel(REAL* x, REAL* y, long n, REAL a, int stride) { 45 | int i; 46 | for (i = 0; i < n; i+=stride) 47 | { 48 | y[i] += a * x[i]; 49 | } 50 | } 51 | 52 | /*omp version */ 53 | void omp_kernel(REAL* x, REAL* y, long n, REAL a, int stride) { 54 | int i; 55 | #pragma omp parallel for shared(x,y,a,n,stride) private(i) 56 | for (i = 0; i < n; i+=stride) 57 | { 58 | y[i] += a * x[i]; 59 | } 60 | } 61 | 62 | /*omp gpu version */ 63 | void omp_gpu_kernel(REAL* x, REAL* y, long n, REAL a, int stride) { 64 | int i; 65 | //#pragma omp target teams distribute parallel for map(tofrom:y) map(to:x,a,n,stride) 66 | #pragma omp target map(to:a,n,x[0:n]) map(tofrom:y[0:n]) 67 | #pragma parallel for 68 | for (i = 0; i < n; i+=stride) 69 | { 70 | y[i] += a * x[i]; 71 | } 72 | } 73 | 74 | 75 | 76 | /* compare two arrays and return percentage of difference */ 77 | REAL check(REAL*A, REAL*B, long int n) 78 | { 79 | int i; 80 | REAL diffRatioSum= 0.0; 81 | for (i = 0; i < n; i++) { 82 | REAL diff = fabs(A[i] - B[i]); 83 | if (fabs(B[i])==0.0) 84 | diffRatioSum+=0.0; 85 | else 86 | diffRatioSum += diff/fabs(B[i]); 87 | } 88 | return diffRatioSum/n; 89 | } 90 | 91 | int main(int argc, char *argv[]) 92 | { 93 | long int n; 94 | int stride = STRIDE; 95 | REAL *y_omp, *y, *x; 96 | REAL a = 123.456; 97 | 98 | n = VEC_LEN; 99 | fprintf(stderr, "Usage: %s [vec_len]\n", argv[0]); 100 | if (argc >= 2) { 101 | stride = atoi(argv[1]); 102 | } 103 | 104 | if (argc >= 3) { 105 | n = atoi(argv[2]); 106 | } 107 | printf("vec len(n_=%ld, stride=%d\n", n, stride); 108 | 109 | // same input x 110 | x = (REAL *) malloc(n * sizeof(REAL)); 111 | if (x==NULL) 112 | { 113 | fprintf(stderr, "malloc returns NULL: out of memory\n"); 114 | abort(); 115 | } 116 | srand48(time(NULL)); 117 | init(x, n); 118 | 119 | // output for serial and omp version 120 | y = (REAL *) malloc(n * sizeof(REAL)); 121 | if (y==NULL) 122 | { 123 | fprintf(stderr, "y malloc returns NULL: out of memory\n"); 124 | abort(); 125 | } 126 | 127 | y_omp = (REAL *) malloc(n * sizeof(REAL)); 128 | if (y_omp==NULL) 129 | { 130 | fprintf(stderr, "y_omp malloc returns NULL: out of memory\n"); 131 | abort(); 132 | } 133 | 134 | REAL* y_omp_gpu = (REAL *) malloc(n * sizeof(REAL)); 135 | if (y_omp_gpu==NULL) 136 | { 137 | fprintf(stderr, "y_omp malloc returns NULL: out of memory\n"); 138 | abort(); 139 | } 140 | 141 | 142 | // serial version as a reference 143 | serial_kernel(x, y, n, a, stride); 144 | 145 | int i; 146 | int num_runs = 100; 147 | 148 | /* OMP version */ 149 | double elapsed = read_timer_ms(); 150 | for (i=0; i 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include "warpDivergenceTest.h" 13 | 14 | double read_timer_ms() { 15 | struct timeb tm; 16 | ftime(&tm); 17 | return (double) tm.time * 1000.0 + (double) tm.millitm; 18 | } 19 | 20 | /* change this to do saxpy or daxpy : single precision or double precision*/ 21 | #define REAL float 22 | #define VEC_LEN 32000 //use a fixed number for now 23 | /* zero out the entire vector */ 24 | void zero(REAL *A, int n) 25 | { 26 | int i; 27 | for (i = 0; i < n; i++) { 28 | A[i] = 0.0; 29 | } 30 | } 31 | 32 | /* initialize a vector with random floating point numbers */ 33 | void init(REAL *A, int n) 34 | { 35 | int i; 36 | for (i = 0; i < n; i++) { 37 | A[i] = (float)drand48(); 38 | } 39 | } 40 | 41 | /*serial version */ 42 | void warpDivergenceSerial(REAL* x, REAL* y, REAL* z, int n) { 43 | int i; 44 | for (i = 0; i < n; ++i) 45 | { 46 | if(i%2 == 0) z[i] = 2 * x[i] + 3 * y[i]; 47 | else z[i] = 3 * x[i] + 2 * y[i]; 48 | } 49 | } 50 | 51 | void NoWarpDivergenceSerial(REAL* x, REAL* y, REAL* z, int n) { 52 | int i; 53 | for (i = 0; i < n; ++i) 54 | { 55 | if((i/32)%2 ==0 ) z[i] = 2 * x[i] + 3 * y[i]; 56 | else z[i] = 3 * x[i] + 2 * y[i]; 57 | } 58 | } 59 | 60 | /* compare two arrays and return percentage of difference */ 61 | REAL check(REAL*A, REAL*B, int n) 62 | { 63 | int i; 64 | REAL diffsum =0.0, sum = 0.0; 65 | for (i = 0; i < n; i++) { 66 | diffsum += fabs(A[i] - B[i]); 67 | sum += fabs(B[i]); 68 | } 69 | return diffsum/sum; 70 | } 71 | 72 | int main(int argc, char *argv[]) 73 | { 74 | int n; 75 | REAL *x, *y, *warp_divergence, *no_warp_divergence, *warp_divergence_serial, *no_warp_divergence_serial; 76 | 77 | n = VEC_LEN; 78 | fprintf(stderr, "Usage: warpDivergenceTest \n"); 79 | if (argc >= 2) { 80 | n = atoi(argv[1]); 81 | } 82 | x = (REAL *) malloc(n * sizeof(REAL)); 83 | y = (REAL *) malloc(n * sizeof(REAL)); 84 | warp_divergence = (REAL *) malloc(n * sizeof(REAL)); 85 | no_warp_divergence = (REAL *) malloc(n * sizeof(REAL)); 86 | 87 | warp_divergence_serial = (REAL *) malloc(n * sizeof(REAL)); 88 | no_warp_divergence_serial = (REAL *) malloc(n * sizeof(REAL)); 89 | 90 | 91 | srand48(1<<12); 92 | init(x, n); 93 | //init(y, n); 94 | memcpy(y, x, n*sizeof(REAL)); 95 | 96 | 97 | int i; 98 | int num_runs = 10; 99 | 100 | warpDivergenceSerial(x,y,warp_divergence_serial,n); 101 | NoWarpDivergenceSerial(x,y,no_warp_divergence_serial,n); 102 | /* cuda version */ 103 | double elapsed = read_timer_ms(); 104 | for (i=0; i>> (d_x, d_y, d_warp_divergence); 50 | cudaDeviceSynchronize(); 51 | 52 | warpDivergence<<<(n+255)/256, 256>>>(d_x, d_y, d_warp_divergence); 53 | cudaDeviceSynchronize(); 54 | 55 | noWarpDivergence<<<(n+255)/256, 256>>>(d_x, d_y, d_no_warp_divergence); 56 | cudaDeviceSynchronize(); 57 | 58 | cudaMemcpy(warp_divergence, d_warp_divergence, n*sizeof(REAL), cudaMemcpyDeviceToHost); 59 | cudaMemcpy(no_warp_divergence, d_no_warp_divergence, n*sizeof(REAL), cudaMemcpyDeviceToHost); 60 | 61 | 62 | cudaFree(d_x); 63 | cudaFree(d_y); 64 | 65 | cudaFree(d_warp_divergence); 66 | cudaFree(d_no_warp_divergence); 67 | 68 | 69 | } 70 | --------------------------------------------------------------------------------