├── .gitignore
├── simpleCUBLASEx
    ├── readme.txt
    ├── simpleCUBLAS.cpp
    ├── Makefile
    ├── helper_cuda.h
    └── helper_string.h
├── simpleCUBLASHgemm
    ├── readme.txt
    ├── simpleCUBLAS.cpp
    ├── Makefile
    ├── helper_cuda.h
    └── helper_string.h
├── simpleCUBLASSgemm
    ├── readme.txt
    ├── simpleCUBLAS.cpp
    ├── Makefile
    └── helper_cuda.h
├── .gitmodules
├── wmma_cuda_fortran
    ├── Makefile
    ├── main.CUF
    └── wmma_mod.CUF
└── README.md


/.gitignore:
--------------------------------------------------------------------------------
1 | *.o
2 | *.mod
3 | core
4 | cudaTensorCoreGemm
5 | simpleCUBLAS
6 | half
7 | notensor
8 | *.swp
9 | 


--------------------------------------------------------------------------------
/simpleCUBLASEx/readme.txt:
--------------------------------------------------------------------------------
1 | Sample: simpleCUBLAS
2 | Minimum spec: SM 3.0
3 | 
4 | Example of using CUBLAS using the new CUBLAS API interface available in CUDA 4.0.
5 | 
6 | Key concepts:
7 | Image Processing
8 | CUBLAS Library
9 | 


--------------------------------------------------------------------------------
/simpleCUBLASHgemm/readme.txt:
--------------------------------------------------------------------------------
1 | Sample: simpleCUBLAS
2 | Minimum spec: SM 3.0
3 | 
4 | Example of using CUBLAS using the new CUBLAS API interface available in CUDA 4.0.
5 | 
6 | Key concepts:
7 | Image Processing
8 | CUBLAS Library
9 | 


--------------------------------------------------------------------------------
/simpleCUBLASSgemm/readme.txt:
--------------------------------------------------------------------------------
1 | Sample: simpleCUBLAS
2 | Minimum spec: SM 3.0
3 | 
4 | Example of using CUBLAS using the new CUBLAS API interface available in CUDA 4.0.
5 | 
6 | Key concepts:
7 | Image Processing
8 | CUBLAS Library
9 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
 1 | [submodule "cutlass"]
 2 | 	path = cutlass
 3 | 	url = https://github.com/NVIDIA/cutlass
 4 | [submodule "apex"]
 5 | 	path = apex
 6 | 	url = git@github.com:NVIDIA/apex.git
 7 | [submodule "pictc"]
 8 | 	path = pictc
 9 | 	url = https://github.com/vishalmehta1991/pictc
10 | 


--------------------------------------------------------------------------------
/wmma_cuda_fortran/Makefile:
--------------------------------------------------------------------------------
 1 | PLATFORM=linux86-64
 2 | YEAR=2019
 3 | main: main.o wmma_mod.o
 4 | 	pgfortran -o $@ $^ -Mcuda=cc70
 5 | main.o: wmma_mod.o main.CUF
 6 | 
 7 | .SUFFIXES: .CUF .o
 8 | .CUF.o:
 9 | 	pgfortran -fast -o $@ -c -Mcuda=cc70 -I $(PGI)/$(PLATFORM)/$(YEAR)/examples/CUDA-Fortran/TensorCores/Utils $<
10 | .PHONY: clean
11 | clean:
12 | 	-rm -f *.o *.mod core main
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | NVIDIA Tensor Core Examples
 2 | ===========================
 3 | 
 4 | This repository collects multiple examples for using NVIDIA Tensor Cores.
 5 | Please see individual examples for their licensing requirements.
 6 | 
 7 | 
 8 | Examples
 9 | --------
10 | 
11 | * [cudaTensorCoreGemm](cudaTensorCoreGemm/readme.txt) - Implements a GEMM operation using WMMA instructions
12 | * [simpleCUBLASEx](simpleCUBLASEx/readme.txt) - Demonstrates an SGEMM using Tensor Cores via the cublasGemmEx
13 |   API
14 | * [simpleCUBLASHgemm](simpleCUBLASHgemm/readme.txt) - Demonstrates calling HGEMM directly from cuBLAS
15 | * [simpleCUBLASSgemm](simpleCUBLASSgemm) - Demonstrates using Tensor Cores implicitly from SGEMM
16 | * [CUTLASS WMMA GEMM](https://github.com/NVIDIA/cutlass/tree/master/examples/05_wmma_gemm) - Using WMMA instructions from the CUTLASS
17 |   framework.
18 | * [pictc](https://github.com/vishalmehta1991/pictc/README.md) - Implements a simple Particle-In-Cell pusher using Tensor Cores
19 | * [DCGAN](https://github.com/NVIDIA/apex/tree/master/examples/dcgan) - Illustrates using Automatic Mixed Precision
20 |   (AMP) within PyTorch using the DCGAN network.
21 | * [ImageNet](https://github.com/NVIDIA/apex/tree/master/examples/imagenet) - Illustrates using Automatic Mixed
22 |   Precision (AMP) with imagenet.
23 | 
24 | Instructions
25 | ------------
26 | 
27 | Some examples are stored in git submodules. It is necessary to call 
28 | `git submodule init` after cloning or clone with the `--recursive-submodules`
29 | option.
30 | 


--------------------------------------------------------------------------------
/wmma_cuda_fortran/main.CUF:
--------------------------------------------------------------------------------
 1 | !
 2 | ! Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
 3 | !
 4 | ! NOTICE TO USER:
 5 | !
 6 | ! This source code is subject to NVIDIA ownership rights under U.S. and
 7 | ! international Copyright laws.  Users and possessors of this source code
 8 | ! are hereby granted a nonexclusive, royalty-free license to use this code
 9 | ! in individual and commercial software.
10 | !
11 | ! NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
12 | ! CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
13 | ! IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
14 | ! REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
15 | ! MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
16 | ! IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
17 | ! OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
18 | ! OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
19 | ! OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
20 | ! OR PERFORMANCE OF THIS SOURCE CODE.
21 | !
22 | ! U.S. Government End Users.   This source code is a "commercial item" as
23 | ! that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
24 | ! "commercial computer  software"  and "commercial computer software
25 | ! documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
26 | ! and is provided to the U.S. Government only as a commercial end item.
27 | ! Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
28 | ! 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
29 | ! source code with only those rights set forth herein.
30 | !
31 | ! Any use of this source code in individual and commercial software must
32 | ! include, in the user documentation and internal comments to the code,
33 | ! the above Disclaimer and U.S. Government End Users Notice.
34 | !
35 | program main
36 |   use m
37 |   use cudafor
38 |   integer, parameter :: m = 16, n=m, k=m 
39 |   real(4) :: a(m,k), b(k,n), c(m,n), cref(m,n)
40 |   real(4), device :: c_d(m,n) 
41 |   real(2), device :: ah_d(m,k), bh_d(k,n) 
42 | 
43 |   call random_number(a); a = int(4.*a); ah_d = a
44 |   call random_number(b); b = int(4.*b); bh_d = b
45 | 
46 |   cref = matmul(a, b)  
47 |   c = 0.0
48 |   call wmma_16x16<<<1,32>>>(ah_d, bh_d, c_d)  
49 |   c = c_d
50 | 
51 |   if (sum(abs(c-cref)) == 0.0) write(*,*) 'Test passed'
52 | end program main
53 | 


--------------------------------------------------------------------------------
/wmma_cuda_fortran/wmma_mod.CUF:
--------------------------------------------------------------------------------
 1 | !
 2 | ! Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
 3 | !
 4 | ! NOTICE TO USER:
 5 | !
 6 | ! This source code is subject to NVIDIA ownership rights under U.S. and
 7 | ! international Copyright laws.  Users and possessors of this source code
 8 | ! are hereby granted a nonexclusive, royalty-free license to use this code
 9 | ! in individual and commercial software.
10 | !
11 | ! NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
12 | ! CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
13 | ! IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
14 | ! REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
15 | ! MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
16 | ! IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
17 | ! OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
18 | ! OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
19 | ! OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
20 | ! OR PERFORMANCE OF THIS SOURCE CODE.
21 | !
22 | ! U.S. Government End Users.   This source code is a "commercial item" as
23 | ! that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
24 | ! "commercial computer  software"  and "commercial computer software
25 | ! documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
26 | ! and is provided to the U.S. Government only as a commercial end item.
27 | ! Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
28 | ! 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
29 | ! source code with only those rights set forth herein.
30 | !
31 | ! Any use of this source code in individual and commercial software must
32 | ! include, in the user documentation and internal comments to the code,
33 | ! the above Disclaimer and U.S. Government End Users Notice.
34 | !
35 | #include "cuf_macros.CUF" 
36 | module m
37 | contains
38 |   attributes(global) subroutine wmma_16x16(a, b, c)
39 |     use wmma
40 |     real(2), intent(in) :: a(16,*), b(16,*)
41 |     real(4) :: c(16,*)
42 |     WMMASubMatrix(WMMAMatrixA, 16, 16, 16, Real, WMMAColMajor) :: sa
43 |     WMMASubMatrix(WMMAMatrixB, 16, 16, 16, Real, WMMAColMajor) :: sb
44 |     WMMASubMatrix(WMMAMatrixC, 16, 16, 16, Real, WMMAKind4) :: sc
45 | 
46 |     sc = 0.0_4
47 |     call wmmaLoadMatrix(sa, a(1,1), 16) 
48 |     call wmmaLoadMatrix(sb, b(1,1), 16)
49 |     call wmmaMatMul(sc, sa, sb, sc)
50 |     call wmmaStoreMatrix(c(1,1), sc, 16)
51 |   end subroutine wmma_16x16 
52 | end module m
53 | 


--------------------------------------------------------------------------------
/simpleCUBLASHgemm/simpleCUBLAS.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NOTICE TO USER:
  5 |  *
  6 |  * This source code is subject to NVIDIA ownership rights under U.S. and
  7 |  * international Copyright laws.  Users and possessors of this source code
  8 |  * are hereby granted a nonexclusive, royalty-free license to use this code
  9 |  * in individual and commercial software.
 10 |  *
 11 |  * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 12 |  * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 13 |  * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 14 |  * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 15 |  * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 16 |  * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 17 |  * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 18 |  * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 19 |  * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
 20 |  * OR PERFORMANCE OF THIS SOURCE CODE.
 21 |  *
 22 |  * U.S. Government End Users.   This source code is a "commercial item" as
 23 |  * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
 24 |  * "commercial computer  software"  and "commercial computer software
 25 |  * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
 26 |  * and is provided to the U.S. Government only as a commercial end item.
 27 |  * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 28 |  * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 29 |  * source code with only those rights set forth herein.
 30 |  *
 31 |  * Any use of this source code in individual and commercial software must
 32 |  * include, in the user documentation and internal comments to the code,
 33 |  * the above Disclaimer and U.S. Government End Users Notice.
 34 |  */
 35 | 
 36 | /* This example demonstrates how to use the CUBLAS library
 37 |  * by scaling an array of floating-point values on the device
 38 |  * and comparing the result to the same operation performed
 39 |  * on the host.
 40 |  */
 41 | 
 42 | /* Includes, system */
 43 | #include <stdio.h>
 44 | #include <stdlib.h>
 45 | #include <string.h>
 46 | 
 47 | /* Includes, cuda */
 48 | #include <cublas_v2.h>
 49 | #include <cuda_runtime.h>
 50 | #include <helper_cuda.h>
 51 | 
 52 | /* Matrix size */
 53 | #define N (512)
 54 | 
 55 | /* Host implementation of a simple version of hgemm */
 56 | static void simple_hgemm(int n, half alpha, const half *A, const half *B,
 57 |                          half beta, half *C) {
 58 |   int i;
 59 |   int j;
 60 |   int k;
 61 | 
 62 |   for (i = 0; i < n; ++i) {
 63 |     for (j = 0; j < n; ++j) {
 64 |       half prod = 0.0f;
 65 | 
 66 |       for (k = 0; k < n; ++k) {
 67 |         prod = prod + A[k * n + i] * B[j * n + k];
 68 |       }
 69 | 
 70 |       C[j * n + i] = alpha * prod + beta * C[j * n + i];
 71 |     }
 72 |   }
 73 | }
 74 | 
 75 | /* Main */
 76 | int main(int argc, char **argv) {
 77 |   cublasStatus_t status;
 78 |   half *h_A;
 79 |   half *h_B;
 80 |   half *h_C;
 81 |   half *h_C_ref;
 82 |   half *d_A = 0;
 83 |   half *d_B = 0;
 84 |   half *d_C = 0;
 85 |   half alpha = 1.0f;
 86 |   half beta = 0.0f;
 87 |   int n2 = N * N;
 88 |   int i;
 89 |   half error_norm;
 90 |   half ref_norm;
 91 |   half diff;
 92 |   cublasHandle_t handle;
 93 | 
 94 |   int dev = findCudaDevice(argc, (const char **)argv);
 95 | 
 96 |   if (dev == -1) {
 97 |     return EXIT_FAILURE;
 98 |   }
 99 | 
100 |   /* Initialize CUBLAS */
101 |   printf("simpleCUBLAS test running..\n");
102 | 
103 |   status = cublasCreate(&handle);
104 | 
105 |   if (status != CUBLAS_STATUS_SUCCESS) {
106 |     fprintf(stderr, "!!!! CUBLAS initialization error\n");
107 |     return EXIT_FAILURE;
108 |   }
109 | 
110 |   /* Allocate host memory for the matrices */
111 |   h_A = reinterpret_cast<half *>(malloc(n2 * sizeof(h_A[0])));
112 | 
113 |   if (h_A == 0) {
114 |     fprintf(stderr, "!!!! host memory allocation error (A)\n");
115 |     return EXIT_FAILURE;
116 |   }
117 | 
118 |   h_B = reinterpret_cast<half *>(malloc(n2 * sizeof(h_B[0])));
119 | 
120 |   if (h_B == 0) {
121 |     fprintf(stderr, "!!!! host memory allocation error (B)\n");
122 |     return EXIT_FAILURE;
123 |   }
124 | 
125 |   h_C = reinterpret_cast<half *>(malloc(n2 * sizeof(h_C[0])));
126 | 
127 |   if (h_C == 0) {
128 |     fprintf(stderr, "!!!! host memory allocation error (C)\n");
129 |     return EXIT_FAILURE;
130 |   }
131 | 
132 |   /* Fill the matrices with test data */
133 |   for (i = 0; i < n2; i++) {
134 |     h_A[i] = static_cast<half>(rand() / static_cast<float>(RAND_MAX));
135 |     h_B[i] = static_cast<half>(rand() / static_cast<float>(RAND_MAX));
136 |     h_C[i] = static_cast<half>(rand() / static_cast<float>(RAND_MAX));
137 |   }
138 | 
139 |   /* Allocate device memory for the matrices */
140 |   if (cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0])) !=
141 |       cudaSuccess) {
142 |     fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
143 |     return EXIT_FAILURE;
144 |   }
145 | 
146 |   if (cudaMalloc(reinterpret_cast<void **>(&d_B), n2 * sizeof(d_B[0])) !=
147 |       cudaSuccess) {
148 |     fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
149 |     return EXIT_FAILURE;
150 |   }
151 | 
152 |   if (cudaMalloc(reinterpret_cast<void **>(&d_C), n2 * sizeof(d_C[0])) !=
153 |       cudaSuccess) {
154 |     fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
155 |     return EXIT_FAILURE;
156 |   }
157 | 
158 |   /* Initialize the device matrices with the host matrices */
159 |   status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
160 | 
161 |   if (status != CUBLAS_STATUS_SUCCESS) {
162 |     fprintf(stderr, "!!!! device access error (write A)\n");
163 |     return EXIT_FAILURE;
164 |   }
165 | 
166 |   status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
167 | 
168 |   if (status != CUBLAS_STATUS_SUCCESS) {
169 |     fprintf(stderr, "!!!! device access error (write B)\n");
170 |     return EXIT_FAILURE;
171 |   }
172 | 
173 |   status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
174 | 
175 |   if (status != CUBLAS_STATUS_SUCCESS) {
176 |     fprintf(stderr, "!!!! device access error (write C)\n");
177 |     return EXIT_FAILURE;
178 |   }
179 | 
180 |   /* Enables automatic use of Tensor Cores from cublasSgemm */
181 |   status = cublasSetMathMode( handle, CUBLAS_TENSOR_OP_MATH );
182 | 
183 |   if (status != CUBLAS_STATUS_SUCCESS) {
184 |     fprintf(stderr, "!!!! failed to enable Tensor Cores\n");
185 |     return EXIT_FAILURE;
186 |   }
187 | 
188 |   /* Performs operation using plain C code */
189 |   simple_hgemm(N, alpha, h_A, h_B, beta, h_C);
190 |   h_C_ref = h_C;
191 | 
192 |   /* Performs operation using cublas */
193 |   status = cublasHgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A,
194 |                        N, d_B, N, &beta, d_C, N);
195 | 
196 |   if (status != CUBLAS_STATUS_SUCCESS) {
197 |     fprintf(stderr, "!!!! kernel execution error.\n");
198 |     return EXIT_FAILURE;
199 |   }
200 | 
201 |   /* Allocate host memory for reading back the result from device memory */
202 |   h_C = reinterpret_cast<half *>(malloc(n2 * sizeof(h_C[0])));
203 | 
204 |   if (h_C == 0) {
205 |     fprintf(stderr, "!!!! host memory allocation error (C)\n");
206 |     return EXIT_FAILURE;
207 |   }
208 | 
209 |   /* Read the result back */
210 |   status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
211 | 
212 |   if (status != CUBLAS_STATUS_SUCCESS) {
213 |     fprintf(stderr, "!!!! device access error (read C)\n");
214 |     return EXIT_FAILURE;
215 |   }
216 | 
217 |   /* Check result against reference */
218 |   error_norm = 0.0f;
219 |   ref_norm = 0.0f;
220 | 
221 |   for (i = 0; i < n2; ++i) {
222 |     diff = h_C_ref[i] - h_C[i];
223 |     error_norm = error_norm + diff * diff;
224 |     ref_norm = ref_norm + h_C_ref[i] * h_C_ref[i];
225 |   }
226 | 
227 |   error_norm = static_cast<half>(sqrt(static_cast<double>(error_norm)));
228 |   ref_norm = static_cast<half>(sqrt(static_cast<double>(ref_norm)));
229 | 
230 |   if (fabs(ref_norm) < 1e-7) {
231 |     fprintf(stderr, "!!!! reference norm is 0\n");
232 |     return EXIT_FAILURE;
233 |   }
234 | 
235 |   /* Memory clean up */
236 |   free(h_A);
237 |   free(h_B);
238 |   free(h_C);
239 |   free(h_C_ref);
240 | 
241 |   if (cudaFree(d_A) != cudaSuccess) {
242 |     fprintf(stderr, "!!!! memory free error (A)\n");
243 |     return EXIT_FAILURE;
244 |   }
245 | 
246 |   if (cudaFree(d_B) != cudaSuccess) {
247 |     fprintf(stderr, "!!!! memory free error (B)\n");
248 |     return EXIT_FAILURE;
249 |   }
250 | 
251 |   if (cudaFree(d_C) != cudaSuccess) {
252 |     fprintf(stderr, "!!!! memory free error (C)\n");
253 |     return EXIT_FAILURE;
254 |   }
255 | 
256 |   /* Shutdown */
257 |   status = cublasDestroy(handle);
258 | 
259 |   if (status != CUBLAS_STATUS_SUCCESS) {
260 |     fprintf(stderr, "!!!! shutdown error (A)\n");
261 |     return EXIT_FAILURE;
262 |   }
263 | 
264 |   if (error_norm / ref_norm < 1e-6f) {
265 |     printf("simpleCUBLAS test passed.\n");
266 |     exit(EXIT_SUCCESS);
267 |   } else {
268 |     printf("simpleCUBLAS test failed.\n");
269 |     exit(EXIT_FAILURE);
270 |   }
271 | }
272 | 


--------------------------------------------------------------------------------
/simpleCUBLASSgemm/simpleCUBLAS.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NOTICE TO USER:
  5 |  *
  6 |  * This source code is subject to NVIDIA ownership rights under U.S. and
  7 |  * international Copyright laws.  Users and possessors of this source code
  8 |  * are hereby granted a nonexclusive, royalty-free license to use this code
  9 |  * in individual and commercial software.
 10 |  *
 11 |  * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 12 |  * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 13 |  * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 14 |  * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 15 |  * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 16 |  * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 17 |  * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 18 |  * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 19 |  * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
 20 |  * OR PERFORMANCE OF THIS SOURCE CODE.
 21 |  *
 22 |  * U.S. Government End Users.   This source code is a "commercial item" as
 23 |  * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
 24 |  * "commercial computer  software"  and "commercial computer software
 25 |  * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
 26 |  * and is provided to the U.S. Government only as a commercial end item.
 27 |  * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 28 |  * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 29 |  * source code with only those rights set forth herein.
 30 |  *
 31 |  * Any use of this source code in individual and commercial software must
 32 |  * include, in the user documentation and internal comments to the code,
 33 |  * the above Disclaimer and U.S. Government End Users Notice.
 34 |  */
 35 | 
 36 | /* This example demonstrates how to use the CUBLAS library
 37 |  * by scaling an array of floating-point values on the device
 38 |  * and comparing the result to the same operation performed
 39 |  * on the host.
 40 |  */
 41 | 
 42 | /* Includes, system */
 43 | #include <stdio.h>
 44 | #include <stdlib.h>
 45 | #include <string.h>
 46 | 
 47 | /* Includes, cuda */
 48 | #include <cublas_v2.h>
 49 | #include <cuda_runtime.h>
 50 | #include <helper_cuda.h>
 51 | 
 52 | /* Matrix size - Size increased for Tesnor Core use. */
 53 | #define N (1024)
 54 | /* Matrix size - This has been kept small to reduce host runtime */
 55 | //#define N (512)
 56 | 
 57 | /* Host implementation of a simple version of sgemm */
 58 | static void simple_sgemm(int n, float alpha, const float *A, const float *B,
 59 |                          float beta, float *C) {
 60 |   int i;
 61 |   int j;
 62 |   int k;
 63 | 
 64 |   for (i = 0; i < n; ++i) {
 65 |     for (j = 0; j < n; ++j) {
 66 |       half prod = 0.0f;
 67 | 
 68 |       for (k = 0; k < n; ++k) {
 69 |         prod = prod + (half)A[k * n + i] * (half)B[j * n + k];
 70 |       }
 71 | 
 72 |       C[j * n + i] = (float)((half)alpha * (half)prod + (half)beta * (half)C[j * n + i]);
 73 |     }
 74 |   }
 75 | }
 76 | 
 77 | /* Main */
 78 | int main(int argc, char **argv) {
 79 |   cublasStatus_t status;
 80 |   float *h_A;
 81 |   float *h_B;
 82 |   float *h_C;
 83 |   float *h_C_ref;
 84 |   float *d_A = 0;
 85 |   float *d_B = 0;
 86 |   float *d_C = 0;
 87 |   float alpha = 1.0f;
 88 |   float beta = 0.0f;
 89 |   int n2 = N * N;
 90 |   int i;
 91 |   float error_norm;
 92 |   float ref_norm;
 93 |   float diff;
 94 |   cublasHandle_t handle;
 95 | 
 96 |   int dev = findCudaDevice(argc, (const char **)argv);
 97 | 
 98 |   if (dev == -1) {
 99 |     return EXIT_FAILURE;
100 |   }
101 | 
102 |   /* Initialize CUBLAS */
103 |   printf("simpleCUBLAS test running..\n");
104 | 
105 |   status = cublasCreate(&handle);
106 | 
107 |   if (status != CUBLAS_STATUS_SUCCESS) {
108 |     fprintf(stderr, "!!!! CUBLAS initialization error\n");
109 |     return EXIT_FAILURE;
110 |   }
111 | 
112 |   /* Allocate host memory for the matrices */
113 |   h_A = reinterpret_cast<float *>(malloc(n2 * sizeof(h_A[0])));
114 | 
115 |   if (h_A == 0) {
116 |     fprintf(stderr, "!!!! host memory allocation error (A)\n");
117 |     return EXIT_FAILURE;
118 |   }
119 | 
120 |   h_B = reinterpret_cast<float *>(malloc(n2 * sizeof(h_B[0])));
121 | 
122 |   if (h_B == 0) {
123 |     fprintf(stderr, "!!!! host memory allocation error (B)\n");
124 |     return EXIT_FAILURE;
125 |   }
126 | 
127 |   h_C = reinterpret_cast<float *>(malloc(n2 * sizeof(h_C[0])));
128 | 
129 |   if (h_C == 0) {
130 |     fprintf(stderr, "!!!! host memory allocation error (C)\n");
131 |     return EXIT_FAILURE;
132 |   }
133 | 
134 |   /* Fill the matrices with test data */
135 |   for (i = 0; i < n2; i++) {
136 |     h_A[i] = rand() / static_cast<float>(RAND_MAX);
137 |     h_B[i] = rand() / static_cast<float>(RAND_MAX);
138 |     h_C[i] = rand() / static_cast<float>(RAND_MAX);
139 |   }
140 | 
141 |   /* Allocate device memory for the matrices */
142 |   if (cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0])) !=
143 |       cudaSuccess) {
144 |     fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
145 |     return EXIT_FAILURE;
146 |   }
147 | 
148 |   if (cudaMalloc(reinterpret_cast<void **>(&d_B), n2 * sizeof(d_B[0])) !=
149 |       cudaSuccess) {
150 |     fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
151 |     return EXIT_FAILURE;
152 |   }
153 | 
154 |   if (cudaMalloc(reinterpret_cast<void **>(&d_C), n2 * sizeof(d_C[0])) !=
155 |       cudaSuccess) {
156 |     fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
157 |     return EXIT_FAILURE;
158 |   }
159 | 
160 |   /* Initialize the device matrices with the host matrices */
161 |   status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
162 | 
163 |   if (status != CUBLAS_STATUS_SUCCESS) {
164 |     fprintf(stderr, "!!!! device access error (write A)\n");
165 |     return EXIT_FAILURE;
166 |   }
167 | 
168 |   status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
169 | 
170 |   if (status != CUBLAS_STATUS_SUCCESS) {
171 |     fprintf(stderr, "!!!! device access error (write B)\n");
172 |     return EXIT_FAILURE;
173 |   }
174 | 
175 |   status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
176 | 
177 |   if (status != CUBLAS_STATUS_SUCCESS) {
178 |     fprintf(stderr, "!!!! device access error (write C)\n");
179 |     return EXIT_FAILURE;
180 |   }
181 | 
182 |   /* Enables automatic use of Tensor Cores from cublasSgemm */
183 |   status = cublasSetMathMode( handle, CUBLAS_TENSOR_OP_MATH );
184 | 
185 |   if (status != CUBLAS_STATUS_SUCCESS) {
186 |     fprintf(stderr, "!!!! failed to enable Tensor Cores\n");
187 |     return EXIT_FAILURE;
188 |   }
189 | 
190 |   /* Performs operation using plain C code */
191 |   simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
192 |   h_C_ref = h_C;
193 | 
194 |   /* Performs operation using cublas */
195 |   status = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, d_A,
196 |                        N, d_B, N, &beta, d_C, N);
197 | 
198 |   if (status != CUBLAS_STATUS_SUCCESS) {
199 |     fprintf(stderr, "!!!! kernel execution error.\n");
200 |     return EXIT_FAILURE;
201 |   }
202 | 
203 |   /* Allocate host memory for reading back the result from device memory */
204 |   h_C = reinterpret_cast<float *>(malloc(n2 * sizeof(h_C[0])));
205 | 
206 |   if (h_C == 0) {
207 |     fprintf(stderr, "!!!! host memory allocation error (C)\n");
208 |     return EXIT_FAILURE;
209 |   }
210 | 
211 |   /* Read the result back */
212 |   status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
213 | 
214 |   if (status != CUBLAS_STATUS_SUCCESS) {
215 |     fprintf(stderr, "!!!! device access error (read C)\n");
216 |     return EXIT_FAILURE;
217 |   }
218 | 
219 |   /* Check result against reference */
220 |   error_norm = 0;
221 |   ref_norm = 0;
222 | 
223 |   for (i = 0; i < n2; ++i) {
224 |     diff = h_C_ref[i] - h_C[i];
225 |     error_norm += diff * diff;
226 |     ref_norm += h_C_ref[i] * h_C_ref[i];
227 |   }
228 | 
229 |   error_norm = static_cast<float>(sqrt(static_cast<double>(error_norm)));
230 |   ref_norm = static_cast<float>(sqrt(static_cast<double>(ref_norm)));
231 | 
232 |   if (fabs(ref_norm) < 1e-7) {
233 |     fprintf(stderr, "!!!! reference norm is 0\n");
234 |     return EXIT_FAILURE;
235 |   }
236 | 
237 |   /* Memory clean up */
238 |   free(h_A);
239 |   free(h_B);
240 |   free(h_C);
241 |   free(h_C_ref);
242 | 
243 |   if (cudaFree(d_A) != cudaSuccess) {
244 |     fprintf(stderr, "!!!! memory free error (A)\n");
245 |     return EXIT_FAILURE;
246 |   }
247 | 
248 |   if (cudaFree(d_B) != cudaSuccess) {
249 |     fprintf(stderr, "!!!! memory free error (B)\n");
250 |     return EXIT_FAILURE;
251 |   }
252 | 
253 |   if (cudaFree(d_C) != cudaSuccess) {
254 |     fprintf(stderr, "!!!! memory free error (C)\n");
255 |     return EXIT_FAILURE;
256 |   }
257 | 
258 |   /* Shutdown */
259 |   status = cublasDestroy(handle);
260 | 
261 |   if (status != CUBLAS_STATUS_SUCCESS) {
262 |     fprintf(stderr, "!!!! shutdown error (A)\n");
263 |     return EXIT_FAILURE;
264 |   }
265 | 
266 |   if (error_norm / ref_norm < 1e-2f) {
267 |     printf("simpleCUBLAS test passed.\n");
268 |     exit(EXIT_SUCCESS);
269 |   } else {
270 |     printf("simpleCUBLAS test failed.\n");
271 |     exit(EXIT_FAILURE);
272 |   }
273 | }
274 | 


--------------------------------------------------------------------------------
/simpleCUBLASEx/simpleCUBLAS.cpp:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NOTICE TO USER:
  5 |  *
  6 |  * This source code is subject to NVIDIA ownership rights under U.S. and
  7 |  * international Copyright laws.  Users and possessors of this source code
  8 |  * are hereby granted a nonexclusive, royalty-free license to use this code
  9 |  * in individual and commercial software.
 10 |  *
 11 |  * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 12 |  * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 13 |  * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 14 |  * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 15 |  * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 16 |  * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 17 |  * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 18 |  * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 19 |  * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
 20 |  * OR PERFORMANCE OF THIS SOURCE CODE.
 21 |  *
 22 |  * U.S. Government End Users.   This source code is a "commercial item" as
 23 |  * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
 24 |  * "commercial computer  software"  and "commercial computer software
 25 |  * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
 26 |  * and is provided to the U.S. Government only as a commercial end item.
 27 |  * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 28 |  * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 29 |  * source code with only those rights set forth herein.
 30 |  *
 31 |  * Any use of this source code in individual and commercial software must
 32 |  * include, in the user documentation and internal comments to the code,
 33 |  * the above Disclaimer and U.S. Government End Users Notice.
 34 |  */
 35 | 
 36 | /* This example demonstrates how to use the CUBLAS library
 37 |  * by scaling an array of floating-point values on the device
 38 |  * and comparing the result to the same operation performed
 39 |  * on the host.
 40 |  */
 41 | 
 42 | /* Includes, system */
 43 | #include <stdio.h>
 44 | #include <stdlib.h>
 45 | #include <string.h>
 46 | 
 47 | /* Includes, cuda */
 48 | #include <cublas_v2.h>
 49 | #include <cuda_runtime.h>
 50 | #include <helper_cuda.h>
 51 | 
 52 | /* Matrix size - Size increased for Tesnor Core use. */
 53 | #define N (1024)
 54 | /* Matrix size - This has been kept small to reduce host runtime */
 55 | //#define N (512)
 56 | 
 57 | /* Host implementation of a simple version of sgemm */
 58 | static void simple_sgemm(int n, float alpha, const float *A, const float *B,
 59 |                          float beta, float *C) {
 60 |   int i;
 61 |   int j;
 62 |   int k;
 63 | 
 64 |   for (i = 0; i < n; ++i) {
 65 |     for (j = 0; j < n; ++j) {
 66 |       half prod = 0.0f;
 67 | 
 68 |       for (k = 0; k < n; ++k) {
 69 |         prod = prod + (half)A[k * n + i] * (half)B[j * n + k];
 70 |       }
 71 | 
 72 |       C[j * n + i] = (float)((half)alpha * (half)prod + (half)beta * (half)C[j * n + i]);
 73 |     }
 74 |   }
 75 | }
 76 | 
 77 | /* Main */
 78 | int main(int argc, char **argv) {
 79 |   cublasStatus_t status;
 80 |   float *h_A;
 81 |   float *h_B;
 82 |   float *h_C;
 83 |   float *h_C_ref;
 84 |   float *d_A = 0;
 85 |   float *d_B = 0;
 86 |   float *d_C = 0;
 87 |   float alpha = 1.0f;
 88 |   float beta = 0.0f;
 89 |   int n2 = N * N;
 90 |   int i;
 91 |   float error_norm;
 92 |   float ref_norm;
 93 |   float diff;
 94 |   cublasHandle_t handle;
 95 | 
 96 |   int dev = findCudaDevice(argc, (const char **)argv);
 97 | 
 98 |   if (dev == -1) {
 99 |     return EXIT_FAILURE;
100 |   }
101 | 
102 |   /* Initialize CUBLAS */
103 |   printf("simpleCUBLAS test running..\n");
104 | 
105 |   status = cublasCreate(&handle);
106 | 
107 |   if (status != CUBLAS_STATUS_SUCCESS) {
108 |     fprintf(stderr, "!!!! CUBLAS initialization error\n");
109 |     return EXIT_FAILURE;
110 |   }
111 | 
112 |   /* Allocate host memory for the matrices */
113 |   h_A = reinterpret_cast<float *>(malloc(n2 * sizeof(h_A[0])));
114 | 
115 |   if (h_A == 0) {
116 |     fprintf(stderr, "!!!! host memory allocation error (A)\n");
117 |     return EXIT_FAILURE;
118 |   }
119 | 
120 |   h_B = reinterpret_cast<float *>(malloc(n2 * sizeof(h_B[0])));
121 | 
122 |   if (h_B == 0) {
123 |     fprintf(stderr, "!!!! host memory allocation error (B)\n");
124 |     return EXIT_FAILURE;
125 |   }
126 | 
127 |   h_C = reinterpret_cast<float *>(malloc(n2 * sizeof(h_C[0])));
128 | 
129 |   if (h_C == 0) {
130 |     fprintf(stderr, "!!!! host memory allocation error (C)\n");
131 |     return EXIT_FAILURE;
132 |   }
133 | 
134 |   /* Fill the matrices with test data */
135 |   for (i = 0; i < n2; i++) {
136 |     h_A[i] = rand() / static_cast<float>(RAND_MAX);
137 |     h_B[i] = rand() / static_cast<float>(RAND_MAX);
138 |     h_C[i] = rand() / static_cast<float>(RAND_MAX);
139 |   }
140 | 
141 |   /* Allocate device memory for the matrices */
142 |   if (cudaMalloc(reinterpret_cast<void **>(&d_A), n2 * sizeof(d_A[0])) !=
143 |       cudaSuccess) {
144 |     fprintf(stderr, "!!!! device memory allocation error (allocate A)\n");
145 |     return EXIT_FAILURE;
146 |   }
147 | 
148 |   if (cudaMalloc(reinterpret_cast<void **>(&d_B), n2 * sizeof(d_B[0])) !=
149 |       cudaSuccess) {
150 |     fprintf(stderr, "!!!! device memory allocation error (allocate B)\n");
151 |     return EXIT_FAILURE;
152 |   }
153 | 
154 |   if (cudaMalloc(reinterpret_cast<void **>(&d_C), n2 * sizeof(d_C[0])) !=
155 |       cudaSuccess) {
156 |     fprintf(stderr, "!!!! device memory allocation error (allocate C)\n");
157 |     return EXIT_FAILURE;
158 |   }
159 | 
160 |   /* Initialize the device matrices with the host matrices */
161 |   status = cublasSetVector(n2, sizeof(h_A[0]), h_A, 1, d_A, 1);
162 | 
163 |   if (status != CUBLAS_STATUS_SUCCESS) {
164 |     fprintf(stderr, "!!!! device access error (write A)\n");
165 |     return EXIT_FAILURE;
166 |   }
167 | 
168 |   status = cublasSetVector(n2, sizeof(h_B[0]), h_B, 1, d_B, 1);
169 | 
170 |   if (status != CUBLAS_STATUS_SUCCESS) {
171 |     fprintf(stderr, "!!!! device access error (write B)\n");
172 |     return EXIT_FAILURE;
173 |   }
174 | 
175 |   status = cublasSetVector(n2, sizeof(h_C[0]), h_C, 1, d_C, 1);
176 | 
177 |   if (status != CUBLAS_STATUS_SUCCESS) {
178 |     fprintf(stderr, "!!!! device access error (write C)\n");
179 |     return EXIT_FAILURE;
180 |   }
181 | 
182 |   /* Performs operation using plain C code */
183 |   simple_sgemm(N, alpha, h_A, h_B, beta, h_C);
184 |   h_C_ref = h_C;
185 | 
186 |   /* Performs operation using cublas */
187 |   status = cublasGemmEx(handle, CUBLAS_OP_N, CUBLAS_OP_N, N, N, N, &alpha, 
188 |                         d_A, CUDA_R_32F, N,             /* 32-bit float A */
189 |                         d_B, CUDA_R_32F, N, &beta,      /* 32-bit float B */
190 |                         d_C, CUDA_R_32F, N,             /* 32-bit float C */
191 |                         CUDA_R_32F,                     /* 32-bit computation */
192 |                         CUBLAS_GEMM_DEFAULT_TENSOR_OP); /* Enable automatic conversion to 16-bit */
193 | 
194 |   if (status != CUBLAS_STATUS_SUCCESS) {
195 |     fprintf(stderr, "!!!! kernel execution error.\n");
196 |     return EXIT_FAILURE;
197 |   }
198 | 
199 |   /* Allocate host memory for reading back the result from device memory */
200 |   h_C = reinterpret_cast<float *>(malloc(n2 * sizeof(h_C[0])));
201 | 
202 |   if (h_C == 0) {
203 |     fprintf(stderr, "!!!! host memory allocation error (C)\n");
204 |     return EXIT_FAILURE;
205 |   }
206 | 
207 |   /* Read the result back */
208 |   status = cublasGetVector(n2, sizeof(h_C[0]), d_C, 1, h_C, 1);
209 | 
210 |   if (status != CUBLAS_STATUS_SUCCESS) {
211 |     fprintf(stderr, "!!!! device access error (read C)\n");
212 |     return EXIT_FAILURE;
213 |   }
214 | 
215 |   /* Check result against reference */
216 |   error_norm = 0;
217 |   ref_norm = 0;
218 | 
219 |   for (i = 0; i < n2; ++i) {
220 |     diff = h_C_ref[i] - h_C[i];
221 |     error_norm += diff * diff;
222 |     ref_norm += h_C_ref[i] * h_C_ref[i];
223 |   }
224 | 
225 |   error_norm = static_cast<float>(sqrt(static_cast<double>(error_norm)));
226 |   ref_norm = static_cast<float>(sqrt(static_cast<double>(ref_norm)));
227 | 
228 |   if (fabs(ref_norm) < 1e-7) {
229 |     fprintf(stderr, "!!!! reference norm is 0\n");
230 |     return EXIT_FAILURE;
231 |   }
232 | 
233 |   /* Memory clean up */
234 |   free(h_A);
235 |   free(h_B);
236 |   free(h_C);
237 |   free(h_C_ref);
238 | 
239 |   if (cudaFree(d_A) != cudaSuccess) {
240 |     fprintf(stderr, "!!!! memory free error (A)\n");
241 |     return EXIT_FAILURE;
242 |   }
243 | 
244 |   if (cudaFree(d_B) != cudaSuccess) {
245 |     fprintf(stderr, "!!!! memory free error (B)\n");
246 |     return EXIT_FAILURE;
247 |   }
248 | 
249 |   if (cudaFree(d_C) != cudaSuccess) {
250 |     fprintf(stderr, "!!!! memory free error (C)\n");
251 |     return EXIT_FAILURE;
252 |   }
253 | 
254 |   /* Shutdown */
255 |   status = cublasDestroy(handle);
256 | 
257 |   if (status != CUBLAS_STATUS_SUCCESS) {
258 |     fprintf(stderr, "!!!! shutdown error (A)\n");
259 |     return EXIT_FAILURE;
260 |   }
261 | 
262 |   if (error_norm / ref_norm < 1e-2f) {
263 |     printf("simpleCUBLAS test passed.\n");
264 |     exit(EXIT_SUCCESS);
265 |   } else {
266 |     printf("simpleCUBLAS test failed.\n");
267 |     exit(EXIT_FAILURE);
268 |   }
269 | }
270 | 


--------------------------------------------------------------------------------
/simpleCUBLASEx/Makefile:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  4 | #
  5 | # NOTICE TO USER:
  6 | #
  7 | # This source code is subject to NVIDIA ownership rights under U.S. and
  8 | # international Copyright laws.
  9 | #
 10 | # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 11 | # CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 12 | # IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 13 | # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 14 | # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 15 | # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 16 | # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 17 | # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 18 | # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 19 | # OR PERFORMANCE OF THIS SOURCE CODE.
 20 | #
 21 | # U.S. Government End Users.  This source code is a "commercial item" as
 22 | # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 23 | # "commercial computer software" and "commercial computer software
 24 | # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 25 | # and is provided to the U.S. Government only as a commercial end item.
 26 | # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 27 | # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 28 | # source code with only those rights set forth herein.
 29 | #
 30 | ################################################################################
 31 | #
 32 | # Makefile project only supported on Mac OS X and Linux Platforms)
 33 | #
 34 | ################################################################################
 35 | 
 36 | # Location of the CUDA Toolkit
 37 | CUDA_PATH ?= /usr/local/cuda-10.1
 38 | 
 39 | ##############################
 40 | # start deprecated interface #
 41 | ##############################
 42 | ifeq ($(x86_64),1)
 43 |     $(info WARNING - x86_64 variable has been deprecated)
 44 |     $(info WARNING - please use TARGET_ARCH=x86_64 instead)
 45 |     TARGET_ARCH ?= x86_64
 46 | endif
 47 | ifeq ($(ARMv7),1)
 48 |     $(info WARNING - ARMv7 variable has been deprecated)
 49 |     $(info WARNING - please use TARGET_ARCH=armv7l instead)
 50 |     TARGET_ARCH ?= armv7l
 51 | endif
 52 | ifeq ($(aarch64),1)
 53 |     $(info WARNING - aarch64 variable has been deprecated)
 54 |     $(info WARNING - please use TARGET_ARCH=aarch64 instead)
 55 |     TARGET_ARCH ?= aarch64
 56 | endif
 57 | ifeq ($(ppc64le),1)
 58 |     $(info WARNING - ppc64le variable has been deprecated)
 59 |     $(info WARNING - please use TARGET_ARCH=ppc64le instead)
 60 |     TARGET_ARCH ?= ppc64le
 61 | endif
 62 | ifneq ($(GCC),)
 63 |     $(info WARNING - GCC variable has been deprecated)
 64 |     $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
 65 |     HOST_COMPILER ?= $(GCC)
 66 | endif
 67 | ifneq ($(abi),)
 68 |     $(error ERROR - abi variable has been removed)
 69 | endif
 70 | ############################
 71 | # end deprecated interface #
 72 | ############################
 73 | 
 74 | # architecture
 75 | HOST_ARCH   := $(shell uname -m)
 76 | TARGET_ARCH ?= $(HOST_ARCH)
 77 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 78 |     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 79 |         ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
 80 |             TARGET_SIZE := 64
 81 |         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
 82 |             TARGET_SIZE := 32
 83 |         endif
 84 |     else
 85 |         TARGET_SIZE := $(shell getconf LONG_BIT)
 86 |     endif
 87 | else
 88 |     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 89 | endif
 90 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 91 |     ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
 92 |         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
 93 |     endif
 94 | endif
 95 | 
 96 | # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 97 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
 98 |     TARGET_ARCH = armv7l
 99 | endif
100 | 
101 | # operating system
102 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
103 | TARGET_OS ?= $(HOST_OS)
104 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
105 |     $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
106 | endif
107 | 
108 | # host compiler
109 | ifeq ($(TARGET_OS),darwin)
110 |     ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
111 |         HOST_COMPILER ?= clang++
112 |     endif
113 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
114 |     ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
115 |         ifeq ($(TARGET_OS),linux)
116 |             HOST_COMPILER ?= arm-linux-gnueabihf-g++
117 |         else ifeq ($(TARGET_OS),qnx)
118 |             ifeq ($(QNX_HOST),)
119 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
120 |             endif
121 |             ifeq ($(QNX_TARGET),)
122 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
123 |             endif
124 |             export QNX_HOST
125 |             export QNX_TARGET
126 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
127 |         else ifeq ($(TARGET_OS),android)
128 |             HOST_COMPILER ?= arm-linux-androideabi-g++
129 |         endif
130 |     else ifeq ($(TARGET_ARCH),aarch64)
131 |         ifeq ($(TARGET_OS), linux)
132 |             HOST_COMPILER ?= aarch64-linux-gnu-g++
133 |         else ifeq ($(TARGET_OS),qnx)
134 |             ifeq ($(QNX_HOST),)
135 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
136 |             endif
137 |             ifeq ($(QNX_TARGET),)
138 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
139 |             endif
140 |             export QNX_HOST
141 |             export QNX_TARGET
142 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
143 |         else ifeq ($(TARGET_OS), android)
144 |             HOST_COMPILER ?= aarch64-linux-android-clang++
145 |         endif
146 |     else ifeq ($(TARGET_ARCH),ppc64le)
147 |         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
148 |     endif
149 | endif
150 | HOST_COMPILER ?= g++
151 | NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
152 | 
153 | # internal flags
154 | NVCCFLAGS   := -m${TARGET_SIZE}
155 | CCFLAGS     :=
156 | LDFLAGS     :=
157 | 
158 | # build flags
159 | ifeq ($(TARGET_OS),darwin)
160 |     LDFLAGS += -rpath $(CUDA_PATH)/lib
161 |     CCFLAGS += -arch $(HOST_ARCH)
162 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
163 |     LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
164 |     CCFLAGS += -mfloat-abi=hard
165 | else ifeq ($(TARGET_OS),android)
166 |     LDFLAGS += -pie
167 |     CCFLAGS += -fpie -fpic -fexceptions
168 | endif
169 | 
170 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
171 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
172 |         ifneq ($(TARGET_FS),)
173 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
174 |             ifeq ($(GCCVERSIONLTEQ46),1)
175 |                 CCFLAGS += --sysroot=$(TARGET_FS)
176 |             endif
177 |             LDFLAGS += --sysroot=$(TARGET_FS)
178 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib
179 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
180 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
181 |         endif
182 |     endif
183 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
184 |         ifneq ($(TARGET_FS),)
185 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
186 |             ifeq ($(GCCVERSIONLTEQ46),1)
187 |                 CCFLAGS += --sysroot=$(TARGET_FS)
188 |             endif
189 |             LDFLAGS += --sysroot=$(TARGET_FS)
190 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
191 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
192 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
193 |             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
194 |             CCFLAGS += -isystem=$(TARGET_FS)/usr/include
195 |             CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
196 |         endif
197 |     endif
198 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
199 |         CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
200 |         LDFLAGS += -lsocket
201 |         LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
202 |     endif
203 | endif
204 | 
205 | # Install directory of different arch
206 | CUDA_INSTALL_TARGET_DIR :=
207 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
208 |     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
209 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
210 |     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
211 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
212 |     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
213 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
214 |     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
215 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
216 |     CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
217 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
218 |     CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
219 | else ifeq ($(TARGET_ARCH),ppc64le)
220 |     CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
221 | endif
222 | 
223 | # Debug build flags
224 | ifeq ($(dbg),1)
225 |       NVCCFLAGS += -g -G
226 |       BUILD_TYPE := debug
227 | else
228 |       BUILD_TYPE := release
229 | endif
230 | 
231 | ALL_CCFLAGS :=
232 | ALL_CCFLAGS += $(NVCCFLAGS)
233 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
234 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
235 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
236 | 
237 | ALL_LDFLAGS :=
238 | ALL_LDFLAGS += $(ALL_CCFLAGS)
239 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
240 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
241 | 
242 | # Common includes and paths for CUDA
243 | INCLUDES  := -I.
244 | LIBRARIES :=
245 | 
246 | ################################################################################
247 | 
248 | # Gencode arguments
249 | SMS ?=
250 | 
251 | ifeq ($(GENCODE_FLAGS),)
252 | # Generate SASS code for each SM architecture listed in $(SMS)
253 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
254 | 
255 | ifeq ($(SMS),)
256 | # Generate PTX code from SM 70
257 | GENCODE_FLAGS += -gencode arch=compute_70,code=compute_70
258 | endif
259 | 
260 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
261 | HIGHEST_SM := $(lastword $(sort $(SMS)))
262 | ifneq ($(HIGHEST_SM),)
263 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
264 | endif
265 | endif
266 | 
267 | LIBRARIES += -lcublas
268 | 
269 | ################################################################################
270 | 
271 | # Target rules
272 | all: build
273 | 
274 | build: simpleCUBLAS
275 | 
276 | simpleCUBLAS.o:simpleCUBLAS.cpp
277 | 	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
278 | 
279 | simpleCUBLAS: simpleCUBLAS.o
280 | 	$(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
281 | 
282 | run: build
283 | 	./simpleCUBLAS
284 | 
285 | clean:
286 | 	rm -f simpleCUBLAS simpleCUBLAS.o
287 | 
288 | clobber: clean
289 | 


--------------------------------------------------------------------------------
/simpleCUBLASHgemm/Makefile:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  4 | #
  5 | # NOTICE TO USER:
  6 | #
  7 | # This source code is subject to NVIDIA ownership rights under U.S. and
  8 | # international Copyright laws.
  9 | #
 10 | # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 11 | # CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 12 | # IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 13 | # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 14 | # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 15 | # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 16 | # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 17 | # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 18 | # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 19 | # OR PERFORMANCE OF THIS SOURCE CODE.
 20 | #
 21 | # U.S. Government End Users.  This source code is a "commercial item" as
 22 | # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 23 | # "commercial computer software" and "commercial computer software
 24 | # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 25 | # and is provided to the U.S. Government only as a commercial end item.
 26 | # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 27 | # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 28 | # source code with only those rights set forth herein.
 29 | #
 30 | ################################################################################
 31 | #
 32 | # Makefile project only supported on Mac OS X and Linux Platforms)
 33 | #
 34 | ################################################################################
 35 | 
 36 | # Location of the CUDA Toolkit
 37 | CUDA_PATH ?= /usr/local/cuda-10.1
 38 | 
 39 | ##############################
 40 | # start deprecated interface #
 41 | ##############################
 42 | ifeq ($(x86_64),1)
 43 |     $(info WARNING - x86_64 variable has been deprecated)
 44 |     $(info WARNING - please use TARGET_ARCH=x86_64 instead)
 45 |     TARGET_ARCH ?= x86_64
 46 | endif
 47 | ifeq ($(ARMv7),1)
 48 |     $(info WARNING - ARMv7 variable has been deprecated)
 49 |     $(info WARNING - please use TARGET_ARCH=armv7l instead)
 50 |     TARGET_ARCH ?= armv7l
 51 | endif
 52 | ifeq ($(aarch64),1)
 53 |     $(info WARNING - aarch64 variable has been deprecated)
 54 |     $(info WARNING - please use TARGET_ARCH=aarch64 instead)
 55 |     TARGET_ARCH ?= aarch64
 56 | endif
 57 | ifeq ($(ppc64le),1)
 58 |     $(info WARNING - ppc64le variable has been deprecated)
 59 |     $(info WARNING - please use TARGET_ARCH=ppc64le instead)
 60 |     TARGET_ARCH ?= ppc64le
 61 | endif
 62 | ifneq ($(GCC),)
 63 |     $(info WARNING - GCC variable has been deprecated)
 64 |     $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
 65 |     HOST_COMPILER ?= $(GCC)
 66 | endif
 67 | ifneq ($(abi),)
 68 |     $(error ERROR - abi variable has been removed)
 69 | endif
 70 | ############################
 71 | # end deprecated interface #
 72 | ############################
 73 | 
 74 | # architecture
 75 | HOST_ARCH   := $(shell uname -m)
 76 | TARGET_ARCH ?= $(HOST_ARCH)
 77 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 78 |     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 79 |         ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
 80 |             TARGET_SIZE := 64
 81 |         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
 82 |             TARGET_SIZE := 32
 83 |         endif
 84 |     else
 85 |         TARGET_SIZE := $(shell getconf LONG_BIT)
 86 |     endif
 87 | else
 88 |     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 89 | endif
 90 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 91 |     ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
 92 |         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
 93 |     endif
 94 | endif
 95 | 
 96 | # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 97 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
 98 |     TARGET_ARCH = armv7l
 99 | endif
100 | 
101 | # operating system
102 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
103 | TARGET_OS ?= $(HOST_OS)
104 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
105 |     $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
106 | endif
107 | 
108 | # host compiler
109 | ifeq ($(TARGET_OS),darwin)
110 |     ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
111 |         HOST_COMPILER ?= clang++
112 |     endif
113 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
114 |     ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
115 |         ifeq ($(TARGET_OS),linux)
116 |             HOST_COMPILER ?= arm-linux-gnueabihf-g++
117 |         else ifeq ($(TARGET_OS),qnx)
118 |             ifeq ($(QNX_HOST),)
119 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
120 |             endif
121 |             ifeq ($(QNX_TARGET),)
122 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
123 |             endif
124 |             export QNX_HOST
125 |             export QNX_TARGET
126 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
127 |         else ifeq ($(TARGET_OS),android)
128 |             HOST_COMPILER ?= arm-linux-androideabi-g++
129 |         endif
130 |     else ifeq ($(TARGET_ARCH),aarch64)
131 |         ifeq ($(TARGET_OS), linux)
132 |             HOST_COMPILER ?= aarch64-linux-gnu-g++
133 |         else ifeq ($(TARGET_OS),qnx)
134 |             ifeq ($(QNX_HOST),)
135 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
136 |             endif
137 |             ifeq ($(QNX_TARGET),)
138 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
139 |             endif
140 |             export QNX_HOST
141 |             export QNX_TARGET
142 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
143 |         else ifeq ($(TARGET_OS), android)
144 |             HOST_COMPILER ?= aarch64-linux-android-clang++
145 |         endif
146 |     else ifeq ($(TARGET_ARCH),ppc64le)
147 |         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
148 |     endif
149 | endif
150 | HOST_COMPILER ?= g++
151 | NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
152 | 
153 | # internal flags
154 | NVCCFLAGS   := -m${TARGET_SIZE}
155 | CCFLAGS     :=
156 | LDFLAGS     :=
157 | 
158 | # build flags
159 | ifeq ($(TARGET_OS),darwin)
160 |     LDFLAGS += -rpath $(CUDA_PATH)/lib
161 |     CCFLAGS += -arch $(HOST_ARCH)
162 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
163 |     LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
164 |     CCFLAGS += -mfloat-abi=hard
165 | else ifeq ($(TARGET_OS),android)
166 |     LDFLAGS += -pie
167 |     CCFLAGS += -fpie -fpic -fexceptions
168 | endif
169 | 
170 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
171 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
172 |         ifneq ($(TARGET_FS),)
173 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
174 |             ifeq ($(GCCVERSIONLTEQ46),1)
175 |                 CCFLAGS += --sysroot=$(TARGET_FS)
176 |             endif
177 |             LDFLAGS += --sysroot=$(TARGET_FS)
178 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib
179 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
180 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
181 |         endif
182 |     endif
183 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
184 |         ifneq ($(TARGET_FS),)
185 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
186 |             ifeq ($(GCCVERSIONLTEQ46),1)
187 |                 CCFLAGS += --sysroot=$(TARGET_FS)
188 |             endif
189 |             LDFLAGS += --sysroot=$(TARGET_FS)
190 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
191 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
192 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
193 |             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
194 |             CCFLAGS += -isystem=$(TARGET_FS)/usr/include
195 |             CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
196 |         endif
197 |     endif
198 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
199 |         CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
200 |         LDFLAGS += -lsocket
201 |         LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
202 |     endif
203 | endif
204 | 
205 | # Install directory of different arch
206 | CUDA_INSTALL_TARGET_DIR :=
207 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
208 |     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
209 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
210 |     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
211 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
212 |     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
213 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
214 |     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
215 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
216 |     CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
217 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
218 |     CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
219 | else ifeq ($(TARGET_ARCH),ppc64le)
220 |     CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
221 | endif
222 | 
223 | # Debug build flags
224 | ifeq ($(dbg),1)
225 |       NVCCFLAGS += -g -G
226 |       BUILD_TYPE := debug
227 | else
228 |       BUILD_TYPE := release
229 | endif
230 | 
231 | ALL_CCFLAGS :=
232 | ALL_CCFLAGS += $(NVCCFLAGS)
233 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
234 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
235 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
236 | 
237 | ALL_LDFLAGS :=
238 | ALL_LDFLAGS += $(ALL_CCFLAGS)
239 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
240 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
241 | 
242 | # Common includes and paths for CUDA
243 | INCLUDES  := -I.
244 | LIBRARIES :=
245 | 
246 | ################################################################################
247 | 
248 | # Gencode arguments
249 | SMS ?=
250 | 
251 | ifeq ($(GENCODE_FLAGS),)
252 | # Generate SASS code for each SM architecture listed in $(SMS)
253 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
254 | 
255 | ifeq ($(SMS),)
256 | # Generate PTX code from SM 70
257 | GENCODE_FLAGS += -gencode arch=compute_70,code=compute_70
258 | endif
259 | 
260 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
261 | HIGHEST_SM := $(lastword $(sort $(SMS)))
262 | ifneq ($(HIGHEST_SM),)
263 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
264 | endif
265 | endif
266 | 
267 | LIBRARIES += -lcublas
268 | 
269 | ################################################################################
270 | 
271 | # Target rules
272 | all: build
273 | 
274 | build: simpleCUBLAS
275 | 
276 | simpleCUBLAS.o:simpleCUBLAS.cpp
277 | 	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
278 | 
279 | simpleCUBLAS: simpleCUBLAS.o
280 | 	$(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
281 | 
282 | run: build
283 | 	./simpleCUBLAS
284 | 
285 | clean:
286 | 	rm -f simpleCUBLAS simpleCUBLAS.o
287 | 
288 | clobber: clean
289 | 


--------------------------------------------------------------------------------
/simpleCUBLASSgemm/Makefile:
--------------------------------------------------------------------------------
  1 | ################################################################################
  2 | #
  3 | # Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
  4 | #
  5 | # NOTICE TO USER:
  6 | #
  7 | # This source code is subject to NVIDIA ownership rights under U.S. and
  8 | # international Copyright laws.
  9 | #
 10 | # NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
 11 | # CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
 12 | # IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
 13 | # REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
 14 | # MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
 15 | # IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
 16 | # OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
 17 | # OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
 18 | # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
 19 | # OR PERFORMANCE OF THIS SOURCE CODE.
 20 | #
 21 | # U.S. Government End Users.  This source code is a "commercial item" as
 22 | # that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting  of
 23 | # "commercial computer software" and "commercial computer software
 24 | # documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
 25 | # and is provided to the U.S. Government only as a commercial end item.
 26 | # Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
 27 | # 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
 28 | # source code with only those rights set forth herein.
 29 | #
 30 | ################################################################################
 31 | #
 32 | # Makefile project only supported on Mac OS X and Linux Platforms)
 33 | #
 34 | ################################################################################
 35 | 
 36 | # Location of the CUDA Toolkit
 37 | CUDA_PATH ?= /usr/local/cuda-10.1
 38 | 
 39 | ##############################
 40 | # start deprecated interface #
 41 | ##############################
 42 | ifeq ($(x86_64),1)
 43 |     $(info WARNING - x86_64 variable has been deprecated)
 44 |     $(info WARNING - please use TARGET_ARCH=x86_64 instead)
 45 |     TARGET_ARCH ?= x86_64
 46 | endif
 47 | ifeq ($(ARMv7),1)
 48 |     $(info WARNING - ARMv7 variable has been deprecated)
 49 |     $(info WARNING - please use TARGET_ARCH=armv7l instead)
 50 |     TARGET_ARCH ?= armv7l
 51 | endif
 52 | ifeq ($(aarch64),1)
 53 |     $(info WARNING - aarch64 variable has been deprecated)
 54 |     $(info WARNING - please use TARGET_ARCH=aarch64 instead)
 55 |     TARGET_ARCH ?= aarch64
 56 | endif
 57 | ifeq ($(ppc64le),1)
 58 |     $(info WARNING - ppc64le variable has been deprecated)
 59 |     $(info WARNING - please use TARGET_ARCH=ppc64le instead)
 60 |     TARGET_ARCH ?= ppc64le
 61 | endif
 62 | ifneq ($(GCC),)
 63 |     $(info WARNING - GCC variable has been deprecated)
 64 |     $(info WARNING - please use HOST_COMPILER=$(GCC) instead)
 65 |     HOST_COMPILER ?= $(GCC)
 66 | endif
 67 | ifneq ($(abi),)
 68 |     $(error ERROR - abi variable has been removed)
 69 | endif
 70 | ############################
 71 | # end deprecated interface #
 72 | ############################
 73 | 
 74 | # architecture
 75 | HOST_ARCH   := $(shell uname -m)
 76 | TARGET_ARCH ?= $(HOST_ARCH)
 77 | ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le armv7l))
 78 |     ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 79 |         ifneq (,$(filter $(TARGET_ARCH),x86_64 aarch64 ppc64le))
 80 |             TARGET_SIZE := 64
 81 |         else ifneq (,$(filter $(TARGET_ARCH),armv7l))
 82 |             TARGET_SIZE := 32
 83 |         endif
 84 |     else
 85 |         TARGET_SIZE := $(shell getconf LONG_BIT)
 86 |     endif
 87 | else
 88 |     $(error ERROR - unsupported value $(TARGET_ARCH) for TARGET_ARCH!)
 89 | endif
 90 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
 91 |     ifeq (,$(filter $(HOST_ARCH)-$(TARGET_ARCH),aarch64-armv7l x86_64-armv7l x86_64-aarch64 x86_64-ppc64le))
 92 |         $(error ERROR - cross compiling from $(HOST_ARCH) to $(TARGET_ARCH) is not supported!)
 93 |     endif
 94 | endif
 95 | 
 96 | # When on native aarch64 system with userspace of 32-bit, change TARGET_ARCH to armv7l
 97 | ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_SIZE),aarch64-aarch64-32)
 98 |     TARGET_ARCH = armv7l
 99 | endif
100 | 
101 | # operating system
102 | HOST_OS   := $(shell uname -s 2>/dev/null | tr "[:upper:]" "[:lower:]")
103 | TARGET_OS ?= $(HOST_OS)
104 | ifeq (,$(filter $(TARGET_OS),linux darwin qnx android))
105 |     $(error ERROR - unsupported value $(TARGET_OS) for TARGET_OS!)
106 | endif
107 | 
108 | # host compiler
109 | ifeq ($(TARGET_OS),darwin)
110 |     ifeq ($(shell expr `xcodebuild -version | grep -i xcode | awk '{print $$2}' | cut -d'.' -f1` \>= 5),1)
111 |         HOST_COMPILER ?= clang++
112 |     endif
113 | else ifneq ($(TARGET_ARCH),$(HOST_ARCH))
114 |     ifeq ($(HOST_ARCH)-$(TARGET_ARCH),x86_64-armv7l)
115 |         ifeq ($(TARGET_OS),linux)
116 |             HOST_COMPILER ?= arm-linux-gnueabihf-g++
117 |         else ifeq ($(TARGET_OS),qnx)
118 |             ifeq ($(QNX_HOST),)
119 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
120 |             endif
121 |             ifeq ($(QNX_TARGET),)
122 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
123 |             endif
124 |             export QNX_HOST
125 |             export QNX_TARGET
126 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/arm-unknown-nto-qnx6.6.0eabi-g++
127 |         else ifeq ($(TARGET_OS),android)
128 |             HOST_COMPILER ?= arm-linux-androideabi-g++
129 |         endif
130 |     else ifeq ($(TARGET_ARCH),aarch64)
131 |         ifeq ($(TARGET_OS), linux)
132 |             HOST_COMPILER ?= aarch64-linux-gnu-g++
133 |         else ifeq ($(TARGET_OS),qnx)
134 |             ifeq ($(QNX_HOST),)
135 |                 $(error ERROR - QNX_HOST must be passed to the QNX host toolchain)
136 |             endif
137 |             ifeq ($(QNX_TARGET),)
138 |                 $(error ERROR - QNX_TARGET must be passed to the QNX target toolchain)
139 |             endif
140 |             export QNX_HOST
141 |             export QNX_TARGET
142 |             HOST_COMPILER ?= $(QNX_HOST)/usr/bin/aarch64-unknown-nto-qnx7.0.0-g++
143 |         else ifeq ($(TARGET_OS), android)
144 |             HOST_COMPILER ?= aarch64-linux-android-clang++
145 |         endif
146 |     else ifeq ($(TARGET_ARCH),ppc64le)
147 |         HOST_COMPILER ?= powerpc64le-linux-gnu-g++
148 |     endif
149 | endif
150 | HOST_COMPILER ?= g++
151 | NVCC          := $(CUDA_PATH)/bin/nvcc -ccbin $(HOST_COMPILER)
152 | 
153 | # internal flags
154 | NVCCFLAGS   := -m${TARGET_SIZE}
155 | CCFLAGS     :=
156 | LDFLAGS     :=
157 | 
158 | # build flags
159 | ifeq ($(TARGET_OS),darwin)
160 |     LDFLAGS += -rpath $(CUDA_PATH)/lib
161 |     CCFLAGS += -arch $(HOST_ARCH)
162 | else ifeq ($(HOST_ARCH)-$(TARGET_ARCH)-$(TARGET_OS),x86_64-armv7l-linux)
163 |     LDFLAGS += --dynamic-linker=/lib/ld-linux-armhf.so.3
164 |     CCFLAGS += -mfloat-abi=hard
165 | else ifeq ($(TARGET_OS),android)
166 |     LDFLAGS += -pie
167 |     CCFLAGS += -fpie -fpic -fexceptions
168 | endif
169 | 
170 | ifneq ($(TARGET_ARCH),$(HOST_ARCH))
171 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
172 |         ifneq ($(TARGET_FS),)
173 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
174 |             ifeq ($(GCCVERSIONLTEQ46),1)
175 |                 CCFLAGS += --sysroot=$(TARGET_FS)
176 |             endif
177 |             LDFLAGS += --sysroot=$(TARGET_FS)
178 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib
179 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib
180 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/arm-linux-gnueabihf
181 |         endif
182 |     endif
183 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
184 |         ifneq ($(TARGET_FS),)
185 |             GCCVERSIONLTEQ46 := $(shell expr `$(HOST_COMPILER) -dumpversion` \<= 4.6)
186 |             ifeq ($(GCCVERSIONLTEQ46),1)
187 |                 CCFLAGS += --sysroot=$(TARGET_FS)
188 |             endif
189 |             LDFLAGS += --sysroot=$(TARGET_FS)
190 |             LDFLAGS += -rpath-link=$(TARGET_FS)/lib -L $(TARGET_FS)/lib
191 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib -L $(TARGET_FS)/usr/lib
192 |             LDFLAGS += -rpath-link=$(TARGET_FS)/usr/lib/aarch64-linux-gnu -L $(TARGET_FS)/usr/lib/aarch64-linux-gnu
193 |             LDFLAGS += --unresolved-symbols=ignore-in-shared-libs
194 |             CCFLAGS += -isystem=$(TARGET_FS)/usr/include
195 |             CCFLAGS += -isystem=$(TARGET_FS)/usr/include/aarch64-linux-gnu
196 |         endif
197 |     endif
198 |     ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
199 |         CCFLAGS += -DWIN_INTERFACE_CUSTOM -I/usr/include/aarch64-qnx-gnu
200 |         LDFLAGS += -lsocket
201 |         LDFLAGS += -rpath=/usr/lib/aarch64-qnx-gnu -L/usr/lib/aarch64-qnx-gnu
202 |     endif
203 | endif
204 | 
205 | # Install directory of different arch
206 | CUDA_INSTALL_TARGET_DIR :=
207 | ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-linux)
208 |     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-gnueabihf/
209 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-linux)
210 |     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux/
211 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-android)
212 |     CUDA_INSTALL_TARGET_DIR = targets/armv7-linux-androideabi/
213 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-android)
214 |     CUDA_INSTALL_TARGET_DIR = targets/aarch64-linux-androideabi/
215 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),armv7l-qnx)
216 |     CUDA_INSTALL_TARGET_DIR = targets/ARMv7-linux-QNX/
217 | else ifeq ($(TARGET_ARCH)-$(TARGET_OS),aarch64-qnx)
218 |     CUDA_INSTALL_TARGET_DIR = targets/aarch64-qnx/
219 | else ifeq ($(TARGET_ARCH),ppc64le)
220 |     CUDA_INSTALL_TARGET_DIR = targets/ppc64le-linux/
221 | endif
222 | 
223 | # Debug build flags
224 | ifeq ($(dbg),1)
225 |       NVCCFLAGS += -g -G
226 |       BUILD_TYPE := debug
227 | else
228 |       BUILD_TYPE := release
229 | endif
230 | 
231 | ALL_CCFLAGS :=
232 | ALL_CCFLAGS += $(NVCCFLAGS)
233 | ALL_CCFLAGS += $(EXTRA_NVCCFLAGS)
234 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(CCFLAGS))
235 | ALL_CCFLAGS += $(addprefix -Xcompiler ,$(EXTRA_CCFLAGS))
236 | 
237 | ALL_LDFLAGS :=
238 | ALL_LDFLAGS += $(ALL_CCFLAGS)
239 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(LDFLAGS))
240 | ALL_LDFLAGS += $(addprefix -Xlinker ,$(EXTRA_LDFLAGS))
241 | 
242 | # Common includes and paths for CUDA
243 | INCLUDES  := -I.
244 | LIBRARIES :=
245 | 
246 | ################################################################################
247 | 
248 | # Gencode arguments
249 | SMS ?=
250 | 
251 | ifeq ($(GENCODE_FLAGS),)
252 | # Generate SASS code for each SM architecture listed in $(SMS)
253 | $(foreach sm,$(SMS),$(eval GENCODE_FLAGS += -gencode arch=compute_$(sm),code=sm_$(sm)))
254 | 
255 | ifeq ($(SMS),)
256 | # Generate PTX code from SM 70
257 | GENCODE_FLAGS += -gencode arch=compute_70,code=compute_70
258 | endif
259 | 
260 | # Generate PTX code from the highest SM architecture in $(SMS) to guarantee forward-compatibility
261 | HIGHEST_SM := $(lastword $(sort $(SMS)))
262 | ifneq ($(HIGHEST_SM),)
263 | GENCODE_FLAGS += -gencode arch=compute_$(HIGHEST_SM),code=compute_$(HIGHEST_SM)
264 | endif
265 | endif
266 | 
267 | LIBRARIES += -lcublas
268 | 
269 | ################################################################################
270 | 
271 | # Target rules
272 | all: build
273 | 
274 | build: simpleCUBLAS
275 | 
276 | simpleCUBLAS.o:simpleCUBLAS.cpp
277 | 	$(NVCC) $(INCLUDES) $(ALL_CCFLAGS) $(GENCODE_FLAGS) -o $@ -c $<
278 | 
279 | simpleCUBLAS: simpleCUBLAS.o
280 | 	$(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
281 | 
282 | run: build
283 | 	./simpleCUBLAS
284 | 
285 | clean:
286 | 	rm -f simpleCUBLAS simpleCUBLAS.o
287 | 
288 | clobber: clean
289 | 


--------------------------------------------------------------------------------
/simpleCUBLASEx/helper_cuda.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | ////////////////////////////////////////////////////////////////////////////////
 13 | // These are CUDA Helper functions for initialization and error checking
 14 | 
 15 | #ifndef COMMON_HELPER_CUDA_H_
 16 | #define COMMON_HELPER_CUDA_H_
 17 | 
 18 | #pragma once
 19 | 
 20 | #include <stdint.h>
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <string.h>
 24 | 
 25 | #include <helper_string.h>
 26 | 
 27 | #ifndef EXIT_WAIVED
 28 | #define EXIT_WAIVED 2
 29 | #endif
 30 | 
 31 | // Note, it is required that your SDK sample to include the proper header
 32 | // files, please refer the CUDA examples for examples of the needed CUDA
 33 | // headers, which may change depending on which CUDA functions are used.
 34 | 
 35 | // CUDA Runtime error messages
 36 | #ifdef __DRIVER_TYPES_H__
 37 | static const char *_cudaGetErrorEnum(cudaError_t error) {
 38 |   return cudaGetErrorName(error);
 39 | }
 40 | #endif
 41 | 
 42 | #ifdef CUDA_DRIVER_API
 43 | // CUDA Driver API errors
 44 | static const char *_cudaGetErrorEnum(CUresult error) {
 45 |   static char unknown[] = "<unknown>";
 46 |   const char *ret = NULL;
 47 |   cuGetErrorName(error, &ret);
 48 |   return ret ? ret : unknown;
 49 | }
 50 | #endif
 51 | 
 52 | #ifdef CUBLAS_API_H_
 53 | // cuBLAS API errors
 54 | static const char *_cudaGetErrorEnum(cublasStatus_t error) {
 55 |   switch (error) {
 56 |     case CUBLAS_STATUS_SUCCESS:
 57 |       return "CUBLAS_STATUS_SUCCESS";
 58 | 
 59 |     case CUBLAS_STATUS_NOT_INITIALIZED:
 60 |       return "CUBLAS_STATUS_NOT_INITIALIZED";
 61 | 
 62 |     case CUBLAS_STATUS_ALLOC_FAILED:
 63 |       return "CUBLAS_STATUS_ALLOC_FAILED";
 64 | 
 65 |     case CUBLAS_STATUS_INVALID_VALUE:
 66 |       return "CUBLAS_STATUS_INVALID_VALUE";
 67 | 
 68 |     case CUBLAS_STATUS_ARCH_MISMATCH:
 69 |       return "CUBLAS_STATUS_ARCH_MISMATCH";
 70 | 
 71 |     case CUBLAS_STATUS_MAPPING_ERROR:
 72 |       return "CUBLAS_STATUS_MAPPING_ERROR";
 73 | 
 74 |     case CUBLAS_STATUS_EXECUTION_FAILED:
 75 |       return "CUBLAS_STATUS_EXECUTION_FAILED";
 76 | 
 77 |     case CUBLAS_STATUS_INTERNAL_ERROR:
 78 |       return "CUBLAS_STATUS_INTERNAL_ERROR";
 79 | 
 80 |     case CUBLAS_STATUS_NOT_SUPPORTED:
 81 |       return "CUBLAS_STATUS_NOT_SUPPORTED";
 82 | 
 83 |     case CUBLAS_STATUS_LICENSE_ERROR:
 84 |       return "CUBLAS_STATUS_LICENSE_ERROR";
 85 |   }
 86 | 
 87 |   return "<unknown>";
 88 | }
 89 | #endif
 90 | 
 91 | #ifdef _CUFFT_H_
 92 | // cuFFT API errors
 93 | static const char *_cudaGetErrorEnum(cufftResult error) {
 94 |   switch (error) {
 95 |     case CUFFT_SUCCESS:
 96 |       return "CUFFT_SUCCESS";
 97 | 
 98 |     case CUFFT_INVALID_PLAN:
 99 |       return "CUFFT_INVALID_PLAN";
100 | 
101 |     case CUFFT_ALLOC_FAILED:
102 |       return "CUFFT_ALLOC_FAILED";
103 | 
104 |     case CUFFT_INVALID_TYPE:
105 |       return "CUFFT_INVALID_TYPE";
106 | 
107 |     case CUFFT_INVALID_VALUE:
108 |       return "CUFFT_INVALID_VALUE";
109 | 
110 |     case CUFFT_INTERNAL_ERROR:
111 |       return "CUFFT_INTERNAL_ERROR";
112 | 
113 |     case CUFFT_EXEC_FAILED:
114 |       return "CUFFT_EXEC_FAILED";
115 | 
116 |     case CUFFT_SETUP_FAILED:
117 |       return "CUFFT_SETUP_FAILED";
118 | 
119 |     case CUFFT_INVALID_SIZE:
120 |       return "CUFFT_INVALID_SIZE";
121 | 
122 |     case CUFFT_UNALIGNED_DATA:
123 |       return "CUFFT_UNALIGNED_DATA";
124 | 
125 |     case CUFFT_INCOMPLETE_PARAMETER_LIST:
126 |       return "CUFFT_INCOMPLETE_PARAMETER_LIST";
127 | 
128 |     case CUFFT_INVALID_DEVICE:
129 |       return "CUFFT_INVALID_DEVICE";
130 | 
131 |     case CUFFT_PARSE_ERROR:
132 |       return "CUFFT_PARSE_ERROR";
133 | 
134 |     case CUFFT_NO_WORKSPACE:
135 |       return "CUFFT_NO_WORKSPACE";
136 | 
137 |     case CUFFT_NOT_IMPLEMENTED:
138 |       return "CUFFT_NOT_IMPLEMENTED";
139 | 
140 |     case CUFFT_LICENSE_ERROR:
141 |       return "CUFFT_LICENSE_ERROR";
142 | 
143 |     case CUFFT_NOT_SUPPORTED:
144 |       return "CUFFT_NOT_SUPPORTED";
145 |   }
146 | 
147 |   return "<unknown>";
148 | }
149 | #endif
150 | 
151 | #ifdef CUSPARSEAPI
152 | // cuSPARSE API errors
153 | static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
154 |   switch (error) {
155 |     case CUSPARSE_STATUS_SUCCESS:
156 |       return "CUSPARSE_STATUS_SUCCESS";
157 | 
158 |     case CUSPARSE_STATUS_NOT_INITIALIZED:
159 |       return "CUSPARSE_STATUS_NOT_INITIALIZED";
160 | 
161 |     case CUSPARSE_STATUS_ALLOC_FAILED:
162 |       return "CUSPARSE_STATUS_ALLOC_FAILED";
163 | 
164 |     case CUSPARSE_STATUS_INVALID_VALUE:
165 |       return "CUSPARSE_STATUS_INVALID_VALUE";
166 | 
167 |     case CUSPARSE_STATUS_ARCH_MISMATCH:
168 |       return "CUSPARSE_STATUS_ARCH_MISMATCH";
169 | 
170 |     case CUSPARSE_STATUS_MAPPING_ERROR:
171 |       return "CUSPARSE_STATUS_MAPPING_ERROR";
172 | 
173 |     case CUSPARSE_STATUS_EXECUTION_FAILED:
174 |       return "CUSPARSE_STATUS_EXECUTION_FAILED";
175 | 
176 |     case CUSPARSE_STATUS_INTERNAL_ERROR:
177 |       return "CUSPARSE_STATUS_INTERNAL_ERROR";
178 | 
179 |     case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
180 |       return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
181 |   }
182 | 
183 |   return "<unknown>";
184 | }
185 | #endif
186 | 
187 | #ifdef CUSOLVER_COMMON_H_
188 | // cuSOLVER API errors
189 | static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
190 |   switch (error) {
191 |     case CUSOLVER_STATUS_SUCCESS:
192 |       return "CUSOLVER_STATUS_SUCCESS";
193 |     case CUSOLVER_STATUS_NOT_INITIALIZED:
194 |       return "CUSOLVER_STATUS_NOT_INITIALIZED";
195 |     case CUSOLVER_STATUS_ALLOC_FAILED:
196 |       return "CUSOLVER_STATUS_ALLOC_FAILED";
197 |     case CUSOLVER_STATUS_INVALID_VALUE:
198 |       return "CUSOLVER_STATUS_INVALID_VALUE";
199 |     case CUSOLVER_STATUS_ARCH_MISMATCH:
200 |       return "CUSOLVER_STATUS_ARCH_MISMATCH";
201 |     case CUSOLVER_STATUS_MAPPING_ERROR:
202 |       return "CUSOLVER_STATUS_MAPPING_ERROR";
203 |     case CUSOLVER_STATUS_EXECUTION_FAILED:
204 |       return "CUSOLVER_STATUS_EXECUTION_FAILED";
205 |     case CUSOLVER_STATUS_INTERNAL_ERROR:
206 |       return "CUSOLVER_STATUS_INTERNAL_ERROR";
207 |     case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
208 |       return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
209 |     case CUSOLVER_STATUS_NOT_SUPPORTED:
210 |       return "CUSOLVER_STATUS_NOT_SUPPORTED ";
211 |     case CUSOLVER_STATUS_ZERO_PIVOT:
212 |       return "CUSOLVER_STATUS_ZERO_PIVOT";
213 |     case CUSOLVER_STATUS_INVALID_LICENSE:
214 |       return "CUSOLVER_STATUS_INVALID_LICENSE";
215 |   }
216 | 
217 |   return "<unknown>";
218 | }
219 | #endif
220 | 
221 | #ifdef CURAND_H_
222 | // cuRAND API errors
223 | static const char *_cudaGetErrorEnum(curandStatus_t error) {
224 |   switch (error) {
225 |     case CURAND_STATUS_SUCCESS:
226 |       return "CURAND_STATUS_SUCCESS";
227 | 
228 |     case CURAND_STATUS_VERSION_MISMATCH:
229 |       return "CURAND_STATUS_VERSION_MISMATCH";
230 | 
231 |     case CURAND_STATUS_NOT_INITIALIZED:
232 |       return "CURAND_STATUS_NOT_INITIALIZED";
233 | 
234 |     case CURAND_STATUS_ALLOCATION_FAILED:
235 |       return "CURAND_STATUS_ALLOCATION_FAILED";
236 | 
237 |     case CURAND_STATUS_TYPE_ERROR:
238 |       return "CURAND_STATUS_TYPE_ERROR";
239 | 
240 |     case CURAND_STATUS_OUT_OF_RANGE:
241 |       return "CURAND_STATUS_OUT_OF_RANGE";
242 | 
243 |     case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
244 |       return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
245 | 
246 |     case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
247 |       return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
248 | 
249 |     case CURAND_STATUS_LAUNCH_FAILURE:
250 |       return "CURAND_STATUS_LAUNCH_FAILURE";
251 | 
252 |     case CURAND_STATUS_PREEXISTING_FAILURE:
253 |       return "CURAND_STATUS_PREEXISTING_FAILURE";
254 | 
255 |     case CURAND_STATUS_INITIALIZATION_FAILED:
256 |       return "CURAND_STATUS_INITIALIZATION_FAILED";
257 | 
258 |     case CURAND_STATUS_ARCH_MISMATCH:
259 |       return "CURAND_STATUS_ARCH_MISMATCH";
260 | 
261 |     case CURAND_STATUS_INTERNAL_ERROR:
262 |       return "CURAND_STATUS_INTERNAL_ERROR";
263 |   }
264 | 
265 |   return "<unknown>";
266 | }
267 | #endif
268 | 
269 | #ifdef NVJPEGAPI
270 | // nvJPEG API errors
271 | static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
272 |   switch (error) {
273 |     case NVJPEG_STATUS_SUCCESS:
274 |       return "NVJPEG_STATUS_SUCCESS";
275 | 
276 |     case NVJPEG_STATUS_NOT_INITIALIZED:
277 |       return "NVJPEG_STATUS_NOT_INITIALIZED";
278 | 
279 |     case NVJPEG_STATUS_INVALID_PARAMETER:
280 |       return "NVJPEG_STATUS_INVALID_PARAMETER";
281 | 
282 |     case NVJPEG_STATUS_BAD_JPEG:
283 |       return "NVJPEG_STATUS_BAD_JPEG";
284 | 
285 |     case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
286 |       return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
287 | 
288 |     case NVJPEG_STATUS_ALLOCATOR_FAILURE:
289 |       return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
290 | 
291 |     case NVJPEG_STATUS_EXECUTION_FAILED:
292 |       return "NVJPEG_STATUS_EXECUTION_FAILED";
293 | 
294 |     case NVJPEG_STATUS_ARCH_MISMATCH:
295 |       return "NVJPEG_STATUS_ARCH_MISMATCH";
296 | 
297 |     case NVJPEG_STATUS_INTERNAL_ERROR:
298 |       return "NVJPEG_STATUS_INTERNAL_ERROR";
299 |   }
300 | 
301 |   return "<unknown>";
302 | }
303 | #endif
304 | 
305 | #ifdef NV_NPPIDEFS_H
306 | // NPP API errors
307 | static const char *_cudaGetErrorEnum(NppStatus error) {
308 |   switch (error) {
309 |     case NPP_NOT_SUPPORTED_MODE_ERROR:
310 |       return "NPP_NOT_SUPPORTED_MODE_ERROR";
311 | 
312 |     case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
313 |       return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
314 | 
315 |     case NPP_RESIZE_NO_OPERATION_ERROR:
316 |       return "NPP_RESIZE_NO_OPERATION_ERROR";
317 | 
318 |     case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
319 |       return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
320 | 
321 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
322 | 
323 |     case NPP_BAD_ARG_ERROR:
324 |       return "NPP_BAD_ARGUMENT_ERROR";
325 | 
326 |     case NPP_COEFF_ERROR:
327 |       return "NPP_COEFFICIENT_ERROR";
328 | 
329 |     case NPP_RECT_ERROR:
330 |       return "NPP_RECTANGLE_ERROR";
331 | 
332 |     case NPP_QUAD_ERROR:
333 |       return "NPP_QUADRANGLE_ERROR";
334 | 
335 |     case NPP_MEM_ALLOC_ERR:
336 |       return "NPP_MEMORY_ALLOCATION_ERROR";
337 | 
338 |     case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
339 |       return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
340 | 
341 |     case NPP_INVALID_INPUT:
342 |       return "NPP_INVALID_INPUT";
343 | 
344 |     case NPP_POINTER_ERROR:
345 |       return "NPP_POINTER_ERROR";
346 | 
347 |     case NPP_WARNING:
348 |       return "NPP_WARNING";
349 | 
350 |     case NPP_ODD_ROI_WARNING:
351 |       return "NPP_ODD_ROI_WARNING";
352 | #else
353 | 
354 |     // These are for CUDA 5.5 or higher
355 |     case NPP_BAD_ARGUMENT_ERROR:
356 |       return "NPP_BAD_ARGUMENT_ERROR";
357 | 
358 |     case NPP_COEFFICIENT_ERROR:
359 |       return "NPP_COEFFICIENT_ERROR";
360 | 
361 |     case NPP_RECTANGLE_ERROR:
362 |       return "NPP_RECTANGLE_ERROR";
363 | 
364 |     case NPP_QUADRANGLE_ERROR:
365 |       return "NPP_QUADRANGLE_ERROR";
366 | 
367 |     case NPP_MEMORY_ALLOCATION_ERR:
368 |       return "NPP_MEMORY_ALLOCATION_ERROR";
369 | 
370 |     case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
371 |       return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
372 | 
373 |     case NPP_INVALID_HOST_POINTER_ERROR:
374 |       return "NPP_INVALID_HOST_POINTER_ERROR";
375 | 
376 |     case NPP_INVALID_DEVICE_POINTER_ERROR:
377 |       return "NPP_INVALID_DEVICE_POINTER_ERROR";
378 | #endif
379 | 
380 |     case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
381 |       return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
382 | 
383 |     case NPP_TEXTURE_BIND_ERROR:
384 |       return "NPP_TEXTURE_BIND_ERROR";
385 | 
386 |     case NPP_WRONG_INTERSECTION_ROI_ERROR:
387 |       return "NPP_WRONG_INTERSECTION_ROI_ERROR";
388 | 
389 |     case NPP_NOT_EVEN_STEP_ERROR:
390 |       return "NPP_NOT_EVEN_STEP_ERROR";
391 | 
392 |     case NPP_INTERPOLATION_ERROR:
393 |       return "NPP_INTERPOLATION_ERROR";
394 | 
395 |     case NPP_RESIZE_FACTOR_ERROR:
396 |       return "NPP_RESIZE_FACTOR_ERROR";
397 | 
398 |     case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
399 |       return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
400 | 
401 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
402 | 
403 |     case NPP_MEMFREE_ERR:
404 |       return "NPP_MEMFREE_ERR";
405 | 
406 |     case NPP_MEMSET_ERR:
407 |       return "NPP_MEMSET_ERR";
408 | 
409 |     case NPP_MEMCPY_ERR:
410 |       return "NPP_MEMCPY_ERROR";
411 | 
412 |     case NPP_MIRROR_FLIP_ERR:
413 |       return "NPP_MIRROR_FLIP_ERR";
414 | #else
415 | 
416 |     case NPP_MEMFREE_ERROR:
417 |       return "NPP_MEMFREE_ERROR";
418 | 
419 |     case NPP_MEMSET_ERROR:
420 |       return "NPP_MEMSET_ERROR";
421 | 
422 |     case NPP_MEMCPY_ERROR:
423 |       return "NPP_MEMCPY_ERROR";
424 | 
425 |     case NPP_MIRROR_FLIP_ERROR:
426 |       return "NPP_MIRROR_FLIP_ERROR";
427 | #endif
428 | 
429 |     case NPP_ALIGNMENT_ERROR:
430 |       return "NPP_ALIGNMENT_ERROR";
431 | 
432 |     case NPP_STEP_ERROR:
433 |       return "NPP_STEP_ERROR";
434 | 
435 |     case NPP_SIZE_ERROR:
436 |       return "NPP_SIZE_ERROR";
437 | 
438 |     case NPP_NULL_POINTER_ERROR:
439 |       return "NPP_NULL_POINTER_ERROR";
440 | 
441 |     case NPP_CUDA_KERNEL_EXECUTION_ERROR:
442 |       return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
443 | 
444 |     case NPP_NOT_IMPLEMENTED_ERROR:
445 |       return "NPP_NOT_IMPLEMENTED_ERROR";
446 | 
447 |     case NPP_ERROR:
448 |       return "NPP_ERROR";
449 | 
450 |     case NPP_SUCCESS:
451 |       return "NPP_SUCCESS";
452 | 
453 |     case NPP_WRONG_INTERSECTION_QUAD_WARNING:
454 |       return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
455 | 
456 |     case NPP_MISALIGNED_DST_ROI_WARNING:
457 |       return "NPP_MISALIGNED_DST_ROI_WARNING";
458 | 
459 |     case NPP_AFFINE_QUAD_INCORRECT_WARNING:
460 |       return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
461 | 
462 |     case NPP_DOUBLE_SIZE_WARNING:
463 |       return "NPP_DOUBLE_SIZE_WARNING";
464 | 
465 |     case NPP_WRONG_INTERSECTION_ROI_WARNING:
466 |       return "NPP_WRONG_INTERSECTION_ROI_WARNING";
467 | 
468 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
469 |     /* These are 6.0 or higher */
470 |     case NPP_LUT_PALETTE_BITSIZE_ERROR:
471 |       return "NPP_LUT_PALETTE_BITSIZE_ERROR";
472 | 
473 |     case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
474 |       return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
475 | 
476 |     case NPP_QUALITY_INDEX_ERROR:
477 |       return "NPP_QUALITY_INDEX_ERROR";
478 | 
479 |     case NPP_CHANNEL_ORDER_ERROR:
480 |       return "NPP_CHANNEL_ORDER_ERROR";
481 | 
482 |     case NPP_ZERO_MASK_VALUE_ERROR:
483 |       return "NPP_ZERO_MASK_VALUE_ERROR";
484 | 
485 |     case NPP_NUMBER_OF_CHANNELS_ERROR:
486 |       return "NPP_NUMBER_OF_CHANNELS_ERROR";
487 | 
488 |     case NPP_COI_ERROR:
489 |       return "NPP_COI_ERROR";
490 | 
491 |     case NPP_DIVISOR_ERROR:
492 |       return "NPP_DIVISOR_ERROR";
493 | 
494 |     case NPP_CHANNEL_ERROR:
495 |       return "NPP_CHANNEL_ERROR";
496 | 
497 |     case NPP_STRIDE_ERROR:
498 |       return "NPP_STRIDE_ERROR";
499 | 
500 |     case NPP_ANCHOR_ERROR:
501 |       return "NPP_ANCHOR_ERROR";
502 | 
503 |     case NPP_MASK_SIZE_ERROR:
504 |       return "NPP_MASK_SIZE_ERROR";
505 | 
506 |     case NPP_MOMENT_00_ZERO_ERROR:
507 |       return "NPP_MOMENT_00_ZERO_ERROR";
508 | 
509 |     case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
510 |       return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
511 | 
512 |     case NPP_THRESHOLD_ERROR:
513 |       return "NPP_THRESHOLD_ERROR";
514 | 
515 |     case NPP_CONTEXT_MATCH_ERROR:
516 |       return "NPP_CONTEXT_MATCH_ERROR";
517 | 
518 |     case NPP_FFT_FLAG_ERROR:
519 |       return "NPP_FFT_FLAG_ERROR";
520 | 
521 |     case NPP_FFT_ORDER_ERROR:
522 |       return "NPP_FFT_ORDER_ERROR";
523 | 
524 |     case NPP_SCALE_RANGE_ERROR:
525 |       return "NPP_SCALE_RANGE_ERROR";
526 | 
527 |     case NPP_DATA_TYPE_ERROR:
528 |       return "NPP_DATA_TYPE_ERROR";
529 | 
530 |     case NPP_OUT_OFF_RANGE_ERROR:
531 |       return "NPP_OUT_OFF_RANGE_ERROR";
532 | 
533 |     case NPP_DIVIDE_BY_ZERO_ERROR:
534 |       return "NPP_DIVIDE_BY_ZERO_ERROR";
535 | 
536 |     case NPP_RANGE_ERROR:
537 |       return "NPP_RANGE_ERROR";
538 | 
539 |     case NPP_NO_MEMORY_ERROR:
540 |       return "NPP_NO_MEMORY_ERROR";
541 | 
542 |     case NPP_ERROR_RESERVED:
543 |       return "NPP_ERROR_RESERVED";
544 | 
545 |     case NPP_NO_OPERATION_WARNING:
546 |       return "NPP_NO_OPERATION_WARNING";
547 | 
548 |     case NPP_DIVIDE_BY_ZERO_WARNING:
549 |       return "NPP_DIVIDE_BY_ZERO_WARNING";
550 | #endif
551 | 
552 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
553 |     /* These are 7.0 or higher */
554 |     case NPP_OVERFLOW_ERROR:
555 |       return "NPP_OVERFLOW_ERROR";
556 | 
557 |     case NPP_CORRUPTED_DATA_ERROR:
558 |       return "NPP_CORRUPTED_DATA_ERROR";
559 | #endif
560 |   }
561 | 
562 |   return "<unknown>";
563 | }
564 | #endif
565 | 
566 | #ifdef __DRIVER_TYPES_H__
567 | #ifndef DEVICE_RESET
568 | #define DEVICE_RESET cudaDeviceReset();
569 | #endif
570 | #else
571 | #ifndef DEVICE_RESET
572 | #define DEVICE_RESET
573 | #endif
574 | #endif
575 | 
576 | template <typename T>
577 | void check(T result, char const *const func, const char *const file,
578 |            int const line) {
579 |   if (result) {
580 |     fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
581 |             static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
582 |     DEVICE_RESET
583 |     // Make sure we call CUDA Device Reset before exiting
584 |     exit(EXIT_FAILURE);
585 |   }
586 | }
587 | 
588 | #ifdef __DRIVER_TYPES_H__
589 | // This will output the proper CUDA error strings in the event
590 | // that a CUDA host call returns an error
591 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
592 | 
593 | // This will output the proper error string when calling cudaGetLastError
594 | #define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
595 | 
596 | inline void __getLastCudaError(const char *errorMessage, const char *file,
597 |                                const int line) {
598 |   cudaError_t err = cudaGetLastError();
599 | 
600 |   if (cudaSuccess != err) {
601 |     fprintf(stderr,
602 |             "%s(%i) : getLastCudaError() CUDA error :"
603 |             " %s : (%d) %s.\n",
604 |             file, line, errorMessage, static_cast<int>(err),
605 |             cudaGetErrorString(err));
606 |     DEVICE_RESET
607 |     exit(EXIT_FAILURE);
608 |   }
609 | }
610 | 
611 | // This will only print the proper error string when calling cudaGetLastError
612 | // but not exit program incase error detected.
613 | #define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
614 | 
615 | inline void __printLastCudaError(const char *errorMessage, const char *file,
616 |                                  const int line) {
617 |   cudaError_t err = cudaGetLastError();
618 | 
619 |   if (cudaSuccess != err) {
620 |     fprintf(stderr,
621 |             "%s(%i) : getLastCudaError() CUDA error :"
622 |             " %s : (%d) %s.\n",
623 |             file, line, errorMessage, static_cast<int>(err),
624 |             cudaGetErrorString(err));
625 |   }
626 | }
627 | #endif
628 | 
629 | #ifndef MAX
630 | #define MAX(a, b) (a > b ? a : b)
631 | #endif
632 | 
633 | // Float To Int conversion
634 | inline int ftoi(float value) {
635 |   return (value >= 0 ? static_cast<int>(value + 0.5)
636 |                      : static_cast<int>(value - 0.5));
637 | }
638 | 
639 | // Beginning of GPU Architecture definitions
640 | inline int _ConvertSMVer2Cores(int major, int minor) {
641 |   // Defines for GPU Architecture types (using the SM version to determine
642 |   // the # of cores per SM
643 |   typedef struct {
644 |     int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
645 |     // and m = SM minor version
646 |     int Cores;
647 |   } sSMtoCores;
648 | 
649 |   sSMtoCores nGpuArchCoresPerSM[] = {
650 |       {0x30, 192},
651 |       {0x32, 192},
652 |       {0x35, 192},
653 |       {0x37, 192},
654 |       {0x50, 128},
655 |       {0x52, 128},
656 |       {0x53, 128},
657 |       {0x60,  64},
658 |       {0x61, 128},
659 |       {0x62, 128},
660 |       {0x70,  64},
661 |       {0x72,  64},
662 |       {0x75,  64},
663 |       {-1, -1}};
664 | 
665 |   int index = 0;
666 | 
667 |   while (nGpuArchCoresPerSM[index].SM != -1) {
668 |     if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
669 |       return nGpuArchCoresPerSM[index].Cores;
670 |     }
671 | 
672 |     index++;
673 |   }
674 | 
675 |   // If we don't find the values, we default use the previous one
676 |   // to run properly
677 |   printf(
678 |       "MapSMtoCores for SM %d.%d is undefined."
679 |       "  Default to use %d Cores/SM\n",
680 |       major, minor, nGpuArchCoresPerSM[index - 1].Cores);
681 |   return nGpuArchCoresPerSM[index - 1].Cores;
682 | }
683 |   // end of GPU Architecture definitions
684 | 
685 | #ifdef __CUDA_RUNTIME_H__
686 | // General GPU Device CUDA Initialization
687 | inline int gpuDeviceInit(int devID) {
688 |   int device_count;
689 |   checkCudaErrors(cudaGetDeviceCount(&device_count));
690 | 
691 |   if (device_count == 0) {
692 |     fprintf(stderr,
693 |             "gpuDeviceInit() CUDA error: "
694 |             "no devices supporting CUDA.\n");
695 |     exit(EXIT_FAILURE);
696 |   }
697 | 
698 |   if (devID < 0) {
699 |     devID = 0;
700 |   }
701 | 
702 |   if (devID > device_count - 1) {
703 |     fprintf(stderr, "\n");
704 |     fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
705 |             device_count);
706 |     fprintf(stderr,
707 |             ">> gpuDeviceInit (-device=%d) is not a valid"
708 |             " GPU device. <<\n",
709 |             devID);
710 |     fprintf(stderr, "\n");
711 |     return -devID;
712 |   }
713 | 
714 |   cudaDeviceProp deviceProp;
715 |   checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
716 | 
717 |   if (deviceProp.computeMode == cudaComputeModeProhibited) {
718 |     fprintf(stderr,
719 |             "Error: device is running in <Compute Mode "
720 |             "Prohibited>, no threads can use cudaSetDevice().\n");
721 |     return -1;
722 |   }
723 | 
724 |   if (deviceProp.major < 1) {
725 |     fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
726 |     exit(EXIT_FAILURE);
727 |   }
728 | 
729 |   checkCudaErrors(cudaSetDevice(devID));
730 |   printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
731 | 
732 |   return devID;
733 | }
734 | 
735 | // This function returns the best GPU (with maximum GFLOPS)
736 | inline int gpuGetMaxGflopsDeviceId() {
737 |   int current_device = 0, sm_per_multiproc = 0;
738 |   int max_perf_device = 0;
739 |   int device_count = 0;
740 |   int devices_prohibited = 0;
741 | 
742 |   uint64_t max_compute_perf = 0;
743 |   cudaDeviceProp deviceProp;
744 |   checkCudaErrors(cudaGetDeviceCount(&device_count));
745 | 
746 |   if (device_count == 0) {
747 |     fprintf(stderr,
748 |             "gpuGetMaxGflopsDeviceId() CUDA error:"
749 |             " no devices supporting CUDA.\n");
750 |     exit(EXIT_FAILURE);
751 |   }
752 | 
753 |   // Find the best CUDA capable GPU device
754 |   current_device = 0;
755 | 
756 |   while (current_device < device_count) {
757 |     cudaGetDeviceProperties(&deviceProp, current_device);
758 | 
759 |     // If this GPU is not running on Compute Mode prohibited,
760 |     // then we can add it to the list
761 |     if (deviceProp.computeMode != cudaComputeModeProhibited) {
762 |       if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
763 |         sm_per_multiproc = 1;
764 |       } else {
765 |         sm_per_multiproc =
766 |             _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
767 |       }
768 | 
769 |       uint64_t compute_perf = (uint64_t)deviceProp.multiProcessorCount *
770 |                               sm_per_multiproc * deviceProp.clockRate;
771 | 
772 |       if (compute_perf > max_compute_perf) {
773 |         max_compute_perf = compute_perf;
774 |         max_perf_device = current_device;
775 |       }
776 |     } else {
777 |       devices_prohibited++;
778 |     }
779 | 
780 |     ++current_device;
781 |   }
782 | 
783 |   if (devices_prohibited == device_count) {
784 |     fprintf(stderr,
785 |             "gpuGetMaxGflopsDeviceId() CUDA error:"
786 |             " all devices have compute mode prohibited.\n");
787 |     exit(EXIT_FAILURE);
788 |   }
789 | 
790 |   return max_perf_device;
791 | }
792 | 
793 | // Initialization code to find the best CUDA Device
794 | inline int findCudaDevice(int argc, const char **argv) {
795 |   cudaDeviceProp deviceProp;
796 |   int devID = 0;
797 | 
798 |   // If the command-line has a device number specified, use it
799 |   if (checkCmdLineFlag(argc, argv, "device")) {
800 |     devID = getCmdLineArgumentInt(argc, argv, "device=");
801 | 
802 |     if (devID < 0) {
803 |       printf("Invalid command line parameter\n ");
804 |       exit(EXIT_FAILURE);
805 |     } else {
806 |       devID = gpuDeviceInit(devID);
807 | 
808 |       if (devID < 0) {
809 |         printf("exiting...\n");
810 |         exit(EXIT_FAILURE);
811 |       }
812 |     }
813 |   } else {
814 |     // Otherwise pick the device with highest Gflops/s
815 |     devID = gpuGetMaxGflopsDeviceId();
816 |     checkCudaErrors(cudaSetDevice(devID));
817 |     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
818 |     printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID,
819 |            deviceProp.name, deviceProp.major, deviceProp.minor);
820 |   }
821 | 
822 |   return devID;
823 | }
824 | 
825 | inline int findIntegratedGPU() {
826 |   int current_device = 0;
827 |   int device_count = 0;
828 |   int devices_prohibited = 0;
829 | 
830 |   cudaDeviceProp deviceProp;
831 |   checkCudaErrors(cudaGetDeviceCount(&device_count));
832 | 
833 |   if (device_count == 0) {
834 |     fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
835 |     exit(EXIT_FAILURE);
836 |   }
837 | 
838 |   // Find the integrated GPU which is compute capable
839 |   while (current_device < device_count) {
840 |     cudaGetDeviceProperties(&deviceProp, current_device);
841 | 
842 |     // If GPU is integrated and is not running on Compute Mode prohibited,
843 |     // then cuda can map to GLES resource
844 |     if (deviceProp.integrated &&
845 |         (deviceProp.computeMode != cudaComputeModeProhibited)) {
846 |       checkCudaErrors(cudaSetDevice(current_device));
847 |       checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
848 |       printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
849 |              current_device, deviceProp.name, deviceProp.major,
850 |              deviceProp.minor);
851 | 
852 |       return current_device;
853 |     } else {
854 |       devices_prohibited++;
855 |     }
856 | 
857 |     current_device++;
858 |   }
859 | 
860 |   if (devices_prohibited == device_count) {
861 |     fprintf(stderr,
862 |             "CUDA error:"
863 |             " No GLES-CUDA Interop capable GPU found.\n");
864 |     exit(EXIT_FAILURE);
865 |   }
866 | 
867 |   return -1;
868 | }
869 | 
870 | // General check for CUDA GPU SM Capabilities
871 | inline bool checkCudaCapabilities(int major_version, int minor_version) {
872 |   cudaDeviceProp deviceProp;
873 |   deviceProp.major = 0;
874 |   deviceProp.minor = 0;
875 |   int dev;
876 | 
877 |   checkCudaErrors(cudaGetDevice(&dev));
878 |   checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
879 | 
880 |   if ((deviceProp.major > major_version) ||
881 |       (deviceProp.major == major_version &&
882 |        deviceProp.minor >= minor_version)) {
883 |     printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
884 |            deviceProp.name, deviceProp.major, deviceProp.minor);
885 |     return true;
886 |   } else {
887 |     printf(
888 |         "  No GPU device was found that can support "
889 |         "CUDA compute capability %d.%d.\n",
890 |         major_version, minor_version);
891 |     return false;
892 |   }
893 | }
894 | #endif
895 | 
896 |   // end of CUDA Helper Functions
897 | 
898 | #endif  // COMMON_HELPER_CUDA_H_
899 | 


--------------------------------------------------------------------------------
/simpleCUBLASHgemm/helper_cuda.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | ////////////////////////////////////////////////////////////////////////////////
 13 | // These are CUDA Helper functions for initialization and error checking
 14 | 
 15 | #ifndef COMMON_HELPER_CUDA_H_
 16 | #define COMMON_HELPER_CUDA_H_
 17 | 
 18 | #pragma once
 19 | 
 20 | #include <stdint.h>
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <string.h>
 24 | 
 25 | #include <helper_string.h>
 26 | 
 27 | #ifndef EXIT_WAIVED
 28 | #define EXIT_WAIVED 2
 29 | #endif
 30 | 
 31 | // Note, it is required that your SDK sample to include the proper header
 32 | // files, please refer the CUDA examples for examples of the needed CUDA
 33 | // headers, which may change depending on which CUDA functions are used.
 34 | 
 35 | // CUDA Runtime error messages
 36 | #ifdef __DRIVER_TYPES_H__
 37 | static const char *_cudaGetErrorEnum(cudaError_t error) {
 38 |   return cudaGetErrorName(error);
 39 | }
 40 | #endif
 41 | 
 42 | #ifdef CUDA_DRIVER_API
 43 | // CUDA Driver API errors
 44 | static const char *_cudaGetErrorEnum(CUresult error) {
 45 |   static char unknown[] = "<unknown>";
 46 |   const char *ret = NULL;
 47 |   cuGetErrorName(error, &ret);
 48 |   return ret ? ret : unknown;
 49 | }
 50 | #endif
 51 | 
 52 | #ifdef CUBLAS_API_H_
 53 | // cuBLAS API errors
 54 | static const char *_cudaGetErrorEnum(cublasStatus_t error) {
 55 |   switch (error) {
 56 |     case CUBLAS_STATUS_SUCCESS:
 57 |       return "CUBLAS_STATUS_SUCCESS";
 58 | 
 59 |     case CUBLAS_STATUS_NOT_INITIALIZED:
 60 |       return "CUBLAS_STATUS_NOT_INITIALIZED";
 61 | 
 62 |     case CUBLAS_STATUS_ALLOC_FAILED:
 63 |       return "CUBLAS_STATUS_ALLOC_FAILED";
 64 | 
 65 |     case CUBLAS_STATUS_INVALID_VALUE:
 66 |       return "CUBLAS_STATUS_INVALID_VALUE";
 67 | 
 68 |     case CUBLAS_STATUS_ARCH_MISMATCH:
 69 |       return "CUBLAS_STATUS_ARCH_MISMATCH";
 70 | 
 71 |     case CUBLAS_STATUS_MAPPING_ERROR:
 72 |       return "CUBLAS_STATUS_MAPPING_ERROR";
 73 | 
 74 |     case CUBLAS_STATUS_EXECUTION_FAILED:
 75 |       return "CUBLAS_STATUS_EXECUTION_FAILED";
 76 | 
 77 |     case CUBLAS_STATUS_INTERNAL_ERROR:
 78 |       return "CUBLAS_STATUS_INTERNAL_ERROR";
 79 | 
 80 |     case CUBLAS_STATUS_NOT_SUPPORTED:
 81 |       return "CUBLAS_STATUS_NOT_SUPPORTED";
 82 | 
 83 |     case CUBLAS_STATUS_LICENSE_ERROR:
 84 |       return "CUBLAS_STATUS_LICENSE_ERROR";
 85 |   }
 86 | 
 87 |   return "<unknown>";
 88 | }
 89 | #endif
 90 | 
 91 | #ifdef _CUFFT_H_
 92 | // cuFFT API errors
 93 | static const char *_cudaGetErrorEnum(cufftResult error) {
 94 |   switch (error) {
 95 |     case CUFFT_SUCCESS:
 96 |       return "CUFFT_SUCCESS";
 97 | 
 98 |     case CUFFT_INVALID_PLAN:
 99 |       return "CUFFT_INVALID_PLAN";
100 | 
101 |     case CUFFT_ALLOC_FAILED:
102 |       return "CUFFT_ALLOC_FAILED";
103 | 
104 |     case CUFFT_INVALID_TYPE:
105 |       return "CUFFT_INVALID_TYPE";
106 | 
107 |     case CUFFT_INVALID_VALUE:
108 |       return "CUFFT_INVALID_VALUE";
109 | 
110 |     case CUFFT_INTERNAL_ERROR:
111 |       return "CUFFT_INTERNAL_ERROR";
112 | 
113 |     case CUFFT_EXEC_FAILED:
114 |       return "CUFFT_EXEC_FAILED";
115 | 
116 |     case CUFFT_SETUP_FAILED:
117 |       return "CUFFT_SETUP_FAILED";
118 | 
119 |     case CUFFT_INVALID_SIZE:
120 |       return "CUFFT_INVALID_SIZE";
121 | 
122 |     case CUFFT_UNALIGNED_DATA:
123 |       return "CUFFT_UNALIGNED_DATA";
124 | 
125 |     case CUFFT_INCOMPLETE_PARAMETER_LIST:
126 |       return "CUFFT_INCOMPLETE_PARAMETER_LIST";
127 | 
128 |     case CUFFT_INVALID_DEVICE:
129 |       return "CUFFT_INVALID_DEVICE";
130 | 
131 |     case CUFFT_PARSE_ERROR:
132 |       return "CUFFT_PARSE_ERROR";
133 | 
134 |     case CUFFT_NO_WORKSPACE:
135 |       return "CUFFT_NO_WORKSPACE";
136 | 
137 |     case CUFFT_NOT_IMPLEMENTED:
138 |       return "CUFFT_NOT_IMPLEMENTED";
139 | 
140 |     case CUFFT_LICENSE_ERROR:
141 |       return "CUFFT_LICENSE_ERROR";
142 | 
143 |     case CUFFT_NOT_SUPPORTED:
144 |       return "CUFFT_NOT_SUPPORTED";
145 |   }
146 | 
147 |   return "<unknown>";
148 | }
149 | #endif
150 | 
151 | #ifdef CUSPARSEAPI
152 | // cuSPARSE API errors
153 | static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
154 |   switch (error) {
155 |     case CUSPARSE_STATUS_SUCCESS:
156 |       return "CUSPARSE_STATUS_SUCCESS";
157 | 
158 |     case CUSPARSE_STATUS_NOT_INITIALIZED:
159 |       return "CUSPARSE_STATUS_NOT_INITIALIZED";
160 | 
161 |     case CUSPARSE_STATUS_ALLOC_FAILED:
162 |       return "CUSPARSE_STATUS_ALLOC_FAILED";
163 | 
164 |     case CUSPARSE_STATUS_INVALID_VALUE:
165 |       return "CUSPARSE_STATUS_INVALID_VALUE";
166 | 
167 |     case CUSPARSE_STATUS_ARCH_MISMATCH:
168 |       return "CUSPARSE_STATUS_ARCH_MISMATCH";
169 | 
170 |     case CUSPARSE_STATUS_MAPPING_ERROR:
171 |       return "CUSPARSE_STATUS_MAPPING_ERROR";
172 | 
173 |     case CUSPARSE_STATUS_EXECUTION_FAILED:
174 |       return "CUSPARSE_STATUS_EXECUTION_FAILED";
175 | 
176 |     case CUSPARSE_STATUS_INTERNAL_ERROR:
177 |       return "CUSPARSE_STATUS_INTERNAL_ERROR";
178 | 
179 |     case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
180 |       return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
181 |   }
182 | 
183 |   return "<unknown>";
184 | }
185 | #endif
186 | 
187 | #ifdef CUSOLVER_COMMON_H_
188 | // cuSOLVER API errors
189 | static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
190 |   switch (error) {
191 |     case CUSOLVER_STATUS_SUCCESS:
192 |       return "CUSOLVER_STATUS_SUCCESS";
193 |     case CUSOLVER_STATUS_NOT_INITIALIZED:
194 |       return "CUSOLVER_STATUS_NOT_INITIALIZED";
195 |     case CUSOLVER_STATUS_ALLOC_FAILED:
196 |       return "CUSOLVER_STATUS_ALLOC_FAILED";
197 |     case CUSOLVER_STATUS_INVALID_VALUE:
198 |       return "CUSOLVER_STATUS_INVALID_VALUE";
199 |     case CUSOLVER_STATUS_ARCH_MISMATCH:
200 |       return "CUSOLVER_STATUS_ARCH_MISMATCH";
201 |     case CUSOLVER_STATUS_MAPPING_ERROR:
202 |       return "CUSOLVER_STATUS_MAPPING_ERROR";
203 |     case CUSOLVER_STATUS_EXECUTION_FAILED:
204 |       return "CUSOLVER_STATUS_EXECUTION_FAILED";
205 |     case CUSOLVER_STATUS_INTERNAL_ERROR:
206 |       return "CUSOLVER_STATUS_INTERNAL_ERROR";
207 |     case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
208 |       return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
209 |     case CUSOLVER_STATUS_NOT_SUPPORTED:
210 |       return "CUSOLVER_STATUS_NOT_SUPPORTED ";
211 |     case CUSOLVER_STATUS_ZERO_PIVOT:
212 |       return "CUSOLVER_STATUS_ZERO_PIVOT";
213 |     case CUSOLVER_STATUS_INVALID_LICENSE:
214 |       return "CUSOLVER_STATUS_INVALID_LICENSE";
215 |   }
216 | 
217 |   return "<unknown>";
218 | }
219 | #endif
220 | 
221 | #ifdef CURAND_H_
222 | // cuRAND API errors
223 | static const char *_cudaGetErrorEnum(curandStatus_t error) {
224 |   switch (error) {
225 |     case CURAND_STATUS_SUCCESS:
226 |       return "CURAND_STATUS_SUCCESS";
227 | 
228 |     case CURAND_STATUS_VERSION_MISMATCH:
229 |       return "CURAND_STATUS_VERSION_MISMATCH";
230 | 
231 |     case CURAND_STATUS_NOT_INITIALIZED:
232 |       return "CURAND_STATUS_NOT_INITIALIZED";
233 | 
234 |     case CURAND_STATUS_ALLOCATION_FAILED:
235 |       return "CURAND_STATUS_ALLOCATION_FAILED";
236 | 
237 |     case CURAND_STATUS_TYPE_ERROR:
238 |       return "CURAND_STATUS_TYPE_ERROR";
239 | 
240 |     case CURAND_STATUS_OUT_OF_RANGE:
241 |       return "CURAND_STATUS_OUT_OF_RANGE";
242 | 
243 |     case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
244 |       return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
245 | 
246 |     case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
247 |       return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
248 | 
249 |     case CURAND_STATUS_LAUNCH_FAILURE:
250 |       return "CURAND_STATUS_LAUNCH_FAILURE";
251 | 
252 |     case CURAND_STATUS_PREEXISTING_FAILURE:
253 |       return "CURAND_STATUS_PREEXISTING_FAILURE";
254 | 
255 |     case CURAND_STATUS_INITIALIZATION_FAILED:
256 |       return "CURAND_STATUS_INITIALIZATION_FAILED";
257 | 
258 |     case CURAND_STATUS_ARCH_MISMATCH:
259 |       return "CURAND_STATUS_ARCH_MISMATCH";
260 | 
261 |     case CURAND_STATUS_INTERNAL_ERROR:
262 |       return "CURAND_STATUS_INTERNAL_ERROR";
263 |   }
264 | 
265 |   return "<unknown>";
266 | }
267 | #endif
268 | 
269 | #ifdef NVJPEGAPI
270 | // nvJPEG API errors
271 | static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
272 |   switch (error) {
273 |     case NVJPEG_STATUS_SUCCESS:
274 |       return "NVJPEG_STATUS_SUCCESS";
275 | 
276 |     case NVJPEG_STATUS_NOT_INITIALIZED:
277 |       return "NVJPEG_STATUS_NOT_INITIALIZED";
278 | 
279 |     case NVJPEG_STATUS_INVALID_PARAMETER:
280 |       return "NVJPEG_STATUS_INVALID_PARAMETER";
281 | 
282 |     case NVJPEG_STATUS_BAD_JPEG:
283 |       return "NVJPEG_STATUS_BAD_JPEG";
284 | 
285 |     case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
286 |       return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
287 | 
288 |     case NVJPEG_STATUS_ALLOCATOR_FAILURE:
289 |       return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
290 | 
291 |     case NVJPEG_STATUS_EXECUTION_FAILED:
292 |       return "NVJPEG_STATUS_EXECUTION_FAILED";
293 | 
294 |     case NVJPEG_STATUS_ARCH_MISMATCH:
295 |       return "NVJPEG_STATUS_ARCH_MISMATCH";
296 | 
297 |     case NVJPEG_STATUS_INTERNAL_ERROR:
298 |       return "NVJPEG_STATUS_INTERNAL_ERROR";
299 |   }
300 | 
301 |   return "<unknown>";
302 | }
303 | #endif
304 | 
305 | #ifdef NV_NPPIDEFS_H
306 | // NPP API errors
307 | static const char *_cudaGetErrorEnum(NppStatus error) {
308 |   switch (error) {
309 |     case NPP_NOT_SUPPORTED_MODE_ERROR:
310 |       return "NPP_NOT_SUPPORTED_MODE_ERROR";
311 | 
312 |     case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
313 |       return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
314 | 
315 |     case NPP_RESIZE_NO_OPERATION_ERROR:
316 |       return "NPP_RESIZE_NO_OPERATION_ERROR";
317 | 
318 |     case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
319 |       return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
320 | 
321 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
322 | 
323 |     case NPP_BAD_ARG_ERROR:
324 |       return "NPP_BAD_ARGUMENT_ERROR";
325 | 
326 |     case NPP_COEFF_ERROR:
327 |       return "NPP_COEFFICIENT_ERROR";
328 | 
329 |     case NPP_RECT_ERROR:
330 |       return "NPP_RECTANGLE_ERROR";
331 | 
332 |     case NPP_QUAD_ERROR:
333 |       return "NPP_QUADRANGLE_ERROR";
334 | 
335 |     case NPP_MEM_ALLOC_ERR:
336 |       return "NPP_MEMORY_ALLOCATION_ERROR";
337 | 
338 |     case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
339 |       return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
340 | 
341 |     case NPP_INVALID_INPUT:
342 |       return "NPP_INVALID_INPUT";
343 | 
344 |     case NPP_POINTER_ERROR:
345 |       return "NPP_POINTER_ERROR";
346 | 
347 |     case NPP_WARNING:
348 |       return "NPP_WARNING";
349 | 
350 |     case NPP_ODD_ROI_WARNING:
351 |       return "NPP_ODD_ROI_WARNING";
352 | #else
353 | 
354 |     // These are for CUDA 5.5 or higher
355 |     case NPP_BAD_ARGUMENT_ERROR:
356 |       return "NPP_BAD_ARGUMENT_ERROR";
357 | 
358 |     case NPP_COEFFICIENT_ERROR:
359 |       return "NPP_COEFFICIENT_ERROR";
360 | 
361 |     case NPP_RECTANGLE_ERROR:
362 |       return "NPP_RECTANGLE_ERROR";
363 | 
364 |     case NPP_QUADRANGLE_ERROR:
365 |       return "NPP_QUADRANGLE_ERROR";
366 | 
367 |     case NPP_MEMORY_ALLOCATION_ERR:
368 |       return "NPP_MEMORY_ALLOCATION_ERROR";
369 | 
370 |     case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
371 |       return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
372 | 
373 |     case NPP_INVALID_HOST_POINTER_ERROR:
374 |       return "NPP_INVALID_HOST_POINTER_ERROR";
375 | 
376 |     case NPP_INVALID_DEVICE_POINTER_ERROR:
377 |       return "NPP_INVALID_DEVICE_POINTER_ERROR";
378 | #endif
379 | 
380 |     case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
381 |       return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
382 | 
383 |     case NPP_TEXTURE_BIND_ERROR:
384 |       return "NPP_TEXTURE_BIND_ERROR";
385 | 
386 |     case NPP_WRONG_INTERSECTION_ROI_ERROR:
387 |       return "NPP_WRONG_INTERSECTION_ROI_ERROR";
388 | 
389 |     case NPP_NOT_EVEN_STEP_ERROR:
390 |       return "NPP_NOT_EVEN_STEP_ERROR";
391 | 
392 |     case NPP_INTERPOLATION_ERROR:
393 |       return "NPP_INTERPOLATION_ERROR";
394 | 
395 |     case NPP_RESIZE_FACTOR_ERROR:
396 |       return "NPP_RESIZE_FACTOR_ERROR";
397 | 
398 |     case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
399 |       return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
400 | 
401 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
402 | 
403 |     case NPP_MEMFREE_ERR:
404 |       return "NPP_MEMFREE_ERR";
405 | 
406 |     case NPP_MEMSET_ERR:
407 |       return "NPP_MEMSET_ERR";
408 | 
409 |     case NPP_MEMCPY_ERR:
410 |       return "NPP_MEMCPY_ERROR";
411 | 
412 |     case NPP_MIRROR_FLIP_ERR:
413 |       return "NPP_MIRROR_FLIP_ERR";
414 | #else
415 | 
416 |     case NPP_MEMFREE_ERROR:
417 |       return "NPP_MEMFREE_ERROR";
418 | 
419 |     case NPP_MEMSET_ERROR:
420 |       return "NPP_MEMSET_ERROR";
421 | 
422 |     case NPP_MEMCPY_ERROR:
423 |       return "NPP_MEMCPY_ERROR";
424 | 
425 |     case NPP_MIRROR_FLIP_ERROR:
426 |       return "NPP_MIRROR_FLIP_ERROR";
427 | #endif
428 | 
429 |     case NPP_ALIGNMENT_ERROR:
430 |       return "NPP_ALIGNMENT_ERROR";
431 | 
432 |     case NPP_STEP_ERROR:
433 |       return "NPP_STEP_ERROR";
434 | 
435 |     case NPP_SIZE_ERROR:
436 |       return "NPP_SIZE_ERROR";
437 | 
438 |     case NPP_NULL_POINTER_ERROR:
439 |       return "NPP_NULL_POINTER_ERROR";
440 | 
441 |     case NPP_CUDA_KERNEL_EXECUTION_ERROR:
442 |       return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
443 | 
444 |     case NPP_NOT_IMPLEMENTED_ERROR:
445 |       return "NPP_NOT_IMPLEMENTED_ERROR";
446 | 
447 |     case NPP_ERROR:
448 |       return "NPP_ERROR";
449 | 
450 |     case NPP_SUCCESS:
451 |       return "NPP_SUCCESS";
452 | 
453 |     case NPP_WRONG_INTERSECTION_QUAD_WARNING:
454 |       return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
455 | 
456 |     case NPP_MISALIGNED_DST_ROI_WARNING:
457 |       return "NPP_MISALIGNED_DST_ROI_WARNING";
458 | 
459 |     case NPP_AFFINE_QUAD_INCORRECT_WARNING:
460 |       return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
461 | 
462 |     case NPP_DOUBLE_SIZE_WARNING:
463 |       return "NPP_DOUBLE_SIZE_WARNING";
464 | 
465 |     case NPP_WRONG_INTERSECTION_ROI_WARNING:
466 |       return "NPP_WRONG_INTERSECTION_ROI_WARNING";
467 | 
468 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
469 |     /* These are 6.0 or higher */
470 |     case NPP_LUT_PALETTE_BITSIZE_ERROR:
471 |       return "NPP_LUT_PALETTE_BITSIZE_ERROR";
472 | 
473 |     case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
474 |       return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
475 | 
476 |     case NPP_QUALITY_INDEX_ERROR:
477 |       return "NPP_QUALITY_INDEX_ERROR";
478 | 
479 |     case NPP_CHANNEL_ORDER_ERROR:
480 |       return "NPP_CHANNEL_ORDER_ERROR";
481 | 
482 |     case NPP_ZERO_MASK_VALUE_ERROR:
483 |       return "NPP_ZERO_MASK_VALUE_ERROR";
484 | 
485 |     case NPP_NUMBER_OF_CHANNELS_ERROR:
486 |       return "NPP_NUMBER_OF_CHANNELS_ERROR";
487 | 
488 |     case NPP_COI_ERROR:
489 |       return "NPP_COI_ERROR";
490 | 
491 |     case NPP_DIVISOR_ERROR:
492 |       return "NPP_DIVISOR_ERROR";
493 | 
494 |     case NPP_CHANNEL_ERROR:
495 |       return "NPP_CHANNEL_ERROR";
496 | 
497 |     case NPP_STRIDE_ERROR:
498 |       return "NPP_STRIDE_ERROR";
499 | 
500 |     case NPP_ANCHOR_ERROR:
501 |       return "NPP_ANCHOR_ERROR";
502 | 
503 |     case NPP_MASK_SIZE_ERROR:
504 |       return "NPP_MASK_SIZE_ERROR";
505 | 
506 |     case NPP_MOMENT_00_ZERO_ERROR:
507 |       return "NPP_MOMENT_00_ZERO_ERROR";
508 | 
509 |     case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
510 |       return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
511 | 
512 |     case NPP_THRESHOLD_ERROR:
513 |       return "NPP_THRESHOLD_ERROR";
514 | 
515 |     case NPP_CONTEXT_MATCH_ERROR:
516 |       return "NPP_CONTEXT_MATCH_ERROR";
517 | 
518 |     case NPP_FFT_FLAG_ERROR:
519 |       return "NPP_FFT_FLAG_ERROR";
520 | 
521 |     case NPP_FFT_ORDER_ERROR:
522 |       return "NPP_FFT_ORDER_ERROR";
523 | 
524 |     case NPP_SCALE_RANGE_ERROR:
525 |       return "NPP_SCALE_RANGE_ERROR";
526 | 
527 |     case NPP_DATA_TYPE_ERROR:
528 |       return "NPP_DATA_TYPE_ERROR";
529 | 
530 |     case NPP_OUT_OFF_RANGE_ERROR:
531 |       return "NPP_OUT_OFF_RANGE_ERROR";
532 | 
533 |     case NPP_DIVIDE_BY_ZERO_ERROR:
534 |       return "NPP_DIVIDE_BY_ZERO_ERROR";
535 | 
536 |     case NPP_RANGE_ERROR:
537 |       return "NPP_RANGE_ERROR";
538 | 
539 |     case NPP_NO_MEMORY_ERROR:
540 |       return "NPP_NO_MEMORY_ERROR";
541 | 
542 |     case NPP_ERROR_RESERVED:
543 |       return "NPP_ERROR_RESERVED";
544 | 
545 |     case NPP_NO_OPERATION_WARNING:
546 |       return "NPP_NO_OPERATION_WARNING";
547 | 
548 |     case NPP_DIVIDE_BY_ZERO_WARNING:
549 |       return "NPP_DIVIDE_BY_ZERO_WARNING";
550 | #endif
551 | 
552 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
553 |     /* These are 7.0 or higher */
554 |     case NPP_OVERFLOW_ERROR:
555 |       return "NPP_OVERFLOW_ERROR";
556 | 
557 |     case NPP_CORRUPTED_DATA_ERROR:
558 |       return "NPP_CORRUPTED_DATA_ERROR";
559 | #endif
560 |   }
561 | 
562 |   return "<unknown>";
563 | }
564 | #endif
565 | 
566 | #ifdef __DRIVER_TYPES_H__
567 | #ifndef DEVICE_RESET
568 | #define DEVICE_RESET cudaDeviceReset();
569 | #endif
570 | #else
571 | #ifndef DEVICE_RESET
572 | #define DEVICE_RESET
573 | #endif
574 | #endif
575 | 
576 | template <typename T>
577 | void check(T result, char const *const func, const char *const file,
578 |            int const line) {
579 |   if (result) {
580 |     fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
581 |             static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
582 |     DEVICE_RESET
583 |     // Make sure we call CUDA Device Reset before exiting
584 |     exit(EXIT_FAILURE);
585 |   }
586 | }
587 | 
588 | #ifdef __DRIVER_TYPES_H__
589 | // This will output the proper CUDA error strings in the event
590 | // that a CUDA host call returns an error
591 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
592 | 
593 | // This will output the proper error string when calling cudaGetLastError
594 | #define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
595 | 
596 | inline void __getLastCudaError(const char *errorMessage, const char *file,
597 |                                const int line) {
598 |   cudaError_t err = cudaGetLastError();
599 | 
600 |   if (cudaSuccess != err) {
601 |     fprintf(stderr,
602 |             "%s(%i) : getLastCudaError() CUDA error :"
603 |             " %s : (%d) %s.\n",
604 |             file, line, errorMessage, static_cast<int>(err),
605 |             cudaGetErrorString(err));
606 |     DEVICE_RESET
607 |     exit(EXIT_FAILURE);
608 |   }
609 | }
610 | 
611 | // This will only print the proper error string when calling cudaGetLastError
612 | // but not exit program incase error detected.
613 | #define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
614 | 
615 | inline void __printLastCudaError(const char *errorMessage, const char *file,
616 |                                  const int line) {
617 |   cudaError_t err = cudaGetLastError();
618 | 
619 |   if (cudaSuccess != err) {
620 |     fprintf(stderr,
621 |             "%s(%i) : getLastCudaError() CUDA error :"
622 |             " %s : (%d) %s.\n",
623 |             file, line, errorMessage, static_cast<int>(err),
624 |             cudaGetErrorString(err));
625 |   }
626 | }
627 | #endif
628 | 
629 | #ifndef MAX
630 | #define MAX(a, b) (a > b ? a : b)
631 | #endif
632 | 
633 | // Float To Int conversion
634 | inline int ftoi(float value) {
635 |   return (value >= 0 ? static_cast<int>(value + 0.5)
636 |                      : static_cast<int>(value - 0.5));
637 | }
638 | 
639 | // Beginning of GPU Architecture definitions
640 | inline int _ConvertSMVer2Cores(int major, int minor) {
641 |   // Defines for GPU Architecture types (using the SM version to determine
642 |   // the # of cores per SM
643 |   typedef struct {
644 |     int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
645 |     // and m = SM minor version
646 |     int Cores;
647 |   } sSMtoCores;
648 | 
649 |   sSMtoCores nGpuArchCoresPerSM[] = {
650 |       {0x30, 192},
651 |       {0x32, 192},
652 |       {0x35, 192},
653 |       {0x37, 192},
654 |       {0x50, 128},
655 |       {0x52, 128},
656 |       {0x53, 128},
657 |       {0x60,  64},
658 |       {0x61, 128},
659 |       {0x62, 128},
660 |       {0x70,  64},
661 |       {0x72,  64},
662 |       {0x75,  64},
663 |       {-1, -1}};
664 | 
665 |   int index = 0;
666 | 
667 |   while (nGpuArchCoresPerSM[index].SM != -1) {
668 |     if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
669 |       return nGpuArchCoresPerSM[index].Cores;
670 |     }
671 | 
672 |     index++;
673 |   }
674 | 
675 |   // If we don't find the values, we default use the previous one
676 |   // to run properly
677 |   printf(
678 |       "MapSMtoCores for SM %d.%d is undefined."
679 |       "  Default to use %d Cores/SM\n",
680 |       major, minor, nGpuArchCoresPerSM[index - 1].Cores);
681 |   return nGpuArchCoresPerSM[index - 1].Cores;
682 | }
683 |   // end of GPU Architecture definitions
684 | 
685 | #ifdef __CUDA_RUNTIME_H__
686 | // General GPU Device CUDA Initialization
687 | inline int gpuDeviceInit(int devID) {
688 |   int device_count;
689 |   checkCudaErrors(cudaGetDeviceCount(&device_count));
690 | 
691 |   if (device_count == 0) {
692 |     fprintf(stderr,
693 |             "gpuDeviceInit() CUDA error: "
694 |             "no devices supporting CUDA.\n");
695 |     exit(EXIT_FAILURE);
696 |   }
697 | 
698 |   if (devID < 0) {
699 |     devID = 0;
700 |   }
701 | 
702 |   if (devID > device_count - 1) {
703 |     fprintf(stderr, "\n");
704 |     fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
705 |             device_count);
706 |     fprintf(stderr,
707 |             ">> gpuDeviceInit (-device=%d) is not a valid"
708 |             " GPU device. <<\n",
709 |             devID);
710 |     fprintf(stderr, "\n");
711 |     return -devID;
712 |   }
713 | 
714 |   cudaDeviceProp deviceProp;
715 |   checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
716 | 
717 |   if (deviceProp.computeMode == cudaComputeModeProhibited) {
718 |     fprintf(stderr,
719 |             "Error: device is running in <Compute Mode "
720 |             "Prohibited>, no threads can use cudaSetDevice().\n");
721 |     return -1;
722 |   }
723 | 
724 |   if (deviceProp.major < 1) {
725 |     fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
726 |     exit(EXIT_FAILURE);
727 |   }
728 | 
729 |   checkCudaErrors(cudaSetDevice(devID));
730 |   printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
731 | 
732 |   return devID;
733 | }
734 | 
735 | // This function returns the best GPU (with maximum GFLOPS)
736 | inline int gpuGetMaxGflopsDeviceId() {
737 |   int current_device = 0, sm_per_multiproc = 0;
738 |   int max_perf_device = 0;
739 |   int device_count = 0;
740 |   int devices_prohibited = 0;
741 | 
742 |   uint64_t max_compute_perf = 0;
743 |   cudaDeviceProp deviceProp;
744 |   checkCudaErrors(cudaGetDeviceCount(&device_count));
745 | 
746 |   if (device_count == 0) {
747 |     fprintf(stderr,
748 |             "gpuGetMaxGflopsDeviceId() CUDA error:"
749 |             " no devices supporting CUDA.\n");
750 |     exit(EXIT_FAILURE);
751 |   }
752 | 
753 |   // Find the best CUDA capable GPU device
754 |   current_device = 0;
755 | 
756 |   while (current_device < device_count) {
757 |     cudaGetDeviceProperties(&deviceProp, current_device);
758 | 
759 |     // If this GPU is not running on Compute Mode prohibited,
760 |     // then we can add it to the list
761 |     if (deviceProp.computeMode != cudaComputeModeProhibited) {
762 |       if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
763 |         sm_per_multiproc = 1;
764 |       } else {
765 |         sm_per_multiproc =
766 |             _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
767 |       }
768 | 
769 |       uint64_t compute_perf = (uint64_t)deviceProp.multiProcessorCount *
770 |                               sm_per_multiproc * deviceProp.clockRate;
771 | 
772 |       if (compute_perf > max_compute_perf) {
773 |         max_compute_perf = compute_perf;
774 |         max_perf_device = current_device;
775 |       }
776 |     } else {
777 |       devices_prohibited++;
778 |     }
779 | 
780 |     ++current_device;
781 |   }
782 | 
783 |   if (devices_prohibited == device_count) {
784 |     fprintf(stderr,
785 |             "gpuGetMaxGflopsDeviceId() CUDA error:"
786 |             " all devices have compute mode prohibited.\n");
787 |     exit(EXIT_FAILURE);
788 |   }
789 | 
790 |   return max_perf_device;
791 | }
792 | 
793 | // Initialization code to find the best CUDA Device
794 | inline int findCudaDevice(int argc, const char **argv) {
795 |   cudaDeviceProp deviceProp;
796 |   int devID = 0;
797 | 
798 |   // If the command-line has a device number specified, use it
799 |   if (checkCmdLineFlag(argc, argv, "device")) {
800 |     devID = getCmdLineArgumentInt(argc, argv, "device=");
801 | 
802 |     if (devID < 0) {
803 |       printf("Invalid command line parameter\n ");
804 |       exit(EXIT_FAILURE);
805 |     } else {
806 |       devID = gpuDeviceInit(devID);
807 | 
808 |       if (devID < 0) {
809 |         printf("exiting...\n");
810 |         exit(EXIT_FAILURE);
811 |       }
812 |     }
813 |   } else {
814 |     // Otherwise pick the device with highest Gflops/s
815 |     devID = gpuGetMaxGflopsDeviceId();
816 |     checkCudaErrors(cudaSetDevice(devID));
817 |     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
818 |     printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID,
819 |            deviceProp.name, deviceProp.major, deviceProp.minor);
820 |   }
821 | 
822 |   return devID;
823 | }
824 | 
825 | inline int findIntegratedGPU() {
826 |   int current_device = 0;
827 |   int device_count = 0;
828 |   int devices_prohibited = 0;
829 | 
830 |   cudaDeviceProp deviceProp;
831 |   checkCudaErrors(cudaGetDeviceCount(&device_count));
832 | 
833 |   if (device_count == 0) {
834 |     fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
835 |     exit(EXIT_FAILURE);
836 |   }
837 | 
838 |   // Find the integrated GPU which is compute capable
839 |   while (current_device < device_count) {
840 |     cudaGetDeviceProperties(&deviceProp, current_device);
841 | 
842 |     // If GPU is integrated and is not running on Compute Mode prohibited,
843 |     // then cuda can map to GLES resource
844 |     if (deviceProp.integrated &&
845 |         (deviceProp.computeMode != cudaComputeModeProhibited)) {
846 |       checkCudaErrors(cudaSetDevice(current_device));
847 |       checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
848 |       printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
849 |              current_device, deviceProp.name, deviceProp.major,
850 |              deviceProp.minor);
851 | 
852 |       return current_device;
853 |     } else {
854 |       devices_prohibited++;
855 |     }
856 | 
857 |     current_device++;
858 |   }
859 | 
860 |   if (devices_prohibited == device_count) {
861 |     fprintf(stderr,
862 |             "CUDA error:"
863 |             " No GLES-CUDA Interop capable GPU found.\n");
864 |     exit(EXIT_FAILURE);
865 |   }
866 | 
867 |   return -1;
868 | }
869 | 
870 | // General check for CUDA GPU SM Capabilities
871 | inline bool checkCudaCapabilities(int major_version, int minor_version) {
872 |   cudaDeviceProp deviceProp;
873 |   deviceProp.major = 0;
874 |   deviceProp.minor = 0;
875 |   int dev;
876 | 
877 |   checkCudaErrors(cudaGetDevice(&dev));
878 |   checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
879 | 
880 |   if ((deviceProp.major > major_version) ||
881 |       (deviceProp.major == major_version &&
882 |        deviceProp.minor >= minor_version)) {
883 |     printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
884 |            deviceProp.name, deviceProp.major, deviceProp.minor);
885 |     return true;
886 |   } else {
887 |     printf(
888 |         "  No GPU device was found that can support "
889 |         "CUDA compute capability %d.%d.\n",
890 |         major_version, minor_version);
891 |     return false;
892 |   }
893 | }
894 | #endif
895 | 
896 |   // end of CUDA Helper Functions
897 | 
898 | #endif  // COMMON_HELPER_CUDA_H_
899 | 


--------------------------------------------------------------------------------
/simpleCUBLASSgemm/helper_cuda.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | ////////////////////////////////////////////////////////////////////////////////
 13 | // These are CUDA Helper functions for initialization and error checking
 14 | 
 15 | #ifndef COMMON_HELPER_CUDA_H_
 16 | #define COMMON_HELPER_CUDA_H_
 17 | 
 18 | #pragma once
 19 | 
 20 | #include <stdint.h>
 21 | #include <stdio.h>
 22 | #include <stdlib.h>
 23 | #include <string.h>
 24 | 
 25 | #include <helper_string.h>
 26 | 
 27 | #ifndef EXIT_WAIVED
 28 | #define EXIT_WAIVED 2
 29 | #endif
 30 | 
 31 | // Note, it is required that your SDK sample to include the proper header
 32 | // files, please refer the CUDA examples for examples of the needed CUDA
 33 | // headers, which may change depending on which CUDA functions are used.
 34 | 
 35 | // CUDA Runtime error messages
 36 | #ifdef __DRIVER_TYPES_H__
 37 | static const char *_cudaGetErrorEnum(cudaError_t error) {
 38 |   return cudaGetErrorName(error);
 39 | }
 40 | #endif
 41 | 
 42 | #ifdef CUDA_DRIVER_API
 43 | // CUDA Driver API errors
 44 | static const char *_cudaGetErrorEnum(CUresult error) {
 45 |   static char unknown[] = "<unknown>";
 46 |   const char *ret = NULL;
 47 |   cuGetErrorName(error, &ret);
 48 |   return ret ? ret : unknown;
 49 | }
 50 | #endif
 51 | 
 52 | #ifdef CUBLAS_API_H_
 53 | // cuBLAS API errors
 54 | static const char *_cudaGetErrorEnum(cublasStatus_t error) {
 55 |   switch (error) {
 56 |     case CUBLAS_STATUS_SUCCESS:
 57 |       return "CUBLAS_STATUS_SUCCESS";
 58 | 
 59 |     case CUBLAS_STATUS_NOT_INITIALIZED:
 60 |       return "CUBLAS_STATUS_NOT_INITIALIZED";
 61 | 
 62 |     case CUBLAS_STATUS_ALLOC_FAILED:
 63 |       return "CUBLAS_STATUS_ALLOC_FAILED";
 64 | 
 65 |     case CUBLAS_STATUS_INVALID_VALUE:
 66 |       return "CUBLAS_STATUS_INVALID_VALUE";
 67 | 
 68 |     case CUBLAS_STATUS_ARCH_MISMATCH:
 69 |       return "CUBLAS_STATUS_ARCH_MISMATCH";
 70 | 
 71 |     case CUBLAS_STATUS_MAPPING_ERROR:
 72 |       return "CUBLAS_STATUS_MAPPING_ERROR";
 73 | 
 74 |     case CUBLAS_STATUS_EXECUTION_FAILED:
 75 |       return "CUBLAS_STATUS_EXECUTION_FAILED";
 76 | 
 77 |     case CUBLAS_STATUS_INTERNAL_ERROR:
 78 |       return "CUBLAS_STATUS_INTERNAL_ERROR";
 79 | 
 80 |     case CUBLAS_STATUS_NOT_SUPPORTED:
 81 |       return "CUBLAS_STATUS_NOT_SUPPORTED";
 82 | 
 83 |     case CUBLAS_STATUS_LICENSE_ERROR:
 84 |       return "CUBLAS_STATUS_LICENSE_ERROR";
 85 |   }
 86 | 
 87 |   return "<unknown>";
 88 | }
 89 | #endif
 90 | 
 91 | #ifdef _CUFFT_H_
 92 | // cuFFT API errors
 93 | static const char *_cudaGetErrorEnum(cufftResult error) {
 94 |   switch (error) {
 95 |     case CUFFT_SUCCESS:
 96 |       return "CUFFT_SUCCESS";
 97 | 
 98 |     case CUFFT_INVALID_PLAN:
 99 |       return "CUFFT_INVALID_PLAN";
100 | 
101 |     case CUFFT_ALLOC_FAILED:
102 |       return "CUFFT_ALLOC_FAILED";
103 | 
104 |     case CUFFT_INVALID_TYPE:
105 |       return "CUFFT_INVALID_TYPE";
106 | 
107 |     case CUFFT_INVALID_VALUE:
108 |       return "CUFFT_INVALID_VALUE";
109 | 
110 |     case CUFFT_INTERNAL_ERROR:
111 |       return "CUFFT_INTERNAL_ERROR";
112 | 
113 |     case CUFFT_EXEC_FAILED:
114 |       return "CUFFT_EXEC_FAILED";
115 | 
116 |     case CUFFT_SETUP_FAILED:
117 |       return "CUFFT_SETUP_FAILED";
118 | 
119 |     case CUFFT_INVALID_SIZE:
120 |       return "CUFFT_INVALID_SIZE";
121 | 
122 |     case CUFFT_UNALIGNED_DATA:
123 |       return "CUFFT_UNALIGNED_DATA";
124 | 
125 |     case CUFFT_INCOMPLETE_PARAMETER_LIST:
126 |       return "CUFFT_INCOMPLETE_PARAMETER_LIST";
127 | 
128 |     case CUFFT_INVALID_DEVICE:
129 |       return "CUFFT_INVALID_DEVICE";
130 | 
131 |     case CUFFT_PARSE_ERROR:
132 |       return "CUFFT_PARSE_ERROR";
133 | 
134 |     case CUFFT_NO_WORKSPACE:
135 |       return "CUFFT_NO_WORKSPACE";
136 | 
137 |     case CUFFT_NOT_IMPLEMENTED:
138 |       return "CUFFT_NOT_IMPLEMENTED";
139 | 
140 |     case CUFFT_LICENSE_ERROR:
141 |       return "CUFFT_LICENSE_ERROR";
142 | 
143 |     case CUFFT_NOT_SUPPORTED:
144 |       return "CUFFT_NOT_SUPPORTED";
145 |   }
146 | 
147 |   return "<unknown>";
148 | }
149 | #endif
150 | 
151 | #ifdef CUSPARSEAPI
152 | // cuSPARSE API errors
153 | static const char *_cudaGetErrorEnum(cusparseStatus_t error) {
154 |   switch (error) {
155 |     case CUSPARSE_STATUS_SUCCESS:
156 |       return "CUSPARSE_STATUS_SUCCESS";
157 | 
158 |     case CUSPARSE_STATUS_NOT_INITIALIZED:
159 |       return "CUSPARSE_STATUS_NOT_INITIALIZED";
160 | 
161 |     case CUSPARSE_STATUS_ALLOC_FAILED:
162 |       return "CUSPARSE_STATUS_ALLOC_FAILED";
163 | 
164 |     case CUSPARSE_STATUS_INVALID_VALUE:
165 |       return "CUSPARSE_STATUS_INVALID_VALUE";
166 | 
167 |     case CUSPARSE_STATUS_ARCH_MISMATCH:
168 |       return "CUSPARSE_STATUS_ARCH_MISMATCH";
169 | 
170 |     case CUSPARSE_STATUS_MAPPING_ERROR:
171 |       return "CUSPARSE_STATUS_MAPPING_ERROR";
172 | 
173 |     case CUSPARSE_STATUS_EXECUTION_FAILED:
174 |       return "CUSPARSE_STATUS_EXECUTION_FAILED";
175 | 
176 |     case CUSPARSE_STATUS_INTERNAL_ERROR:
177 |       return "CUSPARSE_STATUS_INTERNAL_ERROR";
178 | 
179 |     case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
180 |       return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
181 |   }
182 | 
183 |   return "<unknown>";
184 | }
185 | #endif
186 | 
187 | #ifdef CUSOLVER_COMMON_H_
188 | // cuSOLVER API errors
189 | static const char *_cudaGetErrorEnum(cusolverStatus_t error) {
190 |   switch (error) {
191 |     case CUSOLVER_STATUS_SUCCESS:
192 |       return "CUSOLVER_STATUS_SUCCESS";
193 |     case CUSOLVER_STATUS_NOT_INITIALIZED:
194 |       return "CUSOLVER_STATUS_NOT_INITIALIZED";
195 |     case CUSOLVER_STATUS_ALLOC_FAILED:
196 |       return "CUSOLVER_STATUS_ALLOC_FAILED";
197 |     case CUSOLVER_STATUS_INVALID_VALUE:
198 |       return "CUSOLVER_STATUS_INVALID_VALUE";
199 |     case CUSOLVER_STATUS_ARCH_MISMATCH:
200 |       return "CUSOLVER_STATUS_ARCH_MISMATCH";
201 |     case CUSOLVER_STATUS_MAPPING_ERROR:
202 |       return "CUSOLVER_STATUS_MAPPING_ERROR";
203 |     case CUSOLVER_STATUS_EXECUTION_FAILED:
204 |       return "CUSOLVER_STATUS_EXECUTION_FAILED";
205 |     case CUSOLVER_STATUS_INTERNAL_ERROR:
206 |       return "CUSOLVER_STATUS_INTERNAL_ERROR";
207 |     case CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED:
208 |       return "CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED";
209 |     case CUSOLVER_STATUS_NOT_SUPPORTED:
210 |       return "CUSOLVER_STATUS_NOT_SUPPORTED ";
211 |     case CUSOLVER_STATUS_ZERO_PIVOT:
212 |       return "CUSOLVER_STATUS_ZERO_PIVOT";
213 |     case CUSOLVER_STATUS_INVALID_LICENSE:
214 |       return "CUSOLVER_STATUS_INVALID_LICENSE";
215 |   }
216 | 
217 |   return "<unknown>";
218 | }
219 | #endif
220 | 
221 | #ifdef CURAND_H_
222 | // cuRAND API errors
223 | static const char *_cudaGetErrorEnum(curandStatus_t error) {
224 |   switch (error) {
225 |     case CURAND_STATUS_SUCCESS:
226 |       return "CURAND_STATUS_SUCCESS";
227 | 
228 |     case CURAND_STATUS_VERSION_MISMATCH:
229 |       return "CURAND_STATUS_VERSION_MISMATCH";
230 | 
231 |     case CURAND_STATUS_NOT_INITIALIZED:
232 |       return "CURAND_STATUS_NOT_INITIALIZED";
233 | 
234 |     case CURAND_STATUS_ALLOCATION_FAILED:
235 |       return "CURAND_STATUS_ALLOCATION_FAILED";
236 | 
237 |     case CURAND_STATUS_TYPE_ERROR:
238 |       return "CURAND_STATUS_TYPE_ERROR";
239 | 
240 |     case CURAND_STATUS_OUT_OF_RANGE:
241 |       return "CURAND_STATUS_OUT_OF_RANGE";
242 | 
243 |     case CURAND_STATUS_LENGTH_NOT_MULTIPLE:
244 |       return "CURAND_STATUS_LENGTH_NOT_MULTIPLE";
245 | 
246 |     case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED:
247 |       return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED";
248 | 
249 |     case CURAND_STATUS_LAUNCH_FAILURE:
250 |       return "CURAND_STATUS_LAUNCH_FAILURE";
251 | 
252 |     case CURAND_STATUS_PREEXISTING_FAILURE:
253 |       return "CURAND_STATUS_PREEXISTING_FAILURE";
254 | 
255 |     case CURAND_STATUS_INITIALIZATION_FAILED:
256 |       return "CURAND_STATUS_INITIALIZATION_FAILED";
257 | 
258 |     case CURAND_STATUS_ARCH_MISMATCH:
259 |       return "CURAND_STATUS_ARCH_MISMATCH";
260 | 
261 |     case CURAND_STATUS_INTERNAL_ERROR:
262 |       return "CURAND_STATUS_INTERNAL_ERROR";
263 |   }
264 | 
265 |   return "<unknown>";
266 | }
267 | #endif
268 | 
269 | #ifdef NVJPEGAPI
270 | // nvJPEG API errors
271 | static const char *_cudaGetErrorEnum(nvjpegStatus_t error) {
272 |   switch (error) {
273 |     case NVJPEG_STATUS_SUCCESS:
274 |       return "NVJPEG_STATUS_SUCCESS";
275 | 
276 |     case NVJPEG_STATUS_NOT_INITIALIZED:
277 |       return "NVJPEG_STATUS_NOT_INITIALIZED";
278 | 
279 |     case NVJPEG_STATUS_INVALID_PARAMETER:
280 |       return "NVJPEG_STATUS_INVALID_PARAMETER";
281 | 
282 |     case NVJPEG_STATUS_BAD_JPEG:
283 |       return "NVJPEG_STATUS_BAD_JPEG";
284 | 
285 |     case NVJPEG_STATUS_JPEG_NOT_SUPPORTED:
286 |       return "NVJPEG_STATUS_JPEG_NOT_SUPPORTED";
287 | 
288 |     case NVJPEG_STATUS_ALLOCATOR_FAILURE:
289 |       return "NVJPEG_STATUS_ALLOCATOR_FAILURE";
290 | 
291 |     case NVJPEG_STATUS_EXECUTION_FAILED:
292 |       return "NVJPEG_STATUS_EXECUTION_FAILED";
293 | 
294 |     case NVJPEG_STATUS_ARCH_MISMATCH:
295 |       return "NVJPEG_STATUS_ARCH_MISMATCH";
296 | 
297 |     case NVJPEG_STATUS_INTERNAL_ERROR:
298 |       return "NVJPEG_STATUS_INTERNAL_ERROR";
299 |   }
300 | 
301 |   return "<unknown>";
302 | }
303 | #endif
304 | 
305 | #ifdef NV_NPPIDEFS_H
306 | // NPP API errors
307 | static const char *_cudaGetErrorEnum(NppStatus error) {
308 |   switch (error) {
309 |     case NPP_NOT_SUPPORTED_MODE_ERROR:
310 |       return "NPP_NOT_SUPPORTED_MODE_ERROR";
311 | 
312 |     case NPP_ROUND_MODE_NOT_SUPPORTED_ERROR:
313 |       return "NPP_ROUND_MODE_NOT_SUPPORTED_ERROR";
314 | 
315 |     case NPP_RESIZE_NO_OPERATION_ERROR:
316 |       return "NPP_RESIZE_NO_OPERATION_ERROR";
317 | 
318 |     case NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY:
319 |       return "NPP_NOT_SUFFICIENT_COMPUTE_CAPABILITY";
320 | 
321 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
322 | 
323 |     case NPP_BAD_ARG_ERROR:
324 |       return "NPP_BAD_ARGUMENT_ERROR";
325 | 
326 |     case NPP_COEFF_ERROR:
327 |       return "NPP_COEFFICIENT_ERROR";
328 | 
329 |     case NPP_RECT_ERROR:
330 |       return "NPP_RECTANGLE_ERROR";
331 | 
332 |     case NPP_QUAD_ERROR:
333 |       return "NPP_QUADRANGLE_ERROR";
334 | 
335 |     case NPP_MEM_ALLOC_ERR:
336 |       return "NPP_MEMORY_ALLOCATION_ERROR";
337 | 
338 |     case NPP_HISTO_NUMBER_OF_LEVELS_ERROR:
339 |       return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
340 | 
341 |     case NPP_INVALID_INPUT:
342 |       return "NPP_INVALID_INPUT";
343 | 
344 |     case NPP_POINTER_ERROR:
345 |       return "NPP_POINTER_ERROR";
346 | 
347 |     case NPP_WARNING:
348 |       return "NPP_WARNING";
349 | 
350 |     case NPP_ODD_ROI_WARNING:
351 |       return "NPP_ODD_ROI_WARNING";
352 | #else
353 | 
354 |     // These are for CUDA 5.5 or higher
355 |     case NPP_BAD_ARGUMENT_ERROR:
356 |       return "NPP_BAD_ARGUMENT_ERROR";
357 | 
358 |     case NPP_COEFFICIENT_ERROR:
359 |       return "NPP_COEFFICIENT_ERROR";
360 | 
361 |     case NPP_RECTANGLE_ERROR:
362 |       return "NPP_RECTANGLE_ERROR";
363 | 
364 |     case NPP_QUADRANGLE_ERROR:
365 |       return "NPP_QUADRANGLE_ERROR";
366 | 
367 |     case NPP_MEMORY_ALLOCATION_ERR:
368 |       return "NPP_MEMORY_ALLOCATION_ERROR";
369 | 
370 |     case NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR:
371 |       return "NPP_HISTOGRAM_NUMBER_OF_LEVELS_ERROR";
372 | 
373 |     case NPP_INVALID_HOST_POINTER_ERROR:
374 |       return "NPP_INVALID_HOST_POINTER_ERROR";
375 | 
376 |     case NPP_INVALID_DEVICE_POINTER_ERROR:
377 |       return "NPP_INVALID_DEVICE_POINTER_ERROR";
378 | #endif
379 | 
380 |     case NPP_LUT_NUMBER_OF_LEVELS_ERROR:
381 |       return "NPP_LUT_NUMBER_OF_LEVELS_ERROR";
382 | 
383 |     case NPP_TEXTURE_BIND_ERROR:
384 |       return "NPP_TEXTURE_BIND_ERROR";
385 | 
386 |     case NPP_WRONG_INTERSECTION_ROI_ERROR:
387 |       return "NPP_WRONG_INTERSECTION_ROI_ERROR";
388 | 
389 |     case NPP_NOT_EVEN_STEP_ERROR:
390 |       return "NPP_NOT_EVEN_STEP_ERROR";
391 | 
392 |     case NPP_INTERPOLATION_ERROR:
393 |       return "NPP_INTERPOLATION_ERROR";
394 | 
395 |     case NPP_RESIZE_FACTOR_ERROR:
396 |       return "NPP_RESIZE_FACTOR_ERROR";
397 | 
398 |     case NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR:
399 |       return "NPP_HAAR_CLASSIFIER_PIXEL_MATCH_ERROR";
400 | 
401 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) <= 0x5000
402 | 
403 |     case NPP_MEMFREE_ERR:
404 |       return "NPP_MEMFREE_ERR";
405 | 
406 |     case NPP_MEMSET_ERR:
407 |       return "NPP_MEMSET_ERR";
408 | 
409 |     case NPP_MEMCPY_ERR:
410 |       return "NPP_MEMCPY_ERROR";
411 | 
412 |     case NPP_MIRROR_FLIP_ERR:
413 |       return "NPP_MIRROR_FLIP_ERR";
414 | #else
415 | 
416 |     case NPP_MEMFREE_ERROR:
417 |       return "NPP_MEMFREE_ERROR";
418 | 
419 |     case NPP_MEMSET_ERROR:
420 |       return "NPP_MEMSET_ERROR";
421 | 
422 |     case NPP_MEMCPY_ERROR:
423 |       return "NPP_MEMCPY_ERROR";
424 | 
425 |     case NPP_MIRROR_FLIP_ERROR:
426 |       return "NPP_MIRROR_FLIP_ERROR";
427 | #endif
428 | 
429 |     case NPP_ALIGNMENT_ERROR:
430 |       return "NPP_ALIGNMENT_ERROR";
431 | 
432 |     case NPP_STEP_ERROR:
433 |       return "NPP_STEP_ERROR";
434 | 
435 |     case NPP_SIZE_ERROR:
436 |       return "NPP_SIZE_ERROR";
437 | 
438 |     case NPP_NULL_POINTER_ERROR:
439 |       return "NPP_NULL_POINTER_ERROR";
440 | 
441 |     case NPP_CUDA_KERNEL_EXECUTION_ERROR:
442 |       return "NPP_CUDA_KERNEL_EXECUTION_ERROR";
443 | 
444 |     case NPP_NOT_IMPLEMENTED_ERROR:
445 |       return "NPP_NOT_IMPLEMENTED_ERROR";
446 | 
447 |     case NPP_ERROR:
448 |       return "NPP_ERROR";
449 | 
450 |     case NPP_SUCCESS:
451 |       return "NPP_SUCCESS";
452 | 
453 |     case NPP_WRONG_INTERSECTION_QUAD_WARNING:
454 |       return "NPP_WRONG_INTERSECTION_QUAD_WARNING";
455 | 
456 |     case NPP_MISALIGNED_DST_ROI_WARNING:
457 |       return "NPP_MISALIGNED_DST_ROI_WARNING";
458 | 
459 |     case NPP_AFFINE_QUAD_INCORRECT_WARNING:
460 |       return "NPP_AFFINE_QUAD_INCORRECT_WARNING";
461 | 
462 |     case NPP_DOUBLE_SIZE_WARNING:
463 |       return "NPP_DOUBLE_SIZE_WARNING";
464 | 
465 |     case NPP_WRONG_INTERSECTION_ROI_WARNING:
466 |       return "NPP_WRONG_INTERSECTION_ROI_WARNING";
467 | 
468 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x6000
469 |     /* These are 6.0 or higher */
470 |     case NPP_LUT_PALETTE_BITSIZE_ERROR:
471 |       return "NPP_LUT_PALETTE_BITSIZE_ERROR";
472 | 
473 |     case NPP_ZC_MODE_NOT_SUPPORTED_ERROR:
474 |       return "NPP_ZC_MODE_NOT_SUPPORTED_ERROR";
475 | 
476 |     case NPP_QUALITY_INDEX_ERROR:
477 |       return "NPP_QUALITY_INDEX_ERROR";
478 | 
479 |     case NPP_CHANNEL_ORDER_ERROR:
480 |       return "NPP_CHANNEL_ORDER_ERROR";
481 | 
482 |     case NPP_ZERO_MASK_VALUE_ERROR:
483 |       return "NPP_ZERO_MASK_VALUE_ERROR";
484 | 
485 |     case NPP_NUMBER_OF_CHANNELS_ERROR:
486 |       return "NPP_NUMBER_OF_CHANNELS_ERROR";
487 | 
488 |     case NPP_COI_ERROR:
489 |       return "NPP_COI_ERROR";
490 | 
491 |     case NPP_DIVISOR_ERROR:
492 |       return "NPP_DIVISOR_ERROR";
493 | 
494 |     case NPP_CHANNEL_ERROR:
495 |       return "NPP_CHANNEL_ERROR";
496 | 
497 |     case NPP_STRIDE_ERROR:
498 |       return "NPP_STRIDE_ERROR";
499 | 
500 |     case NPP_ANCHOR_ERROR:
501 |       return "NPP_ANCHOR_ERROR";
502 | 
503 |     case NPP_MASK_SIZE_ERROR:
504 |       return "NPP_MASK_SIZE_ERROR";
505 | 
506 |     case NPP_MOMENT_00_ZERO_ERROR:
507 |       return "NPP_MOMENT_00_ZERO_ERROR";
508 | 
509 |     case NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR:
510 |       return "NPP_THRESHOLD_NEGATIVE_LEVEL_ERROR";
511 | 
512 |     case NPP_THRESHOLD_ERROR:
513 |       return "NPP_THRESHOLD_ERROR";
514 | 
515 |     case NPP_CONTEXT_MATCH_ERROR:
516 |       return "NPP_CONTEXT_MATCH_ERROR";
517 | 
518 |     case NPP_FFT_FLAG_ERROR:
519 |       return "NPP_FFT_FLAG_ERROR";
520 | 
521 |     case NPP_FFT_ORDER_ERROR:
522 |       return "NPP_FFT_ORDER_ERROR";
523 | 
524 |     case NPP_SCALE_RANGE_ERROR:
525 |       return "NPP_SCALE_RANGE_ERROR";
526 | 
527 |     case NPP_DATA_TYPE_ERROR:
528 |       return "NPP_DATA_TYPE_ERROR";
529 | 
530 |     case NPP_OUT_OFF_RANGE_ERROR:
531 |       return "NPP_OUT_OFF_RANGE_ERROR";
532 | 
533 |     case NPP_DIVIDE_BY_ZERO_ERROR:
534 |       return "NPP_DIVIDE_BY_ZERO_ERROR";
535 | 
536 |     case NPP_RANGE_ERROR:
537 |       return "NPP_RANGE_ERROR";
538 | 
539 |     case NPP_NO_MEMORY_ERROR:
540 |       return "NPP_NO_MEMORY_ERROR";
541 | 
542 |     case NPP_ERROR_RESERVED:
543 |       return "NPP_ERROR_RESERVED";
544 | 
545 |     case NPP_NO_OPERATION_WARNING:
546 |       return "NPP_NO_OPERATION_WARNING";
547 | 
548 |     case NPP_DIVIDE_BY_ZERO_WARNING:
549 |       return "NPP_DIVIDE_BY_ZERO_WARNING";
550 | #endif
551 | 
552 | #if ((NPP_VERSION_MAJOR << 12) + (NPP_VERSION_MINOR << 4)) >= 0x7000
553 |     /* These are 7.0 or higher */
554 |     case NPP_OVERFLOW_ERROR:
555 |       return "NPP_OVERFLOW_ERROR";
556 | 
557 |     case NPP_CORRUPTED_DATA_ERROR:
558 |       return "NPP_CORRUPTED_DATA_ERROR";
559 | #endif
560 |   }
561 | 
562 |   return "<unknown>";
563 | }
564 | #endif
565 | 
566 | #ifdef __DRIVER_TYPES_H__
567 | #ifndef DEVICE_RESET
568 | #define DEVICE_RESET cudaDeviceReset();
569 | #endif
570 | #else
571 | #ifndef DEVICE_RESET
572 | #define DEVICE_RESET
573 | #endif
574 | #endif
575 | 
576 | template <typename T>
577 | void check(T result, char const *const func, const char *const file,
578 |            int const line) {
579 |   if (result) {
580 |     fprintf(stderr, "CUDA error at %s:%d code=%d(%s) \"%s\" \n", file, line,
581 |             static_cast<unsigned int>(result), _cudaGetErrorEnum(result), func);
582 |     DEVICE_RESET
583 |     // Make sure we call CUDA Device Reset before exiting
584 |     exit(EXIT_FAILURE);
585 |   }
586 | }
587 | 
588 | #ifdef __DRIVER_TYPES_H__
589 | // This will output the proper CUDA error strings in the event
590 | // that a CUDA host call returns an error
591 | #define checkCudaErrors(val) check((val), #val, __FILE__, __LINE__)
592 | 
593 | // This will output the proper error string when calling cudaGetLastError
594 | #define getLastCudaError(msg) __getLastCudaError(msg, __FILE__, __LINE__)
595 | 
596 | inline void __getLastCudaError(const char *errorMessage, const char *file,
597 |                                const int line) {
598 |   cudaError_t err = cudaGetLastError();
599 | 
600 |   if (cudaSuccess != err) {
601 |     fprintf(stderr,
602 |             "%s(%i) : getLastCudaError() CUDA error :"
603 |             " %s : (%d) %s.\n",
604 |             file, line, errorMessage, static_cast<int>(err),
605 |             cudaGetErrorString(err));
606 |     DEVICE_RESET
607 |     exit(EXIT_FAILURE);
608 |   }
609 | }
610 | 
611 | // This will only print the proper error string when calling cudaGetLastError
612 | // but not exit program incase error detected.
613 | #define printLastCudaError(msg) __printLastCudaError(msg, __FILE__, __LINE__)
614 | 
615 | inline void __printLastCudaError(const char *errorMessage, const char *file,
616 |                                  const int line) {
617 |   cudaError_t err = cudaGetLastError();
618 | 
619 |   if (cudaSuccess != err) {
620 |     fprintf(stderr,
621 |             "%s(%i) : getLastCudaError() CUDA error :"
622 |             " %s : (%d) %s.\n",
623 |             file, line, errorMessage, static_cast<int>(err),
624 |             cudaGetErrorString(err));
625 |   }
626 | }
627 | #endif
628 | 
629 | #ifndef MAX
630 | #define MAX(a, b) (a > b ? a : b)
631 | #endif
632 | 
633 | // Float To Int conversion
634 | inline int ftoi(float value) {
635 |   return (value >= 0 ? static_cast<int>(value + 0.5)
636 |                      : static_cast<int>(value - 0.5));
637 | }
638 | 
639 | // Beginning of GPU Architecture definitions
640 | inline int _ConvertSMVer2Cores(int major, int minor) {
641 |   // Defines for GPU Architecture types (using the SM version to determine
642 |   // the # of cores per SM
643 |   typedef struct {
644 |     int SM;  // 0xMm (hexidecimal notation), M = SM Major version,
645 |     // and m = SM minor version
646 |     int Cores;
647 |   } sSMtoCores;
648 | 
649 |   sSMtoCores nGpuArchCoresPerSM[] = {
650 |       {0x30, 192},
651 |       {0x32, 192},
652 |       {0x35, 192},
653 |       {0x37, 192},
654 |       {0x50, 128},
655 |       {0x52, 128},
656 |       {0x53, 128},
657 |       {0x60,  64},
658 |       {0x61, 128},
659 |       {0x62, 128},
660 |       {0x70,  64},
661 |       {0x72,  64},
662 |       {0x75,  64},
663 |       {-1, -1}};
664 | 
665 |   int index = 0;
666 | 
667 |   while (nGpuArchCoresPerSM[index].SM != -1) {
668 |     if (nGpuArchCoresPerSM[index].SM == ((major << 4) + minor)) {
669 |       return nGpuArchCoresPerSM[index].Cores;
670 |     }
671 | 
672 |     index++;
673 |   }
674 | 
675 |   // If we don't find the values, we default use the previous one
676 |   // to run properly
677 |   printf(
678 |       "MapSMtoCores for SM %d.%d is undefined."
679 |       "  Default to use %d Cores/SM\n",
680 |       major, minor, nGpuArchCoresPerSM[index - 1].Cores);
681 |   return nGpuArchCoresPerSM[index - 1].Cores;
682 | }
683 |   // end of GPU Architecture definitions
684 | 
685 | #ifdef __CUDA_RUNTIME_H__
686 | // General GPU Device CUDA Initialization
687 | inline int gpuDeviceInit(int devID) {
688 |   int device_count;
689 |   checkCudaErrors(cudaGetDeviceCount(&device_count));
690 | 
691 |   if (device_count == 0) {
692 |     fprintf(stderr,
693 |             "gpuDeviceInit() CUDA error: "
694 |             "no devices supporting CUDA.\n");
695 |     exit(EXIT_FAILURE);
696 |   }
697 | 
698 |   if (devID < 0) {
699 |     devID = 0;
700 |   }
701 | 
702 |   if (devID > device_count - 1) {
703 |     fprintf(stderr, "\n");
704 |     fprintf(stderr, ">> %d CUDA capable GPU device(s) detected. <<\n",
705 |             device_count);
706 |     fprintf(stderr,
707 |             ">> gpuDeviceInit (-device=%d) is not a valid"
708 |             " GPU device. <<\n",
709 |             devID);
710 |     fprintf(stderr, "\n");
711 |     return -devID;
712 |   }
713 | 
714 |   cudaDeviceProp deviceProp;
715 |   checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
716 | 
717 |   if (deviceProp.computeMode == cudaComputeModeProhibited) {
718 |     fprintf(stderr,
719 |             "Error: device is running in <Compute Mode "
720 |             "Prohibited>, no threads can use cudaSetDevice().\n");
721 |     return -1;
722 |   }
723 | 
724 |   if (deviceProp.major < 1) {
725 |     fprintf(stderr, "gpuDeviceInit(): GPU device does not support CUDA.\n");
726 |     exit(EXIT_FAILURE);
727 |   }
728 | 
729 |   checkCudaErrors(cudaSetDevice(devID));
730 |   printf("gpuDeviceInit() CUDA Device [%d]: \"%s\n", devID, deviceProp.name);
731 | 
732 |   return devID;
733 | }
734 | 
735 | // This function returns the best GPU (with maximum GFLOPS)
736 | inline int gpuGetMaxGflopsDeviceId() {
737 |   int current_device = 0, sm_per_multiproc = 0;
738 |   int max_perf_device = 0;
739 |   int device_count = 0;
740 |   int devices_prohibited = 0;
741 | 
742 |   uint64_t max_compute_perf = 0;
743 |   cudaDeviceProp deviceProp;
744 |   checkCudaErrors(cudaGetDeviceCount(&device_count));
745 | 
746 |   if (device_count == 0) {
747 |     fprintf(stderr,
748 |             "gpuGetMaxGflopsDeviceId() CUDA error:"
749 |             " no devices supporting CUDA.\n");
750 |     exit(EXIT_FAILURE);
751 |   }
752 | 
753 |   // Find the best CUDA capable GPU device
754 |   current_device = 0;
755 | 
756 |   while (current_device < device_count) {
757 |     cudaGetDeviceProperties(&deviceProp, current_device);
758 | 
759 |     // If this GPU is not running on Compute Mode prohibited,
760 |     // then we can add it to the list
761 |     if (deviceProp.computeMode != cudaComputeModeProhibited) {
762 |       if (deviceProp.major == 9999 && deviceProp.minor == 9999) {
763 |         sm_per_multiproc = 1;
764 |       } else {
765 |         sm_per_multiproc =
766 |             _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor);
767 |       }
768 | 
769 |       uint64_t compute_perf = (uint64_t)deviceProp.multiProcessorCount *
770 |                               sm_per_multiproc * deviceProp.clockRate;
771 | 
772 |       if (compute_perf > max_compute_perf) {
773 |         max_compute_perf = compute_perf;
774 |         max_perf_device = current_device;
775 |       }
776 |     } else {
777 |       devices_prohibited++;
778 |     }
779 | 
780 |     ++current_device;
781 |   }
782 | 
783 |   if (devices_prohibited == device_count) {
784 |     fprintf(stderr,
785 |             "gpuGetMaxGflopsDeviceId() CUDA error:"
786 |             " all devices have compute mode prohibited.\n");
787 |     exit(EXIT_FAILURE);
788 |   }
789 | 
790 |   return max_perf_device;
791 | }
792 | 
793 | // Initialization code to find the best CUDA Device
794 | inline int findCudaDevice(int argc, const char **argv) {
795 |   cudaDeviceProp deviceProp;
796 |   int devID = 0;
797 | 
798 |   // If the command-line has a device number specified, use it
799 |   if (checkCmdLineFlag(argc, argv, "device")) {
800 |     devID = getCmdLineArgumentInt(argc, argv, "device=");
801 | 
802 |     if (devID < 0) {
803 |       printf("Invalid command line parameter\n ");
804 |       exit(EXIT_FAILURE);
805 |     } else {
806 |       devID = gpuDeviceInit(devID);
807 | 
808 |       if (devID < 0) {
809 |         printf("exiting...\n");
810 |         exit(EXIT_FAILURE);
811 |       }
812 |     }
813 |   } else {
814 |     // Otherwise pick the device with highest Gflops/s
815 |     devID = gpuGetMaxGflopsDeviceId();
816 |     checkCudaErrors(cudaSetDevice(devID));
817 |     checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
818 |     printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n", devID,
819 |            deviceProp.name, deviceProp.major, deviceProp.minor);
820 |   }
821 | 
822 |   return devID;
823 | }
824 | 
825 | inline int findIntegratedGPU() {
826 |   int current_device = 0;
827 |   int device_count = 0;
828 |   int devices_prohibited = 0;
829 | 
830 |   cudaDeviceProp deviceProp;
831 |   checkCudaErrors(cudaGetDeviceCount(&device_count));
832 | 
833 |   if (device_count == 0) {
834 |     fprintf(stderr, "CUDA error: no devices supporting CUDA.\n");
835 |     exit(EXIT_FAILURE);
836 |   }
837 | 
838 |   // Find the integrated GPU which is compute capable
839 |   while (current_device < device_count) {
840 |     cudaGetDeviceProperties(&deviceProp, current_device);
841 | 
842 |     // If GPU is integrated and is not running on Compute Mode prohibited,
843 |     // then cuda can map to GLES resource
844 |     if (deviceProp.integrated &&
845 |         (deviceProp.computeMode != cudaComputeModeProhibited)) {
846 |       checkCudaErrors(cudaSetDevice(current_device));
847 |       checkCudaErrors(cudaGetDeviceProperties(&deviceProp, current_device));
848 |       printf("GPU Device %d: \"%s\" with compute capability %d.%d\n\n",
849 |              current_device, deviceProp.name, deviceProp.major,
850 |              deviceProp.minor);
851 | 
852 |       return current_device;
853 |     } else {
854 |       devices_prohibited++;
855 |     }
856 | 
857 |     current_device++;
858 |   }
859 | 
860 |   if (devices_prohibited == device_count) {
861 |     fprintf(stderr,
862 |             "CUDA error:"
863 |             " No GLES-CUDA Interop capable GPU found.\n");
864 |     exit(EXIT_FAILURE);
865 |   }
866 | 
867 |   return -1;
868 | }
869 | 
870 | // General check for CUDA GPU SM Capabilities
871 | inline bool checkCudaCapabilities(int major_version, int minor_version) {
872 |   cudaDeviceProp deviceProp;
873 |   deviceProp.major = 0;
874 |   deviceProp.minor = 0;
875 |   int dev;
876 | 
877 |   checkCudaErrors(cudaGetDevice(&dev));
878 |   checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
879 | 
880 |   if ((deviceProp.major > major_version) ||
881 |       (deviceProp.major == major_version &&
882 |        deviceProp.minor >= minor_version)) {
883 |     printf("  Device %d: <%16s >, Compute SM %d.%d detected\n", dev,
884 |            deviceProp.name, deviceProp.major, deviceProp.minor);
885 |     return true;
886 |   } else {
887 |     printf(
888 |         "  No GPU device was found that can support "
889 |         "CUDA compute capability %d.%d.\n",
890 |         major_version, minor_version);
891 |     return false;
892 |   }
893 | }
894 | #endif
895 | 
896 |   // end of CUDA Helper Functions
897 | 
898 | #endif  // COMMON_HELPER_CUDA_H_
899 | 


--------------------------------------------------------------------------------
/simpleCUBLASEx/helper_string.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | // These are helper functions for the SDK samples (string parsing, timers, etc)
 13 | #ifndef COMMON_HELPER_STRING_H_
 14 | #define COMMON_HELPER_STRING_H_
 15 | 
 16 | #include <stdio.h>
 17 | #include <stdlib.h>
 18 | #include <fstream>
 19 | #include <string>
 20 | 
 21 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 22 | #ifndef _CRT_SECURE_NO_DEPRECATE
 23 | #define _CRT_SECURE_NO_DEPRECATE
 24 | #endif
 25 | #ifndef STRCASECMP
 26 | #define STRCASECMP _stricmp
 27 | #endif
 28 | #ifndef STRNCASECMP
 29 | #define STRNCASECMP _strnicmp
 30 | #endif
 31 | #ifndef STRCPY
 32 | #define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
 33 | #endif
 34 | 
 35 | #ifndef FOPEN
 36 | #define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
 37 | #endif
 38 | #ifndef FOPEN_FAIL
 39 | #define FOPEN_FAIL(result) (result != 0)
 40 | #endif
 41 | #ifndef SSCANF
 42 | #define SSCANF sscanf_s
 43 | #endif
 44 | #ifndef SPRINTF
 45 | #define SPRINTF sprintf_s
 46 | #endif
 47 | #else  // Linux Includes
 48 | #include <string.h>
 49 | #include <strings.h>
 50 | 
 51 | #ifndef STRCASECMP
 52 | #define STRCASECMP strcasecmp
 53 | #endif
 54 | #ifndef STRNCASECMP
 55 | #define STRNCASECMP strncasecmp
 56 | #endif
 57 | #ifndef STRCPY
 58 | #define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
 59 | #endif
 60 | 
 61 | #ifndef FOPEN
 62 | #define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
 63 | #endif
 64 | #ifndef FOPEN_FAIL
 65 | #define FOPEN_FAIL(result) (result == NULL)
 66 | #endif
 67 | #ifndef SSCANF
 68 | #define SSCANF sscanf
 69 | #endif
 70 | #ifndef SPRINTF
 71 | #define SPRINTF sprintf
 72 | #endif
 73 | #endif
 74 | 
 75 | #ifndef EXIT_WAIVED
 76 | #define EXIT_WAIVED 2
 77 | #endif
 78 | 
 79 | // CUDA Utility Helper Functions
 80 | inline int stringRemoveDelimiter(char delimiter, const char *string) {
 81 |   int string_start = 0;
 82 | 
 83 |   while (string[string_start] == delimiter) {
 84 |     string_start++;
 85 |   }
 86 | 
 87 |   if (string_start >= static_cast<int>(strlen(string) - 1)) {
 88 |     return 0;
 89 |   }
 90 | 
 91 |   return string_start;
 92 | }
 93 | 
 94 | inline int getFileExtension(char *filename, char **extension) {
 95 |   int string_length = static_cast<int>(strlen(filename));
 96 | 
 97 |   while (filename[string_length--] != '.') {
 98 |     if (string_length == 0) break;
 99 |   }
100 | 
101 |   if (string_length > 0) string_length += 2;
102 | 
103 |   if (string_length == 0)
104 |     *extension = NULL;
105 |   else
106 |     *extension = &filename[string_length];
107 | 
108 |   return string_length;
109 | }
110 | 
111 | inline bool checkCmdLineFlag(const int argc, const char **argv,
112 |                              const char *string_ref) {
113 |   bool bFound = false;
114 | 
115 |   if (argc >= 1) {
116 |     for (int i = 1; i < argc; i++) {
117 |       int string_start = stringRemoveDelimiter('-', argv[i]);
118 |       const char *string_argv = &argv[i][string_start];
119 | 
120 |       const char *equal_pos = strchr(string_argv, '=');
121 |       int argv_length = static_cast<int>(
122 |           equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
123 | 
124 |       int length = static_cast<int>(strlen(string_ref));
125 | 
126 |       if (length == argv_length &&
127 |           !STRNCASECMP(string_argv, string_ref, length)) {
128 |         bFound = true;
129 |         continue;
130 |       }
131 |     }
132 |   }
133 | 
134 |   return bFound;
135 | }
136 | 
137 | // This function wraps the CUDA Driver API into a template function
138 | template <class T>
139 | inline bool getCmdLineArgumentValue(const int argc, const char **argv,
140 |                                     const char *string_ref, T *value) {
141 |   bool bFound = false;
142 | 
143 |   if (argc >= 1) {
144 |     for (int i = 1; i < argc; i++) {
145 |       int string_start = stringRemoveDelimiter('-', argv[i]);
146 |       const char *string_argv = &argv[i][string_start];
147 |       int length = static_cast<int>(strlen(string_ref));
148 | 
149 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
150 |         if (length + 1 <= static_cast<int>(strlen(string_argv))) {
151 |           int auto_inc = (string_argv[length] == '=') ? 1 : 0;
152 |           *value = (T)atoi(&string_argv[length + auto_inc]);
153 |         }
154 | 
155 |         bFound = true;
156 |         i = argc;
157 |       }
158 |     }
159 |   }
160 | 
161 |   return bFound;
162 | }
163 | 
164 | inline int getCmdLineArgumentInt(const int argc, const char **argv,
165 |                                  const char *string_ref) {
166 |   bool bFound = false;
167 |   int value = -1;
168 | 
169 |   if (argc >= 1) {
170 |     for (int i = 1; i < argc; i++) {
171 |       int string_start = stringRemoveDelimiter('-', argv[i]);
172 |       const char *string_argv = &argv[i][string_start];
173 |       int length = static_cast<int>(strlen(string_ref));
174 | 
175 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
176 |         if (length + 1 <= static_cast<int>(strlen(string_argv))) {
177 |           int auto_inc = (string_argv[length] == '=') ? 1 : 0;
178 |           value = atoi(&string_argv[length + auto_inc]);
179 |         } else {
180 |           value = 0;
181 |         }
182 | 
183 |         bFound = true;
184 |         continue;
185 |       }
186 |     }
187 |   }
188 | 
189 |   if (bFound) {
190 |     return value;
191 |   } else {
192 |     return 0;
193 |   }
194 | }
195 | 
196 | inline float getCmdLineArgumentFloat(const int argc, const char **argv,
197 |                                      const char *string_ref) {
198 |   bool bFound = false;
199 |   float value = -1;
200 | 
201 |   if (argc >= 1) {
202 |     for (int i = 1; i < argc; i++) {
203 |       int string_start = stringRemoveDelimiter('-', argv[i]);
204 |       const char *string_argv = &argv[i][string_start];
205 |       int length = static_cast<int>(strlen(string_ref));
206 | 
207 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
208 |         if (length + 1 <= static_cast<int>(strlen(string_argv))) {
209 |           int auto_inc = (string_argv[length] == '=') ? 1 : 0;
210 |           value = static_cast<float>(atof(&string_argv[length + auto_inc]));
211 |         } else {
212 |           value = 0.f;
213 |         }
214 | 
215 |         bFound = true;
216 |         continue;
217 |       }
218 |     }
219 |   }
220 | 
221 |   if (bFound) {
222 |     return value;
223 |   } else {
224 |     return 0;
225 |   }
226 | }
227 | 
228 | inline bool getCmdLineArgumentString(const int argc, const char **argv,
229 |                                      const char *string_ref,
230 |                                      char **string_retval) {
231 |   bool bFound = false;
232 | 
233 |   if (argc >= 1) {
234 |     for (int i = 1; i < argc; i++) {
235 |       int string_start = stringRemoveDelimiter('-', argv[i]);
236 |       char *string_argv = const_cast<char*>(&argv[i][string_start]);
237 |       int length = static_cast<int>(strlen(string_ref));
238 | 
239 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
240 |         *string_retval = &string_argv[length + 1];
241 |         bFound = true;
242 |         continue;
243 |       }
244 |     }
245 |   }
246 | 
247 |   if (!bFound) {
248 |     *string_retval = NULL;
249 |   }
250 | 
251 |   return bFound;
252 | }
253 | 
254 | //////////////////////////////////////////////////////////////////////////////
255 | //! Find the path for a file assuming that
256 | //! files are found in the searchPath.
257 | //!
258 | //! @return the path if succeeded, otherwise 0
259 | //! @param filename         name of the file
260 | //! @param executable_path  optional absolute path of the executable
261 | //////////////////////////////////////////////////////////////////////////////
262 | inline char *sdkFindFilePath(const char *filename,
263 |                              const char *executable_path) {
264 |   // <executable_name> defines a variable that is replaced with the name of the
265 |   // executable
266 | 
267 |   // Typical relative search paths to locate needed companion files (e.g. sample
268 |   // input data, or JIT source files) The origin for the relative search may be
269 |   // the .exe file, a .bat file launching an .exe, a browser .exe launching the
270 |   // .exe or .bat, etc
271 |   const char *searchPath[] = {
272 |       "./",  // same dir
273 |       "./<executable_name>_data_files/",
274 |       "./common/",                      // "/common/" subdir
275 |       "./common/data/",                 // "/common/data/" subdir
276 |       "./data/",                        // "/data/" subdir
277 |       "./src/",                         // "/src/" subdir
278 |       "./src/<executable_name>/data/",  // "/src/<executable_name>/data/" subdir
279 |       "./inc/",                         // "/inc/" subdir
280 |       "./0_Simple/",                    // "/0_Simple/" subdir
281 |       "./1_Utilities/",                 // "/1_Utilities/" subdir
282 |       "./2_Graphics/",                  // "/2_Graphics/" subdir
283 |       "./3_Imaging/",                   // "/3_Imaging/" subdir
284 |       "./4_Finance/",                   // "/4_Finance/" subdir
285 |       "./5_Simulations/",               // "/5_Simulations/" subdir
286 |       "./6_Advanced/",                  // "/6_Advanced/" subdir
287 |       "./7_CUDALibraries/",             // "/7_CUDALibraries/" subdir
288 |       "./8_Android/",                   // "/8_Android/" subdir
289 |       "./samples/",                     // "/samples/" subdir
290 | 
291 |       "./0_Simple/<executable_name>/data/",  // "/0_Simple/<executable_name>/data/"
292 |                                              // subdir
293 |       "./1_Utilities/<executable_name>/data/",  // "/1_Utilities/<executable_name>/data/"
294 |                                                 // subdir
295 |       "./2_Graphics/<executable_name>/data/",  // "/2_Graphics/<executable_name>/data/"
296 |                                                // subdir
297 |       "./3_Imaging/<executable_name>/data/",  // "/3_Imaging/<executable_name>/data/"
298 |                                               // subdir
299 |       "./4_Finance/<executable_name>/data/",  // "/4_Finance/<executable_name>/data/"
300 |                                               // subdir
301 |       "./5_Simulations/<executable_name>/data/",  // "/5_Simulations/<executable_name>/data/"
302 |                                                   // subdir
303 |       "./6_Advanced/<executable_name>/data/",  // "/6_Advanced/<executable_name>/data/"
304 |                                                // subdir
305 |       "./7_CUDALibraries/<executable_name>/",  // "/7_CUDALibraries/<executable_name>/"
306 |                                                // subdir
307 |       "./7_CUDALibraries/<executable_name>/data/",  // "/7_CUDALibraries/<executable_name>/data/"
308 |                                                     // subdir
309 | 
310 |       "../",              // up 1 in tree
311 |       "../common/",       // up 1 in tree, "/common/" subdir
312 |       "../common/data/",  // up 1 in tree, "/common/data/" subdir
313 |       "../data/",         // up 1 in tree, "/data/" subdir
314 |       "../src/",          // up 1 in tree, "/src/" subdir
315 |       "../inc/",          // up 1 in tree, "/inc/" subdir
316 | 
317 |       "../0_Simple/<executable_name>/data/",  // up 1 in tree,
318 |                                               // "/0_Simple/<executable_name>/"
319 |                                               // subdir
320 |       "../1_Utilities/<executable_name>/data/",  // up 1 in tree,
321 |                                                  // "/1_Utilities/<executable_name>/"
322 |                                                  // subdir
323 |       "../2_Graphics/<executable_name>/data/",  // up 1 in tree,
324 |                                                 // "/2_Graphics/<executable_name>/"
325 |                                                 // subdir
326 |       "../3_Imaging/<executable_name>/data/",  // up 1 in tree,
327 |                                                // "/3_Imaging/<executable_name>/"
328 |                                                // subdir
329 |       "../4_Finance/<executable_name>/data/",  // up 1 in tree,
330 |                                                // "/4_Finance/<executable_name>/"
331 |                                                // subdir
332 |       "../5_Simulations/<executable_name>/data/",  // up 1 in tree,
333 |                                                    // "/5_Simulations/<executable_name>/"
334 |                                                    // subdir
335 |       "../6_Advanced/<executable_name>/data/",  // up 1 in tree,
336 |                                                 // "/6_Advanced/<executable_name>/"
337 |                                                 // subdir
338 |       "../7_CUDALibraries/<executable_name>/data/",  // up 1 in tree,
339 |                                                      // "/7_CUDALibraries/<executable_name>/"
340 |                                                      // subdir
341 |       "../8_Android/<executable_name>/data/",  // up 1 in tree,
342 |                                                // "/8_Android/<executable_name>/"
343 |                                                // subdir
344 |       "../samples/<executable_name>/data/",  // up 1 in tree,
345 |                                              // "/samples/<executable_name>/"
346 |                                              // subdir
347 |       "../../",                              // up 2 in tree
348 |       "../../common/",                       // up 2 in tree, "/common/" subdir
349 |       "../../common/data/",  // up 2 in tree, "/common/data/" subdir
350 |       "../../data/",         // up 2 in tree, "/data/" subdir
351 |       "../../src/",          // up 2 in tree, "/src/" subdir
352 |       "../../inc/",          // up 2 in tree, "/inc/" subdir
353 |       "../../sandbox/<executable_name>/data/",  // up 2 in tree,
354 |                                                 // "/sandbox/<executable_name>/"
355 |                                                 // subdir
356 |       "../../0_Simple/<executable_name>/data/",  // up 2 in tree,
357 |                                                  // "/0_Simple/<executable_name>/"
358 |                                                  // subdir
359 |       "../../1_Utilities/<executable_name>/data/",  // up 2 in tree,
360 |                                                     // "/1_Utilities/<executable_name>/"
361 |                                                     // subdir
362 |       "../../2_Graphics/<executable_name>/data/",  // up 2 in tree,
363 |                                                    // "/2_Graphics/<executable_name>/"
364 |                                                    // subdir
365 |       "../../3_Imaging/<executable_name>/data/",  // up 2 in tree,
366 |                                                   // "/3_Imaging/<executable_name>/"
367 |                                                   // subdir
368 |       "../../4_Finance/<executable_name>/data/",  // up 2 in tree,
369 |                                                   // "/4_Finance/<executable_name>/"
370 |                                                   // subdir
371 |       "../../5_Simulations/<executable_name>/data/",  // up 2 in tree,
372 |                                                       // "/5_Simulations/<executable_name>/"
373 |                                                       // subdir
374 |       "../../6_Advanced/<executable_name>/data/",  // up 2 in tree,
375 |                                                    // "/6_Advanced/<executable_name>/"
376 |                                                    // subdir
377 |       "../../7_CUDALibraries/<executable_name>/data/",  // up 2 in tree,
378 |                                                         // "/7_CUDALibraries/<executable_name>/"
379 |                                                         // subdir
380 |       "../../8_Android/<executable_name>/data/",  // up 2 in tree,
381 |                                                   // "/8_Android/<executable_name>/"
382 |                                                   // subdir
383 |       "../../samples/<executable_name>/data/",  // up 2 in tree,
384 |                                                 // "/samples/<executable_name>/"
385 |                                                 // subdir
386 |       "../../../",                              // up 3 in tree
387 |       "../../../src/<executable_name>/",        // up 3 in tree,
388 |                                           // "/src/<executable_name>/" subdir
389 |       "../../../src/<executable_name>/data/",  // up 3 in tree,
390 |                                                // "/src/<executable_name>/data/"
391 |                                                // subdir
392 |       "../../../src/<executable_name>/src/",   // up 3 in tree,
393 |                                                // "/src/<executable_name>/src/"
394 |                                                // subdir
395 |       "../../../src/<executable_name>/inc/",   // up 3 in tree,
396 |                                                // "/src/<executable_name>/inc/"
397 |                                                // subdir
398 |       "../../../sandbox/<executable_name>/",   // up 3 in tree,
399 |                                                // "/sandbox/<executable_name>/"
400 |                                                // subdir
401 |       "../../../sandbox/<executable_name>/data/",  // up 3 in tree,
402 |                                                    // "/sandbox/<executable_name>/data/"
403 |                                                    // subdir
404 |       "../../../sandbox/<executable_name>/src/",  // up 3 in tree,
405 |                                                   // "/sandbox/<executable_name>/src/"
406 |                                                   // subdir
407 |       "../../../sandbox/<executable_name>/inc/",  // up 3 in tree,
408 |                                                   // "/sandbox/<executable_name>/inc/"
409 |                                                   // subdir
410 |       "../../../0_Simple/<executable_name>/data/",  // up 3 in tree,
411 |                                                     // "/0_Simple/<executable_name>/"
412 |                                                     // subdir
413 |       "../../../1_Utilities/<executable_name>/data/",  // up 3 in tree,
414 |                                                        // "/1_Utilities/<executable_name>/"
415 |                                                        // subdir
416 |       "../../../2_Graphics/<executable_name>/data/",  // up 3 in tree,
417 |                                                       // "/2_Graphics/<executable_name>/"
418 |                                                       // subdir
419 |       "../../../3_Imaging/<executable_name>/data/",  // up 3 in tree,
420 |                                                      // "/3_Imaging/<executable_name>/"
421 |                                                      // subdir
422 |       "../../../4_Finance/<executable_name>/data/",  // up 3 in tree,
423 |                                                      // "/4_Finance/<executable_name>/"
424 |                                                      // subdir
425 |       "../../../5_Simulations/<executable_name>/data/",  // up 3 in tree,
426 |                                                          // "/5_Simulations/<executable_name>/"
427 |                                                          // subdir
428 |       "../../../6_Advanced/<executable_name>/data/",  // up 3 in tree,
429 |                                                       // "/6_Advanced/<executable_name>/"
430 |                                                       // subdir
431 |       "../../../7_CUDALibraries/<executable_name>/data/",  // up 3 in tree,
432 |                                                            // "/7_CUDALibraries/<executable_name>/"
433 |                                                            // subdir
434 |       "../../../8_Android/<executable_name>/data/",  // up 3 in tree,
435 |                                                      // "/8_Android/<executable_name>/"
436 |                                                      // subdir
437 |       "../../../0_Simple/<executable_name>/",  // up 3 in tree,
438 |                                                // "/0_Simple/<executable_name>/"
439 |                                                // subdir
440 |       "../../../1_Utilities/<executable_name>/",  // up 3 in tree,
441 |                                                   // "/1_Utilities/<executable_name>/"
442 |                                                   // subdir
443 |       "../../../2_Graphics/<executable_name>/",  // up 3 in tree,
444 |                                                  // "/2_Graphics/<executable_name>/"
445 |                                                  // subdir
446 |       "../../../3_Imaging/<executable_name>/",  // up 3 in tree,
447 |                                                 // "/3_Imaging/<executable_name>/"
448 |                                                 // subdir
449 |       "../../../4_Finance/<executable_name>/",  // up 3 in tree,
450 |                                                 // "/4_Finance/<executable_name>/"
451 |                                                 // subdir
452 |       "../../../5_Simulations/<executable_name>/",  // up 3 in tree,
453 |                                                     // "/5_Simulations/<executable_name>/"
454 |                                                     // subdir
455 |       "../../../6_Advanced/<executable_name>/",  // up 3 in tree,
456 |                                                  // "/6_Advanced/<executable_name>/"
457 |                                                  // subdir
458 |       "../../../7_CUDALibraries/<executable_name>/",  // up 3 in tree,
459 |                                                       // "/7_CUDALibraries/<executable_name>/"
460 |                                                       // subdir
461 |       "../../../8_Android/<executable_name>/",  // up 3 in tree,
462 |                                                 // "/8_Android/<executable_name>/"
463 |                                                 // subdir
464 |       "../../../samples/<executable_name>/data/",  // up 3 in tree,
465 |                                                    // "/samples/<executable_name>/"
466 |                                                    // subdir
467 |       "../../../common/",       // up 3 in tree, "../../../common/" subdir
468 |       "../../../common/data/",  // up 3 in tree, "../../../common/data/" subdir
469 |       "../../../data/",         // up 3 in tree, "../../../data/" subdir
470 |       "../../../../",           // up 4 in tree
471 |       "../../../../src/<executable_name>/",  // up 4 in tree,
472 |                                              // "/src/<executable_name>/" subdir
473 |       "../../../../src/<executable_name>/data/",  // up 4 in tree,
474 |                                                   // "/src/<executable_name>/data/"
475 |                                                   // subdir
476 |       "../../../../src/<executable_name>/src/",  // up 4 in tree,
477 |                                                  // "/src/<executable_name>/src/"
478 |                                                  // subdir
479 |       "../../../../src/<executable_name>/inc/",  // up 4 in tree,
480 |                                                  // "/src/<executable_name>/inc/"
481 |                                                  // subdir
482 |       "../../../../sandbox/<executable_name>/",  // up 4 in tree,
483 |                                                  // "/sandbox/<executable_name>/"
484 |                                                  // subdir
485 |       "../../../../sandbox/<executable_name>/data/",  // up 4 in tree,
486 |                                                       // "/sandbox/<executable_name>/data/"
487 |                                                       // subdir
488 |       "../../../../sandbox/<executable_name>/src/",  // up 4 in tree,
489 |                                                      // "/sandbox/<executable_name>/src/"
490 |                                                      // subdir
491 |       "../../../../sandbox/<executable_name>/inc/",  // up 4 in tree,
492 |                                                      // "/sandbox/<executable_name>/inc/"
493 |                                                      // subdir
494 |       "../../../../0_Simple/<executable_name>/data/",  // up 4 in tree,
495 |                                                        // "/0_Simple/<executable_name>/"
496 |                                                        // subdir
497 |       "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree,
498 |                                                           // "/1_Utilities/<executable_name>/"
499 |                                                           // subdir
500 |       "../../../../2_Graphics/<executable_name>/data/",  // up 4 in tree,
501 |                                                          // "/2_Graphics/<executable_name>/"
502 |                                                          // subdir
503 |       "../../../../3_Imaging/<executable_name>/data/",  // up 4 in tree,
504 |                                                         // "/3_Imaging/<executable_name>/"
505 |                                                         // subdir
506 |       "../../../../4_Finance/<executable_name>/data/",  // up 4 in tree,
507 |                                                         // "/4_Finance/<executable_name>/"
508 |                                                         // subdir
509 |       "../../../../5_Simulations/<executable_name>/data/",  // up 4 in tree,
510 |                                                             // "/5_Simulations/<executable_name>/"
511 |                                                             // subdir
512 |       "../../../../6_Advanced/<executable_name>/data/",  // up 4 in tree,
513 |                                                          // "/6_Advanced/<executable_name>/"
514 |                                                          // subdir
515 |       "../../../../7_CUDALibraries/<executable_name>/data/",  // up 4 in tree,
516 |                                                               // "/7_CUDALibraries/<executable_name>/"
517 |                                                               // subdir
518 |       "../../../../8_Android/<executable_name>/data/",  // up 4 in tree,
519 |                                                         // "/8_Android/<executable_name>/"
520 |                                                         // subdir
521 |       "../../../../0_Simple/<executable_name>/",  // up 4 in tree,
522 |                                                   // "/0_Simple/<executable_name>/"
523 |                                                   // subdir
524 |       "../../../../1_Utilities/<executable_name>/",  // up 4 in tree,
525 |                                                      // "/1_Utilities/<executable_name>/"
526 |                                                      // subdir
527 |       "../../../../2_Graphics/<executable_name>/",  // up 4 in tree,
528 |                                                     // "/2_Graphics/<executable_name>/"
529 |                                                     // subdir
530 |       "../../../../3_Imaging/<executable_name>/",  // up 4 in tree,
531 |                                                    // "/3_Imaging/<executable_name>/"
532 |                                                    // subdir
533 |       "../../../../4_Finance/<executable_name>/",  // up 4 in tree,
534 |                                                    // "/4_Finance/<executable_name>/"
535 |                                                    // subdir
536 |       "../../../../5_Simulations/<executable_name>/",  // up 4 in tree,
537 |                                                        // "/5_Simulations/<executable_name>/"
538 |                                                        // subdir
539 |       "../../../../6_Advanced/<executable_name>/",  // up 4 in tree,
540 |                                                     // "/6_Advanced/<executable_name>/"
541 |                                                     // subdir
542 |       "../../../../7_CUDALibraries/<executable_name>/",  // up 4 in tree,
543 |                                                          // "/7_CUDALibraries/<executable_name>/"
544 |                                                          // subdir
545 |       "../../../../8_Android/<executable_name>/",  // up 4 in tree,
546 |                                                    // "/8_Android/<executable_name>/"
547 |                                                    // subdir
548 |       "../../../../samples/<executable_name>/data/",  // up 4 in tree,
549 |                                                       // "/samples/<executable_name>/"
550 |                                                       // subdir
551 |       "../../../../common/",       // up 4 in tree, "../../../common/" subdir
552 |       "../../../../common/data/",  // up 4 in tree, "../../../common/data/"
553 |                                    // subdir
554 |       "../../../../data/",         // up 4 in tree, "../../../data/" subdir
555 |       "../../../../../",           // up 5 in tree
556 |       "../../../../../src/<executable_name>/",  // up 5 in tree,
557 |                                                 // "/src/<executable_name>/"
558 |                                                 // subdir
559 |       "../../../../../src/<executable_name>/data/",  // up 5 in tree,
560 |                                                      // "/src/<executable_name>/data/"
561 |                                                      // subdir
562 |       "../../../../../src/<executable_name>/src/",  // up 5 in tree,
563 |                                                     // "/src/<executable_name>/src/"
564 |                                                     // subdir
565 |       "../../../../../src/<executable_name>/inc/",  // up 5 in tree,
566 |                                                     // "/src/<executable_name>/inc/"
567 |                                                     // subdir
568 |       "../../../../../sandbox/<executable_name>/",  // up 5 in tree,
569 |                                                     // "/sandbox/<executable_name>/"
570 |                                                     // subdir
571 |       "../../../../../sandbox/<executable_name>/data/",  // up 5 in tree,
572 |                                                          // "/sandbox/<executable_name>/data/"
573 |                                                          // subdir
574 |       "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree,
575 |                                                         // "/sandbox/<executable_name>/src/"
576 |                                                         // subdir
577 |       "../../../../../sandbox/<executable_name>/inc/",  // up 5 in tree,
578 |                                                         // "/sandbox/<executable_name>/inc/"
579 |                                                         // subdir
580 |       "../../../../../0_Simple/<executable_name>/data/",  // up 5 in tree,
581 |                                                           // "/0_Simple/<executable_name>/"
582 |                                                           // subdir
583 |       "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree,
584 |                                                              // "/1_Utilities/<executable_name>/"
585 |                                                              // subdir
586 |       "../../../../../2_Graphics/<executable_name>/data/",  // up 5 in tree,
587 |                                                             // "/2_Graphics/<executable_name>/"
588 |                                                             // subdir
589 |       "../../../../../3_Imaging/<executable_name>/data/",  // up 5 in tree,
590 |                                                            // "/3_Imaging/<executable_name>/"
591 |                                                            // subdir
592 |       "../../../../../4_Finance/<executable_name>/data/",  // up 5 in tree,
593 |                                                            // "/4_Finance/<executable_name>/"
594 |                                                            // subdir
595 |       "../../../../../5_Simulations/<executable_name>/data/",  // up 5 in tree,
596 |                                                                // "/5_Simulations/<executable_name>/"
597 |                                                                // subdir
598 |       "../../../../../6_Advanced/<executable_name>/data/",  // up 5 in tree,
599 |                                                             // "/6_Advanced/<executable_name>/"
600 |                                                             // subdir
601 |       "../../../../../7_CUDALibraries/<executable_name>/data/",  // up 5 in
602 |                                                                  // tree,
603 |                                                                  // "/7_CUDALibraries/<executable_name>/"
604 |                                                                  // subdir
605 |       "../../../../../8_Android/<executable_name>/data/",  // up 5 in tree,
606 |                                                            // "/8_Android/<executable_name>/"
607 |                                                            // subdir
608 |       "../../../../../samples/<executable_name>/data/",  // up 5 in tree,
609 |                                                          // "/samples/<executable_name>/"
610 |                                                          // subdir
611 |       "../../../../../common/",       // up 5 in tree, "../../../common/" subdir
612 |       "../../../../../common/data/",  // up 5 in tree, "../../../common/data/"
613 |                                       // subdir
614 |   };
615 | 
616 |   // Extract the executable name
617 |   std::string executable_name;
618 | 
619 |   if (executable_path != 0) {
620 |     executable_name = std::string(executable_path);
621 | 
622 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
623 |     // Windows path delimiter
624 |     size_t delimiter_pos = executable_name.find_last_of('\\');
625 |     executable_name.erase(0, delimiter_pos + 1);
626 | 
627 |     if (executable_name.rfind(".exe") != std::string::npos) {
628 |       // we strip .exe, only if the .exe is found
629 |       executable_name.resize(executable_name.size() - 4);
630 |     }
631 | 
632 | #else
633 |     // Linux & OSX path delimiter
634 |     size_t delimiter_pos = executable_name.find_last_of('/');
635 |     executable_name.erase(0, delimiter_pos + 1);
636 | #endif
637 |   }
638 | 
639 |   // Loop over all search paths and return the first hit
640 |   for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
641 |     std::string path(searchPath[i]);
642 |     size_t executable_name_pos = path.find("<executable_name>");
643 | 
644 |     // If there is executable_name variable in the searchPath
645 |     // replace it with the value
646 |     if (executable_name_pos != std::string::npos) {
647 |       if (executable_path != 0) {
648 |         path.replace(executable_name_pos, strlen("<executable_name>"),
649 |                      executable_name);
650 |       } else {
651 |         // Skip this path entry if no executable argument is given
652 |         continue;
653 |       }
654 |     }
655 | 
656 | #ifdef _DEBUG
657 |     printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
658 | #endif
659 | 
660 |     // Test if the file exists
661 |     path.append(filename);
662 |     FILE *fp;
663 |     FOPEN(fp, path.c_str(), "rb");
664 | 
665 |     if (fp != NULL) {
666 |       fclose(fp);
667 |       // File found
668 |       // returning an allocated array here for backwards compatibility reasons
669 |       char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
670 |       STRCPY(file_path, path.length() + 1, path.c_str());
671 |       return file_path;
672 |     }
673 | 
674 |     if (fp) {
675 |       fclose(fp);
676 |     }
677 |   }
678 | 
679 |   // File not found
680 |   return 0;
681 | }
682 | 
683 | #endif  // COMMON_HELPER_STRING_H_
684 | 


--------------------------------------------------------------------------------
/simpleCUBLASHgemm/helper_string.h:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Copyright 1993-2013 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * Please refer to the NVIDIA end user license agreement (EULA) associated
  5 |  * with this source code for terms and conditions that govern your use of
  6 |  * this software. Any use, reproduction, disclosure, or distribution of
  7 |  * this software and related documentation outside the terms of the EULA
  8 |  * is strictly prohibited.
  9 |  *
 10 |  */
 11 | 
 12 | // These are helper functions for the SDK samples (string parsing, timers, etc)
 13 | #ifndef COMMON_HELPER_STRING_H_
 14 | #define COMMON_HELPER_STRING_H_
 15 | 
 16 | #include <stdio.h>
 17 | #include <stdlib.h>
 18 | #include <fstream>
 19 | #include <string>
 20 | 
 21 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
 22 | #ifndef _CRT_SECURE_NO_DEPRECATE
 23 | #define _CRT_SECURE_NO_DEPRECATE
 24 | #endif
 25 | #ifndef STRCASECMP
 26 | #define STRCASECMP _stricmp
 27 | #endif
 28 | #ifndef STRNCASECMP
 29 | #define STRNCASECMP _strnicmp
 30 | #endif
 31 | #ifndef STRCPY
 32 | #define STRCPY(sFilePath, nLength, sPath) strcpy_s(sFilePath, nLength, sPath)
 33 | #endif
 34 | 
 35 | #ifndef FOPEN
 36 | #define FOPEN(fHandle, filename, mode) fopen_s(&fHandle, filename, mode)
 37 | #endif
 38 | #ifndef FOPEN_FAIL
 39 | #define FOPEN_FAIL(result) (result != 0)
 40 | #endif
 41 | #ifndef SSCANF
 42 | #define SSCANF sscanf_s
 43 | #endif
 44 | #ifndef SPRINTF
 45 | #define SPRINTF sprintf_s
 46 | #endif
 47 | #else  // Linux Includes
 48 | #include <string.h>
 49 | #include <strings.h>
 50 | 
 51 | #ifndef STRCASECMP
 52 | #define STRCASECMP strcasecmp
 53 | #endif
 54 | #ifndef STRNCASECMP
 55 | #define STRNCASECMP strncasecmp
 56 | #endif
 57 | #ifndef STRCPY
 58 | #define STRCPY(sFilePath, nLength, sPath) strcpy(sFilePath, sPath)
 59 | #endif
 60 | 
 61 | #ifndef FOPEN
 62 | #define FOPEN(fHandle, filename, mode) (fHandle = fopen(filename, mode))
 63 | #endif
 64 | #ifndef FOPEN_FAIL
 65 | #define FOPEN_FAIL(result) (result == NULL)
 66 | #endif
 67 | #ifndef SSCANF
 68 | #define SSCANF sscanf
 69 | #endif
 70 | #ifndef SPRINTF
 71 | #define SPRINTF sprintf
 72 | #endif
 73 | #endif
 74 | 
 75 | #ifndef EXIT_WAIVED
 76 | #define EXIT_WAIVED 2
 77 | #endif
 78 | 
 79 | // CUDA Utility Helper Functions
 80 | inline int stringRemoveDelimiter(char delimiter, const char *string) {
 81 |   int string_start = 0;
 82 | 
 83 |   while (string[string_start] == delimiter) {
 84 |     string_start++;
 85 |   }
 86 | 
 87 |   if (string_start >= static_cast<int>(strlen(string) - 1)) {
 88 |     return 0;
 89 |   }
 90 | 
 91 |   return string_start;
 92 | }
 93 | 
 94 | inline int getFileExtension(char *filename, char **extension) {
 95 |   int string_length = static_cast<int>(strlen(filename));
 96 | 
 97 |   while (filename[string_length--] != '.') {
 98 |     if (string_length == 0) break;
 99 |   }
100 | 
101 |   if (string_length > 0) string_length += 2;
102 | 
103 |   if (string_length == 0)
104 |     *extension = NULL;
105 |   else
106 |     *extension = &filename[string_length];
107 | 
108 |   return string_length;
109 | }
110 | 
111 | inline bool checkCmdLineFlag(const int argc, const char **argv,
112 |                              const char *string_ref) {
113 |   bool bFound = false;
114 | 
115 |   if (argc >= 1) {
116 |     for (int i = 1; i < argc; i++) {
117 |       int string_start = stringRemoveDelimiter('-', argv[i]);
118 |       const char *string_argv = &argv[i][string_start];
119 | 
120 |       const char *equal_pos = strchr(string_argv, '=');
121 |       int argv_length = static_cast<int>(
122 |           equal_pos == 0 ? strlen(string_argv) : equal_pos - string_argv);
123 | 
124 |       int length = static_cast<int>(strlen(string_ref));
125 | 
126 |       if (length == argv_length &&
127 |           !STRNCASECMP(string_argv, string_ref, length)) {
128 |         bFound = true;
129 |         continue;
130 |       }
131 |     }
132 |   }
133 | 
134 |   return bFound;
135 | }
136 | 
137 | // This function wraps the CUDA Driver API into a template function
138 | template <class T>
139 | inline bool getCmdLineArgumentValue(const int argc, const char **argv,
140 |                                     const char *string_ref, T *value) {
141 |   bool bFound = false;
142 | 
143 |   if (argc >= 1) {
144 |     for (int i = 1; i < argc; i++) {
145 |       int string_start = stringRemoveDelimiter('-', argv[i]);
146 |       const char *string_argv = &argv[i][string_start];
147 |       int length = static_cast<int>(strlen(string_ref));
148 | 
149 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
150 |         if (length + 1 <= static_cast<int>(strlen(string_argv))) {
151 |           int auto_inc = (string_argv[length] == '=') ? 1 : 0;
152 |           *value = (T)atoi(&string_argv[length + auto_inc]);
153 |         }
154 | 
155 |         bFound = true;
156 |         i = argc;
157 |       }
158 |     }
159 |   }
160 | 
161 |   return bFound;
162 | }
163 | 
164 | inline int getCmdLineArgumentInt(const int argc, const char **argv,
165 |                                  const char *string_ref) {
166 |   bool bFound = false;
167 |   int value = -1;
168 | 
169 |   if (argc >= 1) {
170 |     for (int i = 1; i < argc; i++) {
171 |       int string_start = stringRemoveDelimiter('-', argv[i]);
172 |       const char *string_argv = &argv[i][string_start];
173 |       int length = static_cast<int>(strlen(string_ref));
174 | 
175 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
176 |         if (length + 1 <= static_cast<int>(strlen(string_argv))) {
177 |           int auto_inc = (string_argv[length] == '=') ? 1 : 0;
178 |           value = atoi(&string_argv[length + auto_inc]);
179 |         } else {
180 |           value = 0;
181 |         }
182 | 
183 |         bFound = true;
184 |         continue;
185 |       }
186 |     }
187 |   }
188 | 
189 |   if (bFound) {
190 |     return value;
191 |   } else {
192 |     return 0;
193 |   }
194 | }
195 | 
196 | inline float getCmdLineArgumentFloat(const int argc, const char **argv,
197 |                                      const char *string_ref) {
198 |   bool bFound = false;
199 |   float value = -1;
200 | 
201 |   if (argc >= 1) {
202 |     for (int i = 1; i < argc; i++) {
203 |       int string_start = stringRemoveDelimiter('-', argv[i]);
204 |       const char *string_argv = &argv[i][string_start];
205 |       int length = static_cast<int>(strlen(string_ref));
206 | 
207 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
208 |         if (length + 1 <= static_cast<int>(strlen(string_argv))) {
209 |           int auto_inc = (string_argv[length] == '=') ? 1 : 0;
210 |           value = static_cast<float>(atof(&string_argv[length + auto_inc]));
211 |         } else {
212 |           value = 0.f;
213 |         }
214 | 
215 |         bFound = true;
216 |         continue;
217 |       }
218 |     }
219 |   }
220 | 
221 |   if (bFound) {
222 |     return value;
223 |   } else {
224 |     return 0;
225 |   }
226 | }
227 | 
228 | inline bool getCmdLineArgumentString(const int argc, const char **argv,
229 |                                      const char *string_ref,
230 |                                      char **string_retval) {
231 |   bool bFound = false;
232 | 
233 |   if (argc >= 1) {
234 |     for (int i = 1; i < argc; i++) {
235 |       int string_start = stringRemoveDelimiter('-', argv[i]);
236 |       char *string_argv = const_cast<char*>(&argv[i][string_start]);
237 |       int length = static_cast<int>(strlen(string_ref));
238 | 
239 |       if (!STRNCASECMP(string_argv, string_ref, length)) {
240 |         *string_retval = &string_argv[length + 1];
241 |         bFound = true;
242 |         continue;
243 |       }
244 |     }
245 |   }
246 | 
247 |   if (!bFound) {
248 |     *string_retval = NULL;
249 |   }
250 | 
251 |   return bFound;
252 | }
253 | 
254 | //////////////////////////////////////////////////////////////////////////////
255 | //! Find the path for a file assuming that
256 | //! files are found in the searchPath.
257 | //!
258 | //! @return the path if succeeded, otherwise 0
259 | //! @param filename         name of the file
260 | //! @param executable_path  optional absolute path of the executable
261 | //////////////////////////////////////////////////////////////////////////////
262 | inline char *sdkFindFilePath(const char *filename,
263 |                              const char *executable_path) {
264 |   // <executable_name> defines a variable that is replaced with the name of the
265 |   // executable
266 | 
267 |   // Typical relative search paths to locate needed companion files (e.g. sample
268 |   // input data, or JIT source files) The origin for the relative search may be
269 |   // the .exe file, a .bat file launching an .exe, a browser .exe launching the
270 |   // .exe or .bat, etc
271 |   const char *searchPath[] = {
272 |       "./",  // same dir
273 |       "./<executable_name>_data_files/",
274 |       "./common/",                      // "/common/" subdir
275 |       "./common/data/",                 // "/common/data/" subdir
276 |       "./data/",                        // "/data/" subdir
277 |       "./src/",                         // "/src/" subdir
278 |       "./src/<executable_name>/data/",  // "/src/<executable_name>/data/" subdir
279 |       "./inc/",                         // "/inc/" subdir
280 |       "./0_Simple/",                    // "/0_Simple/" subdir
281 |       "./1_Utilities/",                 // "/1_Utilities/" subdir
282 |       "./2_Graphics/",                  // "/2_Graphics/" subdir
283 |       "./3_Imaging/",                   // "/3_Imaging/" subdir
284 |       "./4_Finance/",                   // "/4_Finance/" subdir
285 |       "./5_Simulations/",               // "/5_Simulations/" subdir
286 |       "./6_Advanced/",                  // "/6_Advanced/" subdir
287 |       "./7_CUDALibraries/",             // "/7_CUDALibraries/" subdir
288 |       "./8_Android/",                   // "/8_Android/" subdir
289 |       "./samples/",                     // "/samples/" subdir
290 | 
291 |       "./0_Simple/<executable_name>/data/",  // "/0_Simple/<executable_name>/data/"
292 |                                              // subdir
293 |       "./1_Utilities/<executable_name>/data/",  // "/1_Utilities/<executable_name>/data/"
294 |                                                 // subdir
295 |       "./2_Graphics/<executable_name>/data/",  // "/2_Graphics/<executable_name>/data/"
296 |                                                // subdir
297 |       "./3_Imaging/<executable_name>/data/",  // "/3_Imaging/<executable_name>/data/"
298 |                                               // subdir
299 |       "./4_Finance/<executable_name>/data/",  // "/4_Finance/<executable_name>/data/"
300 |                                               // subdir
301 |       "./5_Simulations/<executable_name>/data/",  // "/5_Simulations/<executable_name>/data/"
302 |                                                   // subdir
303 |       "./6_Advanced/<executable_name>/data/",  // "/6_Advanced/<executable_name>/data/"
304 |                                                // subdir
305 |       "./7_CUDALibraries/<executable_name>/",  // "/7_CUDALibraries/<executable_name>/"
306 |                                                // subdir
307 |       "./7_CUDALibraries/<executable_name>/data/",  // "/7_CUDALibraries/<executable_name>/data/"
308 |                                                     // subdir
309 | 
310 |       "../",              // up 1 in tree
311 |       "../common/",       // up 1 in tree, "/common/" subdir
312 |       "../common/data/",  // up 1 in tree, "/common/data/" subdir
313 |       "../data/",         // up 1 in tree, "/data/" subdir
314 |       "../src/",          // up 1 in tree, "/src/" subdir
315 |       "../inc/",          // up 1 in tree, "/inc/" subdir
316 | 
317 |       "../0_Simple/<executable_name>/data/",  // up 1 in tree,
318 |                                               // "/0_Simple/<executable_name>/"
319 |                                               // subdir
320 |       "../1_Utilities/<executable_name>/data/",  // up 1 in tree,
321 |                                                  // "/1_Utilities/<executable_name>/"
322 |                                                  // subdir
323 |       "../2_Graphics/<executable_name>/data/",  // up 1 in tree,
324 |                                                 // "/2_Graphics/<executable_name>/"
325 |                                                 // subdir
326 |       "../3_Imaging/<executable_name>/data/",  // up 1 in tree,
327 |                                                // "/3_Imaging/<executable_name>/"
328 |                                                // subdir
329 |       "../4_Finance/<executable_name>/data/",  // up 1 in tree,
330 |                                                // "/4_Finance/<executable_name>/"
331 |                                                // subdir
332 |       "../5_Simulations/<executable_name>/data/",  // up 1 in tree,
333 |                                                    // "/5_Simulations/<executable_name>/"
334 |                                                    // subdir
335 |       "../6_Advanced/<executable_name>/data/",  // up 1 in tree,
336 |                                                 // "/6_Advanced/<executable_name>/"
337 |                                                 // subdir
338 |       "../7_CUDALibraries/<executable_name>/data/",  // up 1 in tree,
339 |                                                      // "/7_CUDALibraries/<executable_name>/"
340 |                                                      // subdir
341 |       "../8_Android/<executable_name>/data/",  // up 1 in tree,
342 |                                                // "/8_Android/<executable_name>/"
343 |                                                // subdir
344 |       "../samples/<executable_name>/data/",  // up 1 in tree,
345 |                                              // "/samples/<executable_name>/"
346 |                                              // subdir
347 |       "../../",                              // up 2 in tree
348 |       "../../common/",                       // up 2 in tree, "/common/" subdir
349 |       "../../common/data/",  // up 2 in tree, "/common/data/" subdir
350 |       "../../data/",         // up 2 in tree, "/data/" subdir
351 |       "../../src/",          // up 2 in tree, "/src/" subdir
352 |       "../../inc/",          // up 2 in tree, "/inc/" subdir
353 |       "../../sandbox/<executable_name>/data/",  // up 2 in tree,
354 |                                                 // "/sandbox/<executable_name>/"
355 |                                                 // subdir
356 |       "../../0_Simple/<executable_name>/data/",  // up 2 in tree,
357 |                                                  // "/0_Simple/<executable_name>/"
358 |                                                  // subdir
359 |       "../../1_Utilities/<executable_name>/data/",  // up 2 in tree,
360 |                                                     // "/1_Utilities/<executable_name>/"
361 |                                                     // subdir
362 |       "../../2_Graphics/<executable_name>/data/",  // up 2 in tree,
363 |                                                    // "/2_Graphics/<executable_name>/"
364 |                                                    // subdir
365 |       "../../3_Imaging/<executable_name>/data/",  // up 2 in tree,
366 |                                                   // "/3_Imaging/<executable_name>/"
367 |                                                   // subdir
368 |       "../../4_Finance/<executable_name>/data/",  // up 2 in tree,
369 |                                                   // "/4_Finance/<executable_name>/"
370 |                                                   // subdir
371 |       "../../5_Simulations/<executable_name>/data/",  // up 2 in tree,
372 |                                                       // "/5_Simulations/<executable_name>/"
373 |                                                       // subdir
374 |       "../../6_Advanced/<executable_name>/data/",  // up 2 in tree,
375 |                                                    // "/6_Advanced/<executable_name>/"
376 |                                                    // subdir
377 |       "../../7_CUDALibraries/<executable_name>/data/",  // up 2 in tree,
378 |                                                         // "/7_CUDALibraries/<executable_name>/"
379 |                                                         // subdir
380 |       "../../8_Android/<executable_name>/data/",  // up 2 in tree,
381 |                                                   // "/8_Android/<executable_name>/"
382 |                                                   // subdir
383 |       "../../samples/<executable_name>/data/",  // up 2 in tree,
384 |                                                 // "/samples/<executable_name>/"
385 |                                                 // subdir
386 |       "../../../",                              // up 3 in tree
387 |       "../../../src/<executable_name>/",        // up 3 in tree,
388 |                                           // "/src/<executable_name>/" subdir
389 |       "../../../src/<executable_name>/data/",  // up 3 in tree,
390 |                                                // "/src/<executable_name>/data/"
391 |                                                // subdir
392 |       "../../../src/<executable_name>/src/",   // up 3 in tree,
393 |                                                // "/src/<executable_name>/src/"
394 |                                                // subdir
395 |       "../../../src/<executable_name>/inc/",   // up 3 in tree,
396 |                                                // "/src/<executable_name>/inc/"
397 |                                                // subdir
398 |       "../../../sandbox/<executable_name>/",   // up 3 in tree,
399 |                                                // "/sandbox/<executable_name>/"
400 |                                                // subdir
401 |       "../../../sandbox/<executable_name>/data/",  // up 3 in tree,
402 |                                                    // "/sandbox/<executable_name>/data/"
403 |                                                    // subdir
404 |       "../../../sandbox/<executable_name>/src/",  // up 3 in tree,
405 |                                                   // "/sandbox/<executable_name>/src/"
406 |                                                   // subdir
407 |       "../../../sandbox/<executable_name>/inc/",  // up 3 in tree,
408 |                                                   // "/sandbox/<executable_name>/inc/"
409 |                                                   // subdir
410 |       "../../../0_Simple/<executable_name>/data/",  // up 3 in tree,
411 |                                                     // "/0_Simple/<executable_name>/"
412 |                                                     // subdir
413 |       "../../../1_Utilities/<executable_name>/data/",  // up 3 in tree,
414 |                                                        // "/1_Utilities/<executable_name>/"
415 |                                                        // subdir
416 |       "../../../2_Graphics/<executable_name>/data/",  // up 3 in tree,
417 |                                                       // "/2_Graphics/<executable_name>/"
418 |                                                       // subdir
419 |       "../../../3_Imaging/<executable_name>/data/",  // up 3 in tree,
420 |                                                      // "/3_Imaging/<executable_name>/"
421 |                                                      // subdir
422 |       "../../../4_Finance/<executable_name>/data/",  // up 3 in tree,
423 |                                                      // "/4_Finance/<executable_name>/"
424 |                                                      // subdir
425 |       "../../../5_Simulations/<executable_name>/data/",  // up 3 in tree,
426 |                                                          // "/5_Simulations/<executable_name>/"
427 |                                                          // subdir
428 |       "../../../6_Advanced/<executable_name>/data/",  // up 3 in tree,
429 |                                                       // "/6_Advanced/<executable_name>/"
430 |                                                       // subdir
431 |       "../../../7_CUDALibraries/<executable_name>/data/",  // up 3 in tree,
432 |                                                            // "/7_CUDALibraries/<executable_name>/"
433 |                                                            // subdir
434 |       "../../../8_Android/<executable_name>/data/",  // up 3 in tree,
435 |                                                      // "/8_Android/<executable_name>/"
436 |                                                      // subdir
437 |       "../../../0_Simple/<executable_name>/",  // up 3 in tree,
438 |                                                // "/0_Simple/<executable_name>/"
439 |                                                // subdir
440 |       "../../../1_Utilities/<executable_name>/",  // up 3 in tree,
441 |                                                   // "/1_Utilities/<executable_name>/"
442 |                                                   // subdir
443 |       "../../../2_Graphics/<executable_name>/",  // up 3 in tree,
444 |                                                  // "/2_Graphics/<executable_name>/"
445 |                                                  // subdir
446 |       "../../../3_Imaging/<executable_name>/",  // up 3 in tree,
447 |                                                 // "/3_Imaging/<executable_name>/"
448 |                                                 // subdir
449 |       "../../../4_Finance/<executable_name>/",  // up 3 in tree,
450 |                                                 // "/4_Finance/<executable_name>/"
451 |                                                 // subdir
452 |       "../../../5_Simulations/<executable_name>/",  // up 3 in tree,
453 |                                                     // "/5_Simulations/<executable_name>/"
454 |                                                     // subdir
455 |       "../../../6_Advanced/<executable_name>/",  // up 3 in tree,
456 |                                                  // "/6_Advanced/<executable_name>/"
457 |                                                  // subdir
458 |       "../../../7_CUDALibraries/<executable_name>/",  // up 3 in tree,
459 |                                                       // "/7_CUDALibraries/<executable_name>/"
460 |                                                       // subdir
461 |       "../../../8_Android/<executable_name>/",  // up 3 in tree,
462 |                                                 // "/8_Android/<executable_name>/"
463 |                                                 // subdir
464 |       "../../../samples/<executable_name>/data/",  // up 3 in tree,
465 |                                                    // "/samples/<executable_name>/"
466 |                                                    // subdir
467 |       "../../../common/",       // up 3 in tree, "../../../common/" subdir
468 |       "../../../common/data/",  // up 3 in tree, "../../../common/data/" subdir
469 |       "../../../data/",         // up 3 in tree, "../../../data/" subdir
470 |       "../../../../",           // up 4 in tree
471 |       "../../../../src/<executable_name>/",  // up 4 in tree,
472 |                                              // "/src/<executable_name>/" subdir
473 |       "../../../../src/<executable_name>/data/",  // up 4 in tree,
474 |                                                   // "/src/<executable_name>/data/"
475 |                                                   // subdir
476 |       "../../../../src/<executable_name>/src/",  // up 4 in tree,
477 |                                                  // "/src/<executable_name>/src/"
478 |                                                  // subdir
479 |       "../../../../src/<executable_name>/inc/",  // up 4 in tree,
480 |                                                  // "/src/<executable_name>/inc/"
481 |                                                  // subdir
482 |       "../../../../sandbox/<executable_name>/",  // up 4 in tree,
483 |                                                  // "/sandbox/<executable_name>/"
484 |                                                  // subdir
485 |       "../../../../sandbox/<executable_name>/data/",  // up 4 in tree,
486 |                                                       // "/sandbox/<executable_name>/data/"
487 |                                                       // subdir
488 |       "../../../../sandbox/<executable_name>/src/",  // up 4 in tree,
489 |                                                      // "/sandbox/<executable_name>/src/"
490 |                                                      // subdir
491 |       "../../../../sandbox/<executable_name>/inc/",  // up 4 in tree,
492 |                                                      // "/sandbox/<executable_name>/inc/"
493 |                                                      // subdir
494 |       "../../../../0_Simple/<executable_name>/data/",  // up 4 in tree,
495 |                                                        // "/0_Simple/<executable_name>/"
496 |                                                        // subdir
497 |       "../../../../1_Utilities/<executable_name>/data/",  // up 4 in tree,
498 |                                                           // "/1_Utilities/<executable_name>/"
499 |                                                           // subdir
500 |       "../../../../2_Graphics/<executable_name>/data/",  // up 4 in tree,
501 |                                                          // "/2_Graphics/<executable_name>/"
502 |                                                          // subdir
503 |       "../../../../3_Imaging/<executable_name>/data/",  // up 4 in tree,
504 |                                                         // "/3_Imaging/<executable_name>/"
505 |                                                         // subdir
506 |       "../../../../4_Finance/<executable_name>/data/",  // up 4 in tree,
507 |                                                         // "/4_Finance/<executable_name>/"
508 |                                                         // subdir
509 |       "../../../../5_Simulations/<executable_name>/data/",  // up 4 in tree,
510 |                                                             // "/5_Simulations/<executable_name>/"
511 |                                                             // subdir
512 |       "../../../../6_Advanced/<executable_name>/data/",  // up 4 in tree,
513 |                                                          // "/6_Advanced/<executable_name>/"
514 |                                                          // subdir
515 |       "../../../../7_CUDALibraries/<executable_name>/data/",  // up 4 in tree,
516 |                                                               // "/7_CUDALibraries/<executable_name>/"
517 |                                                               // subdir
518 |       "../../../../8_Android/<executable_name>/data/",  // up 4 in tree,
519 |                                                         // "/8_Android/<executable_name>/"
520 |                                                         // subdir
521 |       "../../../../0_Simple/<executable_name>/",  // up 4 in tree,
522 |                                                   // "/0_Simple/<executable_name>/"
523 |                                                   // subdir
524 |       "../../../../1_Utilities/<executable_name>/",  // up 4 in tree,
525 |                                                      // "/1_Utilities/<executable_name>/"
526 |                                                      // subdir
527 |       "../../../../2_Graphics/<executable_name>/",  // up 4 in tree,
528 |                                                     // "/2_Graphics/<executable_name>/"
529 |                                                     // subdir
530 |       "../../../../3_Imaging/<executable_name>/",  // up 4 in tree,
531 |                                                    // "/3_Imaging/<executable_name>/"
532 |                                                    // subdir
533 |       "../../../../4_Finance/<executable_name>/",  // up 4 in tree,
534 |                                                    // "/4_Finance/<executable_name>/"
535 |                                                    // subdir
536 |       "../../../../5_Simulations/<executable_name>/",  // up 4 in tree,
537 |                                                        // "/5_Simulations/<executable_name>/"
538 |                                                        // subdir
539 |       "../../../../6_Advanced/<executable_name>/",  // up 4 in tree,
540 |                                                     // "/6_Advanced/<executable_name>/"
541 |                                                     // subdir
542 |       "../../../../7_CUDALibraries/<executable_name>/",  // up 4 in tree,
543 |                                                          // "/7_CUDALibraries/<executable_name>/"
544 |                                                          // subdir
545 |       "../../../../8_Android/<executable_name>/",  // up 4 in tree,
546 |                                                    // "/8_Android/<executable_name>/"
547 |                                                    // subdir
548 |       "../../../../samples/<executable_name>/data/",  // up 4 in tree,
549 |                                                       // "/samples/<executable_name>/"
550 |                                                       // subdir
551 |       "../../../../common/",       // up 4 in tree, "../../../common/" subdir
552 |       "../../../../common/data/",  // up 4 in tree, "../../../common/data/"
553 |                                    // subdir
554 |       "../../../../data/",         // up 4 in tree, "../../../data/" subdir
555 |       "../../../../../",           // up 5 in tree
556 |       "../../../../../src/<executable_name>/",  // up 5 in tree,
557 |                                                 // "/src/<executable_name>/"
558 |                                                 // subdir
559 |       "../../../../../src/<executable_name>/data/",  // up 5 in tree,
560 |                                                      // "/src/<executable_name>/data/"
561 |                                                      // subdir
562 |       "../../../../../src/<executable_name>/src/",  // up 5 in tree,
563 |                                                     // "/src/<executable_name>/src/"
564 |                                                     // subdir
565 |       "../../../../../src/<executable_name>/inc/",  // up 5 in tree,
566 |                                                     // "/src/<executable_name>/inc/"
567 |                                                     // subdir
568 |       "../../../../../sandbox/<executable_name>/",  // up 5 in tree,
569 |                                                     // "/sandbox/<executable_name>/"
570 |                                                     // subdir
571 |       "../../../../../sandbox/<executable_name>/data/",  // up 5 in tree,
572 |                                                          // "/sandbox/<executable_name>/data/"
573 |                                                          // subdir
574 |       "../../../../../sandbox/<executable_name>/src/",  // up 5 in tree,
575 |                                                         // "/sandbox/<executable_name>/src/"
576 |                                                         // subdir
577 |       "../../../../../sandbox/<executable_name>/inc/",  // up 5 in tree,
578 |                                                         // "/sandbox/<executable_name>/inc/"
579 |                                                         // subdir
580 |       "../../../../../0_Simple/<executable_name>/data/",  // up 5 in tree,
581 |                                                           // "/0_Simple/<executable_name>/"
582 |                                                           // subdir
583 |       "../../../../../1_Utilities/<executable_name>/data/",  // up 5 in tree,
584 |                                                              // "/1_Utilities/<executable_name>/"
585 |                                                              // subdir
586 |       "../../../../../2_Graphics/<executable_name>/data/",  // up 5 in tree,
587 |                                                             // "/2_Graphics/<executable_name>/"
588 |                                                             // subdir
589 |       "../../../../../3_Imaging/<executable_name>/data/",  // up 5 in tree,
590 |                                                            // "/3_Imaging/<executable_name>/"
591 |                                                            // subdir
592 |       "../../../../../4_Finance/<executable_name>/data/",  // up 5 in tree,
593 |                                                            // "/4_Finance/<executable_name>/"
594 |                                                            // subdir
595 |       "../../../../../5_Simulations/<executable_name>/data/",  // up 5 in tree,
596 |                                                                // "/5_Simulations/<executable_name>/"
597 |                                                                // subdir
598 |       "../../../../../6_Advanced/<executable_name>/data/",  // up 5 in tree,
599 |                                                             // "/6_Advanced/<executable_name>/"
600 |                                                             // subdir
601 |       "../../../../../7_CUDALibraries/<executable_name>/data/",  // up 5 in
602 |                                                                  // tree,
603 |                                                                  // "/7_CUDALibraries/<executable_name>/"
604 |                                                                  // subdir
605 |       "../../../../../8_Android/<executable_name>/data/",  // up 5 in tree,
606 |                                                            // "/8_Android/<executable_name>/"
607 |                                                            // subdir
608 |       "../../../../../samples/<executable_name>/data/",  // up 5 in tree,
609 |                                                          // "/samples/<executable_name>/"
610 |                                                          // subdir
611 |       "../../../../../common/",       // up 5 in tree, "../../../common/" subdir
612 |       "../../../../../common/data/",  // up 5 in tree, "../../../common/data/"
613 |                                       // subdir
614 |   };
615 | 
616 |   // Extract the executable name
617 |   std::string executable_name;
618 | 
619 |   if (executable_path != 0) {
620 |     executable_name = std::string(executable_path);
621 | 
622 | #if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
623 |     // Windows path delimiter
624 |     size_t delimiter_pos = executable_name.find_last_of('\\');
625 |     executable_name.erase(0, delimiter_pos + 1);
626 | 
627 |     if (executable_name.rfind(".exe") != std::string::npos) {
628 |       // we strip .exe, only if the .exe is found
629 |       executable_name.resize(executable_name.size() - 4);
630 |     }
631 | 
632 | #else
633 |     // Linux & OSX path delimiter
634 |     size_t delimiter_pos = executable_name.find_last_of('/');
635 |     executable_name.erase(0, delimiter_pos + 1);
636 | #endif
637 |   }
638 | 
639 |   // Loop over all search paths and return the first hit
640 |   for (unsigned int i = 0; i < sizeof(searchPath) / sizeof(char *); ++i) {
641 |     std::string path(searchPath[i]);
642 |     size_t executable_name_pos = path.find("<executable_name>");
643 | 
644 |     // If there is executable_name variable in the searchPath
645 |     // replace it with the value
646 |     if (executable_name_pos != std::string::npos) {
647 |       if (executable_path != 0) {
648 |         path.replace(executable_name_pos, strlen("<executable_name>"),
649 |                      executable_name);
650 |       } else {
651 |         // Skip this path entry if no executable argument is given
652 |         continue;
653 |       }
654 |     }
655 | 
656 | #ifdef _DEBUG
657 |     printf("sdkFindFilePath <%s> in %s\n", filename, path.c_str());
658 | #endif
659 | 
660 |     // Test if the file exists
661 |     path.append(filename);
662 |     FILE *fp;
663 |     FOPEN(fp, path.c_str(), "rb");
664 | 
665 |     if (fp != NULL) {
666 |       fclose(fp);
667 |       // File found
668 |       // returning an allocated array here for backwards compatibility reasons
669 |       char *file_path = reinterpret_cast<char *>(malloc(path.length() + 1));
670 |       STRCPY(file_path, path.length() + 1, path.c_str());
671 |       return file_path;
672 |     }
673 | 
674 |     if (fp) {
675 |       fclose(fp);
676 |     }
677 |   }
678 | 
679 |   // File not found
680 |   return 0;
681 | }
682 | 
683 | #endif  // COMMON_HELPER_STRING_H_
684 | 


--------------------------------------------------------------------------------