├── .gitignore
├── script.sh
├── Makefile
├── LICENSE
├── README.md
├── compare.cu
└── gpu_burn-drv.cpp


/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | gpu_burn-drv\.o
3 | 
4 | compare\.ptx
5 | 
6 | gpu_burn
7 | 


--------------------------------------------------------------------------------
/script.sh:
--------------------------------------------------------------------------------
1 | xterm -e stress --cpu 8 &
2 | xterm -e ./gpu_burn 100000 &
3 | tegrastats  
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | CUDAPATH=/usr/local/cuda
 2 | 
 3 | # Have this point to an old enough gcc (for nvcc)
 4 | GCCPATH=/usr
 5 | 
 6 | NVCC=${CUDAPATH}/bin/nvcc
 7 | CCPATH=${GCCPATH}/bin
 8 | 
 9 | drv:
10 | 	PATH=${PATH}:.:${CCPATH}:${PATH} ${NVCC} -I${CUDAPATH}/include -arch=compute_50 -ptx compare.cu -o compare.ptx
11 | 	g++ -O3 -Wno-unused-result -I${CUDAPATH}/include -c gpu_burn-drv.cpp
12 | 	g++ -o gpu_burn gpu_burn-drv.o -O3 -lcuda -L${CUDAPATH}/lib64 -L${CUDAPATH}/lib -Wl,-rpath=${CUDAPATH}/lib64 -Wl,-rpath=${CUDAPATH}/lib -lcublas -lcudart -o gpu_burn
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2020, Ville Timonen
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | * Redistributions of source code must retain the above copyright notice, this
10 |   list of conditions and the following disclaimer.
11 | 
12 | * Redistributions in binary form must reproduce the above copyright notice,
13 |   this list of conditions and the following disclaimer in the documentation
14 |   and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # jetson-gpu-burn forked from the initial gpu-burn
 2 | Multi-GPU CUDA stress test - http://wili.cc/blog/gpu-burn.html
 3 | 
 4 | The initial GPU burn has a temperature property that is not configured for the Jetson GPU systems. This repository contains the modified gpu-burn to work with Jetson systems as well as a script to stress the GPU and CPU.
 5 | 
 6 | The script requires CUDA and stress. If you are using a Connect Tech BSP, then the nvidia sources are commented out initially and you may not be able to find cuda-toolkit-10-2.
 7 | 
 8 | 
 9 | To enable the NVIDIA sources again, uncomment the nvidia repo in the following file:
10 | 
11 | /etc/apt/sources.list.d/nvidia-l4t-apt-source.list
12 | 
13 | sudo apt-get update
14 | 
15 | sudo apt-get install cuda-toolkit-10-2
16 | 
17 | 
18 | If you want to stress test the CPU then you need to install stress:
19 | 
20 | sudo apt-get install stress
21 | 
22 | 
23 | Last, you can run make, and then run script.sh to stress the GPU and CPUs
24 | 
25 | To view the stats, you can run tegrastats
26 | 
27 | 
28 | 
29 | ## Installing jtop (graphical CPU/GPU usage command)
30 | 
31 | sudo apt-get -y install pip
32 | 
33 | sudo apt-get -y install python-pip
34 | 
35 | sudo -H pip install -U jetson-stats
36 | 
37 | jtop
38 | 
39 | 
40 | 
41 | ## Common problems:
42 | 
43 | If you get this error:
44 | 
45 | ./gpu_burn: error while loading shared libraries: libcublasLt.so.10: cannot open shared object file: No such file or directory
46 | 
47 | Use this to fix it:
48 | 
49 | sudo find / -name "libcublasLt.so.10.2.3.300" -exec ln -s '{}' /usr/lib/aarch64-linux-gnu/libcublasLt.so.10 ';'
50 | 


--------------------------------------------------------------------------------
/compare.cu:
--------------------------------------------------------------------------------
 1 | /* 
 2 |  * Copyright (c) 2016, Ville Timonen
 3 |  * All rights reserved.
 4 |  * 
 5 |  * Redistribution and use in source and binary forms, with or without
 6 |  * modification, are permitted provided that the following conditions are met:
 7 |  * 
 8 |  * 1. Redistributions of source code must retain the above copyright notice, this
 9 |  *    list of conditions and the following disclaimer.
10 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
11 |  *    this list of conditions and the following disclaimer in the documentation
12 |  *    and/or other materials provided with the distribution.
13 |  * 
14 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
18 |  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
19 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
20 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
21 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
23 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 |  * 
25 |  * The views and conclusions contained in the software and documentation are those
26 |  * of the authors and should not be interpreted as representing official policies,
27 |  * either expressed or implied, of the FreeBSD Project.
28 |  */
29 | 
30 | // Actually, there are no rounding errors due to results being accumulated in an arbitrary order..
31 | // Therefore EPSILON = 0.0f is OK
32 | #define EPSILON 0.001f
33 | #define EPSILOND 0.0000001
34 | 
35 | extern "C" __global__ void compare(float *C, int *faultyElems, size_t iters) {
36 | 	size_t iterStep = blockDim.x*blockDim.y*gridDim.x*gridDim.y;
37 | 	size_t myIndex = (blockIdx.y*blockDim.y + threadIdx.y)* // Y
38 | 		gridDim.x*blockDim.x + // W
39 | 		blockIdx.x*blockDim.x + threadIdx.x; // X
40 | 
41 | 	int myFaulty = 0;
42 | 	for (size_t i = 1; i < iters; ++i)
43 | 		if (fabsf(C[myIndex] - C[myIndex + i*iterStep]) > EPSILON)
44 | 			myFaulty++;
45 | 
46 | 	atomicAdd(faultyElems, myFaulty);
47 | }
48 | 
49 | extern "C" __global__ void compareD(double *C, int *faultyElems, size_t iters) {
50 | 	size_t iterStep = blockDim.x*blockDim.y*gridDim.x*gridDim.y;
51 | 	size_t myIndex = (blockIdx.y*blockDim.y + threadIdx.y)* // Y
52 | 		gridDim.x*blockDim.x + // W
53 | 		blockIdx.x*blockDim.x + threadIdx.x; // X
54 | 
55 | 	int myFaulty = 0;
56 | 	for (size_t i = 1; i < iters; ++i)
57 | 		if (fabs(C[myIndex] - C[myIndex + i*iterStep]) > EPSILOND)
58 | 			myFaulty++;
59 | 
60 | 	atomicAdd(faultyElems, myFaulty);
61 | }
62 | 


--------------------------------------------------------------------------------
/gpu_burn-drv.cpp:
--------------------------------------------------------------------------------
  1 | /* 
  2 |  * Copyright (c) 2016, Ville Timonen
  3 |  * All rights reserved.
  4 |  * 
  5 |  * Redistribution and use in source and binary forms, with or without
  6 |  * modification, are permitted provided that the following conditions are met:
  7 |  * 
  8 |  * 1. Redistributions of source code must retain the above copyright notice, this
  9 |  *    list of conditions and the following disclaimer.
 10 |  * 2. Redistributions in binary form must reproduce the above copyright notice,
 11 |  *    this list of conditions and the following disclaimer in the documentation
 12 |  *    and/or other materials provided with the distribution.
 13 |  * 
 14 |  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
 15 |  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
 16 |  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 17 |  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
 18 |  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 19 |  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 20 |  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 21 |  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 22 |  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
 23 |  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 24 |  * 
 25 |  * The views and conclusions contained in the software and documentation are those
 26 |  * of the authors and should not be interpreted as representing official policies,
 27 |  * either expressed or implied, of the FreeBSD Project.
 28 |  */
 29 | 
 30 | #define SIZE 2048ul // Matrices are SIZE*SIZE..  2048^2 should be efficiently implemented in CUBLAS
 31 | #define USEMEM 0.9 // Try to allocate 90% of memory
 32 | 
 33 | // Used to report op/s, measured through Visual Profiler, CUBLAS from CUDA 7.5
 34 | // (Seems that they indeed take the naive dim^3 approach)
 35 | #define OPS_PER_MUL 17188257792ul
 36 | 
 37 | #include <cstdio>
 38 | #include <string>
 39 | #include <map>
 40 | #include <vector>
 41 | #include <sys/types.h>
 42 | #include <signal.h>
 43 | #include <sys/wait.h>
 44 | #include <sys/time.h>
 45 | #include <string.h>
 46 | #include <unistd.h>
 47 | #include <time.h>
 48 | #include <fstream>
 49 | 
 50 | #include <cuda.h>
 51 | #include "cublas_v2.h"
 52 | 
 53 | void checkError(int rCode, std::string desc = "") {
 54 | 	static std::map<int, std::string> g_errorStrings;
 55 | 	if (!g_errorStrings.size()) {
 56 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_VALUE, "CUDA_ERROR_INVALID_VALUE"));
 57 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_OUT_OF_MEMORY, "CUDA_ERROR_OUT_OF_MEMORY"));
 58 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_INITIALIZED, "CUDA_ERROR_NOT_INITIALIZED"));
 59 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_DEINITIALIZED, "CUDA_ERROR_DEINITIALIZED"));
 60 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NO_DEVICE, "CUDA_ERROR_NO_DEVICE"));
 61 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_DEVICE, "CUDA_ERROR_INVALID_DEVICE"));
 62 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_IMAGE, "CUDA_ERROR_INVALID_IMAGE"));
 63 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_CONTEXT, "CUDA_ERROR_INVALID_CONTEXT"));
 64 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_MAP_FAILED, "CUDA_ERROR_MAP_FAILED"));
 65 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_UNMAP_FAILED, "CUDA_ERROR_UNMAP_FAILED"));
 66 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_ARRAY_IS_MAPPED, "CUDA_ERROR_ARRAY_IS_MAPPED"));
 67 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_ALREADY_MAPPED, "CUDA_ERROR_ALREADY_MAPPED"));
 68 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NO_BINARY_FOR_GPU, "CUDA_ERROR_NO_BINARY_FOR_GPU"));
 69 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_ALREADY_ACQUIRED, "CUDA_ERROR_ALREADY_ACQUIRED"));
 70 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_MAPPED, "CUDA_ERROR_NOT_MAPPED"));
 71 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "CUDA_ERROR_NOT_MAPPED_AS_ARRAY"));
 72 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_MAPPED_AS_POINTER, "CUDA_ERROR_NOT_MAPPED_AS_POINTER"));
 73 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_UNSUPPORTED_LIMIT, "CUDA_ERROR_UNSUPPORTED_LIMIT"));
 74 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "CUDA_ERROR_CONTEXT_ALREADY_IN_USE"));
 75 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_SOURCE, "CUDA_ERROR_INVALID_SOURCE"));
 76 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_FILE_NOT_FOUND, "CUDA_ERROR_FILE_NOT_FOUND"));
 77 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND"));
 78 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED"));
 79 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_OPERATING_SYSTEM, "CUDA_ERROR_OPERATING_SYSTEM"));
 80 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_INVALID_HANDLE, "CUDA_ERROR_INVALID_HANDLE"));
 81 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_FOUND, "CUDA_ERROR_NOT_FOUND"));
 82 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_NOT_READY, "CUDA_ERROR_NOT_READY"));
 83 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_FAILED, "CUDA_ERROR_LAUNCH_FAILED"));
 84 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES"));
 85 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_TIMEOUT, "CUDA_ERROR_LAUNCH_TIMEOUT"));
 86 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING"));
 87 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE"));
 88 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_CONTEXT_IS_DESTROYED, "CUDA_ERROR_CONTEXT_IS_DESTROYED"));
 89 | 		g_errorStrings.insert(std::pair<int, std::string>(CUDA_ERROR_UNKNOWN, "CUDA_ERROR_UNKNOWN"));
 90 | 	}
 91 | 
 92 | 	if (rCode != CUDA_SUCCESS)
 93 | 		throw ((desc == "") ? 
 94 | 				std::string("Error: ") : 
 95 | 				(std::string("Error in \"") + desc + std::string("\": "))) + 
 96 | 			g_errorStrings[rCode];
 97 | }
 98 | 
 99 | void checkError(cublasStatus_t rCode, std::string desc = "") {
100 | 	static std::map<cublasStatus_t, std::string> g_errorStrings;
101 | 	if (!g_errorStrings.size()) {
102 | 		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_NOT_INITIALIZED, "CUBLAS_STATUS_NOT_INITIALIZED"));
103 | 		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_ALLOC_FAILED, "CUBLAS_STATUS_ALLOC_FAILED"));
104 | 		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_INVALID_VALUE, "CUBLAS_STATUS_INVALID_VALUE"));
105 | 		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_ARCH_MISMATCH, "CUBLAS_STATUS_ARCH_MISMATCH"));
106 | 		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_MAPPING_ERROR, "CUBLAS_STATUS_MAPPING_ERROR"));
107 | 		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_EXECUTION_FAILED, "CUBLAS_STATUS_EXECUTION_FAILED"));
108 | 		g_errorStrings.insert(std::pair<cublasStatus_t, std::string>(CUBLAS_STATUS_INTERNAL_ERROR, "CUBLAS_STATUS_INTERNAL_ERROR"));
109 | 	}
110 | 
111 | 	if (rCode != CUBLAS_STATUS_SUCCESS)
112 | 		throw ((desc == "") ? 
113 | 				std::string("Error: ") : 
114 | 				(std::string("Error in \"") + desc + std::string("\": "))) + 
115 | 			g_errorStrings[rCode];
116 | }
117 | 
118 | double getTime()
119 | {
120 | 	struct timeval t;
121 | 	gettimeofday(&t, NULL);
122 | 	return (double)t.tv_sec + (double)t.tv_usec / 1e6;
123 | }
124 | 
125 | bool g_running = false;
126 | 
127 | template <class T> class GPU_Test {
128 | 	public:
129 | 	GPU_Test(int dev, bool doubles, bool tensors) : 
130 | 			d_devNumber(dev), d_doubles(doubles), d_tensors(tensors) {
131 | 		checkError(cuDeviceGet(&d_dev, d_devNumber));
132 | 		checkError(cuCtxCreate(&d_ctx, 0, d_dev));
133 | 
134 | 		bind();
135 | 
136 | 		//checkError(cublasInit());
137 | 		checkError(cublasCreate(&d_cublas), "init");
138 | 
139 | 		if(d_tensors)
140 | 			checkError(cublasSetMathMode(d_cublas, CUBLAS_TENSOR_OP_MATH));
141 | 
142 | 		checkError(cuMemAllocHost((void**)&d_faultyElemsHost, sizeof(int)));
143 | 		d_error = 0;
144 | 
145 | 		g_running = true;
146 | 
147 | 		struct sigaction action;
148 | 		memset(&action, 0, sizeof(struct sigaction));
149 | 		action.sa_handler = termHandler;
150 | 		sigaction(SIGTERM, &action, NULL);
151 | 	}
152 | 	~GPU_Test() {
153 | 		bind();
154 | 		checkError(cuMemFree(d_Cdata), "Free A");
155 | 		checkError(cuMemFree(d_Adata), "Free B");
156 | 		checkError(cuMemFree(d_Bdata), "Free C");
157 | 		cuMemFreeHost(d_faultyElemsHost);
158 | 		printf("Freed memory for dev %d\n", d_devNumber);
159 | 
160 | 		cublasDestroy(d_cublas);
161 | 		printf("Uninitted cublas\n");
162 | 	}
163 | 
164 | 	static void termHandler(int signum)
165 | 	{
166 | 		g_running = false;
167 | 	}
168 | 
169 | 	unsigned long long int getErrors() {
170 | 		if (*d_faultyElemsHost) {
171 | 			d_error += (long long int)*d_faultyElemsHost;
172 | 		}
173 | 		unsigned long long int tempErrs = d_error;
174 | 		d_error = 0;
175 | 		return tempErrs;
176 | 	}
177 | 
178 | 	size_t getIters() {
179 | 		return d_iters;
180 | 	}
181 | 
182 | 	void bind() {
183 | 		checkError(cuCtxSetCurrent(d_ctx), "Bind CTX");
184 | 	}
185 | 
186 | 	size_t totalMemory() {
187 | 		bind();
188 | 		size_t freeMem, totalMem;
189 | 		checkError(cuMemGetInfo(&freeMem, &totalMem));
190 | 		return totalMem;
191 | 	}
192 | 
193 | 	size_t availMemory() {
194 | 		bind();
195 | 		size_t freeMem, totalMem;
196 | 		checkError(cuMemGetInfo(&freeMem, &totalMem));
197 | 		return freeMem;
198 | 	}
199 | 
200 | 	void initBuffers(T *A, T *B) {
201 | 		bind();
202 | 
203 | 		size_t useBytes = (size_t)((double)availMemory()*USEMEM);
204 | 		printf("Initialized device %d with %lu MB of memory (%lu MB available, using %lu MB of it), %s%s\n",
205 | 				d_devNumber, totalMemory()/1024ul/1024ul, availMemory()/1024ul/1024ul, useBytes/1024ul/1024ul,
206 | 				d_doubles ? "using DOUBLES" : "using FLOATS", d_tensors ? ", using Tensor Cores" : "");
207 | 		size_t d_resultSize = sizeof(T)*SIZE*SIZE;
208 | 		d_iters = (useBytes - 2*d_resultSize)/d_resultSize; // We remove A and B sizes
209 | 		//printf("Results are %d bytes each, thus performing %d iterations\n", d_resultSize, d_iters);
210 | 		checkError(cuMemAlloc(&d_Cdata, d_iters*d_resultSize), "C alloc");
211 | 		checkError(cuMemAlloc(&d_Adata, d_resultSize), "A alloc");
212 | 		checkError(cuMemAlloc(&d_Bdata, d_resultSize), "B alloc");
213 | 
214 | 		checkError(cuMemAlloc(&d_faultyElemData, sizeof(int)), "faulty data");
215 | 
216 | 		// Populating matrices A and B
217 | 		checkError(cuMemcpyHtoD(d_Adata, A, d_resultSize), "A -> device");
218 | 		checkError(cuMemcpyHtoD(d_Bdata, B, d_resultSize), "A -> device");
219 | 
220 | 		initCompareKernel();
221 | 	}
222 | 
223 | 	void compute() {
224 | 		bind();
225 | 		static const float alpha = 1.0f;
226 | 		static const float beta = 0.0f;
227 | 		static const double alphaD = 1.0;
228 | 		static const double betaD = 0.0;
229 | 
230 | 		for (size_t i = 0; i < d_iters; ++i) {
231 | 			if (d_doubles)
232 | 				checkError(cublasDgemm(d_cublas, CUBLAS_OP_N, CUBLAS_OP_N,
233 | 							SIZE, SIZE, SIZE, &alphaD,
234 | 							(const double*)d_Adata, SIZE,
235 | 							(const double*)d_Bdata, SIZE,
236 | 							&betaD, 
237 | 							(double*)d_Cdata + i*SIZE*SIZE, SIZE), "DGEMM");
238 | 			else
239 | 				checkError(cublasSgemm(d_cublas, CUBLAS_OP_N, CUBLAS_OP_N,
240 | 							SIZE, SIZE, SIZE, &alpha,
241 | 							(const float*)d_Adata, SIZE,
242 | 							(const float*)d_Bdata, SIZE,
243 | 							&beta, 
244 | 							(float*)d_Cdata + i*SIZE*SIZE, SIZE), "SGEMM");
245 | 		}
246 | 	}
247 | 
248 | 	void initCompareKernel() {
249 | 		const char *kernelFile = "compare.ptx";
250 | 		{
251 | 			std::ifstream f(kernelFile);
252 | 			checkError(f.good() ? CUDA_SUCCESS : CUDA_ERROR_NOT_FOUND, std::string("couldn't find file \"") + kernelFile + "\" from working directory");
253 | 		}
254 | 		checkError(cuModuleLoad(&d_module, kernelFile), "load module");
255 | 		checkError(cuModuleGetFunction(&d_function, d_module, 
256 | 					d_doubles ? "compareD" : "compare"), "get func");
257 | 
258 | 		checkError(cuFuncSetCacheConfig(d_function, CU_FUNC_CACHE_PREFER_L1), "L1 config");
259 | 		checkError(cuParamSetSize(d_function, __alignof(T*) + __alignof(int*) + __alignof(size_t)), "set param size");
260 | 		checkError(cuParamSetv(d_function, 0, &d_Cdata, sizeof(T*)), "set param");
261 | 		checkError(cuParamSetv(d_function, __alignof(T*), &d_faultyElemData, sizeof(T*)), "set param");
262 | 		checkError(cuParamSetv(d_function, __alignof(T*) + __alignof(int*), &d_iters, sizeof(size_t)), "set param");
263 | 
264 | 		checkError(cuFuncSetBlockShape(d_function, g_blockSize, g_blockSize, 1), "set block size");
265 | 	}
266 | 
267 | 	void compare() {
268 | 		checkError(cuMemsetD32Async(d_faultyElemData, 0, 1, 0), "memset");
269 | 		checkError(cuLaunchGridAsync(d_function, SIZE/g_blockSize, SIZE/g_blockSize, 0), "Launch grid");
270 | 		checkError(cuMemcpyDtoHAsync(d_faultyElemsHost, d_faultyElemData, sizeof(int), 0), "Read faultyelemdata");
271 | 	}
272 | 
273 | 	bool shouldRun()
274 | 	{
275 | 		return g_running;
276 | 	}
277 | 
278 | 	private:
279 | 	bool d_doubles;
280 | 	bool d_tensors;
281 | 	int d_devNumber;
282 | 	size_t d_iters;
283 | 	size_t d_resultSize;
284 | 
285 | 	long long int d_error;
286 | 
287 | 	static const int g_blockSize = 16;
288 | 
289 | 	CUdevice d_dev;
290 | 	CUcontext d_ctx;
291 | 	CUmodule d_module;
292 | 	CUfunction d_function;
293 | 
294 | 	CUdeviceptr d_Cdata;
295 | 	CUdeviceptr d_Adata;
296 | 	CUdeviceptr d_Bdata;
297 | 	CUdeviceptr d_faultyElemData;
298 | 	int *d_faultyElemsHost;
299 | 
300 | 	cublasHandle_t d_cublas;
301 | };
302 | 
303 | // Returns the number of devices
304 | int initCuda() {
305 | 	checkError(cuInit(0));
306 | 	int deviceCount = 0;
307 | 	checkError(cuDeviceGetCount(&deviceCount));
308 | 
309 | 	if (!deviceCount)
310 | 		throw std::string("No CUDA devices");
311 | 
312 | 	#ifdef USEDEV
313 | 	if (USEDEV >= deviceCount)
314 | 		throw std::string("Not enough devices for USEDEV");
315 | 	#endif
316 | 
317 | 	return deviceCount;
318 | }
319 | 
320 | template<class T> void startBurn(int index, int writeFd, T *A, T *B, bool doubles, bool tensors) {
321 | 	GPU_Test<T> *our;
322 | 	try {
323 | 		our = new GPU_Test<T>(index, doubles, tensors);
324 | 		our->initBuffers(A, B);
325 | 	} catch (std::string e) {
326 | 		fprintf(stderr, "Couldn't init a GPU test: %s\n", e.c_str());
327 | 		exit(124);
328 | 	}
329 | 
330 | 	// The actual work
331 | 	try {
332 | 		int eventIndex = 0;
333 | 		const int maxEvents = 2;
334 | 		CUevent events[maxEvents];
335 | 		for (int i = 0; i < maxEvents; ++i)
336 | 			cuEventCreate(events + i, 0);
337 | 
338 | 		int nonWorkIters = maxEvents;
339 | 
340 | 		while (our->shouldRun()) {
341 | 			our->compute();
342 | 			our->compare();
343 | 			checkError(cuEventRecord(events[eventIndex], 0), "Record event");
344 | 
345 | 			eventIndex = ++eventIndex % maxEvents;
346 | 
347 | 			while (cuEventQuery(events[eventIndex]) != CUDA_SUCCESS) usleep(1000);
348 | 
349 | 			if (--nonWorkIters > 0) continue;
350 | 
351 | 			int ops = our->getIters();
352 | 			write(writeFd, &ops, sizeof(int));
353 | 			ops = our->getErrors();
354 | 			write(writeFd, &ops, sizeof(int));
355 | 		}
356 | 
357 | 		for (int i = 0; i < maxEvents; ++i)
358 | 			cuEventSynchronize(events[i]);
359 | 		delete our;
360 | 	} catch (std::string e) {
361 | 		fprintf(stderr, "Failure during compute: %s\n", e.c_str());
362 | 		int ops = -1;
363 | 		// Signalling that we failed
364 | 		write(writeFd, &ops, sizeof(int));
365 | 		write(writeFd, &ops, sizeof(int));
366 | 		exit(111);
367 | 	}
368 | }
369 | 
370 | int pollTemp(pid_t *p) {
371 | 	int tempPipe[2];
372 | 	pipe(tempPipe);
373 | 	
374 | 	pid_t myPid = fork();
375 | 
376 | 	if (!myPid) {
377 | 		close(tempPipe[0]);
378 | 		dup2(tempPipe[1], STDOUT_FILENO); // Stdout
379 | 		execlp("nvidia-smi", "nvidia-smi", "-l", "5", "-q", "-d", "TEMPERATURE", NULL);
380 | 		fprintf(stderr, "Could not invoke nvidia-smi, no temps available\n");
381 | 		
382 | 		exit(0);
383 | 	}
384 | 
385 | 	*p = myPid;
386 | 	close(tempPipe[1]);
387 | 
388 | 	return tempPipe[0];
389 | }
390 | 
391 | void updateTemps(int handle, std::vector<int> *temps) {
392 | 	const int readSize = 10240;
393 | 	static int gpuIter = 0;
394 | 	char data[readSize+1];
395 | 
396 | 	int curPos = 0;
397 | 	do {
398 | 		read(handle, data+curPos, sizeof(char));
399 | 	} while (data[curPos++] != '\n');
400 | 
401 | 	data[curPos-1] = 0;
402 | 
403 | 	int tempValue;
404 | 	// FIXME: The syntax of this print might change in the future..
405 | 	if (sscanf(data, "        GPU Current Temp            : %d C", &tempValue) == 1) {
406 | 		//printf("read temp val %d\n", tempValue);
407 | 		temps->at(gpuIter) = tempValue;
408 | 		gpuIter = (gpuIter+1)%(temps->size());
409 | 	} else if (!strcmp(data, "        Gpu                     : N/A"))
410 | 		gpuIter = (gpuIter+1)%(temps->size()); // We rotate the iterator for N/A values as well
411 | }
412 | 
413 | void listenClients(std::vector<int> clientFd, std::vector<pid_t> clientPid, int runTime) {
414 | 	fd_set waitHandles;
415 | 	
416 | 	pid_t tempPid;
417 | 	int tempHandle = 0;
418 | 	int maxHandle = tempHandle;
419 | 
420 | 	FD_ZERO(&waitHandles);
421 | 	FD_SET(tempHandle, &waitHandles);
422 | 
423 | 	for (size_t i = 0; i < clientFd.size(); ++i) {
424 | 		if (clientFd.at(i) > maxHandle)
425 | 			maxHandle = clientFd.at(i);
426 | 		FD_SET(clientFd.at(i), &waitHandles);
427 | 	}
428 | 
429 | 	std::vector<int> clientTemp;
430 | 	std::vector<int> clientErrors;
431 | 	std::vector<int> clientCalcs;
432 | 	std::vector<struct timespec> clientUpdateTime;
433 | 	std::vector<float> clientGflops;
434 | 	std::vector<bool> clientFaulty;
435 | 
436 | 	time_t startTime = time(0);
437 | 
438 | 	for (size_t i = 0; i < clientFd.size(); ++i) {
439 | 		clientTemp.push_back(0);
440 | 		clientErrors.push_back(0);
441 | 		clientCalcs.push_back(0);
442 | 		struct timespec thisTime;
443 | 		clock_gettime(CLOCK_REALTIME, &thisTime);
444 | 		clientUpdateTime.push_back(thisTime);
445 | 		clientGflops.push_back(0.0f);
446 | 		clientFaulty.push_back(false);
447 | 	}
448 | 	
449 | 	int changeCount;
450 | 	float nextReport = 10.0f;
451 | 	bool childReport = false;
452 | 	while ((changeCount = select(maxHandle+1, &waitHandles, NULL, NULL, NULL))) {
453 | 		size_t thisTime = time(0);
454 | 		struct timespec thisTimeSpec;
455 | 		clock_gettime(CLOCK_REALTIME, &thisTimeSpec);
456 | 
457 | 		//printf("got new data! %d\n", changeCount);
458 | 		// Going through all descriptors
459 | 		for (size_t i = 0; i < clientFd.size(); ++i)
460 | 			if (FD_ISSET(clientFd.at(i), &waitHandles)) {
461 | 				// First, reading processed
462 | 				int processed, errors;
463 | 				read(clientFd.at(i), &processed, sizeof(int));
464 | 				// Then errors
465 | 				read(clientFd.at(i), &errors, sizeof(int));
466 | 
467 | 				clientErrors.at(i) += errors;
468 | 				if (processed == -1)
469 | 					clientCalcs.at(i) = -1;
470 | 				else
471 | 				{
472 | 					double flops = (double)processed * (double)OPS_PER_MUL;
473 | 					struct timespec clientPrevTime = clientUpdateTime.at(i);
474 | 					double clientTimeDelta = (double)thisTimeSpec.tv_sec + (double)thisTimeSpec.tv_nsec / 1000000000.0 - ((double)clientPrevTime.tv_sec + (double)clientPrevTime.tv_nsec / 1000000000.0);
475 | 					clientUpdateTime.at(i) = thisTimeSpec;
476 | 
477 | 					clientGflops.at(i) = (double)((unsigned long long int)processed * OPS_PER_MUL) / clientTimeDelta / 1000.0 / 1000.0 / 1000.0;
478 | 					clientCalcs.at(i) += processed;
479 | 				}
480 | 
481 | 				childReport = true;
482 | 			}
483 | 
484 | 		if (FD_ISSET(tempHandle, &waitHandles))
485 | 			updateTemps(tempHandle, &clientTemp);
486 | 		
487 | 		// Resetting the listeners
488 | 		FD_ZERO(&waitHandles);
489 | 		FD_SET(tempHandle, &waitHandles);
490 | 		for (size_t i = 0; i < clientFd.size(); ++i)
491 | 			FD_SET(clientFd.at(i), &waitHandles);
492 | 
493 | 		// Printing progress (if a child has initted already)
494 | 		if (childReport) {
495 | 			float elapsed = fminf((float)(thisTime-startTime)/(float)runTime*100.0f, 100.0f);
496 | 			printf("\r%.1f%%  ", elapsed);
497 | 			printf("proc'd: ");
498 | 			for (size_t i = 0; i < clientCalcs.size(); ++i) {
499 | 				printf("%d (%.0f Gflop/s) ", clientCalcs.at(i), clientGflops.at(i));
500 | 				if (i != clientCalcs.size() - 1)
501 | 					printf("- ");
502 | 			}
503 | 			printf("  errors: ");
504 | 			for (size_t i = 0; i < clientErrors.size(); ++i) {
505 | 				std::string note = "%d ";
506 | 				if (clientCalcs.at(i) == -1)
507 | 					note += " (DIED!)";
508 | 				else if (clientErrors.at(i))
509 | 					note += " (WARNING!)";
510 | 
511 | 				printf(note.c_str(), clientErrors.at(i));
512 | 				if (i != clientCalcs.size() - 1)
513 | 					printf("- ");
514 | 			}
515 | 			printf("  temps: ");
516 | 			for (size_t i = 0; i < clientTemp.size(); ++i) {
517 | 				printf(clientTemp.at(i) != 0 ? "%d C " : "-- ", clientTemp.at(i));
518 | 				if (i != clientCalcs.size() - 1)
519 | 					printf("- ");
520 | 			}
521 | 			
522 | 			fflush(stdout);
523 | 
524 | 			if (nextReport < elapsed) {
525 | 				nextReport = elapsed + 10.0f;
526 | 				printf("\n\tSummary at:   ");
527 | 				fflush(stdout);
528 | 				system("date"); // Printing a date
529 | 				fflush(stdout);
530 | 				printf("\n");
531 | 				//printf("\t(checkpoint)\n");
532 | 				for (size_t i = 0; i < clientErrors.size(); ++i) {
533 | 					if (clientErrors.at(i))
534 | 						clientFaulty.at(i) = true;
535 | 					clientErrors.at(i) = 0;
536 | 				}
537 | 			}
538 | 		}
539 | 
540 | 		// Checking whether all clients are dead
541 | 		bool oneAlive = false;
542 | 		for (size_t i = 0; i < clientCalcs.size(); ++i)
543 | 			if (clientCalcs.at(i) != -1)
544 | 				oneAlive = true;
545 | 		if (!oneAlive) {
546 | 			fprintf(stderr, "\n\nNo clients are alive!  Aborting\n");
547 | 			exit(123);
548 | 		}
549 | 
550 | 		if (startTime + runTime < thisTime)
551 | 			break;
552 | 	}
553 | 
554 | 	printf("\nKilling processes.. ");
555 | 	fflush(stdout);
556 | 	for (size_t i = 0; i < clientPid.size(); ++i)
557 | 		kill(clientPid.at(i), 15);
558 | 	
559 | 	kill(tempPid, 15);
560 | 	close(tempHandle);
561 | 
562 | 	while (wait(NULL) != -1);
563 | 	printf("done\n");
564 | 
565 | 	printf("\nTested %d GPUs:\n", (int)clientPid.size());
566 | 	for (size_t i = 0; i < clientPid.size(); ++i)
567 | 		printf("\tGPU %d: %s\n", (int)i, clientFaulty.at(i) ? "FAULTY" : "OK");
568 | }
569 | 
570 | template<class T> void launch(int runLength, bool useDoubles, bool useTensorCores) {
571 | 	system("nvidia-smi -L");
572 | 
573 | 	// Initting A and B with random data
574 | 	T *A = (T*) malloc(sizeof(T)*SIZE*SIZE);
575 | 	T *B = (T*) malloc(sizeof(T)*SIZE*SIZE);
576 | 	srand(10);
577 | 	for (size_t i = 0; i < SIZE*SIZE; ++i) {
578 | 		A[i] = (T)((double)(rand()%1000000)/100000.0);
579 | 		B[i] = (T)((double)(rand()%1000000)/100000.0);
580 | 	}
581 | 
582 | 	// Forking a process..  This one checks the number of devices to use,
583 | 	// returns the value, and continues to use the first one.
584 | 	int mainPipe[2];
585 | 	pipe(mainPipe);
586 | 	int readMain = mainPipe[0];
587 | 	std::vector<int> clientPipes;
588 | 	std::vector<pid_t> clientPids;
589 | 	clientPipes.push_back(readMain);
590 | 
591 | 	pid_t myPid = fork();
592 | 	if (!myPid) {
593 | 		// Child
594 | 		close(mainPipe[0]);
595 | 		int writeFd = mainPipe[1];
596 | 		int devCount = initCuda();
597 | 		write(writeFd, &devCount, sizeof(int));
598 | 
599 | 		startBurn<T>(0, writeFd, A, B, useDoubles, useTensorCores);
600 | 
601 | 		close(writeFd);
602 | 		return;
603 | 	} else {
604 | 		clientPids.push_back(myPid);
605 | 
606 | 		close(mainPipe[1]);
607 | 		int devCount;
608 | 	    read(readMain, &devCount, sizeof(int));
609 | 
610 | 		if (!devCount) {
611 | 			fprintf(stderr, "No CUDA devices\n");
612 | 			exit(EXIT_FAILURE);
613 | 		} else {
614 | 
615 | 			for (int i = 1; i < devCount; ++i) {
616 | 				int slavePipe[2];
617 | 				pipe(slavePipe);
618 | 				clientPipes.push_back(slavePipe[0]);
619 | 
620 | 				pid_t slavePid = fork();
621 | 
622 | 				if (!slavePid) {
623 | 					// Child
624 | 					close(slavePipe[0]);
625 | 					initCuda();
626 | 					startBurn<T>(i, slavePipe[1], A, B, useDoubles, useTensorCores);
627 | 
628 | 					close(slavePipe[1]);
629 | 					return;
630 | 				} else {
631 | 					clientPids.push_back(slavePid);
632 | 					close(slavePipe[1]);
633 | 				}
634 | 			}
635 | 			
636 | 			listenClients(clientPipes, clientPids, runLength);
637 | 		}
638 | 	}
639 | 
640 | 	for (size_t i = 0; i < clientPipes.size(); ++i)
641 | 		close(clientPipes.at(i));
642 | 
643 | 	free(A);
644 | 	free(B);
645 | }
646 | 
647 | int main(int argc, char **argv) {
648 | 	int runLength = 10;
649 | 	bool useDoubles = false;
650 | 	bool useTensorCores = false;
651 | 	int thisParam = 0;
652 | 
653 | 	std::vector<std::string> args(argv, argv + argc);
654 | 	for (size_t i = 1; i < args.size(); ++i)
655 | 	{
656 | 		if (argc >= 2 && std::string(argv[i]).find("-d") != std::string::npos)
657 | 		{
658 | 			useDoubles = true;
659 | 			thisParam++;
660 | 		}
661 | 		if (argc >= 2 && std::string(argv[i]).find("-tc") != std::string::npos)
662 | 		{
663 | 			useTensorCores = true;
664 | 			thisParam++;
665 | 		}
666 | 	}
667 | 	
668 | 	if (argc-thisParam < 2)
669 | 		printf("Run length not specified in the command line.  Burning for 10 secs\n");
670 | 	else 
671 | 		runLength = atoi(argv[1+thisParam]);
672 | 
673 | 	if (useDoubles)
674 | 		launch<double>(runLength, useDoubles, useTensorCores);
675 | 	else
676 | 		launch<float>(runLength, useDoubles, useTensorCores);
677 | 
678 | 	return 0;
679 | }
680 | 


--------------------------------------------------------------------------------