├── .gitignore ├── script.sh ├── Makefile ├── LICENSE ├── README.md ├── compare.cu └── gpu_burn-drv.cpp /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | gpu_burn-drv\.o 3 | 4 | compare\.ptx 5 | 6 | gpu_burn 7 | -------------------------------------------------------------------------------- /script.sh: -------------------------------------------------------------------------------- 1 | xterm -e stress --cpu 8 & 2 | xterm -e ./gpu_burn 100000 & 3 | tegrastats 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | CUDAPATH=/usr/local/cuda 2 | 3 | # Have this point to an old enough gcc (for nvcc) 4 | GCCPATH=/usr 5 | 6 | NVCC=${CUDAPATH}/bin/nvcc 7 | CCPATH=${GCCPATH}/bin 8 | 9 | drv: 10 | PATH=${PATH}:.:${CCPATH}:${PATH} ${NVCC} -I${CUDAPATH}/include -arch=compute_50 -ptx compare.cu -o compare.ptx 11 | g++ -O3 -Wno-unused-result -I${CUDAPATH}/include -c gpu_burn-drv.cpp 12 | g++ -o gpu_burn gpu_burn-drv.o -O3 -lcuda -L${CUDAPATH}/lib64 -L${CUDAPATH}/lib -Wl,-rpath=${CUDAPATH}/lib64 -Wl,-rpath=${CUDAPATH}/lib -lcublas -lcudart -o gpu_burn 13 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2020, Ville Timonen 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | * Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | * Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # jetson-gpu-burn forked from the initial gpu-burn 2 | Multi-GPU CUDA stress test - http://wili.cc/blog/gpu-burn.html 3 | 4 | The initial GPU burn has a temperature property that is not configured for the Jetson GPU systems. This repository contains the modified gpu-burn to work with Jetson systems as well as a script to stress the GPU and CPU. 5 | 6 | The script requires CUDA and stress. If you are using a Connect Tech BSP, then the nvidia sources are commented out initially and you may not be able to find cuda-toolkit-10-2. 7 | 8 | 9 | To enable the NVIDIA sources again, uncomment the nvidia repo in the following file: 10 | 11 | /etc/apt/sources.list.d/nvidia-l4t-apt-source.list 12 | 13 | sudo apt-get update 14 | 15 | sudo apt-get install cuda-toolkit-10-2 16 | 17 | 18 | If you want to stress test the CPU then you need to install stress: 19 | 20 | sudo apt-get install stress 21 | 22 | 23 | Last, you can run make, and then run script.sh to stress the GPU and CPUs 24 | 25 | To view the stats, you can run tegrastats 26 | 27 | 28 | 29 | ## Installing jtop (graphical CPU/GPU usage command) 30 | 31 | sudo apt-get -y install pip 32 | 33 | sudo apt-get -y install python-pip 34 | 35 | sudo -H pip install -U jetson-stats 36 | 37 | jtop 38 | 39 | 40 | 41 | ## Common problems: 42 | 43 | If you get this error: 44 | 45 | ./gpu_burn: error while loading shared libraries: libcublasLt.so.10: cannot open shared object file: No such file or directory 46 | 47 | Use this to fix it: 48 | 49 | sudo find / -name "libcublasLt.so.10.2.3.300" -exec ln -s '{}' /usr/lib/aarch64-linux-gnu/libcublasLt.so.10 ';' 50 | -------------------------------------------------------------------------------- /compare.cu: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016, Ville Timonen 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, this 9 | * list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | * 25 | * The views and conclusions contained in the software and documentation are those 26 | * of the authors and should not be interpreted as representing official policies, 27 | * either expressed or implied, of the FreeBSD Project. 28 | */ 29 | 30 | // Actually, there are no rounding errors due to results being accumulated in an arbitrary order.. 31 | // Therefore EPSILON = 0.0f is OK 32 | #define EPSILON 0.001f 33 | #define EPSILOND 0.0000001 34 | 35 | extern "C" __global__ void compare(float *C, int *faultyElems, size_t iters) { 36 | size_t iterStep = blockDim.x*blockDim.y*gridDim.x*gridDim.y; 37 | size_t myIndex = (blockIdx.y*blockDim.y + threadIdx.y)* // Y 38 | gridDim.x*blockDim.x + // W 39 | blockIdx.x*blockDim.x + threadIdx.x; // X 40 | 41 | int myFaulty = 0; 42 | for (size_t i = 1; i < iters; ++i) 43 | if (fabsf(C[myIndex] - C[myIndex + i*iterStep]) > EPSILON) 44 | myFaulty++; 45 | 46 | atomicAdd(faultyElems, myFaulty); 47 | } 48 | 49 | extern "C" __global__ void compareD(double *C, int *faultyElems, size_t iters) { 50 | size_t iterStep = blockDim.x*blockDim.y*gridDim.x*gridDim.y; 51 | size_t myIndex = (blockIdx.y*blockDim.y + threadIdx.y)* // Y 52 | gridDim.x*blockDim.x + // W 53 | blockIdx.x*blockDim.x + threadIdx.x; // X 54 | 55 | int myFaulty = 0; 56 | for (size_t i = 1; i < iters; ++i) 57 | if (fabs(C[myIndex] - C[myIndex + i*iterStep]) > EPSILOND) 58 | myFaulty++; 59 | 60 | atomicAdd(faultyElems, myFaulty); 61 | } 62 | -------------------------------------------------------------------------------- /gpu_burn-drv.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2016, Ville Timonen 3 | * All rights reserved. 4 | * 5 | * Redistribution and use in source and binary forms, with or without 6 | * modification, are permitted provided that the following conditions are met: 7 | * 8 | * 1. Redistributions of source code must retain the above copyright notice, this 9 | * list of conditions and the following disclaimer. 10 | * 2. Redistributions in binary form must reproduce the above copyright notice, 11 | * this list of conditions and the following disclaimer in the documentation 12 | * and/or other materials provided with the distribution. 13 | * 14 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 16 | * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 17 | * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 18 | * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 19 | * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 20 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 21 | * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 22 | * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 23 | * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 24 | * 25 | * The views and conclusions contained in the software and documentation are those 26 | * of the authors and should not be interpreted as representing official policies, 27 | * either expressed or implied, of the FreeBSD Project. 28 | */ 29 | 30 | #define SIZE 2048ul // Matrices are SIZE*SIZE.. 2048^2 should be efficiently implemented in CUBLAS 31 | #define USEMEM 0.9 // Try to allocate 90% of memory 32 | 33 | // Used to report op/s, measured through Visual Profiler, CUBLAS from CUDA 7.5 34 | // (Seems that they indeed take the naive dim^3 approach) 35 | #define OPS_PER_MUL 17188257792ul 36 | 37 | #include 38 | #include 39 | #include 40 | #include 41 | #include 42 | #include 43 | #include 44 | #include 45 | #include 46 | #include 47 | #include 48 | #include 49 | 50 | #include 51 | #include "cublas_v2.h" 52 | 53 | void checkError(int rCode, std::string desc = "") { 54 | static std::map g_errorStrings; 55 | if (!g_errorStrings.size()) { 56 | g_errorStrings.insert(std::pair(CUDA_ERROR_INVALID_VALUE, "CUDA_ERROR_INVALID_VALUE")); 57 | g_errorStrings.insert(std::pair(CUDA_ERROR_OUT_OF_MEMORY, "CUDA_ERROR_OUT_OF_MEMORY")); 58 | g_errorStrings.insert(std::pair(CUDA_ERROR_NOT_INITIALIZED, "CUDA_ERROR_NOT_INITIALIZED")); 59 | g_errorStrings.insert(std::pair(CUDA_ERROR_DEINITIALIZED, "CUDA_ERROR_DEINITIALIZED")); 60 | g_errorStrings.insert(std::pair(CUDA_ERROR_NO_DEVICE, "CUDA_ERROR_NO_DEVICE")); 61 | g_errorStrings.insert(std::pair(CUDA_ERROR_INVALID_DEVICE, "CUDA_ERROR_INVALID_DEVICE")); 62 | g_errorStrings.insert(std::pair(CUDA_ERROR_INVALID_IMAGE, "CUDA_ERROR_INVALID_IMAGE")); 63 | g_errorStrings.insert(std::pair(CUDA_ERROR_INVALID_CONTEXT, "CUDA_ERROR_INVALID_CONTEXT")); 64 | g_errorStrings.insert(std::pair(CUDA_ERROR_MAP_FAILED, "CUDA_ERROR_MAP_FAILED")); 65 | g_errorStrings.insert(std::pair(CUDA_ERROR_UNMAP_FAILED, "CUDA_ERROR_UNMAP_FAILED")); 66 | g_errorStrings.insert(std::pair(CUDA_ERROR_ARRAY_IS_MAPPED, "CUDA_ERROR_ARRAY_IS_MAPPED")); 67 | g_errorStrings.insert(std::pair(CUDA_ERROR_ALREADY_MAPPED, "CUDA_ERROR_ALREADY_MAPPED")); 68 | g_errorStrings.insert(std::pair(CUDA_ERROR_NO_BINARY_FOR_GPU, "CUDA_ERROR_NO_BINARY_FOR_GPU")); 69 | g_errorStrings.insert(std::pair(CUDA_ERROR_ALREADY_ACQUIRED, "CUDA_ERROR_ALREADY_ACQUIRED")); 70 | g_errorStrings.insert(std::pair(CUDA_ERROR_NOT_MAPPED, "CUDA_ERROR_NOT_MAPPED")); 71 | g_errorStrings.insert(std::pair(CUDA_ERROR_NOT_MAPPED_AS_ARRAY, "CUDA_ERROR_NOT_MAPPED_AS_ARRAY")); 72 | g_errorStrings.insert(std::pair(CUDA_ERROR_NOT_MAPPED_AS_POINTER, "CUDA_ERROR_NOT_MAPPED_AS_POINTER")); 73 | g_errorStrings.insert(std::pair(CUDA_ERROR_UNSUPPORTED_LIMIT, "CUDA_ERROR_UNSUPPORTED_LIMIT")); 74 | g_errorStrings.insert(std::pair(CUDA_ERROR_CONTEXT_ALREADY_IN_USE, "CUDA_ERROR_CONTEXT_ALREADY_IN_USE")); 75 | g_errorStrings.insert(std::pair(CUDA_ERROR_INVALID_SOURCE, "CUDA_ERROR_INVALID_SOURCE")); 76 | g_errorStrings.insert(std::pair(CUDA_ERROR_FILE_NOT_FOUND, "CUDA_ERROR_FILE_NOT_FOUND")); 77 | g_errorStrings.insert(std::pair(CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND, "CUDA_ERROR_SHARED_OBJECT_SYMBOL_NOT_FOUND")); 78 | g_errorStrings.insert(std::pair(CUDA_ERROR_SHARED_OBJECT_INIT_FAILED, "CUDA_ERROR_SHARED_OBJECT_INIT_FAILED")); 79 | g_errorStrings.insert(std::pair(CUDA_ERROR_OPERATING_SYSTEM, "CUDA_ERROR_OPERATING_SYSTEM")); 80 | g_errorStrings.insert(std::pair(CUDA_ERROR_INVALID_HANDLE, "CUDA_ERROR_INVALID_HANDLE")); 81 | g_errorStrings.insert(std::pair(CUDA_ERROR_NOT_FOUND, "CUDA_ERROR_NOT_FOUND")); 82 | g_errorStrings.insert(std::pair(CUDA_ERROR_NOT_READY, "CUDA_ERROR_NOT_READY")); 83 | g_errorStrings.insert(std::pair(CUDA_ERROR_LAUNCH_FAILED, "CUDA_ERROR_LAUNCH_FAILED")); 84 | g_errorStrings.insert(std::pair(CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES, "CUDA_ERROR_LAUNCH_OUT_OF_RESOURCES")); 85 | g_errorStrings.insert(std::pair(CUDA_ERROR_LAUNCH_TIMEOUT, "CUDA_ERROR_LAUNCH_TIMEOUT")); 86 | g_errorStrings.insert(std::pair(CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING, "CUDA_ERROR_LAUNCH_INCOMPATIBLE_TEXTURING")); 87 | g_errorStrings.insert(std::pair(CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE, "CUDA_ERROR_PRIMARY_CONTEXT_ACTIVE")); 88 | g_errorStrings.insert(std::pair(CUDA_ERROR_CONTEXT_IS_DESTROYED, "CUDA_ERROR_CONTEXT_IS_DESTROYED")); 89 | g_errorStrings.insert(std::pair(CUDA_ERROR_UNKNOWN, "CUDA_ERROR_UNKNOWN")); 90 | } 91 | 92 | if (rCode != CUDA_SUCCESS) 93 | throw ((desc == "") ? 94 | std::string("Error: ") : 95 | (std::string("Error in \"") + desc + std::string("\": "))) + 96 | g_errorStrings[rCode]; 97 | } 98 | 99 | void checkError(cublasStatus_t rCode, std::string desc = "") { 100 | static std::map g_errorStrings; 101 | if (!g_errorStrings.size()) { 102 | g_errorStrings.insert(std::pair(CUBLAS_STATUS_NOT_INITIALIZED, "CUBLAS_STATUS_NOT_INITIALIZED")); 103 | g_errorStrings.insert(std::pair(CUBLAS_STATUS_ALLOC_FAILED, "CUBLAS_STATUS_ALLOC_FAILED")); 104 | g_errorStrings.insert(std::pair(CUBLAS_STATUS_INVALID_VALUE, "CUBLAS_STATUS_INVALID_VALUE")); 105 | g_errorStrings.insert(std::pair(CUBLAS_STATUS_ARCH_MISMATCH, "CUBLAS_STATUS_ARCH_MISMATCH")); 106 | g_errorStrings.insert(std::pair(CUBLAS_STATUS_MAPPING_ERROR, "CUBLAS_STATUS_MAPPING_ERROR")); 107 | g_errorStrings.insert(std::pair(CUBLAS_STATUS_EXECUTION_FAILED, "CUBLAS_STATUS_EXECUTION_FAILED")); 108 | g_errorStrings.insert(std::pair(CUBLAS_STATUS_INTERNAL_ERROR, "CUBLAS_STATUS_INTERNAL_ERROR")); 109 | } 110 | 111 | if (rCode != CUBLAS_STATUS_SUCCESS) 112 | throw ((desc == "") ? 113 | std::string("Error: ") : 114 | (std::string("Error in \"") + desc + std::string("\": "))) + 115 | g_errorStrings[rCode]; 116 | } 117 | 118 | double getTime() 119 | { 120 | struct timeval t; 121 | gettimeofday(&t, NULL); 122 | return (double)t.tv_sec + (double)t.tv_usec / 1e6; 123 | } 124 | 125 | bool g_running = false; 126 | 127 | template class GPU_Test { 128 | public: 129 | GPU_Test(int dev, bool doubles, bool tensors) : 130 | d_devNumber(dev), d_doubles(doubles), d_tensors(tensors) { 131 | checkError(cuDeviceGet(&d_dev, d_devNumber)); 132 | checkError(cuCtxCreate(&d_ctx, 0, d_dev)); 133 | 134 | bind(); 135 | 136 | //checkError(cublasInit()); 137 | checkError(cublasCreate(&d_cublas), "init"); 138 | 139 | if(d_tensors) 140 | checkError(cublasSetMathMode(d_cublas, CUBLAS_TENSOR_OP_MATH)); 141 | 142 | checkError(cuMemAllocHost((void**)&d_faultyElemsHost, sizeof(int))); 143 | d_error = 0; 144 | 145 | g_running = true; 146 | 147 | struct sigaction action; 148 | memset(&action, 0, sizeof(struct sigaction)); 149 | action.sa_handler = termHandler; 150 | sigaction(SIGTERM, &action, NULL); 151 | } 152 | ~GPU_Test() { 153 | bind(); 154 | checkError(cuMemFree(d_Cdata), "Free A"); 155 | checkError(cuMemFree(d_Adata), "Free B"); 156 | checkError(cuMemFree(d_Bdata), "Free C"); 157 | cuMemFreeHost(d_faultyElemsHost); 158 | printf("Freed memory for dev %d\n", d_devNumber); 159 | 160 | cublasDestroy(d_cublas); 161 | printf("Uninitted cublas\n"); 162 | } 163 | 164 | static void termHandler(int signum) 165 | { 166 | g_running = false; 167 | } 168 | 169 | unsigned long long int getErrors() { 170 | if (*d_faultyElemsHost) { 171 | d_error += (long long int)*d_faultyElemsHost; 172 | } 173 | unsigned long long int tempErrs = d_error; 174 | d_error = 0; 175 | return tempErrs; 176 | } 177 | 178 | size_t getIters() { 179 | return d_iters; 180 | } 181 | 182 | void bind() { 183 | checkError(cuCtxSetCurrent(d_ctx), "Bind CTX"); 184 | } 185 | 186 | size_t totalMemory() { 187 | bind(); 188 | size_t freeMem, totalMem; 189 | checkError(cuMemGetInfo(&freeMem, &totalMem)); 190 | return totalMem; 191 | } 192 | 193 | size_t availMemory() { 194 | bind(); 195 | size_t freeMem, totalMem; 196 | checkError(cuMemGetInfo(&freeMem, &totalMem)); 197 | return freeMem; 198 | } 199 | 200 | void initBuffers(T *A, T *B) { 201 | bind(); 202 | 203 | size_t useBytes = (size_t)((double)availMemory()*USEMEM); 204 | printf("Initialized device %d with %lu MB of memory (%lu MB available, using %lu MB of it), %s%s\n", 205 | d_devNumber, totalMemory()/1024ul/1024ul, availMemory()/1024ul/1024ul, useBytes/1024ul/1024ul, 206 | d_doubles ? "using DOUBLES" : "using FLOATS", d_tensors ? ", using Tensor Cores" : ""); 207 | size_t d_resultSize = sizeof(T)*SIZE*SIZE; 208 | d_iters = (useBytes - 2*d_resultSize)/d_resultSize; // We remove A and B sizes 209 | //printf("Results are %d bytes each, thus performing %d iterations\n", d_resultSize, d_iters); 210 | checkError(cuMemAlloc(&d_Cdata, d_iters*d_resultSize), "C alloc"); 211 | checkError(cuMemAlloc(&d_Adata, d_resultSize), "A alloc"); 212 | checkError(cuMemAlloc(&d_Bdata, d_resultSize), "B alloc"); 213 | 214 | checkError(cuMemAlloc(&d_faultyElemData, sizeof(int)), "faulty data"); 215 | 216 | // Populating matrices A and B 217 | checkError(cuMemcpyHtoD(d_Adata, A, d_resultSize), "A -> device"); 218 | checkError(cuMemcpyHtoD(d_Bdata, B, d_resultSize), "A -> device"); 219 | 220 | initCompareKernel(); 221 | } 222 | 223 | void compute() { 224 | bind(); 225 | static const float alpha = 1.0f; 226 | static const float beta = 0.0f; 227 | static const double alphaD = 1.0; 228 | static const double betaD = 0.0; 229 | 230 | for (size_t i = 0; i < d_iters; ++i) { 231 | if (d_doubles) 232 | checkError(cublasDgemm(d_cublas, CUBLAS_OP_N, CUBLAS_OP_N, 233 | SIZE, SIZE, SIZE, &alphaD, 234 | (const double*)d_Adata, SIZE, 235 | (const double*)d_Bdata, SIZE, 236 | &betaD, 237 | (double*)d_Cdata + i*SIZE*SIZE, SIZE), "DGEMM"); 238 | else 239 | checkError(cublasSgemm(d_cublas, CUBLAS_OP_N, CUBLAS_OP_N, 240 | SIZE, SIZE, SIZE, &alpha, 241 | (const float*)d_Adata, SIZE, 242 | (const float*)d_Bdata, SIZE, 243 | &beta, 244 | (float*)d_Cdata + i*SIZE*SIZE, SIZE), "SGEMM"); 245 | } 246 | } 247 | 248 | void initCompareKernel() { 249 | const char *kernelFile = "compare.ptx"; 250 | { 251 | std::ifstream f(kernelFile); 252 | checkError(f.good() ? CUDA_SUCCESS : CUDA_ERROR_NOT_FOUND, std::string("couldn't find file \"") + kernelFile + "\" from working directory"); 253 | } 254 | checkError(cuModuleLoad(&d_module, kernelFile), "load module"); 255 | checkError(cuModuleGetFunction(&d_function, d_module, 256 | d_doubles ? "compareD" : "compare"), "get func"); 257 | 258 | checkError(cuFuncSetCacheConfig(d_function, CU_FUNC_CACHE_PREFER_L1), "L1 config"); 259 | checkError(cuParamSetSize(d_function, __alignof(T*) + __alignof(int*) + __alignof(size_t)), "set param size"); 260 | checkError(cuParamSetv(d_function, 0, &d_Cdata, sizeof(T*)), "set param"); 261 | checkError(cuParamSetv(d_function, __alignof(T*), &d_faultyElemData, sizeof(T*)), "set param"); 262 | checkError(cuParamSetv(d_function, __alignof(T*) + __alignof(int*), &d_iters, sizeof(size_t)), "set param"); 263 | 264 | checkError(cuFuncSetBlockShape(d_function, g_blockSize, g_blockSize, 1), "set block size"); 265 | } 266 | 267 | void compare() { 268 | checkError(cuMemsetD32Async(d_faultyElemData, 0, 1, 0), "memset"); 269 | checkError(cuLaunchGridAsync(d_function, SIZE/g_blockSize, SIZE/g_blockSize, 0), "Launch grid"); 270 | checkError(cuMemcpyDtoHAsync(d_faultyElemsHost, d_faultyElemData, sizeof(int), 0), "Read faultyelemdata"); 271 | } 272 | 273 | bool shouldRun() 274 | { 275 | return g_running; 276 | } 277 | 278 | private: 279 | bool d_doubles; 280 | bool d_tensors; 281 | int d_devNumber; 282 | size_t d_iters; 283 | size_t d_resultSize; 284 | 285 | long long int d_error; 286 | 287 | static const int g_blockSize = 16; 288 | 289 | CUdevice d_dev; 290 | CUcontext d_ctx; 291 | CUmodule d_module; 292 | CUfunction d_function; 293 | 294 | CUdeviceptr d_Cdata; 295 | CUdeviceptr d_Adata; 296 | CUdeviceptr d_Bdata; 297 | CUdeviceptr d_faultyElemData; 298 | int *d_faultyElemsHost; 299 | 300 | cublasHandle_t d_cublas; 301 | }; 302 | 303 | // Returns the number of devices 304 | int initCuda() { 305 | checkError(cuInit(0)); 306 | int deviceCount = 0; 307 | checkError(cuDeviceGetCount(&deviceCount)); 308 | 309 | if (!deviceCount) 310 | throw std::string("No CUDA devices"); 311 | 312 | #ifdef USEDEV 313 | if (USEDEV >= deviceCount) 314 | throw std::string("Not enough devices for USEDEV"); 315 | #endif 316 | 317 | return deviceCount; 318 | } 319 | 320 | template void startBurn(int index, int writeFd, T *A, T *B, bool doubles, bool tensors) { 321 | GPU_Test *our; 322 | try { 323 | our = new GPU_Test(index, doubles, tensors); 324 | our->initBuffers(A, B); 325 | } catch (std::string e) { 326 | fprintf(stderr, "Couldn't init a GPU test: %s\n", e.c_str()); 327 | exit(124); 328 | } 329 | 330 | // The actual work 331 | try { 332 | int eventIndex = 0; 333 | const int maxEvents = 2; 334 | CUevent events[maxEvents]; 335 | for (int i = 0; i < maxEvents; ++i) 336 | cuEventCreate(events + i, 0); 337 | 338 | int nonWorkIters = maxEvents; 339 | 340 | while (our->shouldRun()) { 341 | our->compute(); 342 | our->compare(); 343 | checkError(cuEventRecord(events[eventIndex], 0), "Record event"); 344 | 345 | eventIndex = ++eventIndex % maxEvents; 346 | 347 | while (cuEventQuery(events[eventIndex]) != CUDA_SUCCESS) usleep(1000); 348 | 349 | if (--nonWorkIters > 0) continue; 350 | 351 | int ops = our->getIters(); 352 | write(writeFd, &ops, sizeof(int)); 353 | ops = our->getErrors(); 354 | write(writeFd, &ops, sizeof(int)); 355 | } 356 | 357 | for (int i = 0; i < maxEvents; ++i) 358 | cuEventSynchronize(events[i]); 359 | delete our; 360 | } catch (std::string e) { 361 | fprintf(stderr, "Failure during compute: %s\n", e.c_str()); 362 | int ops = -1; 363 | // Signalling that we failed 364 | write(writeFd, &ops, sizeof(int)); 365 | write(writeFd, &ops, sizeof(int)); 366 | exit(111); 367 | } 368 | } 369 | 370 | int pollTemp(pid_t *p) { 371 | int tempPipe[2]; 372 | pipe(tempPipe); 373 | 374 | pid_t myPid = fork(); 375 | 376 | if (!myPid) { 377 | close(tempPipe[0]); 378 | dup2(tempPipe[1], STDOUT_FILENO); // Stdout 379 | execlp("nvidia-smi", "nvidia-smi", "-l", "5", "-q", "-d", "TEMPERATURE", NULL); 380 | fprintf(stderr, "Could not invoke nvidia-smi, no temps available\n"); 381 | 382 | exit(0); 383 | } 384 | 385 | *p = myPid; 386 | close(tempPipe[1]); 387 | 388 | return tempPipe[0]; 389 | } 390 | 391 | void updateTemps(int handle, std::vector *temps) { 392 | const int readSize = 10240; 393 | static int gpuIter = 0; 394 | char data[readSize+1]; 395 | 396 | int curPos = 0; 397 | do { 398 | read(handle, data+curPos, sizeof(char)); 399 | } while (data[curPos++] != '\n'); 400 | 401 | data[curPos-1] = 0; 402 | 403 | int tempValue; 404 | // FIXME: The syntax of this print might change in the future.. 405 | if (sscanf(data, " GPU Current Temp : %d C", &tempValue) == 1) { 406 | //printf("read temp val %d\n", tempValue); 407 | temps->at(gpuIter) = tempValue; 408 | gpuIter = (gpuIter+1)%(temps->size()); 409 | } else if (!strcmp(data, " Gpu : N/A")) 410 | gpuIter = (gpuIter+1)%(temps->size()); // We rotate the iterator for N/A values as well 411 | } 412 | 413 | void listenClients(std::vector clientFd, std::vector clientPid, int runTime) { 414 | fd_set waitHandles; 415 | 416 | pid_t tempPid; 417 | int tempHandle = 0; 418 | int maxHandle = tempHandle; 419 | 420 | FD_ZERO(&waitHandles); 421 | FD_SET(tempHandle, &waitHandles); 422 | 423 | for (size_t i = 0; i < clientFd.size(); ++i) { 424 | if (clientFd.at(i) > maxHandle) 425 | maxHandle = clientFd.at(i); 426 | FD_SET(clientFd.at(i), &waitHandles); 427 | } 428 | 429 | std::vector clientTemp; 430 | std::vector clientErrors; 431 | std::vector clientCalcs; 432 | std::vector clientUpdateTime; 433 | std::vector clientGflops; 434 | std::vector clientFaulty; 435 | 436 | time_t startTime = time(0); 437 | 438 | for (size_t i = 0; i < clientFd.size(); ++i) { 439 | clientTemp.push_back(0); 440 | clientErrors.push_back(0); 441 | clientCalcs.push_back(0); 442 | struct timespec thisTime; 443 | clock_gettime(CLOCK_REALTIME, &thisTime); 444 | clientUpdateTime.push_back(thisTime); 445 | clientGflops.push_back(0.0f); 446 | clientFaulty.push_back(false); 447 | } 448 | 449 | int changeCount; 450 | float nextReport = 10.0f; 451 | bool childReport = false; 452 | while ((changeCount = select(maxHandle+1, &waitHandles, NULL, NULL, NULL))) { 453 | size_t thisTime = time(0); 454 | struct timespec thisTimeSpec; 455 | clock_gettime(CLOCK_REALTIME, &thisTimeSpec); 456 | 457 | //printf("got new data! %d\n", changeCount); 458 | // Going through all descriptors 459 | for (size_t i = 0; i < clientFd.size(); ++i) 460 | if (FD_ISSET(clientFd.at(i), &waitHandles)) { 461 | // First, reading processed 462 | int processed, errors; 463 | read(clientFd.at(i), &processed, sizeof(int)); 464 | // Then errors 465 | read(clientFd.at(i), &errors, sizeof(int)); 466 | 467 | clientErrors.at(i) += errors; 468 | if (processed == -1) 469 | clientCalcs.at(i) = -1; 470 | else 471 | { 472 | double flops = (double)processed * (double)OPS_PER_MUL; 473 | struct timespec clientPrevTime = clientUpdateTime.at(i); 474 | double clientTimeDelta = (double)thisTimeSpec.tv_sec + (double)thisTimeSpec.tv_nsec / 1000000000.0 - ((double)clientPrevTime.tv_sec + (double)clientPrevTime.tv_nsec / 1000000000.0); 475 | clientUpdateTime.at(i) = thisTimeSpec; 476 | 477 | clientGflops.at(i) = (double)((unsigned long long int)processed * OPS_PER_MUL) / clientTimeDelta / 1000.0 / 1000.0 / 1000.0; 478 | clientCalcs.at(i) += processed; 479 | } 480 | 481 | childReport = true; 482 | } 483 | 484 | if (FD_ISSET(tempHandle, &waitHandles)) 485 | updateTemps(tempHandle, &clientTemp); 486 | 487 | // Resetting the listeners 488 | FD_ZERO(&waitHandles); 489 | FD_SET(tempHandle, &waitHandles); 490 | for (size_t i = 0; i < clientFd.size(); ++i) 491 | FD_SET(clientFd.at(i), &waitHandles); 492 | 493 | // Printing progress (if a child has initted already) 494 | if (childReport) { 495 | float elapsed = fminf((float)(thisTime-startTime)/(float)runTime*100.0f, 100.0f); 496 | printf("\r%.1f%% ", elapsed); 497 | printf("proc'd: "); 498 | for (size_t i = 0; i < clientCalcs.size(); ++i) { 499 | printf("%d (%.0f Gflop/s) ", clientCalcs.at(i), clientGflops.at(i)); 500 | if (i != clientCalcs.size() - 1) 501 | printf("- "); 502 | } 503 | printf(" errors: "); 504 | for (size_t i = 0; i < clientErrors.size(); ++i) { 505 | std::string note = "%d "; 506 | if (clientCalcs.at(i) == -1) 507 | note += " (DIED!)"; 508 | else if (clientErrors.at(i)) 509 | note += " (WARNING!)"; 510 | 511 | printf(note.c_str(), clientErrors.at(i)); 512 | if (i != clientCalcs.size() - 1) 513 | printf("- "); 514 | } 515 | printf(" temps: "); 516 | for (size_t i = 0; i < clientTemp.size(); ++i) { 517 | printf(clientTemp.at(i) != 0 ? "%d C " : "-- ", clientTemp.at(i)); 518 | if (i != clientCalcs.size() - 1) 519 | printf("- "); 520 | } 521 | 522 | fflush(stdout); 523 | 524 | if (nextReport < elapsed) { 525 | nextReport = elapsed + 10.0f; 526 | printf("\n\tSummary at: "); 527 | fflush(stdout); 528 | system("date"); // Printing a date 529 | fflush(stdout); 530 | printf("\n"); 531 | //printf("\t(checkpoint)\n"); 532 | for (size_t i = 0; i < clientErrors.size(); ++i) { 533 | if (clientErrors.at(i)) 534 | clientFaulty.at(i) = true; 535 | clientErrors.at(i) = 0; 536 | } 537 | } 538 | } 539 | 540 | // Checking whether all clients are dead 541 | bool oneAlive = false; 542 | for (size_t i = 0; i < clientCalcs.size(); ++i) 543 | if (clientCalcs.at(i) != -1) 544 | oneAlive = true; 545 | if (!oneAlive) { 546 | fprintf(stderr, "\n\nNo clients are alive! Aborting\n"); 547 | exit(123); 548 | } 549 | 550 | if (startTime + runTime < thisTime) 551 | break; 552 | } 553 | 554 | printf("\nKilling processes.. "); 555 | fflush(stdout); 556 | for (size_t i = 0; i < clientPid.size(); ++i) 557 | kill(clientPid.at(i), 15); 558 | 559 | kill(tempPid, 15); 560 | close(tempHandle); 561 | 562 | while (wait(NULL) != -1); 563 | printf("done\n"); 564 | 565 | printf("\nTested %d GPUs:\n", (int)clientPid.size()); 566 | for (size_t i = 0; i < clientPid.size(); ++i) 567 | printf("\tGPU %d: %s\n", (int)i, clientFaulty.at(i) ? "FAULTY" : "OK"); 568 | } 569 | 570 | template void launch(int runLength, bool useDoubles, bool useTensorCores) { 571 | system("nvidia-smi -L"); 572 | 573 | // Initting A and B with random data 574 | T *A = (T*) malloc(sizeof(T)*SIZE*SIZE); 575 | T *B = (T*) malloc(sizeof(T)*SIZE*SIZE); 576 | srand(10); 577 | for (size_t i = 0; i < SIZE*SIZE; ++i) { 578 | A[i] = (T)((double)(rand()%1000000)/100000.0); 579 | B[i] = (T)((double)(rand()%1000000)/100000.0); 580 | } 581 | 582 | // Forking a process.. This one checks the number of devices to use, 583 | // returns the value, and continues to use the first one. 584 | int mainPipe[2]; 585 | pipe(mainPipe); 586 | int readMain = mainPipe[0]; 587 | std::vector clientPipes; 588 | std::vector clientPids; 589 | clientPipes.push_back(readMain); 590 | 591 | pid_t myPid = fork(); 592 | if (!myPid) { 593 | // Child 594 | close(mainPipe[0]); 595 | int writeFd = mainPipe[1]; 596 | int devCount = initCuda(); 597 | write(writeFd, &devCount, sizeof(int)); 598 | 599 | startBurn(0, writeFd, A, B, useDoubles, useTensorCores); 600 | 601 | close(writeFd); 602 | return; 603 | } else { 604 | clientPids.push_back(myPid); 605 | 606 | close(mainPipe[1]); 607 | int devCount; 608 | read(readMain, &devCount, sizeof(int)); 609 | 610 | if (!devCount) { 611 | fprintf(stderr, "No CUDA devices\n"); 612 | exit(EXIT_FAILURE); 613 | } else { 614 | 615 | for (int i = 1; i < devCount; ++i) { 616 | int slavePipe[2]; 617 | pipe(slavePipe); 618 | clientPipes.push_back(slavePipe[0]); 619 | 620 | pid_t slavePid = fork(); 621 | 622 | if (!slavePid) { 623 | // Child 624 | close(slavePipe[0]); 625 | initCuda(); 626 | startBurn(i, slavePipe[1], A, B, useDoubles, useTensorCores); 627 | 628 | close(slavePipe[1]); 629 | return; 630 | } else { 631 | clientPids.push_back(slavePid); 632 | close(slavePipe[1]); 633 | } 634 | } 635 | 636 | listenClients(clientPipes, clientPids, runLength); 637 | } 638 | } 639 | 640 | for (size_t i = 0; i < clientPipes.size(); ++i) 641 | close(clientPipes.at(i)); 642 | 643 | free(A); 644 | free(B); 645 | } 646 | 647 | int main(int argc, char **argv) { 648 | int runLength = 10; 649 | bool useDoubles = false; 650 | bool useTensorCores = false; 651 | int thisParam = 0; 652 | 653 | std::vector args(argv, argv + argc); 654 | for (size_t i = 1; i < args.size(); ++i) 655 | { 656 | if (argc >= 2 && std::string(argv[i]).find("-d") != std::string::npos) 657 | { 658 | useDoubles = true; 659 | thisParam++; 660 | } 661 | if (argc >= 2 && std::string(argv[i]).find("-tc") != std::string::npos) 662 | { 663 | useTensorCores = true; 664 | thisParam++; 665 | } 666 | } 667 | 668 | if (argc-thisParam < 2) 669 | printf("Run length not specified in the command line. Burning for 10 secs\n"); 670 | else 671 | runLength = atoi(argv[1+thisParam]); 672 | 673 | if (useDoubles) 674 | launch(runLength, useDoubles, useTensorCores); 675 | else 676 | launch(runLength, useDoubles, useTensorCores); 677 | 678 | return 0; 679 | } 680 | --------------------------------------------------------------------------------