├── cuda-cpp ├── file ├── version ├── example1 ├── file.cu ├── version.cu ├── example1.cu └── Untitled.ipynb ├── cuda-c └── src │ ├── 05-julia-set │ ├── julia_set │ ├── README.md │ └── julia_set.cu │ ├── 01-hello_world │ ├── hello_world │ ├── hello_world_gpu │ ├── README.md │ ├── hello_world_gpu.cu │ ├── hello_world.cu │ └── hello-world-from-gpu.ipynb │ ├── 03-device-query │ ├── device_query │ └── device_query.cu │ ├── cuda-programming-model │ ├── sum │ ├── sumgpu │ ├── addvector │ ├── checkdims │ ├── sumArraysOnHost.c │ ├── checkDimensions.cu │ ├── sumArraysOnDevice.cu │ ├── 02-organizing-threads.ipynb │ ├── sumArraysOnGPU.cu │ ├── 03-compiling-and-executing.ipynb │ ├── 01-memory-management.ipynb │ └── 04-timing-kernel.ipynb │ ├── 02-passing-params │ ├── passing_params │ ├── passing_params.cu │ └── README.md │ ├── 04-gpu-vector-sums │ ├── gpu_vector_sums │ └── gpu_vector_sums.cu │ ├── 06-gpu-vector-sums-redux │ ├── gpu_vector_sums_redux │ └── gpu_vector_sums_redux.cu │ └── utils │ ├── gl_helper.h │ ├── cpu_bitmap.h │ ├── cpu_anim.h │ ├── gpu_anim.h │ ├── common.h │ └── GL │ └── glut.h ├── README.md ├── pycuda ├── 02-hello_world.py ├── hello_world.py └── notebooks │ ├── 02-hello_world.ipynb │ └── 01-hello-world.ipynb ├── caldera-cheyenne.md ├── notes └── parallel-communication-patterns.md ├── LICENSE ├── .gitignore └── getting_started_on_colab.ipynb /cuda-cpp/file: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-cpp/file -------------------------------------------------------------------------------- /cuda-cpp/version: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-cpp/version -------------------------------------------------------------------------------- /cuda-cpp/example1: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-cpp/example1 -------------------------------------------------------------------------------- /cuda-c/src/05-julia-set/julia_set: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/05-julia-set/julia_set -------------------------------------------------------------------------------- /cuda-c/src/01-hello_world/hello_world: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/01-hello_world/hello_world -------------------------------------------------------------------------------- /cuda-c/src/03-device-query/device_query: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/03-device-query/device_query -------------------------------------------------------------------------------- /cuda-c/src/cuda-programming-model/sum: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/cuda-programming-model/sum -------------------------------------------------------------------------------- /cuda-c/src/01-hello_world/hello_world_gpu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/01-hello_world/hello_world_gpu -------------------------------------------------------------------------------- /cuda-c/src/cuda-programming-model/sumgpu: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/cuda-programming-model/sumgpu -------------------------------------------------------------------------------- /cuda-c/src/02-passing-params/passing_params: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/02-passing-params/passing_params -------------------------------------------------------------------------------- /cuda-c/src/cuda-programming-model/addvector: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/cuda-programming-model/addvector -------------------------------------------------------------------------------- /cuda-c/src/cuda-programming-model/checkdims: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/cuda-programming-model/checkdims -------------------------------------------------------------------------------- /cuda-c/src/04-gpu-vector-sums/gpu_vector_sums: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/04-gpu-vector-sums/gpu_vector_sums -------------------------------------------------------------------------------- /cuda-c/src/06-gpu-vector-sums-redux/gpu_vector_sums_redux: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/06-gpu-vector-sums-redux/gpu_vector_sums_redux -------------------------------------------------------------------------------- /cuda-c/src/01-hello_world/README.md: -------------------------------------------------------------------------------- 1 | # Hello, World 2 | 3 | To compile the program, run the following: 4 | 5 | `$ nvcc -o hello_world hello_world.cu` 6 | 7 | To run the compile program: 8 | 9 | `$ ./hello_world` 10 | -------------------------------------------------------------------------------- /cuda-cpp/file.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(void) 5 | { 6 | int major = THRUST_MAJOR_VERSION; 7 | int minor = THRUST_MINOR_VERSION; 8 | 9 | std::cout << "Thrust v" << major << "." << minor << std::endl; 10 | 11 | return 0; 12 | } -------------------------------------------------------------------------------- /cuda-cpp/version.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | int main(void) 5 | { 6 | int major = THRUST_MAJOR_VERSION; 7 | int minor = THRUST_MINOR_VERSION; 8 | 9 | std::cout << "Thrust v" << major << "." << minor << std::endl; 10 | 11 | return 0; 12 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CUDA Programming 2 | 3 | 4 | This project contains source code, notes associated with my journey of learning Compute Unified Device Architecture (CUDA) programming. 5 | 6 | There is a **Python(PyCuda)** version and a **Cuda-C** version. Files associated with each version are in independent folders. 7 | 8 | -------------------------------------------------------------------------------- /pycuda/02-hello_world.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import pycuda.driver as cuda 4 | import pycuda.autoinit 5 | from pycuda.compiler import SourceModule 6 | 7 | mod = SourceModule(""" 8 | #include 9 | __global__ void say_hi(){ 10 | printf("I am %d.%d\\n", threadIdx.x, threadIdx.y); 11 | }""") 12 | 13 | func = mod.get_function("say_hi") 14 | func(block=(4, 2, 1)) -------------------------------------------------------------------------------- /pycuda/hello_world.py: -------------------------------------------------------------------------------- 1 | 2 | import pycuda.driver as cuda 3 | import pycuda.autoinit 4 | from pycuda.compiler import SourceModule 5 | 6 | mod = SourceModule(""" 7 | #include 8 | 9 | __global__ void kernel() 10 | { 11 | printf("Hello, World!\\n"); 12 | }""") 13 | 14 | func = mod.get_function("kernel") 15 | 16 | 17 | func(block=(4, 1, 1)) -------------------------------------------------------------------------------- /caldera-cheyenne.md: -------------------------------------------------------------------------------- 1 | ## Useful commands 2 | 3 | - List your current jobs: 4 | 5 | `$ squeue -u $USER` 6 | 7 | - Examine a job in detail: 8 | 9 | `$ scontrol show job ` 10 | 11 | - Kill a job: 12 | 13 | `$ scancel ` 14 | 15 | 16 | ## Scripts to start interactive jobs on Caldera/Cheyenne 17 | 18 | `$ execca -a ` -> Run on a caldera node 19 | -------------------------------------------------------------------------------- /cuda-c/src/05-julia-set/README.md: -------------------------------------------------------------------------------- 1 | # Julia Set 2 | 3 | ## To compile the application: 4 | 5 | 6 | In some cases, we need to add ` -lglut -lGLU -lGL` on the link line 7 | 8 | `$ nvcc -o julia_set julia_set.cu -lglut -lGLU -lGL` 9 | 10 | ## To profile and run the code, do the following: 11 | 12 | `$ nvprof --unified-memory-profiling off ./julia_set` 13 | 14 | ![](https://i.imgur.com/kocLDtn.gif) -------------------------------------------------------------------------------- /cuda-c/src/01-hello_world/hello_world_gpu.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | __global__ void helloFromGPU(void) 4 | { 5 | if (threadIdx.x == 5) 6 | printf(".............Hello World from GPU thread %d!.............\n", threadIdx.x); 7 | } 8 | 9 | int main(void){ 10 | // hello from cpu 11 | printf("<------------Hello World from CPU!-------------->\n"); 12 | 13 | helloFromGPU <<<1, 10>>>(); 14 | 15 | cudaDeviceSynchronize(); 16 | return 0; 17 | } -------------------------------------------------------------------------------- /cuda-c/src/01-hello_world/hello_world.cu: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | /* Alerts the compiler that a function should be compiled to run on a device 4 | instead of the host. 5 | */ 6 | __global__ void kernel(void) { 7 | 8 | } 9 | 10 | int main(void){ 11 | 12 | kernel<<<1,1>>>(); 13 | /* Angle brackets denote arguments we plan to pass to the runtime system. 14 | These are not arguments to the device code. */ 15 | printf("Hello, World!\n"); 16 | return 0; 17 | 18 | } 19 | 20 | 21 | -------------------------------------------------------------------------------- /notes/parallel-communication-patterns.md: -------------------------------------------------------------------------------- 1 | # Parallel Communication Patterns 2 | 3 | - Parallel computing is all about many threads solving a problem by working together. 4 | - This is all about communication. 5 | - In CUDA, this communication takes place in memory. 6 | 7 | ## Map and Gather 8 | 9 | - With map, you've got many data elements (e.g., elements of an array, entries in a matrix, or pixels in an image). 10 | - And you are going to do the same function, or computational task, on each piece of data. 11 | - There is a 1 to 1 correspondence between input and output. So, map is very efficient on GPUs -------------------------------------------------------------------------------- /cuda-c/src/02-passing-params/passing_params.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../utils/common.h" 3 | 4 | __global__ void add(int a, int b, int *c) { 5 | *c = a + b; 6 | } 7 | 8 | int main(){ 9 | int c; 10 | int *device_c; 11 | 12 | HANDLE_ERROR(cudaMalloc((void**)&device_c, sizeof(int))); 13 | 14 | add<<<1,1>>>(2, 7, device_c); 15 | 16 | HANDLE_ERROR(cudaMemcpy(&c, 17 | device_c, 18 | sizeof(int), 19 | cudaMemcpyDeviceToHost)); 20 | 21 | printf(" 2 + 7 = %d\n", c); 22 | cudaFree(device_c); 23 | 24 | return 0; 25 | } -------------------------------------------------------------------------------- /cuda-cpp/example1.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | int main(void) 10 | { 11 | // generate 32M random numbers serially 12 | thrust::host_vector h_vec(32 << 20); 13 | std::generate(h_vec.begin(), h_vec.end(), rand); 14 | 15 | // transfer data to the device 16 | thrust::device_vector d_vec = h_vec; 17 | 18 | // sort data on the device (846M keys per second on GeForce GTX 480) 19 | thrust::sort(d_vec.begin(), d_vec.end()); 20 | 21 | // transfer data back to host 22 | thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin()); 23 | 24 | return 0; 25 | } -------------------------------------------------------------------------------- /cuda-c/src/cuda-programming-model/sumArraysOnHost.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | 6 | void sumArraysOnHost(float *A, float *B, float *C, const int N){ 7 | for (int idx=0; idx 3 | #include 4 | 5 | __global__ 6 | void checkIndex(void){ 7 | printf("threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) blockDim:(%d, %d, %d) " 8 | "gridDim:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, 9 | blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z, 10 | gridDim.x,gridDim.y,gridDim.z); 11 | } 12 | 13 | 14 | int main(int argc, char **argv) { 15 | // define total data element 16 | int nElem = 6; 17 | // define grid and block structure 18 | dim3 block (3); 19 | dim3 grid ((nElem+block.x-1)/block.x); 20 | // check grid and block dimension from host side 21 | printf("grid.x %d grid.y %d grid.z %d\n",grid.x, grid.y, grid.z); 22 | printf("block.x %d block.y %d block.z %d\n",block.x, block.y, block.z); 23 | // check grid and block dimension from device side 24 | checkIndex <<>> (); 25 | // reset device before you leave 26 | cudaDeviceReset(); 27 | return(0); 28 | 29 | } -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Anderson Banihirwe 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cuda-c/src/02-passing-params/README.md: -------------------------------------------------------------------------------- 1 | # Passing Parameters 2 | 3 | To compile the program, run the following: 4 | 5 | `$ nvcc -o 02-passing-params 02-passing-params.cu` 6 | 7 | To run the compile program: 8 | 9 | `$ ./02-passing-params` 10 | 11 | To run the profiler: 12 | 13 | `$ nvprof --unified-memory-profiling off ./passing_params` 14 | 15 | [![asciicast](https://asciinema.org/a/mIFzam2aaqraUV6NxtWH7zpPc.png)](https://asciinema.org/a/mIFzam2aaqraUV6NxtWH7zpPc) 16 | 17 | # Summary 18 | 19 | - We can pass parameters to a kernel as we would with any C function. 20 | - We need to allocate memory to do anything useful on a device, such as return values to the host. 21 | 22 | Restrictions on the usage of device pointers are as follows: 23 | 24 | - We **can** pass pointers allocated with `cudaMalloc()` to functions that execute on the device. 25 | - We **can** use pointers allocated with `cudaMalloc()` to read or write memory from code that executes on the device. 26 | - We **can** pass pointers allocated with `cudaMalloc()` to functions that execute on the host. 27 | - We **cannot** use pointers allocated with `cudaMalloc()` to read or write memory from code that executes on the host. 28 | -------------------------------------------------------------------------------- /cuda-c/src/04-gpu-vector-sums/gpu_vector_sums.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../utils/common.h" 3 | 4 | #define N 100 5 | 6 | __global__ void add(int *a, int *b, int *c){ 7 | int tid = blockIdx.x; // handle the data at this index 8 | if (tid < N) 9 | c[tid] = a[tid] + b[tid]; 10 | } 11 | 12 | 13 | int main(void){ 14 | int a[N], b[N], c[N]; 15 | int *device_a, *device_b, *device_c; 16 | 17 | // Allocate the memory on the GPU 18 | HANDLE_ERROR(cudaMalloc((void**)&device_a, N * sizeof(int))); 19 | HANDLE_ERROR(cudaMalloc((void**)&device_b, N * sizeof(int))); 20 | HANDLE_ERROR(cudaMalloc((void**)&device_c, N * sizeof(int))); 21 | 22 | 23 | // fill the arrays 'a' and 'b' on the CPU 24 | for (int i=0; i>>(device_a, device_b, device_c); 38 | 39 | // copy the array 'c' back from the GPU to the CPU 40 | HANDLE_ERROR(cudaMemcpy(c, device_c, N * sizeof(int), 41 | cudaMemcpyDeviceToHost)); 42 | 43 | // display the results 44 | for(int i=0; i 2 | #include "../utils/common.h" 3 | 4 | #define N 100 5 | 6 | __global__ void add(int *a, int *b, int *c){ 7 | int tid = threadIdx.x; // handle the data at this index 8 | if (tid < N) 9 | c[tid] = a[tid] + b[tid]; 10 | } 11 | 12 | 13 | int main(void){ 14 | int a[N], b[N], c[N]; 15 | int *device_a, *device_b, *device_c; 16 | 17 | // Allocate the memory on the GPU 18 | HANDLE_ERROR(cudaMalloc((void**)&device_a, N * sizeof(int))); 19 | HANDLE_ERROR(cudaMalloc((void**)&device_b, N * sizeof(int))); 20 | HANDLE_ERROR(cudaMalloc((void**)&device_c, N * sizeof(int))); 21 | 22 | 23 | // fill the arrays 'a' and 'b' on the CPU 24 | for (int i=0; i>>(device_a, device_b, device_c); 38 | 39 | // copy the array 'c' back from the GPU to the CPU 40 | HANDLE_ERROR(cudaMemcpy(c, device_c, N * sizeof(int), 41 | cudaMemcpyDeviceToHost)); 42 | 43 | // display the results 44 | for(int i=0; i 45 | #include 46 | #include 47 | 48 | #define GET_PROC_ADDRESS( str ) glXGetProcAddress( (const GLubyte *)str ) 49 | 50 | #endif //_WIN32 51 | 52 | 53 | #endif //__GL_HELPER_H__' 54 | -------------------------------------------------------------------------------- /pycuda/notebooks/02-hello_world.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Overwriting ../02-hello_world.py\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%%writefile ../02-hello_world.py\n", 18 | "\n", 19 | "\n", 20 | "import pycuda.driver as cuda\n", 21 | "import pycuda.autoinit\n", 22 | "from pycuda.compiler import SourceModule\n", 23 | "\n", 24 | "mod = SourceModule(\"\"\"\n", 25 | " #include \n", 26 | " __global__ void say_hi(){\n", 27 | " printf(\"I am %d.%d\\\\n\", threadIdx.x, threadIdx.y);\n", 28 | " }\"\"\")\n", 29 | "\n", 30 | "func = mod.get_function(\"say_hi\")\n", 31 | "func(block=(4, 2, 1))" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 2, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "I am 0.0\r\n", 44 | "I am 1.0\r\n", 45 | "I am 2.0\r\n", 46 | "I am 3.0\r\n", 47 | "I am 0.1\r\n", 48 | "I am 1.1\r\n", 49 | "I am 2.1\r\n", 50 | "I am 3.1\r\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "!python ../02-hello_world.py" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [] 64 | } 65 | ], 66 | "metadata": { 67 | "kernelspec": { 68 | "display_name": "Python 3", 69 | "language": "python", 70 | "name": "python3" 71 | }, 72 | "language_info": { 73 | "codemirror_mode": { 74 | "name": "ipython", 75 | "version": 3 76 | }, 77 | "file_extension": ".py", 78 | "mimetype": "text/x-python", 79 | "name": "python", 80 | "nbconvert_exporter": "python", 81 | "pygments_lexer": "ipython3", 82 | "version": "3.6.5" 83 | } 84 | }, 85 | "nbformat": 4, 86 | "nbformat_minor": 2 87 | } 88 | -------------------------------------------------------------------------------- /cuda-c/src/03-device-query/device_query.cu: -------------------------------------------------------------------------------- 1 | #include "../utils/common.h" 2 | 3 | int main(void){ 4 | cudaDeviceProp prop; 5 | 6 | int count; 7 | 8 | HANDLE_ERROR(cudaGetDeviceCount(&count)); 9 | 10 | for (int i=0; i < count; i++) { 11 | HANDLE_ERROR(cudaGetDeviceProperties(&prop, i)); 12 | 13 | printf(" ----- General Information for device %d -----\n", i); 14 | printf("Name: %s\n", prop.name); 15 | printf("Compute capability: %d.%d\n", prop.major, prop.minor); 16 | printf("Clock rate: %d\n", prop.clockRate); 17 | printf("Device copy overlap: "); 18 | if(prop.deviceOverlap) 19 | printf("Enabled\n"); 20 | 21 | else 22 | printf("Disabled\n"); 23 | 24 | printf("Kernel execution timeout : "); 25 | if(prop.kernelExecTimeoutEnabled) 26 | printf("Enabled\n"); 27 | 28 | else 29 | printf("Disabled\n"); 30 | 31 | printf(" ----- Memory Information for device %d -----\n", i); 32 | printf("Total global Mem: %ld\n", prop.totalGlobalMem); 33 | printf("Total constant Mem: %ld\n", prop.totalConstMem); 34 | printf("Max Mem pitch: %ld\n", prop.memPitch); 35 | printf("Texture Alignment: %ld\n", prop.textureAlignment); 36 | 37 | printf(" ----- MP Information for device %d -----\n", i); 38 | printf("Multiprocessor count: %d\n", prop.multiProcessorCount); 39 | printf("Shared mem per mp: %ld\n", prop.sharedMemPerBlock); 40 | printf("Registers per mp: %d\n", prop.regsPerBlock); 41 | printf("Threads in warp: %d\n", prop.warpSize); 42 | printf("Max Threads per block: %d\n", prop.maxThreadsPerBlock); 43 | printf("Max thread dimensions: (%d, %d, %d)\n", prop.maxThreadsDim[0], 44 | prop.maxThreadsDim[1], prop.maxThreadsDim[2]); 45 | printf("Max grid dimensions: (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1], 46 | prop.maxGridSize[2]); 47 | 48 | printf("\n"); 49 | 50 | } 51 | } -------------------------------------------------------------------------------- /cuda-cpp/Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 4, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Overwriting file.cu\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%%file file.cu\n", 18 | "#include \n", 19 | "#include \n", 20 | "\n", 21 | "int main(void)\n", 22 | "{\n", 23 | " int major = THRUST_MAJOR_VERSION;\n", 24 | " int minor = THRUST_MINOR_VERSION;\n", 25 | "\n", 26 | " std::cout << \"Thrust v\" << major << \".\" << minor << std::endl;\n", 27 | "\n", 28 | " return 0;\n", 29 | "}" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 5, 35 | "metadata": {}, 36 | "outputs": [ 37 | { 38 | "name": "stdout", 39 | "output_type": "stream", 40 | "text": [ 41 | "Thrust v1.9\n" 42 | ] 43 | } 44 | ], 45 | "source": [ 46 | "%%bash\n", 47 | "nvcc file.cu -o file\n", 48 | "./file" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "Python 3", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.6.5" 76 | }, 77 | "toc": { 78 | "base_numbering": 1, 79 | "nav_menu": {}, 80 | "number_sections": true, 81 | "sideBar": true, 82 | "skip_h1_title": false, 83 | "title_cell": "Table of Contents", 84 | "title_sidebar": "Contents", 85 | "toc_cell": false, 86 | "toc_position": {}, 87 | "toc_section_display": true, 88 | "toc_window_display": false 89 | } 90 | }, 91 | "nbformat": 4, 92 | "nbformat_minor": 2 93 | } 94 | -------------------------------------------------------------------------------- /pycuda/notebooks/01-hello-world.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## Import and initialize PyCUDA" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "Overwriting ../hello_world.py\n" 20 | ] 21 | } 22 | ], 23 | "source": [ 24 | "%%writefile ../hello_world.py\n", 25 | "\n", 26 | "import pycuda.driver as cuda\n", 27 | "import pycuda.autoinit\n", 28 | "from pycuda.compiler import SourceModule\n", 29 | "\n", 30 | "mod = SourceModule(\"\"\"\n", 31 | " #include \n", 32 | "\n", 33 | " __global__ void kernel()\n", 34 | " {\n", 35 | " printf(\"Hello, World!\\\\n\");\n", 36 | " }\"\"\")\n", 37 | "\n", 38 | "func = mod.get_function(\"kernel\")\n", 39 | "\n", 40 | "\n", 41 | "func(block=(4, 1, 1))\n" 42 | ] 43 | }, 44 | { 45 | "cell_type": "markdown", 46 | "metadata": {}, 47 | "source": [ 48 | "Here,\n", 49 | "`pycuda.autoinit`\n", 50 | "serves for automatic initialization, context creation, and cleanup. The\n", 51 | "`SourceModule`\n", 52 | "is where a (usually short) C-like code for the GPU is to be written." 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 2, 58 | "metadata": {}, 59 | "outputs": [ 60 | { 61 | "name": "stdout", 62 | "output_type": "stream", 63 | "text": [ 64 | "Hello, World!\r\n", 65 | "Hello, World!\r\n", 66 | "Hello, World!\r\n", 67 | "Hello, World!\r\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "!python ../hello_world.py" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.6.5" 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 2 97 | } 98 | -------------------------------------------------------------------------------- /cuda-c/src/cuda-programming-model/sumArraysOnDevice.cu: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | __global__ void sumArraysOnDevice(float *A, float *B, float *C){ 8 | int idx = threadIdx.x; 9 | C[idx] = A[idx] + B[idx]; 10 | 11 | } 12 | 13 | 14 | void initialData(float *ip, int size){ 15 | // generate different seed for random number 16 | time_t t; 17 | srand((unsigned int) time (&t)); 18 | 19 | for (int i=0; i epsilon){ 38 | match = 0; 39 | printf("Arrays do not match!\n"); 40 | printf("host %5.2f gpu %5.2f at current %d\n", 41 | h_C[i], result[i], i); 42 | break; 43 | } 44 | } 45 | if (match) printf("Arrays match. \n\n"); 46 | } 47 | 48 | 49 | int main(int argc, char **argv){ 50 | int nElem = 1024; 51 | size_t nBytes = nElem * sizeof(float); 52 | 53 | float *h_A, *h_B, *h_C, *result; 54 | h_A = (float *)malloc(nBytes); 55 | h_B = (float *)malloc(nBytes); 56 | h_C = (float *)malloc(nBytes); 57 | result = (float *)malloc(nBytes); 58 | 59 | initialData(h_A, nElem); 60 | initialData(h_B, nElem); 61 | 62 | float *d_A, *d_B, *d_C; 63 | cudaMalloc((float**)&d_A, nBytes); 64 | cudaMalloc((float**)&d_B, nBytes); 65 | cudaMalloc((float**)&d_C, nBytes); 66 | 67 | // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the 68 | // parameter cudaMemcpyHostToDevice specifying the transfer direction. 69 | 70 | cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice); 71 | cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice); 72 | 73 | 74 | 75 | sumArraysOnDevice<<<1, nElem>>>(d_A, d_B, d_C); 76 | sumArraysOnHost(h_A, h_B, result, nElem); 77 | 78 | cudaMemcpy(h_C, d_C, nBytes, cudaMemcpyDeviceToHost); 79 | 80 | for (int i=0; i<10; i++){ 81 | printf("%f + %f = %f \n", h_A[i], h_B[i], h_C[i]); 82 | 83 | } 84 | 85 | checkResult(h_C, result, nElem); 86 | 87 | free(h_A); 88 | free(h_B); 89 | free(h_C); 90 | free(result); 91 | 92 | // use cudaFree to release the memory used on the GPU 93 | cudaFree(d_A); 94 | cudaFree(d_B); 95 | cudaFree(d_C); 96 | cudaDeviceReset(); 97 | 98 | return (0); 99 | } -------------------------------------------------------------------------------- /cuda-c/src/utils/cpu_bitmap.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property and 5 | * proprietary rights in and to this software and related documentation. 6 | * Any use, reproduction, disclosure, or distribution of this software 7 | * and related documentation without an express license agreement from 8 | * NVIDIA Corporation is strictly prohibited. 9 | * 10 | * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 | * associated with this source code for terms and conditions that govern 12 | * your use of this NVIDIA software. 13 | * 14 | */ 15 | 16 | 17 | #ifndef __CPU_BITMAP_H__ 18 | #define __CPU_BITMAP_H__ 19 | 20 | #include "gl_helper.h" 21 | 22 | struct CPUBitmap { 23 | unsigned char *pixels; 24 | int x, y; 25 | void *dataBlock; 26 | void (*bitmapExit)(void*); 27 | 28 | CPUBitmap( int width, int height, void *d = NULL ) { 29 | pixels = new unsigned char[width * height * 4]; 30 | x = width; 31 | y = height; 32 | dataBlock = d; 33 | } 34 | 35 | ~CPUBitmap() { 36 | delete [] pixels; 37 | } 38 | 39 | unsigned char* get_ptr( void ) const { return pixels; } 40 | long image_size( void ) const { return x * y * 4; } 41 | 42 | void display_and_exit( void(*e)(void*) = NULL ) { 43 | CPUBitmap** bitmap = get_bitmap_ptr(); 44 | *bitmap = this; 45 | bitmapExit = e; 46 | // a bug in the Windows GLUT implementation prevents us from 47 | // passing zero arguments to glutInit() 48 | int c=1; 49 | char* dummy = ""; 50 | glutInit( &c, &dummy ); 51 | glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA ); 52 | glutInitWindowSize( x, y ); 53 | glutCreateWindow( "bitmap" ); 54 | glutKeyboardFunc(Key); 55 | glutDisplayFunc(Draw); 56 | glutMainLoop(); 57 | } 58 | 59 | // static method used for glut callbacks 60 | static CPUBitmap** get_bitmap_ptr( void ) { 61 | static CPUBitmap *gBitmap; 62 | return &gBitmap; 63 | } 64 | 65 | // static method used for glut callbacks 66 | static void Key(unsigned char key, int x, int y) { 67 | switch (key) { 68 | case 27: 69 | CPUBitmap* bitmap = *(get_bitmap_ptr()); 70 | if (bitmap->dataBlock != NULL && bitmap->bitmapExit != NULL) 71 | bitmap->bitmapExit( bitmap->dataBlock ); 72 | exit(0); 73 | } 74 | } 75 | 76 | // static method used for glut callbacks 77 | static void Draw( void ) { 78 | CPUBitmap* bitmap = *(get_bitmap_ptr()); 79 | glClearColor( 0.0, 0.0, 0.0, 1.0 ); 80 | glClear( GL_COLOR_BUFFER_BIT ); 81 | glDrawPixels( bitmap->x, bitmap->y, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels ); 82 | glFlush(); 83 | } 84 | }; 85 | 86 | #endif // __CPU_BITMAP_H__ 87 | -------------------------------------------------------------------------------- /cuda-c/src/05-julia-set/julia_set.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include "../utils/common.h" 3 | #include "../utils/cpu_bitmap.h" 4 | 5 | #define DIM 1000 6 | 7 | 8 | /* cuComplex structure that defines a method for storing a complex number 9 | with single precision floating-point components. The structure also defines 10 | addition and multiplication operators as well as a function to return 11 | the magnitude of the complex value. 12 | */ 13 | 14 | struct cuComplex { 15 | float r; 16 | float i; 17 | __device__ cuComplex( float a, float b ) : r(a), i(b) {} 18 | __device__ float magnitude2( void ) { 19 | return r * r + i * i; 20 | } 21 | __device__ cuComplex operator*(const cuComplex& a) { 22 | return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i); 23 | } 24 | __device__ cuComplex operator+(const cuComplex& a) { 25 | return cuComplex(r+a.r, i+a.i); 26 | } 27 | }; 28 | 29 | 30 | // Code that determines whether a point is in or out of the 31 | // Julia Set. 32 | 33 | __device__ int julia(int x, int y){ 34 | const float scale= 1.5; 35 | float jx = scale * (float)(DIM/2 - x) / (DIM/2); 36 | float jy = scale * (float)(DIM/2 - y) / (DIM/2); 37 | 38 | cuComplex c(-0.8, 0.156); 39 | cuComplex a(jx, jy); 40 | 41 | int i = 0; 42 | for (i=0; i<200; i++){ 43 | a = a * a + c; 44 | if (a.magnitude2() > 1000) 45 | return 0; 46 | } 47 | 48 | return 1; 49 | } 50 | 51 | __global__ void kernel(unsigned char *ptr){ 52 | // map from blockIdx to pixel position 53 | int x = blockIdx.x; 54 | int y = blockIdx.y; 55 | // compute linear offset with help of built-iin variable, gridDim 56 | // This variable is a constant across all blocks and simply holds the 57 | // dimensions of the grid that was launched. 58 | // In this example, it will always be the value (DIM, DIM) 59 | int offset = x + y * gridDim.x; 60 | 61 | // now calculate the value at that position 62 | int juliaValue = julia(x, y); 63 | ptr[offset*4 + 0] = 255 * juliaValue; 64 | ptr[offset*4 + 1] = 0; 65 | ptr[offset*4 + 2] = 0; 66 | ptr[offset*4 + 3] = 255; 67 | 68 | } 69 | 70 | // globals needed by the update routine 71 | struct DataBlock { 72 | unsigned char *dev_bitmap; 73 | }; 74 | 75 | 76 | int main(void){ 77 | 78 | // Create DIM x DIM bitmap image using utility library 79 | DataBlock data; 80 | CPUBitmap bitmap( DIM, DIM, &data ); 81 | 82 | 83 | // Because the computation will be done on a GPU, declare a pointer to hold a copy 84 | // of data on the device 85 | unsigned char *device_bitmap; 86 | 87 | HANDLE_ERROR(cudaMalloc((void**)&device_bitmap, bitmap.image_size())); 88 | 89 | // type dim3 is a CUDA runtime type that represents a 3-D (with z=1) 90 | // tuple that will be used to specify the size of our launch 91 | dim3 grid(DIM, DIM); 92 | 93 | kernel<<>>(device_bitmap); 94 | 95 | HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(), device_bitmap, 96 | bitmap.image_size(), cudaMemcpyDeviceToHost)); 97 | 98 | 99 | cudaFree(device_bitmap); 100 | bitmap.display_and_exit(); 101 | 102 | } 103 | -------------------------------------------------------------------------------- /cuda-c/src/cuda-programming-model/02-organizing-threads.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Overwriting checkDimensions.cu\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%%file checkDimensions.cu\n", 18 | "\n", 19 | "#include \n", 20 | "#include \n", 21 | "\n", 22 | "__global__\n", 23 | "void checkIndex(void){\n", 24 | " printf(\"threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) blockDim:(%d, %d, %d) \" \n", 25 | " \"gridDim:(%d, %d, %d)\\n\", threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, \n", 26 | " blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z,\n", 27 | " gridDim.x,gridDim.y,gridDim.z);\n", 28 | "}\n", 29 | "\n", 30 | "\n", 31 | "int main(int argc, char **argv) { \n", 32 | " // define total data element \n", 33 | " int nElem = 6;\n", 34 | " // define grid and block structure \n", 35 | " dim3 block (3); \n", 36 | " dim3 grid ((nElem+block.x-1)/block.x);\n", 37 | " // check grid and block dimension from host side \n", 38 | " printf(\"grid.x %d grid.y %d grid.z %d\\n\",grid.x, grid.y, grid.z); \n", 39 | " printf(\"block.x %d block.y %d block.z %d\\n\",block.x, block.y, block.z);\n", 40 | " // check grid and block dimension from device side \n", 41 | " checkIndex <<>> ();\n", 42 | " // reset device before you leave \n", 43 | " cudaDeviceReset();\n", 44 | " return(0);\n", 45 | " \n", 46 | "}\n" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 6, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "name": "stdout", 56 | "output_type": "stream", 57 | "text": [ 58 | "grid.x 2 grid.y 1 grid.z 1\n", 59 | "block.x 3 block.y 1 block.z 1\n", 60 | "threadIdx:(0, 0, 0) blockIdx:(1, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n", 61 | "threadIdx:(1, 0, 0) blockIdx:(1, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n", 62 | "threadIdx:(2, 0, 0) blockIdx:(1, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n", 63 | "threadIdx:(0, 0, 0) blockIdx:(0, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n", 64 | "threadIdx:(1, 0, 0) blockIdx:(0, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n", 65 | "threadIdx:(2, 0, 0) blockIdx:(0, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "%%bash\n", 71 | "nvcc checkDimensions.cu -o checkdims\n", 72 | "./checkdims" 73 | ] 74 | } 75 | ], 76 | "metadata": { 77 | "kernelspec": { 78 | "display_name": "Python 3", 79 | "language": "python", 80 | "name": "python3" 81 | }, 82 | "language_info": { 83 | "codemirror_mode": { 84 | "name": "ipython", 85 | "version": 3 86 | }, 87 | "file_extension": ".py", 88 | "mimetype": "text/x-python", 89 | "name": "python", 90 | "nbconvert_exporter": "python", 91 | "pygments_lexer": "ipython3", 92 | "version": "3.6.5" 93 | }, 94 | "toc": { 95 | "base_numbering": 1, 96 | "nav_menu": {}, 97 | "number_sections": true, 98 | "sideBar": true, 99 | "skip_h1_title": false, 100 | "title_cell": "Table of Contents", 101 | "title_sidebar": "Contents", 102 | "toc_cell": false, 103 | "toc_position": {}, 104 | "toc_section_display": true, 105 | "toc_window_display": false 106 | } 107 | }, 108 | "nbformat": 4, 109 | "nbformat_minor": 2 110 | } 111 | -------------------------------------------------------------------------------- /cuda-c/src/utils/cpu_anim.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property and 5 | * proprietary rights in and to this software and related documentation. 6 | * Any use, reproduction, disclosure, or distribution of this software 7 | * and related documentation without an express license agreement from 8 | * NVIDIA Corporation is strictly prohibited. 9 | * 10 | * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 | * associated with this source code for terms and conditions that govern 12 | * your use of this NVIDIA software. 13 | * 14 | */ 15 | 16 | 17 | #ifndef __CPU_ANIM_H__ 18 | #define __CPU_ANIM_H__ 19 | 20 | #include "gl_helper.h" 21 | 22 | #include 23 | 24 | 25 | struct CPUAnimBitmap { 26 | unsigned char *pixels; 27 | int width, height; 28 | void *dataBlock; 29 | void (*fAnim)(void*,int); 30 | void (*animExit)(void*); 31 | void (*clickDrag)(void*,int,int,int,int); 32 | int dragStartX, dragStartY; 33 | 34 | CPUAnimBitmap( int w, int h, void *d = NULL ) { 35 | width = w; 36 | height = h; 37 | pixels = new unsigned char[width * height * 4]; 38 | dataBlock = d; 39 | clickDrag = NULL; 40 | } 41 | 42 | ~CPUAnimBitmap() { 43 | delete [] pixels; 44 | } 45 | 46 | unsigned char* get_ptr( void ) const { return pixels; } 47 | long image_size( void ) const { return width * height * 4; } 48 | 49 | void click_drag( void (*f)(void*,int,int,int,int)) { 50 | clickDrag = f; 51 | } 52 | 53 | void anim_and_exit( void (*f)(void*,int), void(*e)(void*) ) { 54 | CPUAnimBitmap** bitmap = get_bitmap_ptr(); 55 | *bitmap = this; 56 | fAnim = f; 57 | animExit = e; 58 | // a bug in the Windows GLUT implementation prevents us from 59 | // passing zero arguments to glutInit() 60 | int c=1; 61 | char* dummy = ""; 62 | glutInit( &c, &dummy ); 63 | glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA ); 64 | glutInitWindowSize( width, height ); 65 | glutCreateWindow( "bitmap" ); 66 | glutKeyboardFunc(Key); 67 | glutDisplayFunc(Draw); 68 | if (clickDrag != NULL) 69 | glutMouseFunc( mouse_func ); 70 | glutIdleFunc( idle_func ); 71 | glutMainLoop(); 72 | } 73 | 74 | // static method used for glut callbacks 75 | static CPUAnimBitmap** get_bitmap_ptr( void ) { 76 | static CPUAnimBitmap* gBitmap; 77 | return &gBitmap; 78 | } 79 | 80 | // static method used for glut callbacks 81 | static void mouse_func( int button, int state, 82 | int mx, int my ) { 83 | if (button == GLUT_LEFT_BUTTON) { 84 | CPUAnimBitmap* bitmap = *(get_bitmap_ptr()); 85 | if (state == GLUT_DOWN) { 86 | bitmap->dragStartX = mx; 87 | bitmap->dragStartY = my; 88 | } else if (state == GLUT_UP) { 89 | bitmap->clickDrag( bitmap->dataBlock, 90 | bitmap->dragStartX, 91 | bitmap->dragStartY, 92 | mx, my ); 93 | } 94 | } 95 | } 96 | 97 | // static method used for glut callbacks 98 | static void idle_func( void ) { 99 | static int ticks = 1; 100 | CPUAnimBitmap* bitmap = *(get_bitmap_ptr()); 101 | bitmap->fAnim( bitmap->dataBlock, ticks++ ); 102 | glutPostRedisplay(); 103 | } 104 | 105 | // static method used for glut callbacks 106 | static void Key(unsigned char key, int x, int y) { 107 | switch (key) { 108 | case 27: 109 | CPUAnimBitmap* bitmap = *(get_bitmap_ptr()); 110 | bitmap->animExit( bitmap->dataBlock ); 111 | //delete bitmap; 112 | exit(0); 113 | } 114 | } 115 | 116 | // static method used for glut callbacks 117 | static void Draw( void ) { 118 | CPUAnimBitmap* bitmap = *(get_bitmap_ptr()); 119 | glClearColor( 0.0, 0.0, 0.0, 1.0 ); 120 | glClear( GL_COLOR_BUFFER_BIT ); 121 | glDrawPixels( bitmap->width, bitmap->height, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels ); 122 | glutSwapBuffers(); 123 | } 124 | }; 125 | 126 | 127 | #endif // __CPU_ANIM_H__ 128 | 129 | -------------------------------------------------------------------------------- /cuda-c/src/cuda-programming-model/sumArraysOnGPU.cu: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | double cpuSecond(){ 10 | struct timeval tp; 11 | gettimeofday(&tp, NULL); 12 | return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6); 13 | } 14 | 15 | #define CHECK(call) \ 16 | { \ 17 | const cudaError_t error = call; \ 18 | if (error != cudaSuccess) \ 19 | { \ 20 | fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__); \ 21 | fprintf(stderr, "code: %d, reason: %s\n", error, \ 22 | cudaGetErrorString(error)); \ 23 | exit(1); \ 24 | } \ 25 | } 26 | 27 | 28 | __global__ void sumArraysOnDevice(float *A, float *B, float *C, const int N){ 29 | int idx = blockIdx.x * blockDim.x + threadIdx.x; 30 | if (idx < N) C[idx] = A[idx] + B[idx]; 31 | 32 | } 33 | 34 | 35 | void initialData(float *ip, int size){ 36 | // generate different seed for random number 37 | time_t t; 38 | srand((unsigned int) time (&t)); 39 | 40 | for (int i=0; i epsilon){ 59 | match = 0; 60 | printf("Arrays do not match!\n"); 61 | printf("host %5.2f gpu %5.2f at current %d\n", 62 | hostRef[i], gpuRef[i], i); 63 | break; 64 | } 65 | } 66 | if (match) printf("Arrays match. \n\n"); 67 | } 68 | 69 | 70 | int main(int argc, char **argv){ 71 | 72 | printf("%s Starting...\n", argv[0]); 73 | 74 | // malloc host memory 75 | int nElem = 1 <<24; 76 | size_t nBytes = nElem * sizeof(float); 77 | 78 | 79 | // initialize data at host side 80 | float *h_A, *h_B, *hostRef, *gpuRef; 81 | h_A = (float *)malloc(nBytes); 82 | h_B = (float *)malloc(nBytes); 83 | hostRef = (float *)malloc(nBytes); 84 | gpuRef = (float *)malloc(nBytes); 85 | 86 | // initialize data at host side 87 | initialData(h_A, nElem); 88 | initialData(h_B, nElem); 89 | 90 | memset(hostRef, 0, nBytes); 91 | memset(gpuRef, 0, nBytes); 92 | 93 | // malloc device global memory 94 | float *d_A, *d_B, *d_C; 95 | cudaMalloc((float**)&d_A, nBytes); 96 | cudaMalloc((float**)&d_B, nBytes); 97 | cudaMalloc((float**)&d_C, nBytes); 98 | 99 | // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the 100 | // parameter cudaMemcpyHostToDevice specifying the transfer direction. 101 | 102 | CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice)); 103 | CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice)); 104 | 105 | // invoke kernel at host side 106 | int iLen = 128; 107 | dim3 block(iLen); 108 | dim3 grid((nElem+block.x-1)/block.x); 109 | 110 | double iStart = cpuSecond(); 111 | sumArraysOnDevice<<>>(d_A, d_B, d_C, nElem); 112 | CHECK(cudaDeviceSynchronize()); 113 | double iElaps = cpuSecond() - iStart; 114 | printf("sumArraysOnGPU <<<%d,%d>>> Time elapsed %f sec\n", grid.x, block.x, iElaps); 115 | //printf("Execution configuration <<<%d, %d>>>\n", grid.x, block.x); 116 | 117 | // copy kernel result back to host side 118 | cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost); 119 | 120 | // add vector at host side for result checks 121 | sumArraysOnHost(h_A, h_B, hostRef, nElem); 122 | 123 | for (int i=0; i<10; i++){ 124 | printf("%f + %f = %f \n", h_A[i], h_B[i], hostRef[i]); 125 | 126 | } 127 | 128 | // check device results 129 | checkResult(hostRef, gpuRef, nElem); 130 | 131 | free(h_A); 132 | free(h_B); 133 | free(hostRef); 134 | free(gpuRef); 135 | 136 | // use cudaFree to release the memory used on the GPU 137 | cudaFree(d_A); 138 | cudaFree(d_B); 139 | cudaFree(d_C); 140 | cudaDeviceReset(); 141 | 142 | return (0); 143 | } -------------------------------------------------------------------------------- /cuda-c/src/utils/gpu_anim.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property and 5 | * proprietary rights in and to this software and related documentation. 6 | * Any use, reproduction, disclosure, or distribution of this software 7 | * and related documentation without an express license agreement from 8 | * NVIDIA Corporation is strictly prohibited. 9 | * 10 | * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 | * associated with this source code for terms and conditions that govern 12 | * your use of this NVIDIA software. 13 | * 14 | */ 15 | 16 | 17 | #ifndef __GPU_ANIM_H__ 18 | #define __GPU_ANIM_H__ 19 | 20 | #include "gl_helper.h" 21 | 22 | #include "cuda.h" 23 | #include "cuda_gl_interop.h" 24 | #include 25 | 26 | 27 | PFNGLBINDBUFFERARBPROC glBindBuffer = NULL; 28 | PFNGLDELETEBUFFERSARBPROC glDeleteBuffers = NULL; 29 | PFNGLGENBUFFERSARBPROC glGenBuffers = NULL; 30 | PFNGLBUFFERDATAARBPROC glBufferData = NULL; 31 | 32 | 33 | struct GPUAnimBitmap { 34 | GLuint bufferObj; 35 | cudaGraphicsResource *resource; 36 | int width, height; 37 | void *dataBlock; 38 | void (*fAnim)(uchar4*,void*,int); 39 | void (*animExit)(void*); 40 | void (*clickDrag)(void*,int,int,int,int); 41 | int dragStartX, dragStartY; 42 | 43 | GPUAnimBitmap( int w, int h, void *d = NULL ) { 44 | width = w; 45 | height = h; 46 | dataBlock = d; 47 | clickDrag = NULL; 48 | 49 | // first, find a CUDA device and set it to graphic interop 50 | cudaDeviceProp prop; 51 | int dev; 52 | memset( &prop, 0, sizeof( cudaDeviceProp ) ); 53 | prop.major = 1; 54 | prop.minor = 0; 55 | HANDLE_ERROR( cudaChooseDevice( &dev, &prop ) ); 56 | cudaGLSetGLDevice( dev ); 57 | 58 | // a bug in the Windows GLUT implementation prevents us from 59 | // passing zero arguments to glutInit() 60 | int c=1; 61 | char* dummy = ""; 62 | glutInit( &c, &dummy ); 63 | glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA ); 64 | glutInitWindowSize( width, height ); 65 | glutCreateWindow( "bitmap" ); 66 | 67 | glBindBuffer = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer"); 68 | glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers"); 69 | glGenBuffers = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers"); 70 | glBufferData = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData"); 71 | 72 | glGenBuffers( 1, &bufferObj ); 73 | glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj ); 74 | glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, width * height * 4, 75 | NULL, GL_DYNAMIC_DRAW_ARB ); 76 | 77 | HANDLE_ERROR( cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone ) ); 78 | } 79 | 80 | ~GPUAnimBitmap() { 81 | free_resources(); 82 | } 83 | 84 | void free_resources( void ) { 85 | HANDLE_ERROR( cudaGraphicsUnregisterResource( resource ) ); 86 | 87 | glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 ); 88 | glDeleteBuffers( 1, &bufferObj ); 89 | } 90 | 91 | 92 | long image_size( void ) const { return width * height * 4; } 93 | 94 | void click_drag( void (*f)(void*,int,int,int,int)) { 95 | clickDrag = f; 96 | } 97 | 98 | void anim_and_exit( void (*f)(uchar4*,void*,int), void(*e)(void*) ) { 99 | GPUAnimBitmap** bitmap = get_bitmap_ptr(); 100 | *bitmap = this; 101 | fAnim = f; 102 | animExit = e; 103 | 104 | glutKeyboardFunc( Key ); 105 | glutDisplayFunc( Draw ); 106 | if (clickDrag != NULL) 107 | glutMouseFunc( mouse_func ); 108 | glutIdleFunc( idle_func ); 109 | glutMainLoop(); 110 | } 111 | 112 | // static method used for glut callbacks 113 | static GPUAnimBitmap** get_bitmap_ptr( void ) { 114 | static GPUAnimBitmap* gBitmap; 115 | return &gBitmap; 116 | } 117 | 118 | // static method used for glut callbacks 119 | static void mouse_func( int button, int state, 120 | int mx, int my ) { 121 | if (button == GLUT_LEFT_BUTTON) { 122 | GPUAnimBitmap* bitmap = *(get_bitmap_ptr()); 123 | if (state == GLUT_DOWN) { 124 | bitmap->dragStartX = mx; 125 | bitmap->dragStartY = my; 126 | } else if (state == GLUT_UP) { 127 | bitmap->clickDrag( bitmap->dataBlock, 128 | bitmap->dragStartX, 129 | bitmap->dragStartY, 130 | mx, my ); 131 | } 132 | } 133 | } 134 | 135 | // static method used for glut callbacks 136 | static void idle_func( void ) { 137 | static int ticks = 1; 138 | GPUAnimBitmap* bitmap = *(get_bitmap_ptr()); 139 | uchar4* devPtr; 140 | size_t size; 141 | 142 | HANDLE_ERROR( cudaGraphicsMapResources( 1, &(bitmap->resource), NULL ) ); 143 | HANDLE_ERROR( cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, bitmap->resource) ); 144 | 145 | bitmap->fAnim( devPtr, bitmap->dataBlock, ticks++ ); 146 | 147 | HANDLE_ERROR( cudaGraphicsUnmapResources( 1, &(bitmap->resource), NULL ) ); 148 | 149 | glutPostRedisplay(); 150 | } 151 | 152 | // static method used for glut callbacks 153 | static void Key(unsigned char key, int x, int y) { 154 | switch (key) { 155 | case 27: 156 | GPUAnimBitmap* bitmap = *(get_bitmap_ptr()); 157 | if (bitmap->animExit) 158 | bitmap->animExit( bitmap->dataBlock ); 159 | bitmap->free_resources(); 160 | exit(0); 161 | } 162 | } 163 | 164 | // static method used for glut callbacks 165 | static void Draw( void ) { 166 | GPUAnimBitmap* bitmap = *(get_bitmap_ptr()); 167 | glClearColor( 0.0, 0.0, 0.0, 1.0 ); 168 | glClear( GL_COLOR_BUFFER_BIT ); 169 | glDrawPixels( bitmap->width, bitmap->height, GL_RGBA, 170 | GL_UNSIGNED_BYTE, 0 ); 171 | glutSwapBuffers(); 172 | } 173 | }; 174 | 175 | 176 | #endif // __GPU_ANIM_H__ 177 | 178 | -------------------------------------------------------------------------------- /cuda-c/src/utils/common.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 1993-2010 NVIDIA Corporation. All rights reserved. 3 | * 4 | * NVIDIA Corporation and its licensors retain all intellectual property and 5 | * proprietary rights in and to this software and related documentation. 6 | * Any use, reproduction, disclosure, or distribution of this software 7 | * and related documentation without an express license agreement from 8 | * NVIDIA Corporation is strictly prohibited. 9 | * 10 | * Please refer to the applicable NVIDIA end user license agreement (EULA) 11 | * associated with this source code for terms and conditions that govern 12 | * your use of this NVIDIA software. 13 | * 14 | */ 15 | 16 | 17 | #ifndef __COMMON_H__ 18 | #define __COMMON_H__ 19 | #include 20 | 21 | static void HandleError( cudaError_t err, 22 | const char *file, 23 | int line ) { 24 | if (err != cudaSuccess) { 25 | printf( "%s in %s at line %d\n", cudaGetErrorString( err ), 26 | file, line ); 27 | exit( EXIT_FAILURE ); 28 | } 29 | } 30 | #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ )) 31 | 32 | 33 | #define HANDLE_NULL( a ) {if (a == NULL) { \ 34 | printf( "Host memory failed in %s at line %d\n", \ 35 | __FILE__, __LINE__ ); \ 36 | exit( EXIT_FAILURE );}} 37 | 38 | template< typename T > 39 | void swap( T& a, T& b ) { 40 | T t = a; 41 | a = b; 42 | b = t; 43 | } 44 | 45 | 46 | void* big_random_block( int size ) { 47 | unsigned char *data = (unsigned char*)malloc( size ); 48 | HANDLE_NULL( data ); 49 | for (int i=0; i 360) hue -= 360; 69 | else if (hue < 0) hue += 360; 70 | 71 | if (hue < 60) 72 | return (unsigned char)(255 * (n1 + (n2-n1)*hue/60)); 73 | if (hue < 180) 74 | return (unsigned char)(255 * n2); 75 | if (hue < 240) 76 | return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60)); 77 | return (unsigned char)(255 * n1); 78 | } 79 | 80 | __global__ void float_to_color( unsigned char *optr, 81 | const float *outSrc ) { 82 | // map from threadIdx/BlockIdx to pixel position 83 | int x = threadIdx.x + blockIdx.x * blockDim.x; 84 | int y = threadIdx.y + blockIdx.y * blockDim.y; 85 | int offset = x + y * blockDim.x * gridDim.x; 86 | 87 | float l = outSrc[offset]; 88 | float s = 1; 89 | int h = (180 + (int)(360.0f * outSrc[offset])) % 360; 90 | float m1, m2; 91 | 92 | if (l <= 0.5f) 93 | m2 = l * (1 + s); 94 | else 95 | m2 = l + s - l * s; 96 | m1 = 2 * l - m2; 97 | 98 | optr[offset*4 + 0] = value( m1, m2, h+120 ); 99 | optr[offset*4 + 1] = value( m1, m2, h ); 100 | optr[offset*4 + 2] = value( m1, m2, h -120 ); 101 | optr[offset*4 + 3] = 255; 102 | } 103 | 104 | __global__ void float_to_color( uchar4 *optr, 105 | const float *outSrc ) { 106 | // map from threadIdx/BlockIdx to pixel position 107 | int x = threadIdx.x + blockIdx.x * blockDim.x; 108 | int y = threadIdx.y + blockIdx.y * blockDim.y; 109 | int offset = x + y * blockDim.x * gridDim.x; 110 | 111 | float l = outSrc[offset]; 112 | float s = 1; 113 | int h = (180 + (int)(360.0f * outSrc[offset])) % 360; 114 | float m1, m2; 115 | 116 | if (l <= 0.5f) 117 | m2 = l * (1 + s); 118 | else 119 | m2 = l + s - l * s; 120 | m1 = 2 * l - m2; 121 | 122 | optr[offset].x = value( m1, m2, h+120 ); 123 | optr[offset].y = value( m1, m2, h ); 124 | optr[offset].z = value( m1, m2, h -120 ); 125 | optr[offset].w = 255; 126 | } 127 | 128 | 129 | #if _WIN32 130 | //Windows threads. 131 | #include 132 | 133 | typedef HANDLE CUTThread; 134 | typedef unsigned (WINAPI *CUT_THREADROUTINE)(void *); 135 | 136 | #define CUT_THREADPROC unsigned WINAPI 137 | #define CUT_THREADEND return 0 138 | 139 | #else 140 | //POSIX threads. 141 | #include 142 | 143 | typedef pthread_t CUTThread; 144 | typedef void *(*CUT_THREADROUTINE)(void *); 145 | 146 | #define CUT_THREADPROC void 147 | #define CUT_THREADEND 148 | #endif 149 | 150 | //Create thread. 151 | CUTThread start_thread( CUT_THREADROUTINE, void *data ); 152 | 153 | //Wait for thread to finish. 154 | void end_thread( CUTThread thread ); 155 | 156 | //Destroy thread. 157 | void destroy_thread( CUTThread thread ); 158 | 159 | //Wait for multiple threads. 160 | void wait_for_threads( const CUTThread *threads, int num ); 161 | 162 | #if _WIN32 163 | //Create thread 164 | CUTThread start_thread(CUT_THREADROUTINE func, void *data){ 165 | return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL); 166 | } 167 | 168 | //Wait for thread to finish 169 | void end_thread(CUTThread thread){ 170 | WaitForSingleObject(thread, INFINITE); 171 | CloseHandle(thread); 172 | } 173 | 174 | //Destroy thread 175 | void destroy_thread( CUTThread thread ){ 176 | TerminateThread(thread, 0); 177 | CloseHandle(thread); 178 | } 179 | 180 | //Wait for multiple threads 181 | void wait_for_threads(const CUTThread * threads, int num){ 182 | WaitForMultipleObjects(num, threads, true, INFINITE); 183 | 184 | for(int i = 0; i < num; i++) 185 | CloseHandle(threads[i]); 186 | } 187 | 188 | #else 189 | //Create thread 190 | CUTThread start_thread(CUT_THREADROUTINE func, void * data){ 191 | pthread_t thread; 192 | pthread_create(&thread, NULL, func, data); 193 | return thread; 194 | } 195 | 196 | //Wait for thread to finish 197 | void end_thread(CUTThread thread){ 198 | pthread_join(thread, NULL); 199 | } 200 | 201 | //Destroy thread 202 | void destroy_thread( CUTThread thread ){ 203 | pthread_cancel(thread); 204 | } 205 | 206 | //Wait for multiple threads 207 | void wait_for_threads(const CUTThread * threads, int num){ 208 | for(int i = 0; i < num; i++) 209 | end_thread( threads[i] ); 210 | } 211 | 212 | #endif 213 | 214 | 215 | 216 | 217 | #endif // __COMMON_H__ -------------------------------------------------------------------------------- /cuda-c/src/cuda-programming-model/03-compiling-and-executing.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 49, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Overwriting sumArraysOnGPU.cu\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%%file sumArraysOnGPU.cu\n", 18 | "\n", 19 | "#include \n", 20 | "#include \n", 21 | "#include \n", 22 | "#include \n", 23 | "#include \n", 24 | "\n", 25 | "\n", 26 | "#define CHECK(call) \\\n", 27 | "{ \\\n", 28 | " const cudaError_t error = call; \\\n", 29 | " if (error != cudaSuccess) \\\n", 30 | " { \\\n", 31 | " fprintf(stderr, \"Error: %s:%d, \", __FILE__, __LINE__); \\\n", 32 | " fprintf(stderr, \"code: %d, reason: %s\\n\", error, \\\n", 33 | " cudaGetErrorString(error)); \\\n", 34 | " exit(1); \\\n", 35 | " } \\\n", 36 | "}\n", 37 | "\n", 38 | "\n", 39 | "__global__ void sumArraysOnDevice(float *A, float *B, float *C){\n", 40 | " int idx = blockIdx.x * blockDim.x + threadIdx.x;\n", 41 | " C[idx] = A[idx] + B[idx];\n", 42 | "\n", 43 | "}\n", 44 | "\n", 45 | "\n", 46 | "void initialData(float *ip, int size){\n", 47 | " // generate different seed for random number \n", 48 | " time_t t;\n", 49 | " srand((unsigned int) time (&t));\n", 50 | " \n", 51 | " for (int i=0; i epsilon){\n", 70 | " match = 0;\n", 71 | " printf(\"Arrays do not match!\\n\");\n", 72 | " printf(\"host %5.2f gpu %5.2f at current %d\\n\",\n", 73 | " hostRef[i], gpuRef[i], i);\n", 74 | " break;\n", 75 | " }\n", 76 | " }\n", 77 | " if (match) printf(\"Arrays match. \\n\\n\");\n", 78 | "}\n", 79 | "\n", 80 | "\n", 81 | "int main(int argc, char **argv){\n", 82 | " \n", 83 | " printf(\"%s Starting...\\n\", argv[0]);\n", 84 | " \n", 85 | " // malloc host memory\n", 86 | " int nElem = 10000;\n", 87 | " size_t nBytes = nElem * sizeof(float);\n", 88 | " \n", 89 | " \n", 90 | " // initialize data at host side\n", 91 | " float *h_A, *h_B, *hostRef, *gpuRef;\n", 92 | " h_A = (float *)malloc(nBytes);\n", 93 | " h_B = (float *)malloc(nBytes);\n", 94 | " hostRef = (float *)malloc(nBytes);\n", 95 | " gpuRef = (float *)malloc(nBytes);\n", 96 | " \n", 97 | " // initialize data at host side\n", 98 | " initialData(h_A, nElem);\n", 99 | " initialData(h_B, nElem);\n", 100 | " \n", 101 | " memset(hostRef, 0, nBytes);\n", 102 | " memset(gpuRef, 0, nBytes);\n", 103 | " \n", 104 | " // malloc device global memory \n", 105 | " float *d_A, *d_B, *d_C;\n", 106 | " cudaMalloc((float**)&d_A, nBytes);\n", 107 | " cudaMalloc((float**)&d_B, nBytes);\n", 108 | " cudaMalloc((float**)&d_C, nBytes);\n", 109 | " \n", 110 | " // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the\n", 111 | " // parameter cudaMemcpyHostToDevice specifying the transfer direction.\n", 112 | " \n", 113 | " CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));\n", 114 | " CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));\n", 115 | " \n", 116 | " // invoke kernel at host side\n", 117 | " dim3 block(100);\n", 118 | " dim3 grid(nElem / block.x);\n", 119 | " \n", 120 | " sumArraysOnDevice<<>>(d_A, d_B, d_C);\n", 121 | " printf(\"Execution configuration <<<%d, %d>>>\\n\", grid.x, block.x);\n", 122 | " \n", 123 | " // copy kernel result back to host side \n", 124 | " cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);\n", 125 | " \n", 126 | " // add vector at host side for result checks\n", 127 | " sumArraysOnHost(h_A, h_B, hostRef, nElem);\n", 128 | " \n", 129 | " for (int i=0; i<10; i++){\n", 130 | " printf(\"%f + %f = %f \\n\", h_A[i], h_B[i], hostRef[i]);\n", 131 | "\n", 132 | " }\n", 133 | " \n", 134 | " // check device results\n", 135 | " checkResult(hostRef, gpuRef, nElem);\n", 136 | " \n", 137 | " free(h_A);\n", 138 | " free(h_B);\n", 139 | " free(hostRef);\n", 140 | " free(gpuRef);\n", 141 | " \n", 142 | " // use cudaFree to release the memory used on the GPU\n", 143 | " cudaFree(d_A);\n", 144 | " cudaFree(d_B);\n", 145 | " cudaFree(d_C);\n", 146 | " cudaDeviceReset();\n", 147 | " \n", 148 | " return (0);\n", 149 | "}\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 50, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "./addvector Starting...\n", 162 | "Execution configuration <<<100, 100>>>\n", 163 | "17.600000 + 17.600000 = 35.200001 \n", 164 | "16.299999 + 16.299999 = 32.599998 \n", 165 | "0.600000 + 0.600000 = 1.200000 \n", 166 | "23.200001 + 23.200001 = 46.400002 \n", 167 | "16.799999 + 16.799999 = 33.599998 \n", 168 | "15.600000 + 15.600000 = 31.200001 \n", 169 | "2.200000 + 2.200000 = 4.400000 \n", 170 | "19.700001 + 19.700001 = 39.400002 \n", 171 | "4.300000 + 4.300000 = 8.600000 \n", 172 | "3.200000 + 3.200000 = 6.400000 \n", 173 | "Arrays match. \n", 174 | "\n" 175 | ] 176 | }, 177 | { 178 | "name": "stderr", 179 | "output_type": "stream", 180 | "text": [ 181 | "==26284== NVPROF is profiling process 26284, command: ./addvector\n", 182 | "==26284== Profiling application: ./addvector\n", 183 | "==26284== Profiling result:\n", 184 | " Type Time(%) Time Calls Avg Min Max Name\n", 185 | " GPU activities: 61.39% 41.056us 2 20.528us 19.840us 21.216us [CUDA memcpy HtoD]\n", 186 | " 32.87% 21.984us 1 21.984us 21.984us 21.984us [CUDA memcpy DtoH]\n", 187 | " 5.74% 3.8400us 1 3.8400us 3.8400us 3.8400us sumArraysOnDevice(float*, float*, float*)\n", 188 | " API calls: 67.64% 108.27ms 3 36.090ms 6.2490us 108.25ms cudaMalloc\n", 189 | " 31.70% 50.742ms 1 50.742ms 50.742ms 50.742ms cudaDeviceReset\n", 190 | " 0.37% 586.92us 94 6.2430us 177ns 259.83us cuDeviceGetAttribute\n", 191 | " 0.10% 166.18us 3 55.392us 6.7450us 147.89us cudaFree\n", 192 | " 0.07% 117.21us 3 39.069us 22.571us 54.160us cudaMemcpy\n", 193 | " 0.05% 80.415us 1 80.415us 80.415us 80.415us cuDeviceTotalMem\n", 194 | " 0.05% 75.864us 1 75.864us 75.864us 75.864us cuDeviceGetName\n", 195 | " 0.02% 25.566us 1 25.566us 25.566us 25.566us cudaLaunch\n", 196 | " 0.00% 2.8060us 2 1.4030us 1.2850us 1.5210us cuDeviceGetCount\n", 197 | " 0.00% 2.7890us 3 929ns 221ns 2.1490us cudaSetupArgument\n", 198 | " 0.00% 1.0910us 2 545ns 450ns 641ns cuDeviceGet\n", 199 | " 0.00% 788ns 1 788ns 788ns 788ns cudaConfigureCall\n" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "%%bash\n", 205 | "nvcc sumArraysOnGPU.cu -o addvector\n", 206 | "nvprof --unified-memory-profiling off ./addvector\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": null, 219 | "metadata": {}, 220 | "outputs": [], 221 | "source": [] 222 | } 223 | ], 224 | "metadata": { 225 | "kernelspec": { 226 | "display_name": "Python 3", 227 | "language": "python", 228 | "name": "python3" 229 | }, 230 | "language_info": { 231 | "codemirror_mode": { 232 | "name": "ipython", 233 | "version": 3 234 | }, 235 | "file_extension": ".py", 236 | "mimetype": "text/x-python", 237 | "name": "python", 238 | "nbconvert_exporter": "python", 239 | "pygments_lexer": "ipython3", 240 | "version": "3.6.5" 241 | }, 242 | "toc": { 243 | "base_numbering": 1, 244 | "nav_menu": {}, 245 | "number_sections": true, 246 | "sideBar": true, 247 | "skip_h1_title": false, 248 | "title_cell": "Table of Contents", 249 | "title_sidebar": "Contents", 250 | "toc_cell": false, 251 | "toc_position": {}, 252 | "toc_section_display": true, 253 | "toc_window_display": false 254 | } 255 | }, 256 | "nbformat": 4, 257 | "nbformat_minor": 2 258 | } 259 | -------------------------------------------------------------------------------- /cuda-c/src/cuda-programming-model/01-memory-management.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Overwriting sumArraysOnHost.c\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%%file sumArraysOnHost.c\n", 18 | "\n", 19 | "#include \n", 20 | "#include \n", 21 | "#include \n", 22 | "\n", 23 | "void sumArraysOnHost(float *A, float *B, float *C, const int N){\n", 24 | " for (int idx=0; idx\n", 115 | "#include \n", 116 | "#include \n", 117 | "#include \n", 118 | "\n", 119 | "__global__ void sumArraysOnDevice(float *A, float *B, float *C){\n", 120 | " int idx = threadIdx.x;\n", 121 | " C[idx] = A[idx] + B[idx];\n", 122 | "\n", 123 | "}\n", 124 | "\n", 125 | "\n", 126 | "void initialData(float *ip, int size){\n", 127 | " // generate different seed for random number \n", 128 | " time_t t;\n", 129 | " srand((unsigned int) time (&t));\n", 130 | " \n", 131 | " for (int i=0; i epsilon){\n", 150 | " match = 0;\n", 151 | " printf(\"Arrays do not match!\\n\");\n", 152 | " printf(\"host %5.2f gpu %5.2f at current %d\\n\",\n", 153 | " h_C[i], result[i], i);\n", 154 | " break;\n", 155 | " }\n", 156 | " }\n", 157 | " if (match) printf(\"Arrays match. \\n\\n\");\n", 158 | "}\n", 159 | "\n", 160 | "\n", 161 | "int main(int argc, char **argv){\n", 162 | " int nElem = 1024;\n", 163 | " size_t nBytes = nElem * sizeof(float);\n", 164 | " \n", 165 | " float *h_A, *h_B, *h_C, *result;\n", 166 | " h_A = (float *)malloc(nBytes);\n", 167 | " h_B = (float *)malloc(nBytes);\n", 168 | " h_C = (float *)malloc(nBytes);\n", 169 | " result = (float *)malloc(nBytes);\n", 170 | " \n", 171 | " initialData(h_A, nElem);\n", 172 | " initialData(h_B, nElem);\n", 173 | " \n", 174 | " float *d_A, *d_B, *d_C;\n", 175 | " cudaMalloc((float**)&d_A, nBytes);\n", 176 | " cudaMalloc((float**)&d_B, nBytes);\n", 177 | " cudaMalloc((float**)&d_C, nBytes);\n", 178 | " \n", 179 | " // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the\n", 180 | " // parameter cudaMemcpyHostToDevice specifying the transfer direction.\n", 181 | " \n", 182 | " cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);\n", 183 | " cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);\n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " sumArraysOnDevice<<<1, nElem>>>(d_A, d_B, d_C);\n", 188 | " sumArraysOnHost(h_A, h_B, result, nElem);\n", 189 | " \n", 190 | " cudaMemcpy(h_C, d_C, nBytes, cudaMemcpyDeviceToHost);\n", 191 | " \n", 192 | " for (int i=0; i<10; i++){\n", 193 | " printf(\"%f + %f = %f \\n\", h_A[i], h_B[i], h_C[i]);\n", 194 | "\n", 195 | " }\n", 196 | " \n", 197 | " checkResult(h_C, result, nElem);\n", 198 | " \n", 199 | " free(h_A);\n", 200 | " free(h_B);\n", 201 | " free(h_C);\n", 202 | " free(result);\n", 203 | " \n", 204 | " // use cudaFree to release the memory used on the GPU\n", 205 | " cudaFree(d_A);\n", 206 | " cudaFree(d_B);\n", 207 | " cudaFree(d_C);\n", 208 | " cudaDeviceReset();\n", 209 | " \n", 210 | " return (0);\n", 211 | "}" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 21, 217 | "metadata": {}, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "21.600000 + 21.600000 = 43.200001 \n", 224 | "12.200000 + 12.200000 = 24.400000 \n", 225 | "3.300000 + 3.300000 = 6.600000 \n", 226 | "6.400000 + 6.400000 = 12.800000 \n", 227 | "8.600000 + 8.600000 = 17.200001 \n", 228 | "11.400000 + 11.400000 = 22.799999 \n", 229 | "23.299999 + 23.299999 = 46.599998 \n", 230 | "2.700000 + 2.700000 = 5.400000 \n", 231 | "2.600000 + 2.600000 = 5.200000 \n", 232 | "24.100000 + 24.100000 = 48.200001 \n", 233 | "Arrays match. \n", 234 | "\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "%%bash\n", 240 | "nvcc sumArraysOnDevice.cu -o sumgpu\n", 241 | "./sumgpu" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 11, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "==12294== NVPROF is profiling process 12294, command: ./sumgpu\n", 254 | "11.300000 + 11.300000 = 22.600000 \n", 255 | "23.200001 + 23.200001 = 0.000000 \n", 256 | "23.500000 + 23.500000 = 0.000000 \n", 257 | "21.500000 + 21.500000 = 0.000000 \n", 258 | "16.700001 + 16.700001 = 0.000000 \n", 259 | "23.000000 + 23.000000 = 0.000000 \n", 260 | "5.900000 + 5.900000 = 0.000000 \n", 261 | "3.200000 + 3.200000 = 0.000000 \n", 262 | "13.900000 + 13.900000 = 0.000000 \n", 263 | "8.200000 + 8.200000 = 0.000000 \n", 264 | "Arrays do not match!\n", 265 | "host 0.00 gpu 46.40 at current 1\n", 266 | "==12294== Profiling application: ./sumgpu\n", 267 | "==12294== Profiling result:\n", 268 | " Type Time(%) Time Calls Avg Min Max Name\n", 269 | " GPU activities: 38.31% 3.7760us 1 3.7760us 3.7760us 3.7760us sumArraysOnDevice(float*, float*, float*)\n", 270 | " 37.34% 3.6800us 2 1.8400us 1.8240us 1.8560us [CUDA memcpy HtoD]\n", 271 | " 24.35% 2.4000us 1 2.4000us 2.4000us 2.4000us [CUDA memcpy DtoH]\n", 272 | " API calls: 70.29% 116.93ms 3 38.978ms 4.1810us 116.92ms cudaMalloc\n", 273 | " 29.12% 48.439ms 1 48.439ms 48.439ms 48.439ms cudaDeviceReset\n", 274 | " 0.34% 564.20us 94 6.0020us 164ns 248.93us cuDeviceGetAttribute\n", 275 | " 0.11% 187.10us 3 62.365us 9.9380us 159.64us cudaFree\n", 276 | " 0.05% 83.126us 1 83.126us 83.126us 83.126us cuDeviceTotalMem\n", 277 | " 0.03% 57.601us 1 57.601us 57.601us 57.601us cuDeviceGetName\n", 278 | " 0.03% 52.785us 3 17.595us 10.601us 24.990us cudaMemcpy\n", 279 | " 0.01% 24.804us 1 24.804us 24.804us 24.804us cudaLaunch\n", 280 | " 0.00% 3.5350us 3 1.1780us 225ns 2.7900us cudaSetupArgument\n", 281 | " 0.00% 1.5750us 2 787ns 240ns 1.3350us cuDeviceGetCount\n", 282 | " 0.00% 849ns 1 849ns 849ns 849ns cudaConfigureCall\n", 283 | " 0.00% 483ns 2 241ns 198ns 285ns cuDeviceGet\n" 284 | ] 285 | } 286 | ], 287 | "source": [ 288 | "!nvprof --unified-memory-profiling off ./sumgpu" 289 | ] 290 | }, 291 | { 292 | "cell_type": "code", 293 | "execution_count": null, 294 | "metadata": {}, 295 | "outputs": [], 296 | "source": [] 297 | } 298 | ], 299 | "metadata": { 300 | "kernelspec": { 301 | "display_name": "Python 3", 302 | "language": "python", 303 | "name": "python3" 304 | }, 305 | "language_info": { 306 | "codemirror_mode": { 307 | "name": "ipython", 308 | "version": 3 309 | }, 310 | "file_extension": ".py", 311 | "mimetype": "text/x-python", 312 | "name": "python", 313 | "nbconvert_exporter": "python", 314 | "pygments_lexer": "ipython3", 315 | "version": "3.6.5" 316 | }, 317 | "toc": { 318 | "base_numbering": 1, 319 | "nav_menu": {}, 320 | "number_sections": true, 321 | "sideBar": true, 322 | "skip_h1_title": false, 323 | "title_cell": "Table of Contents", 324 | "title_sidebar": "Contents", 325 | "toc_cell": false, 326 | "toc_position": {}, 327 | "toc_section_display": true, 328 | "toc_window_display": false 329 | } 330 | }, 331 | "nbformat": 4, 332 | "nbformat_minor": 2 333 | } 334 | -------------------------------------------------------------------------------- /cuda-c/src/01-hello_world/hello-world-from-gpu.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Hello World From GPU" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "The best way to learn a new programming language is by writing programs using the new language. In this section, we are going to write our first kernel code running on the GPU.\n", 15 | "\n", 16 | "First, let's check that the CUDA compiler is installed properly with the following command on a Linux system:" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": 1, 22 | "metadata": {}, 23 | "outputs": [ 24 | { 25 | "name": "stdout", 26 | "output_type": "stream", 27 | "text": [ 28 | "/usr/local/cuda/bin/nvcc\n", 29 | "nvcc: NVIDIA (R) Cuda compiler driver\n", 30 | "Copyright (c) 2005-2017 NVIDIA Corporation\n", 31 | "Built on Fri_Nov__3_21:07:56_CDT_2017\n", 32 | "Cuda compilation tools, release 9.1, V9.1.85\n" 33 | ] 34 | } 35 | ], 36 | "source": [ 37 | "!which nvcc\n", 38 | "!nvcc --version" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Let's check if a GPU accelerator card is attached in our machine:" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "crw-rw-rw- 1 root root 195, 0 Jun 26 08:27 /dev/nvidia0\r\n", 58 | "crw-rw-rw- 1 root root 195, 255 Jun 26 08:27 /dev/nvidiactl\r\n", 59 | "crw-rw-rw- 1 root root 195, 254 Jun 26 08:28 /dev/nvidia-modeset\r\n", 60 | "crw-rw-rw- 1 root root 240, 0 Jun 26 08:28 /dev/nvidia-uvm\r\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "!ls -l /dev/nv*" 66 | ] 67 | }, 68 | { 69 | "cell_type": "markdown", 70 | "metadata": {}, 71 | "source": [ 72 | "Now we are ready to write your fi rst CUDA C code. To write a CUDA C program, we need to:\n", 73 | "1. Create a source code fi le with the special fi le name extension of .cu. \n", 74 | "2. Compile the program using the CUDA nvcc compiler.\n", 75 | "3. Run the executable file from the command line, which contains the kernel code executable on the GPU.\n" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 3, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "Overwriting hello_world_gpu.cu\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "%%file hello_world_gpu.cu \n", 93 | "#include \n", 94 | "\n", 95 | "// The qualifier __global__ tells the compiler that the function will be called \n", 96 | "// from the CPU and executed on the GPU.\n", 97 | "\n", 98 | "__global__ void helloFromGPU(void)\n", 99 | "{\n", 100 | " printf(\".............Hello World from GPU!.............\\n\");\n", 101 | "}\n", 102 | "\n", 103 | "int main(void){\n", 104 | " // hello from cpu\n", 105 | " printf(\"<------------Hello World from CPU!-------------->\\n\");\n", 106 | " \n", 107 | " // Launch the kernel\n", 108 | " // The parameters within the triple angle brackets are the execution configuration, \n", 109 | " // which specifi es how many threads will execute the kernel. In this example, we will run 10 GPU threads.\n", 110 | " helloFromGPU <<<1, 10>>>();\n", 111 | " \n", 112 | " \n", 113 | " // explicitly destroy and clean up all resources associated with the current\n", 114 | " // device in the current process\n", 115 | " cudaDeviceReset();\n", 116 | " return 0;\n", 117 | "}" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 4, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "<------------Hello World from CPU!-------------->\n", 130 | ".............Hello World from GPU!.............\n", 131 | ".............Hello World from GPU!.............\n", 132 | ".............Hello World from GPU!.............\n", 133 | ".............Hello World from GPU!.............\n", 134 | ".............Hello World from GPU!.............\n", 135 | ".............Hello World from GPU!.............\n", 136 | ".............Hello World from GPU!.............\n", 137 | ".............Hello World from GPU!.............\n", 138 | ".............Hello World from GPU!.............\n", 139 | ".............Hello World from GPU!.............\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "%%bash\n", 145 | "nvcc hello_world_gpu.cu -o hello_world_gpu\n", 146 | "./hello_world_gpu" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "## CUDA PROGRAM STRUCTURE \n", 154 | "\n", 155 | "A typical CUDA program structure consists of five main steps: \n", 156 | "1. Allocate GPU memories. \n", 157 | "2. Copy data from CPU memory to GPU memory. \n", 158 | "3. Invoke the CUDA kernel to perform program-specifi c computation. \n", 159 | "4. Copy data back from GPU memory to CPU memory. \n", 160 | "5. Destroy GPU memories.\n", 161 | "\n", 162 | "In the simple program `hello_world_gpu.cu`, you only see the third step: Invoke the kernel. " 163 | ] 164 | }, 165 | { 166 | "cell_type": "markdown", 167 | "metadata": {}, 168 | "source": [ 169 | "- Remove the [cudaDeviceReset function](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1gef69dd5c6d0206c2b8d099abac61f217)" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": 5, 175 | "metadata": {}, 176 | "outputs": [ 177 | { 178 | "name": "stdout", 179 | "output_type": "stream", 180 | "text": [ 181 | "Overwriting hello_world_gpu.cu\n" 182 | ] 183 | } 184 | ], 185 | "source": [ 186 | "%%file hello_world_gpu.cu \n", 187 | "#include \n", 188 | "\n", 189 | "__global__ void helloFromGPU(void)\n", 190 | "{\n", 191 | " printf(\".............Hello World from GPU!.............\\n\");\n", 192 | "}\n", 193 | "\n", 194 | "int main(void){\n", 195 | " // hello from cpu\n", 196 | " printf(\"<------------Hello World from CPU!-------------->\\n\");\n", 197 | " \n", 198 | " helloFromGPU <<<1, 10>>>();\n", 199 | " // explicitly destroy and clean up all resources associated with the current\n", 200 | " // device in the current process\n", 201 | " //cudaDeviceReset();\n", 202 | " return 0;\n", 203 | "}" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 6, 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "<------------Hello World from CPU!-------------->\n" 216 | ] 217 | } 218 | ], 219 | "source": [ 220 | "%%bash\n", 221 | "nvcc hello_world_gpu.cu -o hello_world_gpu\n", 222 | "./hello_world_gpu" 223 | ] 224 | }, 225 | { 226 | "cell_type": "markdown", 227 | "metadata": {}, 228 | "source": [ 229 | "- Replace the function `cudaDeviceRest` with `cudaDeviceSynchronize`" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 7, 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "Overwriting hello_world_gpu.cu\n" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "%%file hello_world_gpu.cu \n", 247 | "#include \n", 248 | "\n", 249 | "__global__ void helloFromGPU(void)\n", 250 | "{\n", 251 | " printf(\".............Hello World from GPU!.............\\n\");\n", 252 | "}\n", 253 | "\n", 254 | "int main(void){\n", 255 | " // hello from cpu\n", 256 | " printf(\"<------------Hello World from CPU!-------------->\\n\");\n", 257 | " \n", 258 | " helloFromGPU <<<1, 10>>>();\n", 259 | " \n", 260 | " cudaDeviceSynchronize();\n", 261 | " return 0;\n", 262 | "}" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 8, 268 | "metadata": {}, 269 | "outputs": [ 270 | { 271 | "name": "stdout", 272 | "output_type": "stream", 273 | "text": [ 274 | "<------------Hello World from CPU!-------------->\n", 275 | ".............Hello World from GPU!.............\n", 276 | ".............Hello World from GPU!.............\n", 277 | ".............Hello World from GPU!.............\n", 278 | ".............Hello World from GPU!.............\n", 279 | ".............Hello World from GPU!.............\n", 280 | ".............Hello World from GPU!.............\n", 281 | ".............Hello World from GPU!.............\n", 282 | ".............Hello World from GPU!.............\n", 283 | ".............Hello World from GPU!.............\n", 284 | ".............Hello World from GPU!.............\n" 285 | ] 286 | } 287 | ], 288 | "source": [ 289 | "%%bash\n", 290 | "nvcc hello_world_gpu.cu -o hello_world_gpu\n", 291 | "./hello_world_gpu" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "- Each thread that executes the kernel is given a unique thread ID that is accessible within the kernel through the built-in `threadIdx.x` variable." 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 9, 304 | "metadata": {}, 305 | "outputs": [ 306 | { 307 | "name": "stdout", 308 | "output_type": "stream", 309 | "text": [ 310 | "Overwriting hello_world_gpu.cu\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "%%file hello_world_gpu.cu \n", 316 | "#include \n", 317 | "\n", 318 | "__global__ void helloFromGPU(void)\n", 319 | "{ \n", 320 | " if (threadIdx.x == 5)\n", 321 | " printf(\".............Hello World from GPU thread %d!.............\\n\", threadIdx.x);\n", 322 | "}\n", 323 | "\n", 324 | "int main(void){\n", 325 | " // hello from cpu\n", 326 | " printf(\"<------------Hello World from CPU!-------------->\\n\");\n", 327 | " \n", 328 | " helloFromGPU <<<1, 10>>>();\n", 329 | " \n", 330 | " cudaDeviceSynchronize();\n", 331 | " return 0;\n", 332 | "}" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 10, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "name": "stdout", 342 | "output_type": "stream", 343 | "text": [ 344 | "<------------Hello World from CPU!-------------->\n", 345 | ".............Hello World from GPU thread 5!.............\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "%%bash\n", 351 | "nvcc hello_world_gpu.cu -o hello_world_gpu\n", 352 | "./hello_world_gpu" 353 | ] 354 | }, 355 | { 356 | "cell_type": "markdown", 357 | "metadata": {}, 358 | "source": [ 359 | "## IS CUDA C PROGRAMMING DIFFICULT?\n", 360 | "\n", 361 | "The main difference between CPU programming and GPU programming is the level of programmer exposure to GPU architectural features. Thinking in parallel and having a basic understanding of GPU architecture enables you to write parallel programs that scale to hundreds of cores as easily as you write a sequential program.\n", 362 | "\n", 363 | "\n", 364 | "If you want to write efficient code as a parallel programmer, you need a basic knowledge of CPU architectures. For example, **locality** is a very important concept in parallel programming. \n", 365 | "- **Locality** refers to the reuse of data so as to reduce memory access latency. \n", 366 | "\n", 367 | "There are two basic types of reference locality:\n", 368 | "\n", 369 | "- Temporal locality refers to the reuse of data and/or resources within relatively small time durations.\n", 370 | "- Spatial locality refers to the use of data elements within relatively close storage locations. \n", 371 | "\n", 372 | "Modern CPU architectures use large caches to optimize for applications with good spatial and temporal locality. It is the programmer’s responsibility to design their algorithm to effi ciently use CPU cache. Programmers must handle low-level cache optimizations, but have no introspection into how threads are being scheduled on the underlying architecture because the CPU does not expose that information.\n", 373 | "CUDA exposes you to the concepts of both memory hierarchy and thread hierarchy, extending your ability to control thread execution and scheduling to a greater degree, using: \n", 374 | "- ➤ Memory hierarchy structure\n", 375 | "- ➤ Thread hierarchy structure\n", 376 | "\n", 377 | "For example, a special memory, called shared memory, is exposed by the CUDA programming model. Shared memory can be thought of as a software-managed cache, which provides great speed-up by conserving bandwidth to main memory. With shared memory, you can control the locality of your code directly.\n", 378 | "\n", 379 | "CUDA abstracts away the hardware details and does not require applications to be mapped to traditional graphics APIs. \n", 380 | "At its core are three key abstractions: \n", 381 | "- a hierarchy of thread groups, \n", 382 | "- a hierarchy of memory groups, \n", 383 | "- and barrier synchronization, \n", 384 | "\n", 385 | "which are exposed to us as a minimal set of language extensions. " 386 | ] 387 | } 388 | ], 389 | "metadata": { 390 | "kernelspec": { 391 | "display_name": "Python 3", 392 | "language": "python", 393 | "name": "python3" 394 | }, 395 | "language_info": { 396 | "codemirror_mode": { 397 | "name": "ipython", 398 | "version": 3 399 | }, 400 | "file_extension": ".py", 401 | "mimetype": "text/x-python", 402 | "name": "python", 403 | "nbconvert_exporter": "python", 404 | "pygments_lexer": "ipython3", 405 | "version": "3.6.5" 406 | }, 407 | "toc": { 408 | "base_numbering": 1, 409 | "nav_menu": {}, 410 | "number_sections": false, 411 | "sideBar": true, 412 | "skip_h1_title": false, 413 | "title_cell": "Table of Contents", 414 | "title_sidebar": "Contents", 415 | "toc_cell": false, 416 | "toc_position": {}, 417 | "toc_section_display": true, 418 | "toc_window_display": false 419 | } 420 | }, 421 | "nbformat": 4, 422 | "nbformat_minor": 2 423 | } 424 | -------------------------------------------------------------------------------- /getting_started_on_colab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "name": "getting-started-on-colab.ipynb", 7 | "version": "0.3.2", 8 | "provenance": [], 9 | "include_colab_link": true 10 | }, 11 | "kernelspec": { 12 | "name": "python3", 13 | "display_name": "Python 3" 14 | }, 15 | "accelerator": "GPU" 16 | }, 17 | "cells": [ 18 | { 19 | "cell_type": "markdown", 20 | "metadata": { 21 | "id": "view-in-github", 22 | "colab_type": "text" 23 | }, 24 | "source": [ 25 | "[View in Colaboratory](https://colab.research.google.com/github/andersy005/cuda-programming/blob/master/getting_started_on_colab.ipynb)" 26 | ] 27 | }, 28 | { 29 | "metadata": { 30 | "id": "Azf91OtajTab", 31 | "colab_type": "code", 32 | "colab": { 33 | "base_uri": "https://localhost:8080/", 34 | "height": 34 35 | }, 36 | "outputId": "dfe6f1dc-edac-466c-95ba-b62736d8d3e2" 37 | }, 38 | "cell_type": "code", 39 | "source": [ 40 | "!apt update -qq" 41 | ], 42 | "execution_count": 1, 43 | "outputs": [ 44 | { 45 | "output_type": "stream", 46 | "text": [ 47 | "6 packages can be upgraded. Run 'apt list --upgradable' to see them.\r\n" 48 | ], 49 | "name": "stdout" 50 | } 51 | ] 52 | }, 53 | { 54 | "metadata": { 55 | "id": "PKBV7iXgjdfJ", 56 | "colab_type": "code", 57 | "colab": { 58 | "base_uri": "https://localhost:8080/", 59 | "height": 309 60 | }, 61 | "outputId": "5e307c44-8abe-40f3-947f-48d7cf2f41b2" 62 | }, 63 | "cell_type": "code", 64 | "source": [ 65 | "!wget https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb" 66 | ], 67 | "execution_count": 2, 68 | "outputs": [ 69 | { 70 | "output_type": "stream", 71 | "text": [ 72 | "--2018-06-27 18:46:54-- https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb\r\n", 73 | "Resolving developer.nvidia.com (developer.nvidia.com)... 192.229.162.216\r\n", 74 | "Connecting to developer.nvidia.com (developer.nvidia.com)|192.229.162.216|:443... connected.\n", 75 | "HTTP request sent, awaiting response... 302 Found\n", 76 | "Location: https://developer.download.nvidia.com/compute/cuda/8.0/secure/Prod2/local_installers/cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64.deb?Oy-tuxd8APgk75C-6ni3GsZYC8MKSx8lCk8BAhMEzptmEKWqiBU80Z9TWX4lJfCr-9n4M6xR8eAQcu5bANJUkw92M88T3sQSG2Q5CzCeAhG3ye37lu2a4s6ej_RdyKJ5nHPAmPPd3wAoF-hVGKyZghC3EKpAvO4xKIEOrqItL1bQbfbUalWFWE6JB5e5i2kZ38Qeu_Hz2HpSo1htmYiBXUHogg [following]\n", 77 | "--2018-06-27 18:46:54-- https://developer.download.nvidia.com/compute/cuda/8.0/secure/Prod2/local_installers/cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64.deb?Oy-tuxd8APgk75C-6ni3GsZYC8MKSx8lCk8BAhMEzptmEKWqiBU80Z9TWX4lJfCr-9n4M6xR8eAQcu5bANJUkw92M88T3sQSG2Q5CzCeAhG3ye37lu2a4s6ej_RdyKJ5nHPAmPPd3wAoF-hVGKyZghC3EKpAvO4xKIEOrqItL1bQbfbUalWFWE6JB5e5i2kZ38Qeu_Hz2HpSo1htmYiBXUHogg\n", 78 | "Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 192.229.211.70, 2606:2800:21f:3aa:dcf:37b:1ed6:1fb\n", 79 | "Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|192.229.211.70|:443... connected.\n", 80 | "HTTP request sent, awaiting response... 200 OK\n", 81 | "Length: 1913589814 (1.8G) [application/x-deb]\n", 82 | "Saving to: ‘cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb.4’\n", 83 | "\n" 84 | ], 85 | "name": "stdout" 86 | }, 87 | { 88 | "output_type": "stream", 89 | "text": [ 90 | "deb.4 99%[==================> ] 1.77G 157MB/s eta 0s \rcuda-repo-ubuntu160 100%[===================>] 1.78G 155MB/s in 13s \r\n", 91 | "\r\n", 92 | "2018-06-27 18:47:07 (139 MB/s) - ‘cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb.4’ saved [1913589814/1913589814]\r\n", 93 | "\r\n" 94 | ], 95 | "name": "stdout" 96 | } 97 | ] 98 | }, 99 | { 100 | "metadata": { 101 | "id": "vrfKbLekdPDO", 102 | "colab_type": "code", 103 | "colab": {} 104 | }, 105 | "cell_type": "code", 106 | "source": [ 107 | "!dpkg -i cuda-repo-ubuntu1604–8–0-local-ga2_8.0.61–1_amd64-deb 2> /dev/null" 108 | ], 109 | "execution_count": 0, 110 | "outputs": [] 111 | }, 112 | { 113 | "metadata": { 114 | "id": "A9Lo4FoydC7q", 115 | "colab_type": "code", 116 | "colab": { 117 | "base_uri": "https://localhost:8080/", 118 | "height": 34 119 | }, 120 | "outputId": "a12cf818-1adf-40a5-8a4c-1f1e2dfd5423" 121 | }, 122 | "cell_type": "code", 123 | "source": [ 124 | "!apt-key add /var/cuda-repo-8-0-local-ga2/7fa2af80.pub" 125 | ], 126 | "execution_count": 4, 127 | "outputs": [ 128 | { 129 | "output_type": "stream", 130 | "text": [ 131 | "OK\r\n" 132 | ], 133 | "name": "stdout" 134 | } 135 | ] 136 | }, 137 | { 138 | "metadata": { 139 | "id": "vuIFPr-leAFV", 140 | "colab_type": "code", 141 | "colab": {} 142 | }, 143 | "cell_type": "code", 144 | "source": [ 145 | "!apt-get update -qq" 146 | ], 147 | "execution_count": 0, 148 | "outputs": [] 149 | }, 150 | { 151 | "metadata": { 152 | "id": "5LCW1tnxj-pk", 153 | "colab_type": "code", 154 | "colab": { 155 | "base_uri": "https://localhost:8080/", 156 | "height": 85 157 | }, 158 | "outputId": "76f7e7ae-1e54-41b1-c877-6b528a691ade" 159 | }, 160 | "cell_type": "code", 161 | "source": [ 162 | "!apt --fix-broken install\n", 163 | "!apt-get install cuda gcc-5 g++-5 -y -qq;" 164 | ], 165 | "execution_count": 6, 166 | "outputs": [ 167 | { 168 | "output_type": "stream", 169 | "text": [ 170 | "Reading package lists... Done\n", 171 | "Building dependency tree \n", 172 | "Reading state information... Done\n", 173 | "0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.\n" 174 | ], 175 | "name": "stdout" 176 | } 177 | ] 178 | }, 179 | { 180 | "metadata": { 181 | "id": "zoqglXEBj-02", 182 | "colab_type": "code", 183 | "colab": { 184 | "base_uri": "https://localhost:8080/", 185 | "height": 170 186 | }, 187 | "outputId": "3078ead4-f9d8-446c-dc8f-c4774369992a" 188 | }, 189 | "cell_type": "code", 190 | "source": [ 191 | "!ln -s /usr/bin/gcc-5 /usr/local/cuda/bin/gcc;\n", 192 | "!ln -s /usr/bin/g++-5 /usr/local/cuda/bin/g++;\n", 193 | "!apt install cuda-8.0;" 194 | ], 195 | "execution_count": 7, 196 | "outputs": [ 197 | { 198 | "output_type": "stream", 199 | "text": [ 200 | "Reading package lists... Done\n", 201 | "Building dependency tree \n", 202 | "Reading state information... Done\n", 203 | "Note, selecting 'cuda-8-0' for regex 'cuda-8.0'\n", 204 | "Note, selecting 'libcuda-8.0-1' for regex 'cuda-8.0'\n", 205 | "Note, selecting 'libcuda1-384' instead of 'libcuda-8.0-1'\n", 206 | "libcuda1-384 is already the newest version (384.130-0ubuntu0.17.10.1).\n", 207 | "cuda-8-0 is already the newest version (8.0.61-1).\n", 208 | "0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.\n" 209 | ], 210 | "name": "stdout" 211 | } 212 | ] 213 | }, 214 | { 215 | "metadata": { 216 | "id": "IzaD9e62epuL", 217 | "colab_type": "code", 218 | "colab": {} 219 | }, 220 | "cell_type": "code", 221 | "source": [ 222 | "import os\n", 223 | "os.environ['PATH'] += ':/usr/local/cuda/bin'" 224 | ], 225 | "execution_count": 0, 226 | "outputs": [] 227 | }, 228 | { 229 | "metadata": { 230 | "id": "ayx2gqk8iICV", 231 | "colab_type": "code", 232 | "colab": { 233 | "base_uri": "https://localhost:8080/", 234 | "height": 119 235 | }, 236 | "outputId": "da46bca7-2c02-4b64-9363-31e7efe437df" 237 | }, 238 | "cell_type": "code", 239 | "source": [ 240 | "!apt install gcc-5 g++-5 -y" 241 | ], 242 | "execution_count": 9, 243 | "outputs": [ 244 | { 245 | "output_type": "stream", 246 | "text": [ 247 | "Reading package lists... Done\n", 248 | "Building dependency tree \n", 249 | "Reading state information... Done\n", 250 | "gcc-5 is already the newest version (5.5.0-1ubuntu2).\n", 251 | "g++-5 is already the newest version (5.5.0-1ubuntu2).\n", 252 | "0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.\n" 253 | ], 254 | "name": "stdout" 255 | } 256 | ] 257 | }, 258 | { 259 | "metadata": { 260 | "id": "MFe_6wdBiLcy", 261 | "colab_type": "code", 262 | "colab": {} 263 | }, 264 | "cell_type": "code", 265 | "source": [ 266 | "import os\n", 267 | "os.environ['PATH'] += ':/usr/local/cuda/bin'" 268 | ], 269 | "execution_count": 0, 270 | "outputs": [] 271 | }, 272 | { 273 | "metadata": { 274 | "id": "1UT9EIGZiN12", 275 | "colab_type": "code", 276 | "colab": { 277 | "base_uri": "https://localhost:8080/", 278 | "height": 85 279 | }, 280 | "outputId": "8935d529-d8bf-4063-f95f-18e86ef49e22" 281 | }, 282 | "cell_type": "code", 283 | "source": [ 284 | "!nvcc --version" 285 | ], 286 | "execution_count": 11, 287 | "outputs": [ 288 | { 289 | "output_type": "stream", 290 | "text": [ 291 | "nvcc: NVIDIA (R) Cuda compiler driver\r\n", 292 | "Copyright (c) 2005-2016 NVIDIA Corporation\r\n", 293 | "Built on Tue_Jan_10_13:22:03_CST_2017\r\n", 294 | "Cuda compilation tools, release 8.0, V8.0.61\r\n" 295 | ], 296 | "name": "stdout" 297 | } 298 | ] 299 | }, 300 | { 301 | "metadata": { 302 | "id": "zjr4TsIFk21Z", 303 | "colab_type": "code", 304 | "colab": { 305 | "base_uri": "https://localhost:8080/", 306 | "height": 34 307 | }, 308 | "outputId": "a4d6d568-c783-408b-ea2d-fdda4b2139b1" 309 | }, 310 | "cell_type": "code", 311 | "source": [ 312 | "%%file version.cu\n", 313 | "#include \n", 314 | "#include \n", 315 | "\n", 316 | "int main(void)\n", 317 | "{\n", 318 | " int major = THRUST_MAJOR_VERSION;\n", 319 | " int minor = THRUST_MINOR_VERSION;\n", 320 | "\n", 321 | " std::cout << \"Thrust v\" << major << \".\" << minor << std::endl;\n", 322 | "\n", 323 | " return 0;\n", 324 | "}" 325 | ], 326 | "execution_count": 12, 327 | "outputs": [ 328 | { 329 | "output_type": "stream", 330 | "text": [ 331 | "Writing version.cu\n" 332 | ], 333 | "name": "stdout" 334 | } 335 | ] 336 | }, 337 | { 338 | "metadata": { 339 | "id": "7lVHw-Ezlf_L", 340 | "colab_type": "code", 341 | "colab": { 342 | "base_uri": "https://localhost:8080/", 343 | "height": 51 344 | }, 345 | "outputId": "ae831aac-1584-4e94-ff77-ed686be5346e" 346 | }, 347 | "cell_type": "code", 348 | "source": [ 349 | "!nvcc version.cu -o version\n", 350 | "!./version" 351 | ], 352 | "execution_count": 13, 353 | "outputs": [ 354 | { 355 | "output_type": "stream", 356 | "text": [ 357 | "nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n", 358 | "Thrust v1.8\n" 359 | ], 360 | "name": "stdout" 361 | } 362 | ] 363 | }, 364 | { 365 | "metadata": { 366 | "id": "DXERV5wRlqX3", 367 | "colab_type": "code", 368 | "colab": { 369 | "base_uri": "https://localhost:8080/", 370 | "height": 34 371 | }, 372 | "outputId": "619b324c-c970-467c-9dc1-087a1490670b" 373 | }, 374 | "cell_type": "code", 375 | "source": [ 376 | "%%file thrust_example.cu\n", 377 | "#include \n", 378 | "#include \n", 379 | "#include \n", 380 | "#include \n", 381 | "#include \n", 382 | "#include \n", 383 | "#include \n", 384 | "\n", 385 | "int main(void)\n", 386 | "{\n", 387 | " // generate 32M random numbers serially\n", 388 | " thrust::host_vector h_vec(32 << 20);\n", 389 | " std::generate(h_vec.begin(), h_vec.end(), rand);\n", 390 | "\n", 391 | " // transfer data to the device\n", 392 | " thrust::device_vector d_vec = h_vec;\n", 393 | "\n", 394 | " // sort data on the device \n", 395 | " thrust::sort(d_vec.begin(), d_vec.end());\n", 396 | "\n", 397 | " // transfer data back to host\n", 398 | " thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());\n", 399 | "\n", 400 | " return 0;\n", 401 | "}" 402 | ], 403 | "execution_count": 14, 404 | "outputs": [ 405 | { 406 | "output_type": "stream", 407 | "text": [ 408 | "Writing thrust_example.cu\n" 409 | ], 410 | "name": "stdout" 411 | } 412 | ] 413 | }, 414 | { 415 | "metadata": { 416 | "id": "PSqPaE0amBwB", 417 | "colab_type": "code", 418 | "colab": { 419 | "base_uri": "https://localhost:8080/", 420 | "height": 34 421 | }, 422 | "outputId": "4be9eef0-eee1-4c04-b916-1b4559c0894b" 423 | }, 424 | "cell_type": "code", 425 | "source": [ 426 | "!nvcc thrust_example.cu -o thrust_example\n", 427 | "!./thrust_example\n" 428 | ], 429 | "execution_count": 15, 430 | "outputs": [ 431 | { 432 | "output_type": "stream", 433 | "text": [ 434 | "nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\r\n" 435 | ], 436 | "name": "stdout" 437 | } 438 | ] 439 | }, 440 | { 441 | "metadata": { 442 | "id": "v6gCqNHwmJp1", 443 | "colab_type": "code", 444 | "colab": { 445 | "base_uri": "https://localhost:8080/", 446 | "height": 581 447 | }, 448 | "outputId": "65f32a73-9ad6-4b27-cb72-c1aa5a3fec9f" 449 | }, 450 | "cell_type": "code", 451 | "source": [ 452 | "!nvprof ./thrust_example" 453 | ], 454 | "execution_count": 16, 455 | "outputs": [ 456 | { 457 | "output_type": "stream", 458 | "text": [ 459 | "==1986== NVPROF is profiling process 1986, command: ./thrust_example\n", 460 | "==1986== Profiling application: ./thrust_example\n", 461 | "==1986== Profiling result:\n", 462 | "Time(%) Time Calls Avg Min Max Name\n", 463 | " 30.98% 26.865ms 1 26.865ms 26.865ms 26.865ms [CUDA memcpy HtoD]\n", 464 | " 22.54% 19.542ms 4 4.8854ms 3.8770ms 5.2403ms void thrust::system::cuda::detail::cub_::DeviceRadixSortDownsweepKernel::PtxDownsweepPolicy, bool=0, int, thrust::system::cuda::detail::cub_::NullType, int>(thrust::system::cuda::detail::cub_::NullType*, thrust::system::cuda::detail::cub_::NullType, int*, int, thrust::system::cuda::detail::cub_::DeviceRadixSortDispatch::PtxDownsweepPolicy*, int*, int, int, bool, bool, thrust::system::cuda::detail::cub_::GridEvenShare)\n", 465 | " 21.37% 18.534ms 1 18.534ms 18.534ms 18.534ms [CUDA memcpy DtoH]\n", 466 | " 13.21% 11.459ms 3 3.8197ms 3.7991ms 3.8498ms void thrust::system::cuda::detail::cub_::DeviceRadixSortDownsweepKernel::PtxAltDownsweepPolicy, bool=0, int, thrust::system::cuda::detail::cub_::NullType, int>(thrust::system::cuda::detail::cub_::NullType*, thrust::system::cuda::detail::cub_::NullType, int*, int, thrust::system::cuda::detail::cub_::DeviceRadixSortDispatch::PtxAltDownsweepPolicy*, int*, int, int, bool, bool, thrust::system::cuda::detail::cub_::GridEvenShare)\n", 467 | " 5.08% 4.4069ms 4 1.1017ms 1.0984ms 1.1063ms void thrust::system::cuda::detail::cub_::DeviceRadixSortUpsweepKernel::PtxUpsweepPolicy, bool=0, int, int>(thrust::system::cuda::detail::cub_::NullType*, int*, thrust::system::cuda::detail::cub_::NullType*, int, int, bool, thrust::system::cuda::detail::cub_::GridEvenShare)\n", 468 | " 4.12% 3.5706ms 3 1.1902ms 1.1881ms 1.1933ms void thrust::system::cuda::detail::cub_::DeviceRadixSortUpsweepKernel::PtxAltUpsweepPolicy, bool=0, int, int>(thrust::system::cuda::detail::cub_::NullType*, int*, thrust::system::cuda::detail::cub_::NullType*, int, int, bool, thrust::system::cuda::detail::cub_::GridEvenShare)\n", 469 | " 2.35% 2.0400ms 1 2.0400ms 2.0400ms 2.0400ms [CUDA memcpy DtoD]\n", 470 | " 0.34% 298.43us 7 42.632us 32.160us 50.464us void thrust::system::cuda::detail::cub_::RadixSortScanBinsKernel::PtxScanPolicy, int>(int*, int)\n", 471 | "\n", 472 | "==1986== API calls:\n", 473 | "Time(%) Time Calls Avg Min Max Name\n", 474 | " 68.70% 197.26ms 2 98.628ms 481.23us 196.77ms cudaMalloc\n", 475 | " 15.94% 45.770ms 3 15.257ms 34.923us 26.979ms cudaMemcpyAsync\n", 476 | " 14.52% 41.688ms 2 20.844ms 515.99us 41.172ms cudaFree\n", 477 | " 0.28% 794.29us 91 8.7280us 2.6300us 267.81us cuDeviceGetAttribute\n", 478 | " 0.18% 512.13us 21 24.387us 18.936us 61.082us cudaLaunch\n", 479 | " 0.17% 485.34us 10 48.534us 44.333us 68.555us cudaFuncGetAttributes\n", 480 | " 0.11% 316.36us 1 316.36us 316.36us 316.36us cuDeviceTotalMem\n", 481 | " 0.05% 135.60us 2 67.798us 24.341us 111.26us cudaStreamSynchronize\n", 482 | " 0.01% 37.730us 1 37.730us 37.730us 37.730us cuDeviceGetName\n", 483 | " 0.01% 33.288us 6 5.5480us 5.3300us 6.3190us cudaDeviceGetAttribute\n", 484 | " 0.01% 31.534us 140 225ns 178ns 1.0860us cudaSetupArgument\n", 485 | " 0.01% 17.206us 2 8.6030us 8.4720us 8.7340us cudaDeviceGetSharedMemConfig\n", 486 | " 0.00% 12.342us 2 6.1710us 5.9050us 6.4370us cudaGetDevice\n", 487 | " 0.00% 11.925us 3 3.9750us 2.7300us 5.2650us cuDeviceGetCount\n", 488 | " 0.00% 10.324us 3 3.4410us 2.9730us 4.2770us cuDeviceGet\n", 489 | " 0.00% 8.4490us 21 402ns 321ns 926ns cudaPeekAtLastError\n", 490 | " 0.00% 6.8050us 21 324ns 203ns 2.0050us cudaConfigureCall\n" 491 | ], 492 | "name": "stdout" 493 | } 494 | ] 495 | }, 496 | { 497 | "metadata": { 498 | "id": "nKymLb9hnFIZ", 499 | "colab_type": "code", 500 | "colab": {} 501 | }, 502 | "cell_type": "code", 503 | "source": [ 504 | "" 505 | ], 506 | "execution_count": 0, 507 | "outputs": [] 508 | } 509 | ] 510 | } -------------------------------------------------------------------------------- /cuda-c/src/utils/GL/glut.h: -------------------------------------------------------------------------------- 1 | #ifndef __glut_h__ 2 | #define __glut_h__ 3 | 4 | /* Copyright (c) Mark J. Kilgard, 1994, 1995, 1996, 1998. */ 5 | 6 | /* This program is freely distributable without licensing fees and is 7 | provided without guarantee or warrantee expressed or implied. This 8 | program is -not- in the public domain. */ 9 | 10 | #if defined(_WIN32) 11 | 12 | /* GLUT 3.7 now tries to avoid including 13 | to avoid name space pollution, but Win32's 14 | needs APIENTRY and WINGDIAPI defined properly. */ 15 | # if 0 16 | /* This would put tons of macros and crap in our clean name space. */ 17 | # define WIN32_LEAN_AND_MEAN 18 | # include 19 | # else 20 | /* XXX This is from Win32's */ 21 | # ifndef APIENTRY 22 | # define GLUT_APIENTRY_DEFINED 23 | # if (_MSC_VER >= 800) || defined(_STDCALL_SUPPORTED) || defined(__BORLANDC__) || defined(__LCC__) 24 | # define APIENTRY __stdcall 25 | # else 26 | # define APIENTRY 27 | # endif 28 | # endif 29 | /* XXX This is from Win32's */ 30 | # ifndef CALLBACK 31 | # if (defined(_M_MRX000) || defined(_M_IX86) || defined(_M_ALPHA) || defined(_M_PPC)) && !defined(MIDL_PASS) || defined(__LCC__) 32 | # define CALLBACK __stdcall 33 | # else 34 | # define CALLBACK 35 | # endif 36 | # endif 37 | /* XXX Hack for lcc compiler. It doesn't support __declspec(dllimport), just __stdcall. */ 38 | # if defined( __LCC__ ) 39 | # undef WINGDIAPI 40 | # define WINGDIAPI __stdcall 41 | # else 42 | /* XXX This is from Win32's and */ 43 | # ifndef WINGDIAPI 44 | # define GLUT_WINGDIAPI_DEFINED 45 | # define WINGDIAPI __declspec(dllimport) 46 | # endif 47 | # endif 48 | /* XXX This is from Win32's */ 49 | # ifndef _WCHAR_T_DEFINED 50 | typedef unsigned short wchar_t; 51 | # define _WCHAR_T_DEFINED 52 | # endif 53 | # endif 54 | 55 | /* To disable automatic library usage for GLUT, define GLUT_NO_LIB_PRAGMA 56 | in your compile preprocessor options. */ 57 | # if !defined(GLUT_BUILDING_LIB) && !defined(GLUT_NO_LIB_PRAGMA) 58 | # pragma comment (lib, "winmm.lib") /* link with Windows MultiMedia lib */ 59 | /* To enable automatic SGI OpenGL for Windows library usage for GLUT, 60 | define GLUT_USE_SGI_OPENGL in your compile preprocessor options. */ 61 | # ifdef GLUT_USE_SGI_OPENGL 62 | # pragma comment (lib, "opengl.lib") /* link with SGI OpenGL for Windows lib */ 63 | # pragma comment (lib, "glu.lib") /* link with SGI OpenGL Utility lib */ 64 | # pragma comment (lib, "glut.lib") /* link with Win32 GLUT for SGI OpenGL lib */ 65 | # else 66 | # pragma comment (lib, "opengl32.lib") /* link with Microsoft OpenGL lib */ 67 | # pragma comment (lib, "glu32.lib") /* link with Microsoft OpenGL Utility lib */ 68 | # pragma comment (lib, "glut32.lib") /* link with Win32 GLUT lib */ 69 | # endif 70 | # endif 71 | 72 | /* To disable supression of annoying warnings about floats being promoted 73 | to doubles, define GLUT_NO_WARNING_DISABLE in your compile preprocessor 74 | options. */ 75 | # ifndef GLUT_NO_WARNING_DISABLE 76 | # pragma warning (disable:4244) /* Disable bogus VC++ 4.2 conversion warnings. */ 77 | # pragma warning (disable:4305) /* VC++ 5.0 version of above warning. */ 78 | # endif 79 | 80 | /* Win32 has an annoying issue where there are multiple C run-time 81 | libraries (CRTs). If the executable is linked with a different CRT 82 | from the GLUT DLL, the GLUT DLL will not share the same CRT static 83 | data seen by the executable. In particular, atexit callbacks registered 84 | in the executable will not be called if GLUT calls its (different) 85 | exit routine). GLUT is typically built with the 86 | "/MD" option (the CRT with multithreading DLL support), but the Visual 87 | C++ linker default is "/ML" (the single threaded CRT). 88 | 89 | One workaround to this issue is requiring users to always link with 90 | the same CRT as GLUT is compiled with. That requires users supply a 91 | non-standard option. GLUT 3.7 has its own built-in workaround where 92 | the executable's "exit" function pointer is covertly passed to GLUT. 93 | GLUT then calls the executable's exit function pointer to ensure that 94 | any "atexit" calls registered by the application are called if GLUT 95 | needs to exit. 96 | 97 | Note that the __glut*WithExit routines should NEVER be called directly. 98 | To avoid the atexit workaround, #define GLUT_DISABLE_ATEXIT_HACK. */ 99 | 100 | /* XXX This is from Win32's */ 101 | # if !defined(_MSC_VER) && !defined(__cdecl) 102 | /* Define __cdecl for non-Microsoft compilers. */ 103 | # define __cdecl 104 | # define GLUT_DEFINED___CDECL 105 | # endif 106 | # ifndef _CRTIMP 107 | # ifdef _NTSDK 108 | /* Definition compatible with NT SDK */ 109 | # define _CRTIMP 110 | # else 111 | /* Current definition */ 112 | # ifdef _DLL 113 | # define _CRTIMP __declspec(dllimport) 114 | # else 115 | # define _CRTIMP 116 | # endif 117 | # endif 118 | # define GLUT_DEFINED__CRTIMP 119 | # endif 120 | 121 | /* GLUT API entry point declarations for Win32. */ 122 | # ifdef GLUT_BUILDING_LIB 123 | # define GLUTAPI __declspec(dllexport) 124 | # else 125 | # ifdef _DLL 126 | # define GLUTAPI __declspec(dllimport) 127 | # else 128 | # define GLUTAPI extern 129 | # endif 130 | # endif 131 | 132 | /* GLUT callback calling convention for Win32. */ 133 | # define GLUTCALLBACK __cdecl 134 | 135 | #endif /* _WIN32 */ 136 | 137 | #include 138 | #include 139 | 140 | #ifdef __cplusplus 141 | extern "C" { 142 | #endif 143 | 144 | #if defined(_WIN32) 145 | # ifndef GLUT_BUILDING_LIB 146 | extern _CRTIMP void __cdecl exit(int); 147 | # endif 148 | #else 149 | /* non-Win32 case. */ 150 | /* Define APIENTRY and CALLBACK to nothing if we aren't on Win32. */ 151 | # define APIENTRY 152 | # define GLUT_APIENTRY_DEFINED 153 | # define CALLBACK 154 | /* Define GLUTAPI and GLUTCALLBACK as below if we aren't on Win32. */ 155 | # define GLUTAPI extern 156 | # define GLUTCALLBACK 157 | /* Prototype exit for the non-Win32 case (see above). */ 158 | extern void exit(int); 159 | #endif 160 | 161 | /** 162 | GLUT API revision history: 163 | 164 | GLUT_API_VERSION is updated to reflect incompatible GLUT 165 | API changes (interface changes, semantic changes, deletions, 166 | or additions). 167 | 168 | GLUT_API_VERSION=1 First public release of GLUT. 11/29/94 169 | 170 | GLUT_API_VERSION=2 Added support for OpenGL/GLX multisampling, 171 | extension. Supports new input devices like tablet, dial and button 172 | box, and Spaceball. Easy to query OpenGL extensions. 173 | 174 | GLUT_API_VERSION=3 glutMenuStatus added. 175 | 176 | GLUT_API_VERSION=4 glutInitDisplayString, glutWarpPointer, 177 | glutBitmapLength, glutStrokeLength, glutWindowStatusFunc, dynamic 178 | video resize subAPI, glutPostWindowRedisplay, glutKeyboardUpFunc, 179 | glutSpecialUpFunc, glutIgnoreKeyRepeat, glutSetKeyRepeat, 180 | glutJoystickFunc, glutForceJoystickFunc (NOT FINALIZED!). 181 | **/ 182 | #ifndef GLUT_API_VERSION /* allow this to be overriden */ 183 | #define GLUT_API_VERSION 3 184 | #endif 185 | 186 | /** 187 | GLUT implementation revision history: 188 | 189 | GLUT_XLIB_IMPLEMENTATION is updated to reflect both GLUT 190 | API revisions and implementation revisions (ie, bug fixes). 191 | 192 | GLUT_XLIB_IMPLEMENTATION=1 mjk's first public release of 193 | GLUT Xlib-based implementation. 11/29/94 194 | 195 | GLUT_XLIB_IMPLEMENTATION=2 mjk's second public release of 196 | GLUT Xlib-based implementation providing GLUT version 2 197 | interfaces. 198 | 199 | GLUT_XLIB_IMPLEMENTATION=3 mjk's GLUT 2.2 images. 4/17/95 200 | 201 | GLUT_XLIB_IMPLEMENTATION=4 mjk's GLUT 2.3 images. 6/?/95 202 | 203 | GLUT_XLIB_IMPLEMENTATION=5 mjk's GLUT 3.0 images. 10/?/95 204 | 205 | GLUT_XLIB_IMPLEMENTATION=7 mjk's GLUT 3.1+ with glutWarpPoitner. 7/24/96 206 | 207 | GLUT_XLIB_IMPLEMENTATION=8 mjk's GLUT 3.1+ with glutWarpPoitner 208 | and video resize. 1/3/97 209 | 210 | GLUT_XLIB_IMPLEMENTATION=9 mjk's GLUT 3.4 release with early GLUT 4 routines. 211 | 212 | GLUT_XLIB_IMPLEMENTATION=11 Mesa 2.5's GLUT 3.6 release. 213 | 214 | GLUT_XLIB_IMPLEMENTATION=12 mjk's GLUT 3.6 release with early GLUT 4 routines + signal handling. 215 | 216 | GLUT_XLIB_IMPLEMENTATION=13 mjk's GLUT 3.7 beta with GameGLUT support. 217 | 218 | GLUT_XLIB_IMPLEMENTATION=14 mjk's GLUT 3.7 beta with f90gl friend interface. 219 | 220 | GLUT_XLIB_IMPLEMENTATION=15 mjk's GLUT 3.7 beta sync'ed with Mesa 221 | **/ 222 | #ifndef GLUT_XLIB_IMPLEMENTATION /* Allow this to be overriden. */ 223 | #define GLUT_XLIB_IMPLEMENTATION 15 224 | #endif 225 | 226 | /* Display mode bit masks. */ 227 | #define GLUT_RGB 0 228 | #define GLUT_RGBA GLUT_RGB 229 | #define GLUT_INDEX 1 230 | #define GLUT_SINGLE 0 231 | #define GLUT_DOUBLE 2 232 | #define GLUT_ACCUM 4 233 | #define GLUT_ALPHA 8 234 | #define GLUT_DEPTH 16 235 | #define GLUT_STENCIL 32 236 | #if (GLUT_API_VERSION >= 2) 237 | #define GLUT_MULTISAMPLE 128 238 | #define GLUT_STEREO 256 239 | #endif 240 | #if (GLUT_API_VERSION >= 3) 241 | #define GLUT_LUMINANCE 512 242 | #endif 243 | 244 | /* Mouse buttons. */ 245 | #define GLUT_LEFT_BUTTON 0 246 | #define GLUT_MIDDLE_BUTTON 1 247 | #define GLUT_RIGHT_BUTTON 2 248 | 249 | /* Mouse button state. */ 250 | #define GLUT_DOWN 0 251 | #define GLUT_UP 1 252 | 253 | #if (GLUT_API_VERSION >= 2) 254 | /* function keys */ 255 | #define GLUT_KEY_F1 1 256 | #define GLUT_KEY_F2 2 257 | #define GLUT_KEY_F3 3 258 | #define GLUT_KEY_F4 4 259 | #define GLUT_KEY_F5 5 260 | #define GLUT_KEY_F6 6 261 | #define GLUT_KEY_F7 7 262 | #define GLUT_KEY_F8 8 263 | #define GLUT_KEY_F9 9 264 | #define GLUT_KEY_F10 10 265 | #define GLUT_KEY_F11 11 266 | #define GLUT_KEY_F12 12 267 | /* directional keys */ 268 | #define GLUT_KEY_LEFT 100 269 | #define GLUT_KEY_UP 101 270 | #define GLUT_KEY_RIGHT 102 271 | #define GLUT_KEY_DOWN 103 272 | #define GLUT_KEY_PAGE_UP 104 273 | #define GLUT_KEY_PAGE_DOWN 105 274 | #define GLUT_KEY_HOME 106 275 | #define GLUT_KEY_END 107 276 | #define GLUT_KEY_INSERT 108 277 | #endif 278 | 279 | /* Entry/exit state. */ 280 | #define GLUT_LEFT 0 281 | #define GLUT_ENTERED 1 282 | 283 | /* Menu usage state. */ 284 | #define GLUT_MENU_NOT_IN_USE 0 285 | #define GLUT_MENU_IN_USE 1 286 | 287 | /* Visibility state. */ 288 | #define GLUT_NOT_VISIBLE 0 289 | #define GLUT_VISIBLE 1 290 | 291 | /* Window status state. */ 292 | #define GLUT_HIDDEN 0 293 | #define GLUT_FULLY_RETAINED 1 294 | #define GLUT_PARTIALLY_RETAINED 2 295 | #define GLUT_FULLY_COVERED 3 296 | 297 | /* Color index component selection values. */ 298 | #define GLUT_RED 0 299 | #define GLUT_GREEN 1 300 | #define GLUT_BLUE 2 301 | 302 | #if defined(_WIN32) 303 | /* Stroke font constants (use these in GLUT program). */ 304 | #define GLUT_STROKE_ROMAN ((void*)0) 305 | #define GLUT_STROKE_MONO_ROMAN ((void*)1) 306 | 307 | /* Bitmap font constants (use these in GLUT program). */ 308 | #define GLUT_BITMAP_9_BY_15 ((void*)2) 309 | #define GLUT_BITMAP_8_BY_13 ((void*)3) 310 | #define GLUT_BITMAP_TIMES_ROMAN_10 ((void*)4) 311 | #define GLUT_BITMAP_TIMES_ROMAN_24 ((void*)5) 312 | #if (GLUT_API_VERSION >= 3) 313 | #define GLUT_BITMAP_HELVETICA_10 ((void*)6) 314 | #define GLUT_BITMAP_HELVETICA_12 ((void*)7) 315 | #define GLUT_BITMAP_HELVETICA_18 ((void*)8) 316 | #endif 317 | #else 318 | /* Stroke font opaque addresses (use constants instead in source code). */ 319 | GLUTAPI void *glutStrokeRoman; 320 | GLUTAPI void *glutStrokeMonoRoman; 321 | 322 | /* Stroke font constants (use these in GLUT program). */ 323 | #define GLUT_STROKE_ROMAN (&glutStrokeRoman) 324 | #define GLUT_STROKE_MONO_ROMAN (&glutStrokeMonoRoman) 325 | 326 | /* Bitmap font opaque addresses (use constants instead in source code). */ 327 | GLUTAPI void *glutBitmap9By15; 328 | GLUTAPI void *glutBitmap8By13; 329 | GLUTAPI void *glutBitmapTimesRoman10; 330 | GLUTAPI void *glutBitmapTimesRoman24; 331 | GLUTAPI void *glutBitmapHelvetica10; 332 | GLUTAPI void *glutBitmapHelvetica12; 333 | GLUTAPI void *glutBitmapHelvetica18; 334 | 335 | /* Bitmap font constants (use these in GLUT program). */ 336 | #define GLUT_BITMAP_9_BY_15 (&glutBitmap9By15) 337 | #define GLUT_BITMAP_8_BY_13 (&glutBitmap8By13) 338 | #define GLUT_BITMAP_TIMES_ROMAN_10 (&glutBitmapTimesRoman10) 339 | #define GLUT_BITMAP_TIMES_ROMAN_24 (&glutBitmapTimesRoman24) 340 | #if (GLUT_API_VERSION >= 3) 341 | #define GLUT_BITMAP_HELVETICA_10 (&glutBitmapHelvetica10) 342 | #define GLUT_BITMAP_HELVETICA_12 (&glutBitmapHelvetica12) 343 | #define GLUT_BITMAP_HELVETICA_18 (&glutBitmapHelvetica18) 344 | #endif 345 | #endif 346 | 347 | /* glutGet parameters. */ 348 | #define GLUT_WINDOW_X ((GLenum) 100) 349 | #define GLUT_WINDOW_Y ((GLenum) 101) 350 | #define GLUT_WINDOW_WIDTH ((GLenum) 102) 351 | #define GLUT_WINDOW_HEIGHT ((GLenum) 103) 352 | #define GLUT_WINDOW_BUFFER_SIZE ((GLenum) 104) 353 | #define GLUT_WINDOW_STENCIL_SIZE ((GLenum) 105) 354 | #define GLUT_WINDOW_DEPTH_SIZE ((GLenum) 106) 355 | #define GLUT_WINDOW_RED_SIZE ((GLenum) 107) 356 | #define GLUT_WINDOW_GREEN_SIZE ((GLenum) 108) 357 | #define GLUT_WINDOW_BLUE_SIZE ((GLenum) 109) 358 | #define GLUT_WINDOW_ALPHA_SIZE ((GLenum) 110) 359 | #define GLUT_WINDOW_ACCUM_RED_SIZE ((GLenum) 111) 360 | #define GLUT_WINDOW_ACCUM_GREEN_SIZE ((GLenum) 112) 361 | #define GLUT_WINDOW_ACCUM_BLUE_SIZE ((GLenum) 113) 362 | #define GLUT_WINDOW_ACCUM_ALPHA_SIZE ((GLenum) 114) 363 | #define GLUT_WINDOW_DOUBLEBUFFER ((GLenum) 115) 364 | #define GLUT_WINDOW_RGBA ((GLenum) 116) 365 | #define GLUT_WINDOW_PARENT ((GLenum) 117) 366 | #define GLUT_WINDOW_NUM_CHILDREN ((GLenum) 118) 367 | #define GLUT_WINDOW_COLORMAP_SIZE ((GLenum) 119) 368 | #if (GLUT_API_VERSION >= 2) 369 | #define GLUT_WINDOW_NUM_SAMPLES ((GLenum) 120) 370 | #define GLUT_WINDOW_STEREO ((GLenum) 121) 371 | #endif 372 | #if (GLUT_API_VERSION >= 3) 373 | #define GLUT_WINDOW_CURSOR ((GLenum) 122) 374 | #endif 375 | #define GLUT_SCREEN_WIDTH ((GLenum) 200) 376 | #define GLUT_SCREEN_HEIGHT ((GLenum) 201) 377 | #define GLUT_SCREEN_WIDTH_MM ((GLenum) 202) 378 | #define GLUT_SCREEN_HEIGHT_MM ((GLenum) 203) 379 | #define GLUT_MENU_NUM_ITEMS ((GLenum) 300) 380 | #define GLUT_DISPLAY_MODE_POSSIBLE ((GLenum) 400) 381 | #define GLUT_INIT_WINDOW_X ((GLenum) 500) 382 | #define GLUT_INIT_WINDOW_Y ((GLenum) 501) 383 | #define GLUT_INIT_WINDOW_WIDTH ((GLenum) 502) 384 | #define GLUT_INIT_WINDOW_HEIGHT ((GLenum) 503) 385 | #define GLUT_INIT_DISPLAY_MODE ((GLenum) 504) 386 | #if (GLUT_API_VERSION >= 2) 387 | #define GLUT_ELAPSED_TIME ((GLenum) 700) 388 | #endif 389 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13) 390 | #define GLUT_WINDOW_FORMAT_ID ((GLenum) 123) 391 | #endif 392 | 393 | #if (GLUT_API_VERSION >= 2) 394 | /* glutDeviceGet parameters. */ 395 | #define GLUT_HAS_KEYBOARD ((GLenum) 600) 396 | #define GLUT_HAS_MOUSE ((GLenum) 601) 397 | #define GLUT_HAS_SPACEBALL ((GLenum) 602) 398 | #define GLUT_HAS_DIAL_AND_BUTTON_BOX ((GLenum) 603) 399 | #define GLUT_HAS_TABLET ((GLenum) 604) 400 | #define GLUT_NUM_MOUSE_BUTTONS ((GLenum) 605) 401 | #define GLUT_NUM_SPACEBALL_BUTTONS ((GLenum) 606) 402 | #define GLUT_NUM_BUTTON_BOX_BUTTONS ((GLenum) 607) 403 | #define GLUT_NUM_DIALS ((GLenum) 608) 404 | #define GLUT_NUM_TABLET_BUTTONS ((GLenum) 609) 405 | #endif 406 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13) 407 | #define GLUT_DEVICE_IGNORE_KEY_REPEAT ((GLenum) 610) 408 | #define GLUT_DEVICE_KEY_REPEAT ((GLenum) 611) 409 | #define GLUT_HAS_JOYSTICK ((GLenum) 612) 410 | #define GLUT_OWNS_JOYSTICK ((GLenum) 613) 411 | #define GLUT_JOYSTICK_BUTTONS ((GLenum) 614) 412 | #define GLUT_JOYSTICK_AXES ((GLenum) 615) 413 | #define GLUT_JOYSTICK_POLL_RATE ((GLenum) 616) 414 | #endif 415 | 416 | #if (GLUT_API_VERSION >= 3) 417 | /* glutLayerGet parameters. */ 418 | #define GLUT_OVERLAY_POSSIBLE ((GLenum) 800) 419 | #define GLUT_LAYER_IN_USE ((GLenum) 801) 420 | #define GLUT_HAS_OVERLAY ((GLenum) 802) 421 | #define GLUT_TRANSPARENT_INDEX ((GLenum) 803) 422 | #define GLUT_NORMAL_DAMAGED ((GLenum) 804) 423 | #define GLUT_OVERLAY_DAMAGED ((GLenum) 805) 424 | 425 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9) 426 | /* glutVideoResizeGet parameters. */ 427 | #define GLUT_VIDEO_RESIZE_POSSIBLE ((GLenum) 900) 428 | #define GLUT_VIDEO_RESIZE_IN_USE ((GLenum) 901) 429 | #define GLUT_VIDEO_RESIZE_X_DELTA ((GLenum) 902) 430 | #define GLUT_VIDEO_RESIZE_Y_DELTA ((GLenum) 903) 431 | #define GLUT_VIDEO_RESIZE_WIDTH_DELTA ((GLenum) 904) 432 | #define GLUT_VIDEO_RESIZE_HEIGHT_DELTA ((GLenum) 905) 433 | #define GLUT_VIDEO_RESIZE_X ((GLenum) 906) 434 | #define GLUT_VIDEO_RESIZE_Y ((GLenum) 907) 435 | #define GLUT_VIDEO_RESIZE_WIDTH ((GLenum) 908) 436 | #define GLUT_VIDEO_RESIZE_HEIGHT ((GLenum) 909) 437 | #endif 438 | 439 | /* glutUseLayer parameters. */ 440 | #define GLUT_NORMAL ((GLenum) 0) 441 | #define GLUT_OVERLAY ((GLenum) 1) 442 | 443 | /* glutGetModifiers return mask. */ 444 | #define GLUT_ACTIVE_SHIFT 1 445 | #define GLUT_ACTIVE_CTRL 2 446 | #define GLUT_ACTIVE_ALT 4 447 | 448 | /* glutSetCursor parameters. */ 449 | /* Basic arrows. */ 450 | #define GLUT_CURSOR_RIGHT_ARROW 0 451 | #define GLUT_CURSOR_LEFT_ARROW 1 452 | /* Symbolic cursor shapes. */ 453 | #define GLUT_CURSOR_INFO 2 454 | #define GLUT_CURSOR_DESTROY 3 455 | #define GLUT_CURSOR_HELP 4 456 | #define GLUT_CURSOR_CYCLE 5 457 | #define GLUT_CURSOR_SPRAY 6 458 | #define GLUT_CURSOR_WAIT 7 459 | #define GLUT_CURSOR_TEXT 8 460 | #define GLUT_CURSOR_CROSSHAIR 9 461 | /* Directional cursors. */ 462 | #define GLUT_CURSOR_UP_DOWN 10 463 | #define GLUT_CURSOR_LEFT_RIGHT 11 464 | /* Sizing cursors. */ 465 | #define GLUT_CURSOR_TOP_SIDE 12 466 | #define GLUT_CURSOR_BOTTOM_SIDE 13 467 | #define GLUT_CURSOR_LEFT_SIDE 14 468 | #define GLUT_CURSOR_RIGHT_SIDE 15 469 | #define GLUT_CURSOR_TOP_LEFT_CORNER 16 470 | #define GLUT_CURSOR_TOP_RIGHT_CORNER 17 471 | #define GLUT_CURSOR_BOTTOM_RIGHT_CORNER 18 472 | #define GLUT_CURSOR_BOTTOM_LEFT_CORNER 19 473 | /* Inherit from parent window. */ 474 | #define GLUT_CURSOR_INHERIT 100 475 | /* Blank cursor. */ 476 | #define GLUT_CURSOR_NONE 101 477 | /* Fullscreen crosshair (if available). */ 478 | #define GLUT_CURSOR_FULL_CROSSHAIR 102 479 | #endif 480 | 481 | /* GLUT initialization sub-API. */ 482 | GLUTAPI void APIENTRY glutInit(int *argcp, char **argv); 483 | #if defined(_WIN32) && !defined(GLUT_DISABLE_ATEXIT_HACK) 484 | GLUTAPI void APIENTRY __glutInitWithExit(int *argcp, char **argv, void (__cdecl *exitfunc)(int)); 485 | #ifndef GLUT_BUILDING_LIB 486 | static void APIENTRY glutInit_ATEXIT_HACK(int *argcp, char **argv) { __glutInitWithExit(argcp, argv, exit); } 487 | #define glutInit glutInit_ATEXIT_HACK 488 | #endif 489 | #endif 490 | GLUTAPI void APIENTRY glutInitDisplayMode(unsigned int mode); 491 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9) 492 | GLUTAPI void APIENTRY glutInitDisplayString(const char *string); 493 | #endif 494 | GLUTAPI void APIENTRY glutInitWindowPosition(int x, int y); 495 | GLUTAPI void APIENTRY glutInitWindowSize(int width, int height); 496 | GLUTAPI void APIENTRY glutMainLoop(void); 497 | 498 | /* GLUT window sub-API. */ 499 | GLUTAPI int APIENTRY glutCreateWindow(const char *title); 500 | #if defined(_WIN32) && !defined(GLUT_DISABLE_ATEXIT_HACK) 501 | GLUTAPI int APIENTRY __glutCreateWindowWithExit(const char *title, void (__cdecl *exitfunc)(int)); 502 | #ifndef GLUT_BUILDING_LIB 503 | static int APIENTRY glutCreateWindow_ATEXIT_HACK(const char *title) { return __glutCreateWindowWithExit(title, exit); } 504 | #define glutCreateWindow glutCreateWindow_ATEXIT_HACK 505 | #endif 506 | #endif 507 | GLUTAPI int APIENTRY glutCreateSubWindow(int win, int x, int y, int width, int height); 508 | GLUTAPI void APIENTRY glutDestroyWindow(int win); 509 | GLUTAPI void APIENTRY glutPostRedisplay(void); 510 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 11) 511 | GLUTAPI void APIENTRY glutPostWindowRedisplay(int win); 512 | #endif 513 | GLUTAPI void APIENTRY glutSwapBuffers(void); 514 | GLUTAPI int APIENTRY glutGetWindow(void); 515 | GLUTAPI void APIENTRY glutSetWindow(int win); 516 | GLUTAPI void APIENTRY glutSetWindowTitle(const char *title); 517 | GLUTAPI void APIENTRY glutSetIconTitle(const char *title); 518 | GLUTAPI void APIENTRY glutPositionWindow(int x, int y); 519 | GLUTAPI void APIENTRY glutReshapeWindow(int width, int height); 520 | GLUTAPI void APIENTRY glutPopWindow(void); 521 | GLUTAPI void APIENTRY glutPushWindow(void); 522 | GLUTAPI void APIENTRY glutIconifyWindow(void); 523 | GLUTAPI void APIENTRY glutShowWindow(void); 524 | GLUTAPI void APIENTRY glutHideWindow(void); 525 | #if (GLUT_API_VERSION >= 3) 526 | GLUTAPI void APIENTRY glutFullScreen(void); 527 | GLUTAPI void APIENTRY glutSetCursor(int cursor); 528 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9) 529 | GLUTAPI void APIENTRY glutWarpPointer(int x, int y); 530 | #endif 531 | 532 | /* GLUT overlay sub-API. */ 533 | GLUTAPI void APIENTRY glutEstablishOverlay(void); 534 | GLUTAPI void APIENTRY glutRemoveOverlay(void); 535 | GLUTAPI void APIENTRY glutUseLayer(GLenum layer); 536 | GLUTAPI void APIENTRY glutPostOverlayRedisplay(void); 537 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 11) 538 | GLUTAPI void APIENTRY glutPostWindowOverlayRedisplay(int win); 539 | #endif 540 | GLUTAPI void APIENTRY glutShowOverlay(void); 541 | GLUTAPI void APIENTRY glutHideOverlay(void); 542 | #endif 543 | 544 | /* GLUT menu sub-API. */ 545 | GLUTAPI int APIENTRY glutCreateMenu(void (GLUTCALLBACK *func)(int)); 546 | #if defined(_WIN32) && !defined(GLUT_DISABLE_ATEXIT_HACK) 547 | GLUTAPI int APIENTRY __glutCreateMenuWithExit(void (GLUTCALLBACK *func)(int), void (__cdecl *exitfunc)(int)); 548 | #ifndef GLUT_BUILDING_LIB 549 | static int APIENTRY glutCreateMenu_ATEXIT_HACK(void (GLUTCALLBACK *func)(int)) { return __glutCreateMenuWithExit(func, exit); } 550 | #define glutCreateMenu glutCreateMenu_ATEXIT_HACK 551 | #endif 552 | #endif 553 | GLUTAPI void APIENTRY glutDestroyMenu(int menu); 554 | GLUTAPI int APIENTRY glutGetMenu(void); 555 | GLUTAPI void APIENTRY glutSetMenu(int menu); 556 | GLUTAPI void APIENTRY glutAddMenuEntry(const char *label, int value); 557 | GLUTAPI void APIENTRY glutAddSubMenu(const char *label, int submenu); 558 | GLUTAPI void APIENTRY glutChangeToMenuEntry(int item, const char *label, int value); 559 | GLUTAPI void APIENTRY glutChangeToSubMenu(int item, const char *label, int submenu); 560 | GLUTAPI void APIENTRY glutRemoveMenuItem(int item); 561 | GLUTAPI void APIENTRY glutAttachMenu(int button); 562 | GLUTAPI void APIENTRY glutDetachMenu(int button); 563 | 564 | /* GLUT window callback sub-API. */ 565 | GLUTAPI void APIENTRY glutDisplayFunc(void (GLUTCALLBACK *func)(void)); 566 | GLUTAPI void APIENTRY glutReshapeFunc(void (GLUTCALLBACK *func)(int width, int height)); 567 | GLUTAPI void APIENTRY glutKeyboardFunc(void (GLUTCALLBACK *func)(unsigned char key, int x, int y)); 568 | GLUTAPI void APIENTRY glutMouseFunc(void (GLUTCALLBACK *func)(int button, int state, int x, int y)); 569 | GLUTAPI void APIENTRY glutMotionFunc(void (GLUTCALLBACK *func)(int x, int y)); 570 | GLUTAPI void APIENTRY glutPassiveMotionFunc(void (GLUTCALLBACK *func)(int x, int y)); 571 | GLUTAPI void APIENTRY glutEntryFunc(void (GLUTCALLBACK *func)(int state)); 572 | GLUTAPI void APIENTRY glutVisibilityFunc(void (GLUTCALLBACK *func)(int state)); 573 | GLUTAPI void APIENTRY glutIdleFunc(void (GLUTCALLBACK *func)(void)); 574 | GLUTAPI void APIENTRY glutTimerFunc(unsigned int millis, void (GLUTCALLBACK *func)(int value), int value); 575 | GLUTAPI void APIENTRY glutMenuStateFunc(void (GLUTCALLBACK *func)(int state)); 576 | #if (GLUT_API_VERSION >= 2) 577 | GLUTAPI void APIENTRY glutSpecialFunc(void (GLUTCALLBACK *func)(int key, int x, int y)); 578 | GLUTAPI void APIENTRY glutSpaceballMotionFunc(void (GLUTCALLBACK *func)(int x, int y, int z)); 579 | GLUTAPI void APIENTRY glutSpaceballRotateFunc(void (GLUTCALLBACK *func)(int x, int y, int z)); 580 | GLUTAPI void APIENTRY glutSpaceballButtonFunc(void (GLUTCALLBACK *func)(int button, int state)); 581 | GLUTAPI void APIENTRY glutButtonBoxFunc(void (GLUTCALLBACK *func)(int button, int state)); 582 | GLUTAPI void APIENTRY glutDialsFunc(void (GLUTCALLBACK *func)(int dial, int value)); 583 | GLUTAPI void APIENTRY glutTabletMotionFunc(void (GLUTCALLBACK *func)(int x, int y)); 584 | GLUTAPI void APIENTRY glutTabletButtonFunc(void (GLUTCALLBACK *func)(int button, int state, int x, int y)); 585 | #if (GLUT_API_VERSION >= 3) 586 | GLUTAPI void APIENTRY glutMenuStatusFunc(void (GLUTCALLBACK *func)(int status, int x, int y)); 587 | GLUTAPI void APIENTRY glutOverlayDisplayFunc(void (GLUTCALLBACK *func)(void)); 588 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9) 589 | GLUTAPI void APIENTRY glutWindowStatusFunc(void (GLUTCALLBACK *func)(int state)); 590 | #endif 591 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13) 592 | GLUTAPI void APIENTRY glutKeyboardUpFunc(void (GLUTCALLBACK *func)(unsigned char key, int x, int y)); 593 | GLUTAPI void APIENTRY glutSpecialUpFunc(void (GLUTCALLBACK *func)(int key, int x, int y)); 594 | GLUTAPI void APIENTRY glutJoystickFunc(void (GLUTCALLBACK *func)(unsigned int buttonMask, int x, int y, int z), int pollInterval); 595 | #endif 596 | #endif 597 | #endif 598 | 599 | /* GLUT color index sub-API. */ 600 | GLUTAPI void APIENTRY glutSetColor(int, GLfloat red, GLfloat green, GLfloat blue); 601 | GLUTAPI GLfloat APIENTRY glutGetColor(int ndx, int component); 602 | GLUTAPI void APIENTRY glutCopyColormap(int win); 603 | 604 | /* GLUT state retrieval sub-API. */ 605 | GLUTAPI int APIENTRY glutGet(GLenum type); 606 | GLUTAPI int APIENTRY glutDeviceGet(GLenum type); 607 | #if (GLUT_API_VERSION >= 2) 608 | /* GLUT extension support sub-API */ 609 | GLUTAPI int APIENTRY glutExtensionSupported(const char *name); 610 | #endif 611 | #if (GLUT_API_VERSION >= 3) 612 | GLUTAPI int APIENTRY glutGetModifiers(void); 613 | GLUTAPI int APIENTRY glutLayerGet(GLenum type); 614 | #endif 615 | 616 | /* GLUT font sub-API */ 617 | GLUTAPI void APIENTRY glutBitmapCharacter(void *font, int character); 618 | GLUTAPI int APIENTRY glutBitmapWidth(void *font, int character); 619 | GLUTAPI void APIENTRY glutStrokeCharacter(void *font, int character); 620 | GLUTAPI int APIENTRY glutStrokeWidth(void *font, int character); 621 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9) 622 | GLUTAPI int APIENTRY glutBitmapLength(void *font, const unsigned char *string); 623 | GLUTAPI int APIENTRY glutStrokeLength(void *font, const unsigned char *string); 624 | #endif 625 | 626 | /* GLUT pre-built models sub-API */ 627 | GLUTAPI void APIENTRY glutWireSphere(GLdouble radius, GLint slices, GLint stacks); 628 | GLUTAPI void APIENTRY glutSolidSphere(GLdouble radius, GLint slices, GLint stacks); 629 | GLUTAPI void APIENTRY glutWireCone(GLdouble base, GLdouble height, GLint slices, GLint stacks); 630 | GLUTAPI void APIENTRY glutSolidCone(GLdouble base, GLdouble height, GLint slices, GLint stacks); 631 | GLUTAPI void APIENTRY glutWireCube(GLdouble size); 632 | GLUTAPI void APIENTRY glutSolidCube(GLdouble size); 633 | GLUTAPI void APIENTRY glutWireTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings); 634 | GLUTAPI void APIENTRY glutSolidTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings); 635 | GLUTAPI void APIENTRY glutWireDodecahedron(void); 636 | GLUTAPI void APIENTRY glutSolidDodecahedron(void); 637 | GLUTAPI void APIENTRY glutWireTeapot(GLdouble size); 638 | GLUTAPI void APIENTRY glutSolidTeapot(GLdouble size); 639 | GLUTAPI void APIENTRY glutWireOctahedron(void); 640 | GLUTAPI void APIENTRY glutSolidOctahedron(void); 641 | GLUTAPI void APIENTRY glutWireTetrahedron(void); 642 | GLUTAPI void APIENTRY glutSolidTetrahedron(void); 643 | GLUTAPI void APIENTRY glutWireIcosahedron(void); 644 | GLUTAPI void APIENTRY glutSolidIcosahedron(void); 645 | 646 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9) 647 | /* GLUT video resize sub-API. */ 648 | GLUTAPI int APIENTRY glutVideoResizeGet(GLenum param); 649 | GLUTAPI void APIENTRY glutSetupVideoResizing(void); 650 | GLUTAPI void APIENTRY glutStopVideoResizing(void); 651 | GLUTAPI void APIENTRY glutVideoResize(int x, int y, int width, int height); 652 | GLUTAPI void APIENTRY glutVideoPan(int x, int y, int width, int height); 653 | 654 | /* GLUT debugging sub-API. */ 655 | GLUTAPI void APIENTRY glutReportErrors(void); 656 | #endif 657 | 658 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13) 659 | /* GLUT device control sub-API. */ 660 | /* glutSetKeyRepeat modes. */ 661 | #define GLUT_KEY_REPEAT_OFF 0 662 | #define GLUT_KEY_REPEAT_ON 1 663 | #define GLUT_KEY_REPEAT_DEFAULT 2 664 | 665 | /* Joystick button masks. */ 666 | #define GLUT_JOYSTICK_BUTTON_A 1 667 | #define GLUT_JOYSTICK_BUTTON_B 2 668 | #define GLUT_JOYSTICK_BUTTON_C 4 669 | #define GLUT_JOYSTICK_BUTTON_D 8 670 | 671 | GLUTAPI void APIENTRY glutIgnoreKeyRepeat(int ignore); 672 | GLUTAPI void APIENTRY glutSetKeyRepeat(int repeatMode); 673 | GLUTAPI void APIENTRY glutForceJoystickFunc(void); 674 | 675 | /* GLUT game mode sub-API. */ 676 | /* glutGameModeGet. */ 677 | #define GLUT_GAME_MODE_ACTIVE ((GLenum) 0) 678 | #define GLUT_GAME_MODE_POSSIBLE ((GLenum) 1) 679 | #define GLUT_GAME_MODE_WIDTH ((GLenum) 2) 680 | #define GLUT_GAME_MODE_HEIGHT ((GLenum) 3) 681 | #define GLUT_GAME_MODE_PIXEL_DEPTH ((GLenum) 4) 682 | #define GLUT_GAME_MODE_REFRESH_RATE ((GLenum) 5) 683 | #define GLUT_GAME_MODE_DISPLAY_CHANGED ((GLenum) 6) 684 | 685 | GLUTAPI void APIENTRY glutGameModeString(const char *string); 686 | GLUTAPI int APIENTRY glutEnterGameMode(void); 687 | GLUTAPI void APIENTRY glutLeaveGameMode(void); 688 | GLUTAPI int APIENTRY glutGameModeGet(GLenum mode); 689 | #endif 690 | 691 | #ifdef __cplusplus 692 | } 693 | 694 | #endif 695 | 696 | #ifdef GLUT_APIENTRY_DEFINED 697 | # undef GLUT_APIENTRY_DEFINED 698 | # undef APIENTRY 699 | #endif 700 | 701 | #ifdef GLUT_WINGDIAPI_DEFINED 702 | # undef GLUT_WINGDIAPI_DEFINED 703 | # undef WINGDIAPI 704 | #endif 705 | 706 | #ifdef GLUT_DEFINED___CDECL 707 | # undef GLUT_DEFINED___CDECL 708 | # undef __cdecl 709 | #endif 710 | 711 | #ifdef GLUT_DEFINED__CRTIMP 712 | # undef GLUT_DEFINED__CRTIMP 713 | # undef _CRTIMP 714 | #endif 715 | 716 | #endif /* __glut_h__ */ 717 | -------------------------------------------------------------------------------- /cuda-c/src/cuda-programming-model/04-timing-kernel.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 24, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stdout", 10 | "output_type": "stream", 11 | "text": [ 12 | "Overwriting sumArraysOnGPU.cu\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "%%file sumArraysOnGPU.cu\n", 18 | "\n", 19 | "#include \n", 20 | "#include \n", 21 | "#include \n", 22 | "#include \n", 23 | "#include \n", 24 | "#include \n", 25 | "\n", 26 | "double cpuSecond(){\n", 27 | " struct timeval tp;\n", 28 | " gettimeofday(&tp, NULL);\n", 29 | " return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);\n", 30 | "}\n", 31 | "\n", 32 | "#define CHECK(call) \\\n", 33 | "{ \\\n", 34 | " const cudaError_t error = call; \\\n", 35 | " if (error != cudaSuccess) \\\n", 36 | " { \\\n", 37 | " fprintf(stderr, \"Error: %s:%d, \", __FILE__, __LINE__); \\\n", 38 | " fprintf(stderr, \"code: %d, reason: %s\\n\", error, \\\n", 39 | " cudaGetErrorString(error)); \\\n", 40 | " exit(1); \\\n", 41 | " } \\\n", 42 | "}\n", 43 | "\n", 44 | "\n", 45 | "__global__ void sumArraysOnDevice(float *A, float *B, float *C, const int N){\n", 46 | " int idx = blockIdx.x * blockDim.x + threadIdx.x;\n", 47 | " if (idx < N) C[idx] = A[idx] + B[idx];\n", 48 | "\n", 49 | "}\n", 50 | "\n", 51 | "\n", 52 | "void initialData(float *ip, int size){\n", 53 | " // generate different seed for random number \n", 54 | " time_t t;\n", 55 | " srand((unsigned int) time (&t));\n", 56 | " \n", 57 | " for (int i=0; i epsilon){\n", 76 | " match = 0;\n", 77 | " printf(\"Arrays do not match!\\n\");\n", 78 | " printf(\"host %5.2f gpu %5.2f at current %d\\n\",\n", 79 | " hostRef[i], gpuRef[i], i);\n", 80 | " break;\n", 81 | " }\n", 82 | " }\n", 83 | " if (match) printf(\"Arrays match. \\n\\n\");\n", 84 | "}\n", 85 | "\n", 86 | "\n", 87 | "int main(int argc, char **argv){\n", 88 | " \n", 89 | " printf(\"%s Starting...\\n\", argv[0]);\n", 90 | " \n", 91 | " // malloc host memory\n", 92 | " int nElem = 1 <<24;\n", 93 | " size_t nBytes = nElem * sizeof(float);\n", 94 | " \n", 95 | " \n", 96 | " // initialize data at host side\n", 97 | " float *h_A, *h_B, *hostRef, *gpuRef;\n", 98 | " h_A = (float *)malloc(nBytes);\n", 99 | " h_B = (float *)malloc(nBytes);\n", 100 | " hostRef = (float *)malloc(nBytes);\n", 101 | " gpuRef = (float *)malloc(nBytes);\n", 102 | " \n", 103 | " // initialize data at host side\n", 104 | " initialData(h_A, nElem);\n", 105 | " initialData(h_B, nElem);\n", 106 | " \n", 107 | " memset(hostRef, 0, nBytes);\n", 108 | " memset(gpuRef, 0, nBytes);\n", 109 | " \n", 110 | " // malloc device global memory \n", 111 | " float *d_A, *d_B, *d_C;\n", 112 | " cudaMalloc((float**)&d_A, nBytes);\n", 113 | " cudaMalloc((float**)&d_B, nBytes);\n", 114 | " cudaMalloc((float**)&d_C, nBytes);\n", 115 | " \n", 116 | " // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the\n", 117 | " // parameter cudaMemcpyHostToDevice specifying the transfer direction.\n", 118 | " \n", 119 | " CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));\n", 120 | " CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));\n", 121 | " \n", 122 | " // invoke kernel at host side\n", 123 | " int iLen = 128;\n", 124 | " dim3 block(iLen);\n", 125 | " dim3 grid((nElem+block.x-1)/block.x);\n", 126 | " \n", 127 | " double iStart = cpuSecond();\n", 128 | " sumArraysOnDevice<<>>(d_A, d_B, d_C, nElem);\n", 129 | " CHECK(cudaDeviceSynchronize());\n", 130 | " double iElaps = cpuSecond() - iStart;\n", 131 | " printf(\"sumArraysOnGPU <<<%d,%d>>> Time elapsed %f sec\\n\", grid.x, block.x, iElaps);\n", 132 | " //printf(\"Execution configuration <<<%d, %d>>>\\n\", grid.x, block.x);\n", 133 | " \n", 134 | " // copy kernel result back to host side \n", 135 | " cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);\n", 136 | " \n", 137 | " // add vector at host side for result checks\n", 138 | " sumArraysOnHost(h_A, h_B, hostRef, nElem);\n", 139 | " \n", 140 | " for (int i=0; i<10; i++){\n", 141 | " printf(\"%f + %f = %f \\n\", h_A[i], h_B[i], hostRef[i]);\n", 142 | "\n", 143 | " }\n", 144 | " \n", 145 | " // check device results\n", 146 | " checkResult(hostRef, gpuRef, nElem);\n", 147 | " \n", 148 | " free(h_A);\n", 149 | " free(h_B);\n", 150 | " free(hostRef);\n", 151 | " free(gpuRef);\n", 152 | " \n", 153 | " // use cudaFree to release the memory used on the GPU\n", 154 | " cudaFree(d_A);\n", 155 | " cudaFree(d_B);\n", 156 | " cudaFree(d_C);\n", 157 | " cudaDeviceReset();\n", 158 | " \n", 159 | " return (0);\n", 160 | "}\n" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 26, 166 | "metadata": {}, 167 | "outputs": [ 168 | { 169 | "name": "stdout", 170 | "output_type": "stream", 171 | "text": [ 172 | "./addvector Starting...\n", 173 | "sumArraysOnGPU <<<131072,128>>> Time elapsed 0.016467 sec\n", 174 | "2.800000 + 2.800000 = 5.600000 \n", 175 | "10.000000 + 10.000000 = 20.000000 \n", 176 | "2.600000 + 2.600000 = 5.200000 \n", 177 | "22.299999 + 22.299999 = 44.599998 \n", 178 | "11.000000 + 11.000000 = 22.000000 \n", 179 | "9.900000 + 9.900000 = 19.799999 \n", 180 | "14.600000 + 14.600000 = 29.200001 \n", 181 | "22.299999 + 22.299999 = 44.599998 \n", 182 | "21.100000 + 21.100000 = 42.200001 \n", 183 | "8.600000 + 8.600000 = 17.200001 \n", 184 | "Arrays match. \n", 185 | "\n" 186 | ] 187 | } 188 | ], 189 | "source": [ 190 | "%%bash\n", 191 | "nvcc sumArraysOnGPU.cu -o addvector\n", 192 | "./addvector" 193 | ] 194 | }, 195 | { 196 | "cell_type": "markdown", 197 | "metadata": {}, 198 | "source": [ 199 | "## Timing with nvprof" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 27, 205 | "metadata": {}, 206 | "outputs": [ 207 | { 208 | "name": "stdout", 209 | "output_type": "stream", 210 | "text": [ 211 | "./addvector Starting...\n", 212 | "==19639== NVPROF is profiling process 19639, command: ./addvector\n", 213 | "sumArraysOnGPU <<<131072,128>>> Time elapsed 0.014515 sec\n", 214 | "24.600000 + 24.600000 = 49.200001 \n", 215 | "11.400000 + 11.400000 = 22.799999 \n", 216 | "9.800000 + 9.800000 = 19.600000 \n", 217 | "15.000000 + 15.000000 = 30.000000 \n", 218 | "0.800000 + 0.800000 = 1.600000 \n", 219 | "22.700001 + 22.700001 = 45.400002 \n", 220 | "8.800000 + 8.800000 = 17.600000 \n", 221 | "17.700001 + 17.700001 = 35.400002 \n", 222 | "5.100000 + 5.100000 = 10.200000 \n", 223 | "3.800000 + 3.800000 = 7.600000 \n", 224 | "Arrays match. \n", 225 | "\n", 226 | "==19639== Profiling application: ./addvector\n", 227 | "==19639== Profiling result:\n", 228 | " Type Time(%) Time Calls Avg Min Max Name\n", 229 | " GPU activities: 61.59% 86.326ms 2 43.163ms 43.142ms 43.184ms [CUDA memcpy HtoD]\n", 230 | " 28.89% 40.487ms 1 40.487ms 40.487ms 40.487ms [CUDA memcpy DtoH]\n", 231 | " 9.52% 13.347ms 1 13.347ms 13.347ms 13.347ms sumArraysOnDevice(float*, float*, float*, int)\n", 232 | " API calls: 40.42% 166.13ms 3 55.378ms 263.99us 165.59ms cudaMalloc\n", 233 | " 30.71% 126.23ms 3 42.076ms 40.685ms 43.332ms cudaMemcpy\n", 234 | " 16.15% 66.370ms 1 66.370ms 66.370ms 66.370ms cudaDeviceReset\n", 235 | " 8.37% 34.394ms 3 11.465ms 364.77us 26.767ms cudaFree\n", 236 | " 3.52% 14.469ms 1 14.469ms 14.469ms 14.469ms cudaDeviceSynchronize\n", 237 | " 0.65% 2.6564ms 94 28.259us 256ns 1.1985ms cuDeviceGetAttribute\n", 238 | " 0.10% 402.69us 1 402.69us 402.69us 402.69us cuDeviceGetName\n", 239 | " 0.07% 279.80us 1 279.80us 279.80us 279.80us cuDeviceTotalMem\n", 240 | " 0.01% 35.872us 1 35.872us 35.872us 35.872us cudaLaunch\n", 241 | " 0.00% 3.0730us 4 768ns 181ns 2.1630us cudaSetupArgument\n", 242 | " 0.00% 2.3970us 2 1.1980us 572ns 1.8250us cuDeviceGetCount\n", 243 | " 0.00% 1.6430us 1 1.6430us 1.6430us 1.6430us cudaConfigureCall\n", 244 | " 0.00% 1.0380us 2 519ns 269ns 769ns cuDeviceGet\n" 245 | ] 246 | } 247 | ], 248 | "source": [ 249 | "!nvprof --unified-memory-profiling off ./addvector" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 28, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "Usage: nvprof [options] [application] [application-arguments]\r\n", 262 | "Options:\r\n", 263 | " --aggregate-mode \r\n", 264 | " Turn on/off aggregate mode for events and metrics specified\r\n", 265 | " by subsequent \"--events\" and \"--metrics\" options. Those\r\n", 266 | " event/metric values will be collected for each domain instance,\r\n", 267 | " instead of the whole device. Allowed values:\r\n", 268 | " \ton - turn on aggregate mode (default)\r\n", 269 | " \toff - turn off aggregate mode\r\n", 270 | "\r\n", 271 | " --analysis-metrics\r\n", 272 | " Collect profiling data that can be imported to Visual Profiler's\r\n", 273 | " \"analysis\" mode. Note: Use \"--export-profile\" to specify\r\n", 274 | " an export file.\r\n", 275 | "\r\n", 276 | " --annotate-mpi \r\n", 277 | " Automatically annotate MPI calls with NVTX markers. Specify\r\n", 278 | " the MPI implementation installed on your machine. Currently,\r\n", 279 | " Open MPI and MPICH implementations are supported. By default,\r\n", 280 | " this option is off.\r\n", 281 | "\r\n", 282 | " --concurrent-kernels \r\n", 283 | " Turn on/off concurrent kernel execution. If concurrent kernel\r\n", 284 | " execution is off, all kernels running on one device will\r\n", 285 | " be serialized. Allowed values:\r\n", 286 | " \ton - turn on concurrent kernel execution (default)\r\n", 287 | " \toff - turn off concurrent kernel execution\r\n", 288 | "\r\n", 289 | " --continuous-sampling-interval \r\n", 290 | " Set the continuous mode sampling interval in milliseconds.\r\n", 291 | " Minimum is 1 ms. Default is 2 ms.\r\n", 292 | "\r\n", 293 | " --cpu-thread-tracing \r\n", 294 | " Collect information about CPU thread API activity.\r\n", 295 | " Allowed values:\r\n", 296 | " \ton - turn on CPU thread API tracing\r\n", 297 | " \toff - turn off CPU thread API tracing (default)\r\n", 298 | "\r\n", 299 | " --dependency-analysis\r\n", 300 | " Generate event dependency graph for host and device activities\r\n", 301 | " and run dependency analysis.\r\n", 302 | "\r\n", 303 | " --device-buffer-size \r\n", 304 | " Set the device memory size (in MBs) reserved for storing\r\n", 305 | " profiling data for non-CDP operations, especially for concurrent\r\n", 306 | " kernel tracing, for each buffer on a context. The default\r\n", 307 | " value is 8MB. The size should be a positive integer.\r\n", 308 | "\r\n", 309 | " --device-cdp-buffer-size \r\n", 310 | " Set the device memory size (in MBs) reserved for storing\r\n", 311 | " profiling data for CDP operations for each buffer on a context.\r\n", 312 | " The default value is 8MB. The size should be a positive\r\n", 313 | " integer.\r\n", 314 | "\r\n", 315 | " --devices \r\n", 316 | " Change the scope of subsequent \"--events\", \"--metrics\", \"--query-events\"\r\n", 317 | " and \"--query-metrics\" options.\r\n", 318 | " Allowed values:\r\n", 319 | " \tall - change scope to all valid devices\r\n", 320 | " \tcomma-separated device IDs - change scope to specified\r\n", 321 | " devices\r\n", 322 | "\r\n", 323 | " --event-collection-mode \r\n", 324 | " Choose event collection mode for all events/metrics Allowed\r\n", 325 | " values:\r\n", 326 | " \tkernel - events/metrics are collected only for durations\r\n", 327 | " of kernel executions (default)\r\n", 328 | " \tcontinuous - events/metrics are collected for duration\r\n", 329 | " of application. This is not applicable for non-tesla devices.\r\n", 330 | " This mode is compatible only with NVLink events/metrics.\r\n", 331 | " This modeis incompatible with \"--profile-all-processes\"\r\n", 332 | " or \"--profile-child-processes\" or \"--replay-mode kernel\"\r\n", 333 | " or \"--replay-mode application\".\r\n", 334 | "\r\n", 335 | " -e, --events \r\n", 336 | " Specify the events to be profiled on certain device(s). Multiple\r\n", 337 | " event names separated by comma can be specified. Which device(s)\r\n", 338 | " are profiled is controlled by the \"--devices\" option. Otherwise\r\n", 339 | " events will be collected on all devices.\r\n", 340 | " For a list of available events, use \"--query-events\".\r\n", 341 | " Use \"--events all\" to profile all events available for each\r\n", 342 | " device.\r\n", 343 | " Use \"--devices\" and \"--kernels\" to select a specific kernel\r\n", 344 | " invocation.\r\n", 345 | "\r\n", 346 | " --kernel-latency-timestamps \r\n", 347 | " Turn on/off collection of kernel latency timestamps, namely\r\n", 348 | " queued and submitted. The queued timestamp is captured when\r\n", 349 | " a kernel launch command was queued into the CPU command\r\n", 350 | " buffer. The submitted timestamp denotes when the CPU command\r\n", 351 | " buffer containing this kernel launch was submitted to the\r\n", 352 | " GPU. Turning this option on may incur an overhead during\r\n", 353 | " profiling. Allowed values:\r\n", 354 | " \ton - turn on collection of kernel latency timestamps\r\n", 355 | " \toff - turn off collection of kernel latency timestamps\r\n", 356 | " (default)\r\n", 357 | "\r\n", 358 | " --kernels \r\n", 359 | " Change the scope of subsequent \"--events\", \"--metrics\" options.\r\n", 360 | " The syntax is as follows:\r\n", 361 | " \t\r\n", 362 | " \tLimit scope to given kernel name.\r\n", 363 | " or\r\n", 364 | " \t:::\r\n", 365 | " The context/stream IDs, names, kernel name and invocation\r\n", 366 | " can be regular expressions. Empty string matches any number\r\n", 367 | " or characters. If or \r\n", 368 | " is a positive number, it's strictly matched against the\r\n", 369 | " CUDA context/stream ID. Otherwise it's treated as a regular\r\n", 370 | " expression and matched against the context/stream name specified\r\n", 371 | " by the NVTX library. If the invocation count is a positive\r\n", 372 | " number, it's strictly matched against the invocation of\r\n", 373 | " the kernel. Otherwise it's treated as a regular expression.\r\n", 374 | " Example: --kernels \"1:foo:bar:2\" will profile any kernel\r\n", 375 | " whose name contains \"bar\" and is the 2nd instance on context\r\n", 376 | " 1 and on stream named \"foo\".\r\n", 377 | "\r\n", 378 | " -m, --metrics \r\n", 379 | " Specify the metrics to be profiled on certain device(s).\r\n", 380 | " Multiple metric names separated by comma can be specified.\r\n", 381 | " Which device(s) are profiled is controlled by the \"--devices\"\r\n", 382 | " option. Otherwise metrics will be collected on all devices.\r\n", 383 | " For a list of available metrics, use \"--query-metrics\".\r\n", 384 | " Use \"--metrics all\" to profile all metrics available for\r\n", 385 | " each device.\r\n", 386 | " Use \"--devices\" and \"--kernels\" to select a specific kernel\r\n", 387 | " invocation. \r\n", 388 | " Note: \"--metrics all\" does not include some metrics which\r\n", 389 | " are needed for Visual Profiler's source level analysis.\r\n", 390 | " For that, use \"--analysis-metrics\".\r\n", 391 | "\r\n", 392 | " --pc-sampling-period \r\n", 393 | " Specify PC Sampling period in cycles, at which the sampling\r\n", 394 | " records will be dumped. Allowed values for the period are\r\n", 395 | " integers between 5 to 31 both inclusive.\r\n", 396 | " This will set the sampling period to (2^period) cycles\r\n", 397 | " Default value is a number between 5 and 12 based on the setup.Note:\r\n", 398 | " Only available for GM20X+.\r\n", 399 | " \r\n", 400 | "\r\n", 401 | " --profile-all-processes\r\n", 402 | " Profile all processes launched by the same user who launched\r\n", 403 | " this nvprof instance. Note: Only one instance of nvprof\r\n", 404 | " can run with this option at the same time. Under this mode,\r\n", 405 | " there's no need to specify an application to run.\r\n", 406 | "\r\n", 407 | " --profile-api-trace \r\n", 408 | " Turn on/off CUDA runtime/driver API tracing. Allowed values:\r\n", 409 | " \tnone - turn off API tracing\r\n", 410 | " \truntime - only turn on CUDA runtime API tracing\r\n", 411 | " \tdriver - only turn on CUDA driver API tracing\r\n", 412 | " \tall - turn on all API tracing (default)\r\n", 413 | "\r\n", 414 | " --profile-child-processes\r\n", 415 | " Profile the application and all child processes launched\r\n", 416 | " by it.\r\n", 417 | "\r\n", 418 | " --profile-from-start \r\n", 419 | " Enable/disable profiling from the start of the application.\r\n", 420 | " If it's disabled, the application can use {cu,cuda}Profiler{Start,Stop}\r\n", 421 | " to turn on/off profiling. Allowed values:\r\n", 422 | " \ton - enable profiling from start (default)\r\n", 423 | " \toff - disable profiling from start\r\n", 424 | "\r\n", 425 | " --profiling-semaphore-pool-size \r\n", 426 | " Set the profiling semaphore pool size reserved for storing\r\n", 427 | " profiling data for serialized kernels and memory operations\r\n", 428 | " for each context. The default value is 65536. The size should\r\n", 429 | " be a positive integer.\r\n", 430 | "\r\n", 431 | " --query-events\r\n", 432 | " List all the events available on the device(s). Device(s)\r\n", 433 | " queried can be controlled by the \"--devices\" option.\r\n", 434 | "\r\n", 435 | " --query-metrics\r\n", 436 | " List all the metrics available on the device(s). Device(s)\r\n", 437 | " queried can be controlled by the \"--devices\" option.\r\n", 438 | "\r\n", 439 | " --replay-mode \r\n", 440 | " Choose replay mode used when not all events/metrics can be\r\n", 441 | " collected in a single run. Allowed values:\r\n", 442 | " \tdisabled - replay is disabled, events/metrics couldn't\r\n", 443 | " be profiled will be dropped\r\n", 444 | " \tkernel - each kernel invocation is replayed (default)\r\n", 445 | " \tapplication - the entire application is replayed.\r\n", 446 | " This modeis incompatible with \"--profile-all-processes\"\r\n", 447 | " or \"profile-child-processes\".\r\n", 448 | "\r\n", 449 | " -a, --source-level-analysis \r\n", 450 | " Specify the source level metrics to be profiled on a certain\r\n", 451 | " kernel invocation. Use \"--devices\" and \"--kernels\" to select\r\n", 452 | " a specific kernel invocation. Allowed values: one or more\r\n", 453 | " of the following, separated by commas\r\n", 454 | " \tglobal_access: global access\r\n", 455 | " \tshared_access: shared access\r\n", 456 | " \tbranch: divergent branch\r\n", 457 | " \tinstruction_execution: instruction execution\r\n", 458 | " \tpc_sampling: pc sampling, available only for GM20X+\r\n", 459 | " Note: Use \"--export-profile\" to specify an export file.\r\n", 460 | "\r\n", 461 | " --system-profiling \r\n", 462 | " Turn on/off power, clock, and thermal profiling. Allowed\r\n", 463 | " values:\r\n", 464 | " \ton - turn on system profiling\r\n", 465 | " \toff - turn off system profiling (default)\r\n", 466 | "\r\n", 467 | " -t, --timeout \r\n", 468 | " Set an execution timeout (in seconds) for the CUDA application.\r\n", 469 | " Note: Timeout starts counting from the moment the CUDA driver\r\n", 470 | " is initialized. If the application doesn't call any CUDA\r\n", 471 | " APIs, timeout won't be triggered.\r\n", 472 | "\r\n", 473 | " --track-memory-allocations \r\n", 474 | " Turn on/off tracking of memory operations, which involves\r\n", 475 | " recording timestamps, memory size, memory type and program\r\n", 476 | " counters of the memory allocations and frees. Turning this\r\n", 477 | " option on may incur an overhead during profiling. Allowed\r\n", 478 | " values:\r\n", 479 | " \ton - turn on tracking of memory allocations and\r\n", 480 | " free\r\n", 481 | " \toff - turn off tracking of memory allocations and\r\n", 482 | " free (default)\r\n", 483 | "\r\n", 484 | " --unified-memory-profiling \r\n", 485 | " Configure unified memory profiling. Allowed values:\r\n", 486 | " \tper-process-device - collect counts for each process\r\n", 487 | " and each device (default)\r\n", 488 | " \toff - turn off unified memory profiling\r\n", 489 | "\r\n", 490 | " --cpu-profiling \r\n", 491 | " Turn on CPU profiling. Note: CPU profiling is not supported\r\n", 492 | " in multi-process mode.\r\n", 493 | "\r\n", 494 | " --cpu-profiling-explain-ccff \r\n", 495 | " Path to a PGI pgexplain.xml file that should be used to interpret\r\n", 496 | " Common Compiler Feedback Format (CCFF) messages.\r\n", 497 | "\r\n", 498 | " --cpu-profiling-frequency \r\n", 499 | " Set the CPU profiling frequency in samples per second. Default\r\n", 500 | " is 100Hz. Maximum is 500Hz.\r\n", 501 | "\r\n", 502 | " --cpu-profiling-max-depth \r\n", 503 | " Set the maximum depth of each call stack. Zero means no limit.\r\n", 504 | " Default is zero.\r\n", 505 | "\r\n", 506 | " --cpu-profiling-mode \r\n", 507 | " Set the output mode of CPU profiling. Allowed values:\r\n", 508 | " \tflat - Show flat profile\r\n", 509 | " \ttop-down - Show parent functions at the top\r\n", 510 | " \tbottom-up - Show parent functions at the bottom\r\n", 511 | " (default)\r\n", 512 | "\r\n", 513 | " --cpu-profiling-percentage-threshold \r\n", 514 | " Filter out the entries that are below the set percentage\r\n", 515 | " threshold. The limit should be an integer between 0 and\r\n", 516 | " 100, inclusive. Zero means no limit. Default is zero.\r\n", 517 | "\r\n", 518 | " --cpu-profiling-scope \r\n", 519 | " Choose the profiling scope. Allowed values:\r\n", 520 | " \tfunction - Each level in the stack trace represents\r\n", 521 | " a distinct function (default)\r\n", 522 | " \tinstruction - Each level in the stack trace represents\r\n", 523 | " a distinct instruction address\r\n", 524 | "\r\n", 525 | " --cpu-profiling-show-ccff \r\n", 526 | " Choose whether to print Common Compiler Feedback Format (CCFF)\r\n", 527 | " messages embedded in the binary. Note: this option implies\r\n", 528 | " \"--cpu-profiling-scope instruction\".Default is off.\r\n", 529 | "\r\n", 530 | " --cpu-profiling-show-library \r\n", 531 | " Choose whether to print the library name for each sample.\r\n", 532 | "\r\n", 533 | " --cpu-profiling-thread-mode \r\n", 534 | " Set the thread mode of CPU profiling. Allowed values:\r\n", 535 | " \tseparated - Show separate profile for each thread\r\n", 536 | " \taggregated - Aggregate data from all threads (default)\r\n", 537 | "\r\n", 538 | " --cpu-profiling-unwind-stack \r\n", 539 | " Choose whether to unwind the CPU call-stack at each sample\r\n", 540 | " point. Default is on. \r\n", 541 | "\r\n", 542 | " --openacc-profiling \r\n", 543 | " Enable/disable recording information from the OpenACC profiling\r\n", 544 | " interface. Note: if the OpenACC profiling interface is available\r\n", 545 | " depends on the OpenACC runtime. Default is on.\r\n", 546 | "\r\n", 547 | " --context-name \r\n", 548 | " Name of the CUDA context.\r\n", 549 | " \t\"%i\" in the context name string is replaced with\r\n", 550 | " the ID of the context.\r\n", 551 | " \t\"%p\" in the context name string is replaced with\r\n", 552 | " the process ID of the application being profiled.\r\n", 553 | " \t\"%q{}\" in the context name string is replaced\r\n", 554 | " with the value of the environment variable \"\". If the\r\n", 555 | " environment variable is not set it's an error.\r\n", 556 | " \t\"%h\" in the context name string is replaced with\r\n", 557 | " the hostname of the system.\r\n", 558 | " \t\"%%\" in the context name string is replaced with\r\n", 559 | " \"%\". Any other character following \"%\" is illegal.\r\n", 560 | "\r\n", 561 | " --csv\r\n", 562 | " Use comma-separated values in the output.\r\n", 563 | "\r\n", 564 | " --demangling \r\n", 565 | " Turn on/off C++ name demangling of function names. Allowed\r\n", 566 | " values:\r\n", 567 | " \ton - turn on demangling (default)\r\n", 568 | " \toff - turn off demangling\r\n", 569 | "\r\n", 570 | " -u, --normalized-time-unit \r\n", 571 | " Specify the unit of time that will be used in the output.\r\n", 572 | " Allowed values:\r\n", 573 | " \ts - second, ms - millisecond, us - microsecond,\r\n", 574 | " ns - nanosecond\r\n", 575 | " \tcol - a fixed unit for each column\r\n", 576 | " \tauto (default) - the scale is chosen for each value\r\n", 577 | " based on its length.\r\n", 578 | "\r\n", 579 | " --openacc-summary-mode \r\n", 580 | " Set how durations are computed in the OpenACC summary. Allowed\r\n", 581 | " values:\r\n", 582 | " \texclusive: show exclusive times (default)\r\n", 583 | " \tinclusive: show inclusive times\r\n", 584 | "\r\n", 585 | " --print-api-summary\r\n", 586 | " Print a summary of CUDA runtime/driver API calls.\r\n", 587 | "\r\n", 588 | " --print-api-trace\r\n", 589 | " Print CUDA runtime/driver API trace.\r\n", 590 | "\r\n", 591 | " --print-dependency-analysis-trace\r\n", 592 | " Print dependency analysis trace.\r\n", 593 | "\r\n", 594 | " --print-gpu-summary\r\n", 595 | " Print a summary of the activities on the GPU (including CUDA\r\n", 596 | " kernels and memcpy's/memset's).\r\n", 597 | "\r\n", 598 | " --print-gpu-trace\r\n", 599 | " Print individual kernel invocations (including CUDA memcpy's/memset's)\r\n", 600 | " and sort them in chronological order. In event/metric profiling\r\n", 601 | " mode, show events/metrics for each kernel invocation.\r\n", 602 | "\r\n", 603 | " --print-openacc-constructs\r\n", 604 | " Include parent construct names in OpenACC profile.\r\n", 605 | "\r\n", 606 | " --print-openacc-summary\r\n", 607 | " Print a summary of the OpenACC profile.\r\n", 608 | "\r\n", 609 | " --print-openacc-trace\r\n", 610 | " Print a trace of the OpenACC profile.\r\n", 611 | "\r\n", 612 | " -s, --print-summary\r\n", 613 | " Print a summary of the profiling result on screen. Note:\r\n", 614 | " This is the default unless \"--export-profile\" or other print\r\n", 615 | " options are used.\r\n", 616 | "\r\n", 617 | " --print-summary-per-gpu\r\n", 618 | " Print a summary of the profiling result for each GPU.\r\n", 619 | "\r\n", 620 | " --process-name \r\n", 621 | " Name of the process.\r\n", 622 | " \t\"%p\" in the process name string is replaced with\r\n", 623 | " the process ID of the application being profiled.\r\n", 624 | " \t\"%q{}\" in the process name string is replaced\r\n", 625 | " with the value of the environment variable \"\". If the\r\n", 626 | " environment variable is not set it's an error.\r\n", 627 | " \t\"%h\" in the process name string is replaced with\r\n", 628 | " the hostname of the system.\r\n", 629 | " \t\"%%\" in the process name string is replaced with\r\n", 630 | " \"%\". Any other character following \"%\" is illegal.\r\n", 631 | "\r\n", 632 | " --quiet\r\n", 633 | " Suppress all nvprof output.\r\n", 634 | "\r\n", 635 | " --stream-name \r\n", 636 | " Name of the CUDA stream.\r\n", 637 | " \t\"%i\" in the stream name string is replaced with the\r\n", 638 | " ID of the stream.\r\n", 639 | " \t\"%p\" in the stream name string is replaced with\r\n", 640 | " the process ID of the application being profiled.\r\n", 641 | " \t\"%q{}\" in the stream name string is replaced\r\n", 642 | " with the value of the environment variable \"\". If the\r\n", 643 | " environment variable is not set it's an error.\r\n", 644 | " \t\"%h\" in the stream name string is replaced with\r\n", 645 | " the hostname of the system.\r\n", 646 | " \t\"%%\" in the stream name string is replaced with\r\n", 647 | " \"%\". Any other character following \"%\" is illegal.\r\n", 648 | "\r\n", 649 | " -o, --export-profile \r\n", 650 | " Export the result file which can be imported later or opened\r\n", 651 | " by the NVIDIA Visual Profiler.\r\n", 652 | " \t\"%p\" in the file name string is replaced with the\r\n", 653 | " process ID of the application being profiled.\r\n", 654 | " \t\"%q{}\" in the file name string is replaced\r\n", 655 | " with the value of the environment variable \"\". If the\r\n", 656 | " environment variable is not set it's an error.\r\n", 657 | " \t\"%h\" in the file name string is replaced with the\r\n", 658 | " hostname of the system.\r\n", 659 | " \t\"%%\" in the file name string is replaced with \"%\".\r\n", 660 | " \tAny other character following \"%\" is illegal.\r\n", 661 | " By default, this option disables the summary output. Note:\r\n", 662 | " If the application being profiled creates child processes,\r\n", 663 | " or if '--profile-all-processes' is used, the \"%p\" format\r\n", 664 | " is needed to get correct export files for each process.\r\n", 665 | "\r\n", 666 | " -f, --force-overwrite\r\n", 667 | " Force overwriting all output files (any existing files will\r\n", 668 | " be overwritten).\r\n", 669 | "\r\n", 670 | " -i, --import-profile \r\n", 671 | " Import a result profile from a previous run.\r\n", 672 | "\r\n", 673 | " --log-file \r\n", 674 | " Make nvprof send all its output to the specified file, or\r\n", 675 | " one of the standard channels. The file will be overwritten.\r\n", 676 | " If the file doesn't exist, a new one will be created.\r\n", 677 | " \t\"%1\" as the whole file name indicates standard output\r\n", 678 | " channel (stdout).\r\n", 679 | " \t\"%2\" as the whole file name indicates standard error\r\n", 680 | " channel (stderr). Note: This is the default.\r\n", 681 | " \t\"%p\" in the file name string is replaced with the\r\n", 682 | " process ID of the application being profiled.\r\n", 683 | " \t\"%q{}\" in the file name string is replaced\r\n", 684 | " with the value of the environment variable \"\". If the\r\n", 685 | " environment variable is not set it's an error.\r\n", 686 | " \t\"%h\" in the file name string is replaced with the\r\n", 687 | " hostname of the system.\r\n", 688 | " \t\"%%\" in the file name is replaced with \"%\".\r\n", 689 | " \tAny other character following \"%\" is illegal.\r\n", 690 | "\r\n", 691 | " --print-nvlink-topology\r\n", 692 | " Print nvlink topology \r\n", 693 | "\r\n", 694 | " -h, --help\r\n", 695 | " Print this help information.\r\n", 696 | "\r\n", 697 | " -V, --version\r\n", 698 | " Print version information of this tool.\r\n", 699 | "\r\n" 700 | ] 701 | } 702 | ], 703 | "source": [ 704 | "!nvprof --help" 705 | ] 706 | }, 707 | { 708 | "cell_type": "code", 709 | "execution_count": null, 710 | "metadata": {}, 711 | "outputs": [], 712 | "source": [] 713 | } 714 | ], 715 | "metadata": { 716 | "kernelspec": { 717 | "display_name": "Python 3", 718 | "language": "python", 719 | "name": "python3" 720 | }, 721 | "language_info": { 722 | "codemirror_mode": { 723 | "name": "ipython", 724 | "version": 3 725 | }, 726 | "file_extension": ".py", 727 | "mimetype": "text/x-python", 728 | "name": "python", 729 | "nbconvert_exporter": "python", 730 | "pygments_lexer": "ipython3", 731 | "version": "3.6.5" 732 | }, 733 | "toc": { 734 | "base_numbering": 1, 735 | "nav_menu": {}, 736 | "number_sections": true, 737 | "sideBar": true, 738 | "skip_h1_title": false, 739 | "title_cell": "Table of Contents", 740 | "title_sidebar": "Contents", 741 | "toc_cell": false, 742 | "toc_position": {}, 743 | "toc_section_display": true, 744 | "toc_window_display": false 745 | } 746 | }, 747 | "nbformat": 4, 748 | "nbformat_minor": 2 749 | } 750 | --------------------------------------------------------------------------------