├── cuda-cpp
    ├── file
    ├── version
    ├── example1
    ├── file.cu
    ├── version.cu
    ├── example1.cu
    └── Untitled.ipynb
├── cuda-c
    └── src
    │   ├── 05-julia-set
    │       ├── julia_set
    │       ├── README.md
    │       └── julia_set.cu
    │   ├── 01-hello_world
    │       ├── hello_world
    │       ├── hello_world_gpu
    │       ├── README.md
    │       ├── hello_world_gpu.cu
    │       ├── hello_world.cu
    │       └── hello-world-from-gpu.ipynb
    │   ├── 03-device-query
    │       ├── device_query
    │       └── device_query.cu
    │   ├── cuda-programming-model
    │       ├── sum
    │       ├── sumgpu
    │       ├── addvector
    │       ├── checkdims
    │       ├── sumArraysOnHost.c
    │       ├── checkDimensions.cu
    │       ├── sumArraysOnDevice.cu
    │       ├── 02-organizing-threads.ipynb
    │       ├── sumArraysOnGPU.cu
    │       ├── 03-compiling-and-executing.ipynb
    │       ├── 01-memory-management.ipynb
    │       └── 04-timing-kernel.ipynb
    │   ├── 02-passing-params
    │       ├── passing_params
    │       ├── passing_params.cu
    │       └── README.md
    │   ├── 04-gpu-vector-sums
    │       ├── gpu_vector_sums
    │       └── gpu_vector_sums.cu
    │   ├── 06-gpu-vector-sums-redux
    │       ├── gpu_vector_sums_redux
    │       └── gpu_vector_sums_redux.cu
    │   └── utils
    │       ├── gl_helper.h
    │       ├── cpu_bitmap.h
    │       ├── cpu_anim.h
    │       ├── gpu_anim.h
    │       ├── common.h
    │       └── GL
    │           └── glut.h
├── README.md
├── pycuda
    ├── 02-hello_world.py
    ├── hello_world.py
    └── notebooks
    │   ├── 02-hello_world.ipynb
    │   └── 01-hello-world.ipynb
├── caldera-cheyenne.md
├── notes
    └── parallel-communication-patterns.md
├── LICENSE
├── .gitignore
└── getting_started_on_colab.ipynb


/cuda-cpp/file:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-cpp/file


--------------------------------------------------------------------------------
/cuda-cpp/version:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-cpp/version


--------------------------------------------------------------------------------
/cuda-cpp/example1:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-cpp/example1


--------------------------------------------------------------------------------
/cuda-c/src/05-julia-set/julia_set:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/05-julia-set/julia_set


--------------------------------------------------------------------------------
/cuda-c/src/01-hello_world/hello_world:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/01-hello_world/hello_world


--------------------------------------------------------------------------------
/cuda-c/src/03-device-query/device_query:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/03-device-query/device_query


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/sum:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/cuda-programming-model/sum


--------------------------------------------------------------------------------
/cuda-c/src/01-hello_world/hello_world_gpu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/01-hello_world/hello_world_gpu


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/sumgpu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/cuda-programming-model/sumgpu


--------------------------------------------------------------------------------
/cuda-c/src/02-passing-params/passing_params:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/02-passing-params/passing_params


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/addvector:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/cuda-programming-model/addvector


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/checkdims:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/cuda-programming-model/checkdims


--------------------------------------------------------------------------------
/cuda-c/src/04-gpu-vector-sums/gpu_vector_sums:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/04-gpu-vector-sums/gpu_vector_sums


--------------------------------------------------------------------------------
/cuda-c/src/06-gpu-vector-sums-redux/gpu_vector_sums_redux:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/andersy005/cuda-programming/master/cuda-c/src/06-gpu-vector-sums-redux/gpu_vector_sums_redux


--------------------------------------------------------------------------------
/cuda-c/src/01-hello_world/README.md:
--------------------------------------------------------------------------------
 1 | # Hello, World 
 2 | 
 3 | To compile the program, run the following:
 4 | 
 5 | `$ nvcc -o hello_world hello_world.cu`
 6 | 
 7 | To run the compile program:
 8 | 
 9 | `$ ./hello_world`
10 | 


--------------------------------------------------------------------------------
/cuda-cpp/file.cu:
--------------------------------------------------------------------------------
 1 | #include <thrust/version.h>
 2 | #include <iostream>
 3 | 
 4 | int main(void)
 5 | {
 6 |   int major = THRUST_MAJOR_VERSION;
 7 |   int minor = THRUST_MINOR_VERSION;
 8 | 
 9 |   std::cout << "Thrust v" << major << "." << minor << std::endl;
10 | 
11 |   return 0;
12 | }


--------------------------------------------------------------------------------
/cuda-cpp/version.cu:
--------------------------------------------------------------------------------
 1 | #include <thrust/version.h>
 2 | #include <iostream>
 3 | 
 4 | int main(void)
 5 | {
 6 |   int major = THRUST_MAJOR_VERSION;
 7 |   int minor = THRUST_MINOR_VERSION;
 8 | 
 9 |   std::cout << "Thrust v" << major << "." << minor << std::endl;
10 | 
11 |   return 0;
12 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CUDA Programming
2 | 
3 | 
4 | This project contains source code, notes associated with my journey of learning Compute Unified Device Architecture (CUDA) programming.
5 | 
6 | There is a **Python(PyCuda)** version and a **Cuda-C** version. Files associated with each version are in independent folders. 
7 | 
8 | 


--------------------------------------------------------------------------------
/pycuda/02-hello_world.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import pycuda.driver as cuda
 4 | import pycuda.autoinit
 5 | from pycuda.compiler import SourceModule
 6 | 
 7 | mod = SourceModule("""
 8 |     #include <stdio.h>
 9 |     __global__ void say_hi(){
10 |     printf("I am %d.%d\\n", threadIdx.x, threadIdx.y);
11 |     }""")
12 | 
13 | func = mod.get_function("say_hi")
14 | func(block=(4, 2, 1))


--------------------------------------------------------------------------------
/pycuda/hello_world.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import pycuda.driver as cuda
 3 | import pycuda.autoinit
 4 | from pycuda.compiler import SourceModule
 5 | 
 6 | mod = SourceModule("""
 7 |     #include <stdio.h>
 8 | 
 9 |     __global__ void kernel()
10 |     {
11 |         printf("Hello, World!\\n");
12 |     }""")
13 | 
14 | func = mod.get_function("kernel")
15 | 
16 | 
17 | func(block=(4, 1, 1))


--------------------------------------------------------------------------------
/caldera-cheyenne.md:
--------------------------------------------------------------------------------
 1 | ## Useful commands
 2 | 
 3 | - List your current jobs:
 4 | 
 5 |   `$  squeue -u $USER`
 6 |   
 7 | - Examine a job in detail:
 8 | 
 9 |   `$ scontrol show job <ID>`
10 |   
11 | - Kill a job:
12 | 
13 |   `$ scancel <ID>`
14 | 
15 | 
16 | ## Scripts to start interactive jobs on Caldera/Cheyenne
17 | 
18 | `$ execca -a <project>` -> Run on a caldera node 
19 | 


--------------------------------------------------------------------------------
/cuda-c/src/05-julia-set/README.md:
--------------------------------------------------------------------------------
 1 | # Julia Set 
 2 | 
 3 | ## To compile the application:
 4 | 
 5 | 
 6 | In some cases, we need to add ` -lglut -lGLU -lGL` on the link line
 7 | 
 8 | `$ nvcc -o julia_set julia_set.cu -lglut -lGLU -lGL`
 9 | 
10 | ## To profile and run the code, do the following:
11 | 
12 | `$ nvprof --unified-memory-profiling off ./julia_set`
13 | 
14 | ![](https://i.imgur.com/kocLDtn.gif)


--------------------------------------------------------------------------------
/cuda-c/src/01-hello_world/hello_world_gpu.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | __global__ void helloFromGPU(void)
 4 | {   
 5 |     if (threadIdx.x == 5)
 6 |         printf(".............Hello World from GPU thread %d!.............\n", threadIdx.x);
 7 | }
 8 | 
 9 | int main(void){
10 |     // hello from cpu
11 |     printf("<------------Hello World from CPU!-------------->\n");
12 |     
13 |     helloFromGPU <<<1, 10>>>();
14 |    
15 |     cudaDeviceSynchronize();
16 |     return 0;
17 | }


--------------------------------------------------------------------------------
/cuda-c/src/01-hello_world/hello_world.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | 
 3 | /* Alerts the compiler that a function should be compiled to run on a device
 4 | instead of the host.
 5 | */
 6 | __global__ void kernel(void) {
 7 | 
 8 | }
 9 | 
10 | int main(void){
11 | 
12 | 	kernel<<<1,1>>>();
13 | 	/* Angle brackets denote arguments we plan to pass to the runtime system. 
14 | 	These are not arguments to the device code. */
15 | 	printf("Hello, World!\n");
16 | 	return 0;
17 | 
18 | }
19 | 
20 | 
21 | 


--------------------------------------------------------------------------------
/notes/parallel-communication-patterns.md:
--------------------------------------------------------------------------------
 1 | # Parallel Communication Patterns
 2 | 
 3 | - Parallel computing is all about many threads solving a problem by working together.
 4 | - This is all about communication.
 5 | - In CUDA, this communication takes place in memory.
 6 | 
 7 | ## Map and Gather
 8 | 
 9 | - With map, you've got many data elements (e.g., elements of an array, entries in a matrix, or pixels in an image).
10 | - And you are going to do the same function, or computational task, on each piece of data. 
11 | - There is a 1 to 1 correspondence between input and output. So, map is very efficient on GPUs


--------------------------------------------------------------------------------
/cuda-c/src/02-passing-params/passing_params.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "../utils/common.h"
 3 | 
 4 | __global__ void add(int a, int b, int *c) {
 5 |     *c = a + b;
 6 | }
 7 | 
 8 | int main(){
 9 |     int c;
10 |     int *device_c;
11 | 
12 |     HANDLE_ERROR(cudaMalloc((void**)&device_c, sizeof(int)));
13 | 
14 |     add<<<1,1>>>(2, 7, device_c);
15 | 
16 |     HANDLE_ERROR(cudaMemcpy(&c,
17 |                            device_c,
18 |                            sizeof(int),
19 |                            cudaMemcpyDeviceToHost));
20 | 
21 |     printf(" 2 + 7 = %d\n", c);
22 |     cudaFree(device_c);
23 | 
24 |     return 0;
25 | }


--------------------------------------------------------------------------------
/cuda-cpp/example1.cu:
--------------------------------------------------------------------------------
 1 | #include <thrust/host_vector.h>
 2 | #include <thrust/device_vector.h>
 3 | #include <thrust/generate.h>
 4 | #include <thrust/sort.h>
 5 | #include <thrust/copy.h>
 6 | #include <algorithm>
 7 | #include <cstdlib>
 8 | 
 9 | int main(void)
10 | {
11 |   // generate 32M random numbers serially
12 |   thrust::host_vector<int> h_vec(32 << 20);
13 |   std::generate(h_vec.begin(), h_vec.end(), rand);
14 | 
15 |   // transfer data to the device
16 |   thrust::device_vector<int> d_vec = h_vec;
17 | 
18 |   // sort data on the device (846M keys per second on GeForce GTX 480)
19 |   thrust::sort(d_vec.begin(), d_vec.end());
20 | 
21 |   // transfer data back to host
22 |   thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());
23 | 
24 |   return 0;
25 | }


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/sumArraysOnHost.c:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <time.h>
 5 | 
 6 | void sumArraysOnHost(float *A, float *B, float *C, const int N){
 7 |     for (int idx=0; idx<N; idx++){
 8 |         C[idx] = A[idx] + B[idx];
 9 |     }
10 | }
11 | 
12 | 
13 | void initialData(float *ip, int size){
14 |     // generate different seed for random number 
15 |     time_t t;
16 |     srand((unsigned int) time (&t));
17 |     
18 |     for (int i=0; i<size; i++){
19 |         ip[i] = (float)(rand() & 0xFF) / 10.0f;
20 |     }
21 | }
22 | 
23 | 
24 | int main(int argc, char **argv){
25 |     int nElem = 1024;
26 |     size_t nBytes = nElem * sizeof(float);
27 |     
28 |     float *h_A, *h_B, *h_C;
29 |     h_A = (float *)malloc(nBytes);
30 |     h_B = (float *)malloc(nBytes);
31 |     h_C = (float *)malloc(nBytes);
32 |     
33 |     initialData(h_A, nElem);
34 |     initialData(h_B, nElem);
35 |     
36 |     sumArraysOnHost(h_A, h_B, h_C, nElem);
37 |     
38 |     free(h_A);
39 |     free(h_B);
40 |     free(h_C);
41 |     
42 |     return (0);
43 | }


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/checkDimensions.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <cuda_runtime.h>
 3 | #include <stdio.h>
 4 | 
 5 | __global__
 6 | void checkIndex(void){
 7 |     printf("threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) blockDim:(%d, %d, %d) " 
 8 |            "gridDim:(%d, %d, %d)\n", threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, 
 9 |            blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z,
10 |            gridDim.x,gridDim.y,gridDim.z);
11 | }
12 | 
13 | 
14 | int main(int argc, char **argv) { 
15 |     // define total data element 
16 |     int nElem = 6;
17 |     // define grid and block structure 
18 |     dim3 block (3); 
19 |     dim3 grid ((nElem+block.x-1)/block.x);
20 |     // check grid and block dimension from host side 
21 |     printf("grid.x %d grid.y %d grid.z %d\n",grid.x, grid.y, grid.z); 
22 |     printf("block.x %d block.y %d block.z %d\n",block.x, block.y, block.z);
23 |     // check grid and block dimension from device side 
24 |     checkIndex <<<grid, block>>> ();
25 |     // reset device before you leave 
26 |     cudaDeviceReset();
27 |     return(0);
28 |     
29 | }


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2018 Anderson Banihirwe
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cuda-c/src/02-passing-params/README.md:
--------------------------------------------------------------------------------
 1 | # Passing Parameters
 2 | 
 3 | To compile the program, run the following:
 4 | 
 5 | `$ nvcc -o 02-passing-params 02-passing-params.cu`
 6 | 
 7 | To run the compile program:
 8 | 
 9 | `$ ./02-passing-params`
10 | 
11 | To run the profiler:
12 | 
13 | `$ nvprof --unified-memory-profiling off ./passing_params`
14 | 
15 | [![asciicast](https://asciinema.org/a/mIFzam2aaqraUV6NxtWH7zpPc.png)](https://asciinema.org/a/mIFzam2aaqraUV6NxtWH7zpPc)
16 | 
17 | # Summary
18 | 
19 | - We can pass parameters to a kernel as we would with any C function.
20 | - We need to allocate memory to do anything useful on a device, such as return values to the host.
21 | 
22 | Restrictions on the usage of device pointers are as follows:
23 | 
24 | - We **can** pass pointers allocated with `cudaMalloc()` to functions that execute on the device.
25 | - We **can** use pointers allocated with `cudaMalloc()` to read or write memory from code that executes on the device.
26 | - We **can** pass pointers allocated with `cudaMalloc()` to functions that execute on the host.
27 | - We **cannot** use pointers allocated with `cudaMalloc()` to read or write memory from code that executes on the host. 
28 | 


--------------------------------------------------------------------------------
/cuda-c/src/04-gpu-vector-sums/gpu_vector_sums.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "../utils/common.h"
 3 | 
 4 | #define N 100
 5 | 
 6 | __global__ void add(int *a, int *b, int *c){
 7 |     int tid = blockIdx.x;  // handle the data at this index
 8 |     if (tid < N)
 9 |         c[tid] = a[tid] + b[tid];
10 | }
11 | 
12 | 
13 | int main(void){
14 |     int a[N], b[N], c[N];
15 |     int *device_a, *device_b, *device_c;
16 | 
17 |     // Allocate the memory on the GPU
18 |     HANDLE_ERROR(cudaMalloc((void**)&device_a, N * sizeof(int)));
19 |     HANDLE_ERROR(cudaMalloc((void**)&device_b, N * sizeof(int)));
20 |     HANDLE_ERROR(cudaMalloc((void**)&device_c, N * sizeof(int)));
21 | 
22 | 
23 |     // fill the arrays 'a' and 'b' on the CPU
24 |     for (int i=0; i<N; i++){
25 |         a[i] = -i;
26 |         b[i] = i * i;
27 |     }
28 | 
29 | 
30 |     // copy the arrays 'a' and 'b' to the GPU
31 |     HANDLE_ERROR(cudaMemcpy(device_a, a, N * sizeof(int), 
32 |                             cudaMemcpyHostToDevice));
33 |     HANDLE_ERROR(cudaMemcpy(device_b, b, N * sizeof(int),
34 |                             cudaMemcpyHostToDevice));
35 | 
36 |     // Execute the kernel with N parallel blocks
37 |     add<<<N, 1>>>(device_a, device_b, device_c);
38 | 
39 |     // copy the array 'c' back from the GPU to the CPU
40 |     HANDLE_ERROR(cudaMemcpy(c, device_c, N * sizeof(int),
41 |                            cudaMemcpyDeviceToHost));
42 | 
43 |     // display the results
44 |     for(int i=0; i<N; i++){
45 |         printf("%d + %d = % d\n", a[i], b[i], c[i]);
46 |     }
47 | 
48 |     // free the memory allocated on the GPU
49 |     cudaFree(device_a);
50 |     cudaFree(device_b);
51 |     cudaFree(device_c);
52 | 
53 |     return 0;
54 | }


--------------------------------------------------------------------------------
/cuda-c/src/06-gpu-vector-sums-redux/gpu_vector_sums_redux.cu:
--------------------------------------------------------------------------------
 1 | #include <stdio.h>
 2 | #include "../utils/common.h"
 3 | 
 4 | #define N 100
 5 | 
 6 | __global__ void add(int *a, int *b, int *c){
 7 |     int tid = threadIdx.x;  // handle the data at this index
 8 |     if (tid < N)
 9 |         c[tid] = a[tid] + b[tid];
10 | }
11 | 
12 | 
13 | int main(void){
14 |     int a[N], b[N], c[N];
15 |     int *device_a, *device_b, *device_c;
16 | 
17 |     // Allocate the memory on the GPU
18 |     HANDLE_ERROR(cudaMalloc((void**)&device_a, N * sizeof(int)));
19 |     HANDLE_ERROR(cudaMalloc((void**)&device_b, N * sizeof(int)));
20 |     HANDLE_ERROR(cudaMalloc((void**)&device_c, N * sizeof(int)));
21 | 
22 | 
23 |     // fill the arrays 'a' and 'b' on the CPU
24 |     for (int i=0; i<N; i++){
25 |         a[i] = -i;
26 |         b[i] = i * i;
27 |     }
28 | 
29 | 
30 |     // copy the arrays 'a' and 'b' to the GPU
31 |     HANDLE_ERROR(cudaMemcpy(device_a, a, N * sizeof(int), 
32 |                             cudaMemcpyHostToDevice));
33 |     HANDLE_ERROR(cudaMemcpy(device_b, b, N * sizeof(int),
34 |                             cudaMemcpyHostToDevice));
35 | 
36 |     // Execute the kernel with N parallel blocks
37 |     add<<<1, N>>>(device_a, device_b, device_c);
38 | 
39 |     // copy the array 'c' back from the GPU to the CPU
40 |     HANDLE_ERROR(cudaMemcpy(c, device_c, N * sizeof(int),
41 |                            cudaMemcpyDeviceToHost));
42 | 
43 |     // display the results
44 |     for(int i=0; i<N; i++){
45 |         printf("%d + %d = % d\n", a[i], b[i], c[i]);
46 |     }
47 | 
48 |     // free the memory allocated on the GPU
49 |     cudaFree(device_a);
50 |     cudaFree(device_b);
51 |     cudaFree(device_c);
52 | 
53 |     return 0;
54 | }


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.i
  2 | *.ii
  3 | *.gpu
  4 | *.ptx
  5 | *.cubin
  6 | *.fatbin
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | env/
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | .hypothesis/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # dotenv
 90 | .env
 91 | 
 92 | # virtualenv
 93 | .venv
 94 | venv/
 95 | ENV/
 96 | 
 97 | # Spyder project settings
 98 | .spyderproject
 99 | .spyproject
100 | 
101 | # Rope project settings
102 | .ropeproject
103 | 
104 | # mkdocs documentation
105 | /site
106 | 
107 | # mypy
108 | .mypy_cache/
109 | .pytest_cache/
110 | .vscode/
111 | notes/CodeSamples/
112 | notes/Solutions/
113 | 


--------------------------------------------------------------------------------
/cuda-c/src/utils/gl_helper.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 3 |  *
 4 |  * NVIDIA Corporation and its licensors retain all intellectual property and
 5 |  * proprietary rights in and to this software and related documentation.
 6 |  * Any use, reproduction, disclosure, or distribution of this software
 7 |  * and related documentation without an express license agreement from
 8 |  * NVIDIA Corporation is strictly prohibited.
 9 |  *
10 |  * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 |  * associated with this source code for terms and conditions that govern
12 |  * your use of this NVIDIA software.
13 |  *
14 |  */
15 | 
16 | 
17 | #ifndef __GL_HELPER_H__
18 | #define __GL_HELPER_H__
19 | 
20 | /*
21 |    On 64-bit Windows, we need to prevent GLUT from automatically linking against
22 |    glut32. We do this by defining GLUT_NO_LIB_PRAGMA. This means that we need to
23 |    manually add opengl32.lib and glut64.lib back to the link using pragmas.
24 |    Alternatively, this could be done on the compilation/link command-line, but
25 |    we chose this so that compilation is identical between 32- and 64-bit Windows.
26 | */
27 | #ifdef _WIN64
28 | #define GLUT_NO_LIB_PRAGMA
29 | #pragma comment (lib, "opengl32.lib")  /* link with Microsoft OpenGL lib */
30 | #pragma comment (lib, "glut64.lib")    /* link with Win64 GLUT lib */
31 | #endif //_WIN64
32 | 
33 | 
34 | #ifdef _WIN32
35 | /* On Windows, include the local copy of glut.h and glext.h */
36 | #include "GL/glut.h"
37 | #include "GL/glext.h"
38 | 
39 | #define GET_PROC_ADDRESS( str ) wglGetProcAddress( str )
40 | 
41 | #else
42 | 
43 | /* On Linux, include the system's copy of glut.h, glext.h, and glx.h */
44 | #include <GL/glut.h>
45 | #include <GL/glext.h>
46 | #include <GL/glx.h>
47 | 
48 | #define GET_PROC_ADDRESS( str ) glXGetProcAddress( (const GLubyte *)str )
49 | 
50 | #endif //_WIN32
51 | 
52 | 
53 | #endif //__GL_HELPER_H__'
54 | 


--------------------------------------------------------------------------------
/pycuda/notebooks/02-hello_world.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 1,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "Overwriting ../02-hello_world.py\n"
13 |      ]
14 |     }
15 |    ],
16 |    "source": [
17 |     "%%writefile ../02-hello_world.py\n",
18 |     "\n",
19 |     "\n",
20 |     "import pycuda.driver as cuda\n",
21 |     "import pycuda.autoinit\n",
22 |     "from pycuda.compiler import SourceModule\n",
23 |     "\n",
24 |     "mod = SourceModule(\"\"\"\n",
25 |     "    #include <stdio.h>\n",
26 |     "    __global__ void say_hi(){\n",
27 |     "    printf(\"I am %d.%d\\\\n\", threadIdx.x, threadIdx.y);\n",
28 |     "    }\"\"\")\n",
29 |     "\n",
30 |     "func = mod.get_function(\"say_hi\")\n",
31 |     "func(block=(4, 2, 1))"
32 |    ]
33 |   },
34 |   {
35 |    "cell_type": "code",
36 |    "execution_count": 2,
37 |    "metadata": {},
38 |    "outputs": [
39 |     {
40 |      "name": "stdout",
41 |      "output_type": "stream",
42 |      "text": [
43 |       "I am 0.0\r\n",
44 |       "I am 1.0\r\n",
45 |       "I am 2.0\r\n",
46 |       "I am 3.0\r\n",
47 |       "I am 0.1\r\n",
48 |       "I am 1.1\r\n",
49 |       "I am 2.1\r\n",
50 |       "I am 3.1\r\n"
51 |      ]
52 |     }
53 |    ],
54 |    "source": [
55 |     "!python ../02-hello_world.py"
56 |    ]
57 |   },
58 |   {
59 |    "cell_type": "code",
60 |    "execution_count": null,
61 |    "metadata": {},
62 |    "outputs": [],
63 |    "source": []
64 |   }
65 |  ],
66 |  "metadata": {
67 |   "kernelspec": {
68 |    "display_name": "Python 3",
69 |    "language": "python",
70 |    "name": "python3"
71 |   },
72 |   "language_info": {
73 |    "codemirror_mode": {
74 |     "name": "ipython",
75 |     "version": 3
76 |    },
77 |    "file_extension": ".py",
78 |    "mimetype": "text/x-python",
79 |    "name": "python",
80 |    "nbconvert_exporter": "python",
81 |    "pygments_lexer": "ipython3",
82 |    "version": "3.6.5"
83 |   }
84 |  },
85 |  "nbformat": 4,
86 |  "nbformat_minor": 2
87 | }
88 | 


--------------------------------------------------------------------------------
/cuda-c/src/03-device-query/device_query.cu:
--------------------------------------------------------------------------------
 1 | #include "../utils/common.h"
 2 | 
 3 | int main(void){
 4 |     cudaDeviceProp prop;
 5 | 
 6 |     int count;
 7 | 
 8 |     HANDLE_ERROR(cudaGetDeviceCount(&count));
 9 | 
10 |     for (int i=0; i < count; i++) {
11 |         HANDLE_ERROR(cudaGetDeviceProperties(&prop, i));
12 | 
13 |         printf("  ----- General Information for device %d -----\n", i);
14 |         printf("Name:  %s\n", prop.name);
15 |         printf("Compute capability: %d.%d\n", prop.major, prop.minor);
16 |         printf("Clock rate: %d\n", prop.clockRate);
17 |         printf("Device copy overlap: ");
18 |         if(prop.deviceOverlap)
19 |             printf("Enabled\n");
20 | 
21 |         else
22 |             printf("Disabled\n");
23 | 
24 |         printf("Kernel execution timeout : ");
25 |         if(prop.kernelExecTimeoutEnabled)
26 |             printf("Enabled\n");
27 | 
28 |         else
29 |             printf("Disabled\n");
30 | 
31 |         printf("  ----- Memory Information for device %d -----\n", i);
32 |         printf("Total global Mem:  %ld\n", prop.totalGlobalMem);
33 |         printf("Total constant Mem:  %ld\n", prop.totalConstMem);
34 |         printf("Max Mem pitch:  %ld\n", prop.memPitch);
35 |         printf("Texture Alignment:  %ld\n", prop.textureAlignment);
36 |         
37 |         printf("  ----- MP Information for device %d -----\n", i);
38 |         printf("Multiprocessor count:  %d\n", prop.multiProcessorCount);
39 |         printf("Shared mem per mp:  %ld\n", prop.sharedMemPerBlock);
40 |         printf("Registers per mp:  %d\n", prop.regsPerBlock);
41 |         printf("Threads in warp:  %d\n", prop.warpSize);
42 |         printf("Max Threads per block:  %d\n", prop.maxThreadsPerBlock);
43 |         printf("Max thread dimensions:  (%d, %d, %d)\n", prop.maxThreadsDim[0], 
44 |                                   prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
45 |         printf("Max grid dimensions:  (%d, %d, %d)\n", prop.maxGridSize[0], prop.maxGridSize[1],
46 |                                   prop.maxGridSize[2]);
47 | 
48 |         printf("\n");
49 |          
50 |     }
51 | }


--------------------------------------------------------------------------------
/cuda-cpp/Untitled.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": 4,
 6 |    "metadata": {},
 7 |    "outputs": [
 8 |     {
 9 |      "name": "stdout",
10 |      "output_type": "stream",
11 |      "text": [
12 |       "Overwriting file.cu\n"
13 |      ]
14 |     }
15 |    ],
16 |    "source": [
17 |     "%%file file.cu\n",
18 |     "#include <thrust/version.h>\n",
19 |     "#include <iostream>\n",
20 |     "\n",
21 |     "int main(void)\n",
22 |     "{\n",
23 |     "  int major = THRUST_MAJOR_VERSION;\n",
24 |     "  int minor = THRUST_MINOR_VERSION;\n",
25 |     "\n",
26 |     "  std::cout << \"Thrust v\" << major << \".\" << minor << std::endl;\n",
27 |     "\n",
28 |     "  return 0;\n",
29 |     "}"
30 |    ]
31 |   },
32 |   {
33 |    "cell_type": "code",
34 |    "execution_count": 5,
35 |    "metadata": {},
36 |    "outputs": [
37 |     {
38 |      "name": "stdout",
39 |      "output_type": "stream",
40 |      "text": [
41 |       "Thrust v1.9\n"
42 |      ]
43 |     }
44 |    ],
45 |    "source": [
46 |     "%%bash\n",
47 |     "nvcc file.cu -o file\n",
48 |     "./file"
49 |    ]
50 |   },
51 |   {
52 |    "cell_type": "code",
53 |    "execution_count": null,
54 |    "metadata": {},
55 |    "outputs": [],
56 |    "source": []
57 |   }
58 |  ],
59 |  "metadata": {
60 |   "kernelspec": {
61 |    "display_name": "Python 3",
62 |    "language": "python",
63 |    "name": "python3"
64 |   },
65 |   "language_info": {
66 |    "codemirror_mode": {
67 |     "name": "ipython",
68 |     "version": 3
69 |    },
70 |    "file_extension": ".py",
71 |    "mimetype": "text/x-python",
72 |    "name": "python",
73 |    "nbconvert_exporter": "python",
74 |    "pygments_lexer": "ipython3",
75 |    "version": "3.6.5"
76 |   },
77 |   "toc": {
78 |    "base_numbering": 1,
79 |    "nav_menu": {},
80 |    "number_sections": true,
81 |    "sideBar": true,
82 |    "skip_h1_title": false,
83 |    "title_cell": "Table of Contents",
84 |    "title_sidebar": "Contents",
85 |    "toc_cell": false,
86 |    "toc_position": {},
87 |    "toc_section_display": true,
88 |    "toc_window_display": false
89 |   }
90 |  },
91 |  "nbformat": 4,
92 |  "nbformat_minor": 2
93 | }
94 | 


--------------------------------------------------------------------------------
/pycuda/notebooks/01-hello-world.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "markdown",
 5 |    "metadata": {},
 6 |    "source": [
 7 |     "## Import and initialize PyCUDA"
 8 |    ]
 9 |   },
10 |   {
11 |    "cell_type": "code",
12 |    "execution_count": 1,
13 |    "metadata": {},
14 |    "outputs": [
15 |     {
16 |      "name": "stdout",
17 |      "output_type": "stream",
18 |      "text": [
19 |       "Overwriting ../hello_world.py\n"
20 |      ]
21 |     }
22 |    ],
23 |    "source": [
24 |     "%%writefile ../hello_world.py\n",
25 |     "\n",
26 |     "import pycuda.driver as cuda\n",
27 |     "import pycuda.autoinit\n",
28 |     "from pycuda.compiler import SourceModule\n",
29 |     "\n",
30 |     "mod = SourceModule(\"\"\"\n",
31 |     "    #include <stdio.h>\n",
32 |     "\n",
33 |     "    __global__ void kernel()\n",
34 |     "    {\n",
35 |     "        printf(\"Hello, World!\\\\n\");\n",
36 |     "    }\"\"\")\n",
37 |     "\n",
38 |     "func = mod.get_function(\"kernel\")\n",
39 |     "\n",
40 |     "\n",
41 |     "func(block=(4, 1, 1))\n"
42 |    ]
43 |   },
44 |   {
45 |    "cell_type": "markdown",
46 |    "metadata": {},
47 |    "source": [
48 |     "Here,\n",
49 |     "`pycuda.autoinit`\n",
50 |     "serves for automatic initialization, context creation, and cleanup. The\n",
51 |     "`SourceModule`\n",
52 |     "is where a (usually short) C-like code for the GPU is to be written."
53 |    ]
54 |   },
55 |   {
56 |    "cell_type": "code",
57 |    "execution_count": 2,
58 |    "metadata": {},
59 |    "outputs": [
60 |     {
61 |      "name": "stdout",
62 |      "output_type": "stream",
63 |      "text": [
64 |       "Hello, World!\r\n",
65 |       "Hello, World!\r\n",
66 |       "Hello, World!\r\n",
67 |       "Hello, World!\r\n"
68 |      ]
69 |     }
70 |    ],
71 |    "source": [
72 |     "!python ../hello_world.py"
73 |    ]
74 |   }
75 |  ],
76 |  "metadata": {
77 |   "kernelspec": {
78 |    "display_name": "Python 3",
79 |    "language": "python",
80 |    "name": "python3"
81 |   },
82 |   "language_info": {
83 |    "codemirror_mode": {
84 |     "name": "ipython",
85 |     "version": 3
86 |    },
87 |    "file_extension": ".py",
88 |    "mimetype": "text/x-python",
89 |    "name": "python",
90 |    "nbconvert_exporter": "python",
91 |    "pygments_lexer": "ipython3",
92 |    "version": "3.6.5"
93 |   }
94 |  },
95 |  "nbformat": 4,
96 |  "nbformat_minor": 2
97 | }
98 | 


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/sumArraysOnDevice.cu:
--------------------------------------------------------------------------------
 1 | 
 2 | #include <stdlib.h>
 3 | #include <string.h>
 4 | #include <time.h>
 5 | #include <stdio.h>
 6 | 
 7 | __global__ void sumArraysOnDevice(float *A, float *B, float *C){
 8 |     int idx = threadIdx.x;
 9 |     C[idx] = A[idx] + B[idx];
10 | 
11 | }
12 | 
13 | 
14 | void initialData(float *ip, int size){
15 |     // generate different seed for random number 
16 |     time_t t;
17 |     srand((unsigned int) time (&t));
18 |     
19 |     for (int i=0; i<size; i++){
20 |         ip[i] = (float)(rand() & 0xFF) / 10.0f;
21 |     }
22 | }
23 | 
24 | 
25 | void sumArraysOnHost(float *A, float *B, float *C, const int N){
26 |     for (int idx=0; idx<N; idx++){
27 |         C[idx] = A[idx] + B[idx];
28 |     }
29 | }
30 | 
31 | 
32 | 
33 | void checkResult(float *h_C, float *result, const int N){
34 |     double epsilon = 1.0E-8;
35 |     int match = 1;
36 |     for (int i = 0; i < N; i++){
37 |         if (abs(h_C[i] - result[i]) > epsilon){
38 |             match = 0;
39 |             printf("Arrays do not match!\n");
40 |             printf("host %5.2f gpu %5.2f at current %d\n",
41 |                    h_C[i], result[i], i);
42 |             break;
43 |         }
44 |     }
45 |     if (match) printf("Arrays match. \n\n");
46 | }
47 | 
48 | 
49 | int main(int argc, char **argv){
50 |     int nElem = 1024;
51 |     size_t nBytes = nElem * sizeof(float);
52 |     
53 |     float *h_A, *h_B, *h_C, *result;
54 |     h_A = (float *)malloc(nBytes);
55 |     h_B = (float *)malloc(nBytes);
56 |     h_C = (float *)malloc(nBytes);
57 |     result = (float *)malloc(nBytes);
58 |     
59 |     initialData(h_A, nElem);
60 |     initialData(h_B, nElem);
61 |     
62 |     float *d_A, *d_B, *d_C;
63 |     cudaMalloc((float**)&d_A, nBytes);
64 |     cudaMalloc((float**)&d_B, nBytes);
65 |     cudaMalloc((float**)&d_C, nBytes);
66 |     
67 |     // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the
68 |     // parameter cudaMemcpyHostToDevice specifying the transfer direction.
69 |     
70 |     cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
71 |     cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
72 |     
73 |     
74 |     
75 |     sumArraysOnDevice<<<1, nElem>>>(d_A, d_B, d_C);
76 |     sumArraysOnHost(h_A, h_B, result, nElem);
77 |     
78 |     cudaMemcpy(h_C, d_C, nBytes, cudaMemcpyDeviceToHost);
79 |     
80 |     for (int i=0; i<10; i++){
81 |          printf("%f + %f = %f \n", h_A[i], h_B[i], h_C[i]);
82 | 
83 |     }
84 |     
85 |     checkResult(h_C, result, nElem);
86 |     
87 |     free(h_A);
88 |     free(h_B);
89 |     free(h_C);
90 |     free(result);
91 |     
92 |     // use cudaFree to release the memory used on the GPU
93 |     cudaFree(d_A);
94 |     cudaFree(d_B);
95 |     cudaFree(d_C);
96 |     cudaDeviceReset();
97 |     
98 |     return (0);
99 | }


--------------------------------------------------------------------------------
/cuda-c/src/utils/cpu_bitmap.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
 3 |  *
 4 |  * NVIDIA Corporation and its licensors retain all intellectual property and
 5 |  * proprietary rights in and to this software and related documentation.
 6 |  * Any use, reproduction, disclosure, or distribution of this software
 7 |  * and related documentation without an express license agreement from
 8 |  * NVIDIA Corporation is strictly prohibited.
 9 |  *
10 |  * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 |  * associated with this source code for terms and conditions that govern
12 |  * your use of this NVIDIA software.
13 |  *
14 |  */
15 | 
16 | 
17 | #ifndef __CPU_BITMAP_H__
18 | #define __CPU_BITMAP_H__
19 | 
20 | #include "gl_helper.h"
21 | 
22 | struct CPUBitmap {
23 |     unsigned char    *pixels;
24 |     int     x, y;
25 |     void    *dataBlock;
26 |     void (*bitmapExit)(void*);
27 | 
28 |     CPUBitmap( int width, int height, void *d = NULL ) {
29 |         pixels = new unsigned char[width * height * 4];
30 |         x = width;
31 |         y = height;
32 |         dataBlock = d;
33 |     }
34 | 
35 |     ~CPUBitmap() {
36 |         delete [] pixels;
37 |     }
38 | 
39 |     unsigned char* get_ptr( void ) const   { return pixels; }
40 |     long image_size( void ) const { return x * y * 4; }
41 | 
42 |     void display_and_exit( void(*e)(void*) = NULL ) {
43 |         CPUBitmap**   bitmap = get_bitmap_ptr();
44 |         *bitmap = this;
45 |         bitmapExit = e;
46 |         // a bug in the Windows GLUT implementation prevents us from
47 |         // passing zero arguments to glutInit()
48 |         int c=1;
49 |         char* dummy = "";
50 |         glutInit( &c, &dummy );
51 |         glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );
52 |         glutInitWindowSize( x, y );
53 |         glutCreateWindow( "bitmap" );
54 |         glutKeyboardFunc(Key);
55 |         glutDisplayFunc(Draw);
56 |         glutMainLoop();
57 |     }
58 | 
59 |      // static method used for glut callbacks
60 |     static CPUBitmap** get_bitmap_ptr( void ) {
61 |         static CPUBitmap   *gBitmap;
62 |         return &gBitmap;
63 |     }
64 | 
65 |    // static method used for glut callbacks
66 |     static void Key(unsigned char key, int x, int y) {
67 |         switch (key) {
68 |             case 27:
69 |                 CPUBitmap*   bitmap = *(get_bitmap_ptr());
70 |                 if (bitmap->dataBlock != NULL && bitmap->bitmapExit != NULL)
71 |                     bitmap->bitmapExit( bitmap->dataBlock );
72 |                 exit(0);
73 |         }
74 |     }
75 | 
76 |     // static method used for glut callbacks
77 |     static void Draw( void ) {
78 |         CPUBitmap*   bitmap = *(get_bitmap_ptr());
79 |         glClearColor( 0.0, 0.0, 0.0, 1.0 );
80 |         glClear( GL_COLOR_BUFFER_BIT );
81 |         glDrawPixels( bitmap->x, bitmap->y, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );
82 |         glFlush();
83 |     }
84 | };
85 | 
86 | #endif  // __CPU_BITMAP_H__
87 | 


--------------------------------------------------------------------------------
/cuda-c/src/05-julia-set/julia_set.cu:
--------------------------------------------------------------------------------
  1 | #include <stdio.h>
  2 | #include "../utils/common.h"
  3 | #include "../utils/cpu_bitmap.h"
  4 | 
  5 | #define DIM 1000
  6 | 
  7 | 
  8 | /* cuComplex structure that defines a method for storing a complex number
  9 |  with single precision floating-point components. The structure also defines
 10 |  addition and multiplication operators as well as a function to return 
 11 |  the magnitude of the complex value. 
 12 | */
 13 | 
 14 | struct cuComplex {
 15 |     float   r;
 16 |     float   i;
 17 |     __device__ cuComplex( float a, float b ) : r(a), i(b)  {}
 18 |     __device__ float magnitude2( void ) {
 19 |         return r * r + i * i;
 20 |     }
 21 |     __device__ cuComplex operator*(const cuComplex& a) {
 22 |         return cuComplex(r*a.r - i*a.i, i*a.r + r*a.i);
 23 |     }
 24 |     __device__ cuComplex operator+(const cuComplex& a) {
 25 |         return cuComplex(r+a.r, i+a.i);
 26 |     }
 27 | };
 28 | 
 29 | 
 30 | // Code that determines whether a point is in or out of the 
 31 | // Julia Set. 
 32 | 
 33 | __device__ int julia(int x, int y){
 34 |     const float scale= 1.5;
 35 |     float jx = scale * (float)(DIM/2 - x) / (DIM/2);
 36 |     float jy = scale * (float)(DIM/2 - y) / (DIM/2);
 37 | 
 38 |     cuComplex c(-0.8, 0.156);
 39 |     cuComplex a(jx, jy);
 40 | 
 41 |     int i = 0;
 42 |     for (i=0; i<200; i++){
 43 |         a = a * a + c;
 44 |         if (a.magnitude2() > 1000)
 45 |             return 0;
 46 |     }
 47 | 
 48 |     return 1;
 49 | }
 50 | 
 51 | __global__ void kernel(unsigned char *ptr){
 52 |     // map from blockIdx to pixel position
 53 |     int x = blockIdx.x;
 54 |     int y = blockIdx.y;
 55 |     // compute linear offset with help of built-iin variable, gridDim
 56 |     // This variable is a constant across all blocks and simply holds the 
 57 |     // dimensions of the grid that was launched. 
 58 |     // In this example, it will always be the value (DIM, DIM)
 59 |     int offset = x + y * gridDim.x;
 60 | 
 61 |     // now calculate the value at that position
 62 |     int juliaValue = julia(x, y);
 63 |     ptr[offset*4 + 0] = 255 * juliaValue;
 64 |     ptr[offset*4 + 1] = 0;
 65 |     ptr[offset*4 + 2] = 0;
 66 |     ptr[offset*4 + 3] = 255;
 67 | 
 68 | }
 69 | 
 70 | // globals needed by the update routine
 71 | struct DataBlock {
 72 |     unsigned char   *dev_bitmap;
 73 | };
 74 | 
 75 | 
 76 | int main(void){
 77 | 
 78 |     // Create DIM x DIM bitmap image using utility library
 79 |     DataBlock   data;
 80 |     CPUBitmap bitmap( DIM, DIM, &data );
 81 | 
 82 | 
 83 |     // Because the computation will be done on a GPU, declare a pointer to hold a copy
 84 |     // of data on the device 
 85 |     unsigned char *device_bitmap;
 86 | 
 87 |     HANDLE_ERROR(cudaMalloc((void**)&device_bitmap, bitmap.image_size()));
 88 |  
 89 |     // type dim3 is a CUDA runtime type that represents a 3-D (with z=1)  
 90 |     // tuple that will be used to specify the size of our launch 
 91 |     dim3 grid(DIM, DIM);
 92 | 
 93 |     kernel<<<grid, 1>>>(device_bitmap);
 94 | 
 95 |     HANDLE_ERROR(cudaMemcpy(bitmap.get_ptr(), device_bitmap,
 96 |                             bitmap.image_size(), cudaMemcpyDeviceToHost));
 97 | 
 98 |     
 99 |     cudaFree(device_bitmap);
100 |     bitmap.display_and_exit();
101 | 
102 | }
103 | 


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/02-organizing-threads.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 5,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Overwriting checkDimensions.cu\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "%%file checkDimensions.cu\n",
 18 |     "\n",
 19 |     "#include <cuda_runtime.h>\n",
 20 |     "#include <stdio.h>\n",
 21 |     "\n",
 22 |     "__global__\n",
 23 |     "void checkIndex(void){\n",
 24 |     "    printf(\"threadIdx:(%d, %d, %d) blockIdx:(%d, %d, %d) blockDim:(%d, %d, %d) \" \n",
 25 |     "           \"gridDim:(%d, %d, %d)\\n\", threadIdx.x, threadIdx.y, threadIdx.z, blockIdx.x, \n",
 26 |     "           blockIdx.y, blockIdx.z, blockDim.x, blockDim.y, blockDim.z,\n",
 27 |     "           gridDim.x,gridDim.y,gridDim.z);\n",
 28 |     "}\n",
 29 |     "\n",
 30 |     "\n",
 31 |     "int main(int argc, char **argv) { \n",
 32 |     "    // define total data element \n",
 33 |     "    int nElem = 6;\n",
 34 |     "    // define grid and block structure \n",
 35 |     "    dim3 block (3); \n",
 36 |     "    dim3 grid ((nElem+block.x-1)/block.x);\n",
 37 |     "    // check grid and block dimension from host side \n",
 38 |     "    printf(\"grid.x %d grid.y %d grid.z %d\\n\",grid.x, grid.y, grid.z); \n",
 39 |     "    printf(\"block.x %d block.y %d block.z %d\\n\",block.x, block.y, block.z);\n",
 40 |     "    // check grid and block dimension from device side \n",
 41 |     "    checkIndex <<<grid, block>>> ();\n",
 42 |     "    // reset device before you leave \n",
 43 |     "    cudaDeviceReset();\n",
 44 |     "    return(0);\n",
 45 |     "    \n",
 46 |     "}\n"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 6,
 52 |    "metadata": {},
 53 |    "outputs": [
 54 |     {
 55 |      "name": "stdout",
 56 |      "output_type": "stream",
 57 |      "text": [
 58 |       "grid.x 2 grid.y 1 grid.z 1\n",
 59 |       "block.x 3 block.y 1 block.z 1\n",
 60 |       "threadIdx:(0, 0, 0) blockIdx:(1, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n",
 61 |       "threadIdx:(1, 0, 0) blockIdx:(1, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n",
 62 |       "threadIdx:(2, 0, 0) blockIdx:(1, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n",
 63 |       "threadIdx:(0, 0, 0) blockIdx:(0, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n",
 64 |       "threadIdx:(1, 0, 0) blockIdx:(0, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n",
 65 |       "threadIdx:(2, 0, 0) blockIdx:(0, 0, 0) blockDim:(3, 1, 1) gridDim:(2, 1, 1)\n"
 66 |      ]
 67 |     }
 68 |    ],
 69 |    "source": [
 70 |     "%%bash\n",
 71 |     "nvcc checkDimensions.cu -o checkdims\n",
 72 |     "./checkdims"
 73 |    ]
 74 |   }
 75 |  ],
 76 |  "metadata": {
 77 |   "kernelspec": {
 78 |    "display_name": "Python 3",
 79 |    "language": "python",
 80 |    "name": "python3"
 81 |   },
 82 |   "language_info": {
 83 |    "codemirror_mode": {
 84 |     "name": "ipython",
 85 |     "version": 3
 86 |    },
 87 |    "file_extension": ".py",
 88 |    "mimetype": "text/x-python",
 89 |    "name": "python",
 90 |    "nbconvert_exporter": "python",
 91 |    "pygments_lexer": "ipython3",
 92 |    "version": "3.6.5"
 93 |   },
 94 |   "toc": {
 95 |    "base_numbering": 1,
 96 |    "nav_menu": {},
 97 |    "number_sections": true,
 98 |    "sideBar": true,
 99 |    "skip_h1_title": false,
100 |    "title_cell": "Table of Contents",
101 |    "title_sidebar": "Contents",
102 |    "toc_cell": false,
103 |    "toc_position": {},
104 |    "toc_section_display": true,
105 |    "toc_window_display": false
106 |   }
107 |  },
108 |  "nbformat": 4,
109 |  "nbformat_minor": 2
110 | }
111 | 


--------------------------------------------------------------------------------
/cuda-c/src/utils/cpu_anim.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NVIDIA Corporation and its licensors retain all intellectual property and
  5 |  * proprietary rights in and to this software and related documentation.
  6 |  * Any use, reproduction, disclosure, or distribution of this software
  7 |  * and related documentation without an express license agreement from
  8 |  * NVIDIA Corporation is strictly prohibited.
  9 |  *
 10 |  * Please refer to the applicable NVIDIA end user license agreement (EULA)
 11 |  * associated with this source code for terms and conditions that govern
 12 |  * your use of this NVIDIA software.
 13 |  *
 14 |  */
 15 | 
 16 | 
 17 | #ifndef __CPU_ANIM_H__
 18 | #define __CPU_ANIM_H__
 19 | 
 20 | #include "gl_helper.h"
 21 | 
 22 | #include <iostream>
 23 | 
 24 | 
 25 | struct CPUAnimBitmap {
 26 |     unsigned char    *pixels;
 27 |     int     width, height;
 28 |     void    *dataBlock;
 29 |     void (*fAnim)(void*,int);
 30 |     void (*animExit)(void*);
 31 |     void (*clickDrag)(void*,int,int,int,int);
 32 |     int     dragStartX, dragStartY;
 33 | 
 34 |     CPUAnimBitmap( int w, int h, void *d = NULL ) {
 35 |         width = w;
 36 |         height = h;
 37 |         pixels = new unsigned char[width * height * 4];
 38 |         dataBlock = d;
 39 |         clickDrag = NULL;
 40 |     }
 41 | 
 42 |     ~CPUAnimBitmap() {
 43 |         delete [] pixels;
 44 |     }
 45 | 
 46 |     unsigned char* get_ptr( void ) const   { return pixels; }
 47 |     long image_size( void ) const { return width * height * 4; }
 48 | 
 49 |     void click_drag( void (*f)(void*,int,int,int,int)) {
 50 |         clickDrag = f;
 51 |     }
 52 | 
 53 |     void anim_and_exit( void (*f)(void*,int), void(*e)(void*) ) {
 54 |         CPUAnimBitmap**   bitmap = get_bitmap_ptr();
 55 |         *bitmap = this;
 56 |         fAnim = f;
 57 |         animExit = e;
 58 |         // a bug in the Windows GLUT implementation prevents us from
 59 |         // passing zero arguments to glutInit()
 60 |         int c=1;
 61 |         char* dummy = "";
 62 |         glutInit( &c, &dummy );
 63 |         glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
 64 |         glutInitWindowSize( width, height );
 65 |         glutCreateWindow( "bitmap" );
 66 |         glutKeyboardFunc(Key);
 67 |         glutDisplayFunc(Draw);
 68 |         if (clickDrag != NULL)
 69 |             glutMouseFunc( mouse_func );
 70 |         glutIdleFunc( idle_func );
 71 |         glutMainLoop();
 72 |     }
 73 | 
 74 |     // static method used for glut callbacks
 75 |     static CPUAnimBitmap** get_bitmap_ptr( void ) {
 76 |         static CPUAnimBitmap*   gBitmap;
 77 |         return &gBitmap;
 78 |     }
 79 | 
 80 |     // static method used for glut callbacks
 81 |     static void mouse_func( int button, int state,
 82 |                             int mx, int my ) {
 83 |         if (button == GLUT_LEFT_BUTTON) {
 84 |             CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());
 85 |             if (state == GLUT_DOWN) {
 86 |                 bitmap->dragStartX = mx;
 87 |                 bitmap->dragStartY = my;
 88 |             } else if (state == GLUT_UP) {
 89 |                 bitmap->clickDrag( bitmap->dataBlock,
 90 |                                    bitmap->dragStartX,
 91 |                                    bitmap->dragStartY,
 92 |                                    mx, my );
 93 |             }
 94 |         }
 95 |     }
 96 | 
 97 |     // static method used for glut callbacks
 98 |     static void idle_func( void ) {
 99 |         static int ticks = 1;
100 |         CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());
101 |         bitmap->fAnim( bitmap->dataBlock, ticks++ );
102 |         glutPostRedisplay();
103 |     }
104 | 
105 |     // static method used for glut callbacks
106 |     static void Key(unsigned char key, int x, int y) {
107 |         switch (key) {
108 |             case 27:
109 |                 CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());
110 |                 bitmap->animExit( bitmap->dataBlock );
111 |                 //delete bitmap;
112 |                 exit(0);
113 |         }
114 |     }
115 | 
116 |     // static method used for glut callbacks
117 |     static void Draw( void ) {
118 |         CPUAnimBitmap*   bitmap = *(get_bitmap_ptr());
119 |         glClearColor( 0.0, 0.0, 0.0, 1.0 );
120 |         glClear( GL_COLOR_BUFFER_BIT );
121 |         glDrawPixels( bitmap->width, bitmap->height, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );
122 |         glutSwapBuffers();
123 |     }
124 | };
125 | 
126 | 
127 | #endif  // __CPU_ANIM_H__
128 | 
129 | 


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/sumArraysOnGPU.cu:
--------------------------------------------------------------------------------
  1 | 
  2 | #include <stdlib.h>
  3 | #include <string.h>
  4 | #include <time.h>
  5 | #include <stdio.h>
  6 | #include <cuda_runtime.h>
  7 | #include <sys/time.h>
  8 | 
  9 | double cpuSecond(){
 10 |     struct timeval tp;
 11 |     gettimeofday(&tp, NULL);
 12 |     return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);
 13 | }
 14 | 
 15 | #define CHECK(call)                                                            \
 16 | {                                                                              \
 17 |     const cudaError_t error = call;                                            \
 18 |     if (error != cudaSuccess)                                                  \
 19 |     {                                                                          \
 20 |         fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);                 \
 21 |         fprintf(stderr, "code: %d, reason: %s\n", error,                       \
 22 |                 cudaGetErrorString(error));                                    \
 23 |         exit(1);                                                               \
 24 |     }                                                                          \
 25 | }
 26 | 
 27 | 
 28 | __global__ void sumArraysOnDevice(float *A, float *B, float *C, const int N){
 29 |     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 30 |     if (idx < N) C[idx] = A[idx] + B[idx];
 31 | 
 32 | }
 33 | 
 34 | 
 35 | void initialData(float *ip, int size){
 36 |     // generate different seed for random number 
 37 |     time_t t;
 38 |     srand((unsigned int) time (&t));
 39 |     
 40 |     for (int i=0; i<size; i++){
 41 |         ip[i] = (float)(rand() & 0xFF) / 10.0f;
 42 |     }
 43 | }
 44 | 
 45 | 
 46 | void sumArraysOnHost(float *A, float *B, float *C, const int N){
 47 |     for (int idx=0; idx<N; idx++){
 48 |         C[idx] = A[idx] + B[idx];
 49 |     }
 50 | }
 51 | 
 52 | 
 53 | 
 54 | void checkResult(float *hostRef, float *gpuRef, const int N){
 55 |     double epsilon = 1.0E-8;
 56 |     int match = 1;
 57 |     for (int i = 0; i < N; i++){
 58 |         if (abs(hostRef[i] - gpuRef[i]) > epsilon){
 59 |             match = 0;
 60 |             printf("Arrays do not match!\n");
 61 |             printf("host %5.2f gpu %5.2f at current %d\n",
 62 |                    hostRef[i], gpuRef[i], i);
 63 |             break;
 64 |         }
 65 |     }
 66 |     if (match) printf("Arrays match. \n\n");
 67 | }
 68 | 
 69 | 
 70 | int main(int argc, char **argv){
 71 |     
 72 |     printf("%s Starting...\n", argv[0]);
 73 |     
 74 |     // malloc host memory
 75 |     int nElem = 1 <<24;
 76 |     size_t nBytes = nElem * sizeof(float);
 77 |     
 78 |     
 79 |     // initialize data at host side
 80 |     float *h_A, *h_B, *hostRef, *gpuRef;
 81 |     h_A = (float *)malloc(nBytes);
 82 |     h_B = (float *)malloc(nBytes);
 83 |     hostRef = (float *)malloc(nBytes);
 84 |     gpuRef = (float *)malloc(nBytes);
 85 |     
 86 |     // initialize data at host side
 87 |     initialData(h_A, nElem);
 88 |     initialData(h_B, nElem);
 89 |     
 90 |     memset(hostRef, 0, nBytes);
 91 |     memset(gpuRef, 0, nBytes);
 92 |     
 93 |     // malloc device global memory 
 94 |     float *d_A, *d_B, *d_C;
 95 |     cudaMalloc((float**)&d_A, nBytes);
 96 |     cudaMalloc((float**)&d_B, nBytes);
 97 |     cudaMalloc((float**)&d_C, nBytes);
 98 |     
 99 |     // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the
100 |     // parameter cudaMemcpyHostToDevice specifying the transfer direction.
101 |     
102 |     CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));
103 |     CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));
104 |     
105 |     // invoke kernel at host side
106 |     int iLen = 128;
107 |     dim3 block(iLen);
108 |     dim3 grid((nElem+block.x-1)/block.x);
109 |     
110 |     double iStart = cpuSecond();
111 |     sumArraysOnDevice<<<grid, block>>>(d_A, d_B, d_C, nElem);
112 |     CHECK(cudaDeviceSynchronize());
113 |     double iElaps = cpuSecond() - iStart;
114 |     printf("sumArraysOnGPU <<<%d,%d>>> Time elapsed %f sec\n", grid.x, block.x, iElaps);
115 |     //printf("Execution configuration <<<%d, %d>>>\n", grid.x, block.x);
116 |     
117 |     // copy kernel result back to host side 
118 |     cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
119 |     
120 |     // add vector at host side for result checks
121 |     sumArraysOnHost(h_A, h_B, hostRef, nElem);
122 |     
123 |     for (int i=0; i<10; i++){
124 |          printf("%f + %f = %f \n", h_A[i], h_B[i], hostRef[i]);
125 | 
126 |     }
127 |     
128 |     // check device results
129 |     checkResult(hostRef, gpuRef, nElem);
130 |     
131 |     free(h_A);
132 |     free(h_B);
133 |     free(hostRef);
134 |     free(gpuRef);
135 |     
136 |     // use cudaFree to release the memory used on the GPU
137 |     cudaFree(d_A);
138 |     cudaFree(d_B);
139 |     cudaFree(d_C);
140 |     cudaDeviceReset();
141 |     
142 |     return (0);
143 | }


--------------------------------------------------------------------------------
/cuda-c/src/utils/gpu_anim.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NVIDIA Corporation and its licensors retain all intellectual property and
  5 |  * proprietary rights in and to this software and related documentation.
  6 |  * Any use, reproduction, disclosure, or distribution of this software
  7 |  * and related documentation without an express license agreement from
  8 |  * NVIDIA Corporation is strictly prohibited.
  9 |  *
 10 |  * Please refer to the applicable NVIDIA end user license agreement (EULA)
 11 |  * associated with this source code for terms and conditions that govern
 12 |  * your use of this NVIDIA software.
 13 |  *
 14 |  */
 15 | 
 16 | 
 17 | #ifndef __GPU_ANIM_H__
 18 | #define __GPU_ANIM_H__
 19 | 
 20 | #include "gl_helper.h"
 21 | 
 22 | #include "cuda.h"
 23 | #include "cuda_gl_interop.h"
 24 | #include <iostream>
 25 | 
 26 | 
 27 | PFNGLBINDBUFFERARBPROC    glBindBuffer     = NULL;
 28 | PFNGLDELETEBUFFERSARBPROC glDeleteBuffers  = NULL;
 29 | PFNGLGENBUFFERSARBPROC    glGenBuffers     = NULL;
 30 | PFNGLBUFFERDATAARBPROC    glBufferData     = NULL;
 31 | 
 32 | 
 33 | struct GPUAnimBitmap {
 34 |     GLuint  bufferObj;
 35 |     cudaGraphicsResource *resource;
 36 |     int     width, height;
 37 |     void    *dataBlock;
 38 |     void (*fAnim)(uchar4*,void*,int);
 39 |     void (*animExit)(void*);
 40 |     void (*clickDrag)(void*,int,int,int,int);
 41 |     int     dragStartX, dragStartY;
 42 | 
 43 |     GPUAnimBitmap( int w, int h, void *d = NULL ) {
 44 |         width = w;
 45 |         height = h;
 46 |         dataBlock = d;
 47 |         clickDrag = NULL;
 48 | 
 49 |         // first, find a CUDA device and set it to graphic interop
 50 |         cudaDeviceProp  prop;
 51 |         int dev;
 52 |         memset( &prop, 0, sizeof( cudaDeviceProp ) );
 53 |         prop.major = 1;
 54 |         prop.minor = 0;
 55 |         HANDLE_ERROR( cudaChooseDevice( &dev, &prop ) );
 56 |         cudaGLSetGLDevice( dev );
 57 | 
 58 |         // a bug in the Windows GLUT implementation prevents us from
 59 |         // passing zero arguments to glutInit()
 60 |         int c=1;
 61 |         char* dummy = "";
 62 |         glutInit( &c, &dummy );
 63 |         glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
 64 |         glutInitWindowSize( width, height );
 65 |         glutCreateWindow( "bitmap" );
 66 | 
 67 |         glBindBuffer    = (PFNGLBINDBUFFERARBPROC)GET_PROC_ADDRESS("glBindBuffer");
 68 |         glDeleteBuffers = (PFNGLDELETEBUFFERSARBPROC)GET_PROC_ADDRESS("glDeleteBuffers");
 69 |         glGenBuffers    = (PFNGLGENBUFFERSARBPROC)GET_PROC_ADDRESS("glGenBuffers");
 70 |         glBufferData    = (PFNGLBUFFERDATAARBPROC)GET_PROC_ADDRESS("glBufferData");
 71 | 
 72 |         glGenBuffers( 1, &bufferObj );
 73 |         glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, bufferObj );
 74 |         glBufferData( GL_PIXEL_UNPACK_BUFFER_ARB, width * height * 4,
 75 |                       NULL, GL_DYNAMIC_DRAW_ARB );
 76 | 
 77 |         HANDLE_ERROR( cudaGraphicsGLRegisterBuffer( &resource, bufferObj, cudaGraphicsMapFlagsNone ) );
 78 |     }
 79 | 
 80 |     ~GPUAnimBitmap() {
 81 |         free_resources();
 82 |     }
 83 | 
 84 |     void free_resources( void ) {
 85 |         HANDLE_ERROR( cudaGraphicsUnregisterResource( resource ) );
 86 | 
 87 |         glBindBuffer( GL_PIXEL_UNPACK_BUFFER_ARB, 0 );
 88 |         glDeleteBuffers( 1, &bufferObj );
 89 |     }
 90 | 
 91 | 
 92 |     long image_size( void ) const { return width * height * 4; }
 93 | 
 94 |     void click_drag( void (*f)(void*,int,int,int,int)) {
 95 |         clickDrag = f;
 96 |     }
 97 | 
 98 |     void anim_and_exit( void (*f)(uchar4*,void*,int), void(*e)(void*) ) {
 99 |         GPUAnimBitmap**   bitmap = get_bitmap_ptr();
100 |         *bitmap = this;
101 |         fAnim = f;
102 |         animExit = e;
103 | 
104 |         glutKeyboardFunc( Key );
105 |         glutDisplayFunc( Draw );
106 |         if (clickDrag != NULL)
107 |             glutMouseFunc( mouse_func );
108 |         glutIdleFunc( idle_func );
109 |         glutMainLoop();
110 |     }
111 | 
112 |     // static method used for glut callbacks
113 |     static GPUAnimBitmap** get_bitmap_ptr( void ) {
114 |         static GPUAnimBitmap*   gBitmap;
115 |         return &gBitmap;
116 |     }
117 | 
118 |     // static method used for glut callbacks
119 |     static void mouse_func( int button, int state,
120 |                             int mx, int my ) {
121 |         if (button == GLUT_LEFT_BUTTON) {
122 |             GPUAnimBitmap*   bitmap = *(get_bitmap_ptr());
123 |             if (state == GLUT_DOWN) {
124 |                 bitmap->dragStartX = mx;
125 |                 bitmap->dragStartY = my;
126 |             } else if (state == GLUT_UP) {
127 |                 bitmap->clickDrag( bitmap->dataBlock,
128 |                                    bitmap->dragStartX,
129 |                                    bitmap->dragStartY,
130 |                                    mx, my );
131 |             }
132 |         }
133 |     }
134 | 
135 |     // static method used for glut callbacks
136 |     static void idle_func( void ) {
137 |         static int ticks = 1;
138 |         GPUAnimBitmap*  bitmap = *(get_bitmap_ptr());
139 |         uchar4*         devPtr;
140 |         size_t  size;
141 | 
142 |         HANDLE_ERROR( cudaGraphicsMapResources( 1, &(bitmap->resource), NULL ) );
143 |         HANDLE_ERROR( cudaGraphicsResourceGetMappedPointer( (void**)&devPtr, &size, bitmap->resource) );
144 | 
145 |         bitmap->fAnim( devPtr, bitmap->dataBlock, ticks++ );
146 | 
147 |         HANDLE_ERROR( cudaGraphicsUnmapResources( 1, &(bitmap->resource), NULL ) );
148 | 
149 |         glutPostRedisplay();
150 |     }
151 | 
152 |     // static method used for glut callbacks
153 |     static void Key(unsigned char key, int x, int y) {
154 |         switch (key) {
155 |             case 27:
156 |                 GPUAnimBitmap*   bitmap = *(get_bitmap_ptr());
157 |                 if (bitmap->animExit)
158 |                     bitmap->animExit( bitmap->dataBlock );
159 |                 bitmap->free_resources();
160 |                 exit(0);
161 |         }
162 |     }
163 | 
164 |     // static method used for glut callbacks
165 |     static void Draw( void ) {
166 |         GPUAnimBitmap*   bitmap = *(get_bitmap_ptr());
167 |         glClearColor( 0.0, 0.0, 0.0, 1.0 );
168 |         glClear( GL_COLOR_BUFFER_BIT );
169 |         glDrawPixels( bitmap->width, bitmap->height, GL_RGBA,
170 |                       GL_UNSIGNED_BYTE, 0 );
171 |         glutSwapBuffers();
172 |     }
173 | };
174 | 
175 | 
176 | #endif  // __GPU_ANIM_H__
177 | 
178 | 


--------------------------------------------------------------------------------
/cuda-c/src/utils/common.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 1993-2010 NVIDIA Corporation.  All rights reserved.
  3 |  *
  4 |  * NVIDIA Corporation and its licensors retain all intellectual property and
  5 |  * proprietary rights in and to this software and related documentation.
  6 |  * Any use, reproduction, disclosure, or distribution of this software
  7 |  * and related documentation without an express license agreement from
  8 |  * NVIDIA Corporation is strictly prohibited.
  9 |  *
 10 |  * Please refer to the applicable NVIDIA end user license agreement (EULA)
 11 |  * associated with this source code for terms and conditions that govern
 12 |  * your use of this NVIDIA software.
 13 |  *
 14 |  */
 15 | 
 16 | 
 17 | #ifndef __COMMON_H__
 18 | #define __COMMON_H__
 19 | #include <stdio.h>
 20 | 
 21 | static void HandleError( cudaError_t err,
 22 |                          const char *file,
 23 |                          int line ) {
 24 |     if (err != cudaSuccess) {
 25 |         printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
 26 |                 file, line );
 27 |         exit( EXIT_FAILURE );
 28 |     }
 29 | }
 30 | #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
 31 | 
 32 | 
 33 | #define HANDLE_NULL( a ) {if (a == NULL) { \
 34 |                             printf( "Host memory failed in %s at line %d\n", \
 35 |                                     __FILE__, __LINE__ ); \
 36 |                             exit( EXIT_FAILURE );}}
 37 | 
 38 | template< typename T >
 39 | void swap( T& a, T& b ) {
 40 |     T t = a;
 41 |     a = b;
 42 |     b = t;
 43 | }
 44 | 
 45 | 
 46 | void* big_random_block( int size ) {
 47 |     unsigned char *data = (unsigned char*)malloc( size );
 48 |     HANDLE_NULL( data );
 49 |     for (int i=0; i<size; i++)
 50 |         data[i] = rand();
 51 | 
 52 |     return data;
 53 | }
 54 | 
 55 | int* big_random_block_int( int size ) {
 56 |     int *data = (int*)malloc( size * sizeof(int) );
 57 |     HANDLE_NULL( data );
 58 |     for (int i=0; i<size; i++)
 59 |         data[i] = rand();
 60 | 
 61 |     return data;
 62 | }
 63 | 
 64 | 
 65 | // a place for common kernels - starts here
 66 | 
 67 | __device__ unsigned char value( float n1, float n2, int hue ) {
 68 |     if (hue > 360)      hue -= 360;
 69 |     else if (hue < 0)   hue += 360;
 70 | 
 71 |     if (hue < 60)
 72 |         return (unsigned char)(255 * (n1 + (n2-n1)*hue/60));
 73 |     if (hue < 180)
 74 |         return (unsigned char)(255 * n2);
 75 |     if (hue < 240)
 76 |         return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60));
 77 |     return (unsigned char)(255 * n1);
 78 | }
 79 | 
 80 | __global__ void float_to_color( unsigned char *optr,
 81 |                               const float *outSrc ) {
 82 |     // map from threadIdx/BlockIdx to pixel position
 83 |     int x = threadIdx.x + blockIdx.x * blockDim.x;
 84 |     int y = threadIdx.y + blockIdx.y * blockDim.y;
 85 |     int offset = x + y * blockDim.x * gridDim.x;
 86 | 
 87 |     float l = outSrc[offset];
 88 |     float s = 1;
 89 |     int h = (180 + (int)(360.0f * outSrc[offset])) % 360;
 90 |     float m1, m2;
 91 | 
 92 |     if (l <= 0.5f)
 93 |         m2 = l * (1 + s);
 94 |     else
 95 |         m2 = l + s - l * s;
 96 |     m1 = 2 * l - m2;
 97 | 
 98 |     optr[offset*4 + 0] = value( m1, m2, h+120 );
 99 |     optr[offset*4 + 1] = value( m1, m2, h );
100 |     optr[offset*4 + 2] = value( m1, m2, h -120 );
101 |     optr[offset*4 + 3] = 255;
102 | }
103 | 
104 | __global__ void float_to_color( uchar4 *optr,
105 |                               const float *outSrc ) {
106 |     // map from threadIdx/BlockIdx to pixel position
107 |     int x = threadIdx.x + blockIdx.x * blockDim.x;
108 |     int y = threadIdx.y + blockIdx.y * blockDim.y;
109 |     int offset = x + y * blockDim.x * gridDim.x;
110 | 
111 |     float l = outSrc[offset];
112 |     float s = 1;
113 |     int h = (180 + (int)(360.0f * outSrc[offset])) % 360;
114 |     float m1, m2;
115 | 
116 |     if (l <= 0.5f)
117 |         m2 = l * (1 + s);
118 |     else
119 |         m2 = l + s - l * s;
120 |     m1 = 2 * l - m2;
121 | 
122 |     optr[offset].x = value( m1, m2, h+120 );
123 |     optr[offset].y = value( m1, m2, h );
124 |     optr[offset].z = value( m1, m2, h -120 );
125 |     optr[offset].w = 255;
126 | }
127 | 
128 | 
129 | #if _WIN32
130 |     //Windows threads.
131 |     #include <windows.h>
132 | 
133 |     typedef HANDLE CUTThread;
134 |     typedef unsigned (WINAPI *CUT_THREADROUTINE)(void *);
135 | 
136 |     #define CUT_THREADPROC unsigned WINAPI
137 |     #define  CUT_THREADEND return 0
138 | 
139 | #else
140 |     //POSIX threads.
141 |     #include <pthread.h>
142 | 
143 |     typedef pthread_t CUTThread;
144 |     typedef void *(*CUT_THREADROUTINE)(void *);
145 | 
146 |     #define CUT_THREADPROC void
147 |     #define  CUT_THREADEND
148 | #endif
149 | 
150 | //Create thread.
151 | CUTThread start_thread( CUT_THREADROUTINE, void *data );
152 | 
153 | //Wait for thread to finish.
154 | void end_thread( CUTThread thread );
155 | 
156 | //Destroy thread.
157 | void destroy_thread( CUTThread thread );
158 | 
159 | //Wait for multiple threads.
160 | void wait_for_threads( const CUTThread *threads, int num );
161 | 
162 | #if _WIN32
163 |     //Create thread
164 |     CUTThread start_thread(CUT_THREADROUTINE func, void *data){
165 |         return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
166 |     }
167 | 
168 |     //Wait for thread to finish
169 |     void end_thread(CUTThread thread){
170 |         WaitForSingleObject(thread, INFINITE);
171 |         CloseHandle(thread);
172 |     }
173 | 
174 |     //Destroy thread
175 |     void destroy_thread( CUTThread thread ){
176 |         TerminateThread(thread, 0);
177 |         CloseHandle(thread);
178 |     }
179 | 
180 |     //Wait for multiple threads
181 |     void wait_for_threads(const CUTThread * threads, int num){
182 |         WaitForMultipleObjects(num, threads, true, INFINITE);
183 | 
184 |         for(int i = 0; i < num; i++)
185 |             CloseHandle(threads[i]);
186 |     }
187 | 
188 | #else
189 |     //Create thread
190 |     CUTThread start_thread(CUT_THREADROUTINE func, void * data){
191 |         pthread_t thread;
192 |         pthread_create(&thread, NULL, func, data);
193 |         return thread;
194 |     }
195 | 
196 |     //Wait for thread to finish
197 |     void end_thread(CUTThread thread){
198 |         pthread_join(thread, NULL);
199 |     }
200 | 
201 |     //Destroy thread
202 |     void destroy_thread( CUTThread thread ){
203 |         pthread_cancel(thread);
204 |     }
205 | 
206 |     //Wait for multiple threads
207 |     void wait_for_threads(const CUTThread * threads, int num){
208 |         for(int i = 0; i < num; i++)
209 |             end_thread( threads[i] );
210 |     }
211 | 
212 | #endif
213 | 
214 | 
215 | 
216 | 
217 | #endif  // __COMMON_H__


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/03-compiling-and-executing.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 49,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Overwriting sumArraysOnGPU.cu\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "%%file sumArraysOnGPU.cu\n",
 18 |     "\n",
 19 |     "#include <stdlib.h>\n",
 20 |     "#include <string.h>\n",
 21 |     "#include <time.h>\n",
 22 |     "#include <stdio.h>\n",
 23 |     "#include <cuda_runtime.h>\n",
 24 |     "\n",
 25 |     "\n",
 26 |     "#define CHECK(call)                                                            \\\n",
 27 |     "{                                                                              \\\n",
 28 |     "    const cudaError_t error = call;                                            \\\n",
 29 |     "    if (error != cudaSuccess)                                                  \\\n",
 30 |     "    {                                                                          \\\n",
 31 |     "        fprintf(stderr, \"Error: %s:%d, \", __FILE__, __LINE__);                 \\\n",
 32 |     "        fprintf(stderr, \"code: %d, reason: %s\\n\", error,                       \\\n",
 33 |     "                cudaGetErrorString(error));                                    \\\n",
 34 |     "        exit(1);                                                               \\\n",
 35 |     "    }                                                                          \\\n",
 36 |     "}\n",
 37 |     "\n",
 38 |     "\n",
 39 |     "__global__ void sumArraysOnDevice(float *A, float *B, float *C){\n",
 40 |     "    int idx = blockIdx.x * blockDim.x + threadIdx.x;\n",
 41 |     "    C[idx] = A[idx] + B[idx];\n",
 42 |     "\n",
 43 |     "}\n",
 44 |     "\n",
 45 |     "\n",
 46 |     "void initialData(float *ip, int size){\n",
 47 |     "    // generate different seed for random number \n",
 48 |     "    time_t t;\n",
 49 |     "    srand((unsigned int) time (&t));\n",
 50 |     "    \n",
 51 |     "    for (int i=0; i<size; i++){\n",
 52 |     "        ip[i] = (float)(rand() & 0xFF) / 10.0f;\n",
 53 |     "    }\n",
 54 |     "}\n",
 55 |     "\n",
 56 |     "\n",
 57 |     "void sumArraysOnHost(float *A, float *B, float *C, const int N){\n",
 58 |     "    for (int idx=0; idx<N; idx++){\n",
 59 |     "        C[idx] = A[idx] + B[idx];\n",
 60 |     "    }\n",
 61 |     "}\n",
 62 |     "\n",
 63 |     "\n",
 64 |     "\n",
 65 |     "void checkResult(float *hostRef, float *gpuRef, const int N){\n",
 66 |     "    double epsilon = 1.0E-8;\n",
 67 |     "    int match = 1;\n",
 68 |     "    for (int i = 0; i < N; i++){\n",
 69 |     "        if (abs(hostRef[i] - gpuRef[i]) > epsilon){\n",
 70 |     "            match = 0;\n",
 71 |     "            printf(\"Arrays do not match!\\n\");\n",
 72 |     "            printf(\"host %5.2f gpu %5.2f at current %d\\n\",\n",
 73 |     "                   hostRef[i], gpuRef[i], i);\n",
 74 |     "            break;\n",
 75 |     "        }\n",
 76 |     "    }\n",
 77 |     "    if (match) printf(\"Arrays match. \\n\\n\");\n",
 78 |     "}\n",
 79 |     "\n",
 80 |     "\n",
 81 |     "int main(int argc, char **argv){\n",
 82 |     "    \n",
 83 |     "    printf(\"%s Starting...\\n\", argv[0]);\n",
 84 |     "    \n",
 85 |     "    // malloc host memory\n",
 86 |     "    int nElem = 10000;\n",
 87 |     "    size_t nBytes = nElem * sizeof(float);\n",
 88 |     "    \n",
 89 |     "    \n",
 90 |     "    // initialize data at host side\n",
 91 |     "    float *h_A, *h_B, *hostRef, *gpuRef;\n",
 92 |     "    h_A = (float *)malloc(nBytes);\n",
 93 |     "    h_B = (float *)malloc(nBytes);\n",
 94 |     "    hostRef = (float *)malloc(nBytes);\n",
 95 |     "    gpuRef = (float *)malloc(nBytes);\n",
 96 |     "    \n",
 97 |     "    // initialize data at host side\n",
 98 |     "    initialData(h_A, nElem);\n",
 99 |     "    initialData(h_B, nElem);\n",
100 |     "    \n",
101 |     "    memset(hostRef, 0, nBytes);\n",
102 |     "    memset(gpuRef, 0, nBytes);\n",
103 |     "    \n",
104 |     "    // malloc device global memory \n",
105 |     "    float *d_A, *d_B, *d_C;\n",
106 |     "    cudaMalloc((float**)&d_A, nBytes);\n",
107 |     "    cudaMalloc((float**)&d_B, nBytes);\n",
108 |     "    cudaMalloc((float**)&d_C, nBytes);\n",
109 |     "    \n",
110 |     "    // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the\n",
111 |     "    // parameter cudaMemcpyHostToDevice specifying the transfer direction.\n",
112 |     "    \n",
113 |     "    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));\n",
114 |     "    CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));\n",
115 |     "    \n",
116 |     "    // invoke kernel at host side\n",
117 |     "    dim3 block(100);\n",
118 |     "    dim3 grid(nElem / block.x);\n",
119 |     "    \n",
120 |     "    sumArraysOnDevice<<<grid, block>>>(d_A, d_B, d_C);\n",
121 |     "    printf(\"Execution configuration <<<%d, %d>>>\\n\", grid.x, block.x);\n",
122 |     "    \n",
123 |     "    // copy kernel result back to host side \n",
124 |     "    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);\n",
125 |     "    \n",
126 |     "    // add vector at host side for result checks\n",
127 |     "    sumArraysOnHost(h_A, h_B, hostRef, nElem);\n",
128 |     "    \n",
129 |     "    for (int i=0; i<10; i++){\n",
130 |     "         printf(\"%f + %f = %f \\n\", h_A[i], h_B[i], hostRef[i]);\n",
131 |     "\n",
132 |     "    }\n",
133 |     "    \n",
134 |     "    // check device results\n",
135 |     "    checkResult(hostRef, gpuRef, nElem);\n",
136 |     "    \n",
137 |     "    free(h_A);\n",
138 |     "    free(h_B);\n",
139 |     "    free(hostRef);\n",
140 |     "    free(gpuRef);\n",
141 |     "    \n",
142 |     "    // use cudaFree to release the memory used on the GPU\n",
143 |     "    cudaFree(d_A);\n",
144 |     "    cudaFree(d_B);\n",
145 |     "    cudaFree(d_C);\n",
146 |     "    cudaDeviceReset();\n",
147 |     "    \n",
148 |     "    return (0);\n",
149 |     "}\n"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 50,
155 |    "metadata": {},
156 |    "outputs": [
157 |     {
158 |      "name": "stdout",
159 |      "output_type": "stream",
160 |      "text": [
161 |       "./addvector Starting...\n",
162 |       "Execution configuration <<<100, 100>>>\n",
163 |       "17.600000 + 17.600000 = 35.200001 \n",
164 |       "16.299999 + 16.299999 = 32.599998 \n",
165 |       "0.600000 + 0.600000 = 1.200000 \n",
166 |       "23.200001 + 23.200001 = 46.400002 \n",
167 |       "16.799999 + 16.799999 = 33.599998 \n",
168 |       "15.600000 + 15.600000 = 31.200001 \n",
169 |       "2.200000 + 2.200000 = 4.400000 \n",
170 |       "19.700001 + 19.700001 = 39.400002 \n",
171 |       "4.300000 + 4.300000 = 8.600000 \n",
172 |       "3.200000 + 3.200000 = 6.400000 \n",
173 |       "Arrays match. \n",
174 |       "\n"
175 |      ]
176 |     },
177 |     {
178 |      "name": "stderr",
179 |      "output_type": "stream",
180 |      "text": [
181 |       "==26284== NVPROF is profiling process 26284, command: ./addvector\n",
182 |       "==26284== Profiling application: ./addvector\n",
183 |       "==26284== Profiling result:\n",
184 |       "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
185 |       " GPU activities:   61.39%  41.056us         2  20.528us  19.840us  21.216us  [CUDA memcpy HtoD]\n",
186 |       "                   32.87%  21.984us         1  21.984us  21.984us  21.984us  [CUDA memcpy DtoH]\n",
187 |       "                    5.74%  3.8400us         1  3.8400us  3.8400us  3.8400us  sumArraysOnDevice(float*, float*, float*)\n",
188 |       "      API calls:   67.64%  108.27ms         3  36.090ms  6.2490us  108.25ms  cudaMalloc\n",
189 |       "                   31.70%  50.742ms         1  50.742ms  50.742ms  50.742ms  cudaDeviceReset\n",
190 |       "                    0.37%  586.92us        94  6.2430us     177ns  259.83us  cuDeviceGetAttribute\n",
191 |       "                    0.10%  166.18us         3  55.392us  6.7450us  147.89us  cudaFree\n",
192 |       "                    0.07%  117.21us         3  39.069us  22.571us  54.160us  cudaMemcpy\n",
193 |       "                    0.05%  80.415us         1  80.415us  80.415us  80.415us  cuDeviceTotalMem\n",
194 |       "                    0.05%  75.864us         1  75.864us  75.864us  75.864us  cuDeviceGetName\n",
195 |       "                    0.02%  25.566us         1  25.566us  25.566us  25.566us  cudaLaunch\n",
196 |       "                    0.00%  2.8060us         2  1.4030us  1.2850us  1.5210us  cuDeviceGetCount\n",
197 |       "                    0.00%  2.7890us         3     929ns     221ns  2.1490us  cudaSetupArgument\n",
198 |       "                    0.00%  1.0910us         2     545ns     450ns     641ns  cuDeviceGet\n",
199 |       "                    0.00%     788ns         1     788ns     788ns     788ns  cudaConfigureCall\n"
200 |      ]
201 |     }
202 |    ],
203 |    "source": [
204 |     "%%bash\n",
205 |     "nvcc sumArraysOnGPU.cu -o addvector\n",
206 |     "nvprof --unified-memory-profiling off ./addvector\n"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": []
215 |   },
216 |   {
217 |    "cell_type": "code",
218 |    "execution_count": null,
219 |    "metadata": {},
220 |    "outputs": [],
221 |    "source": []
222 |   }
223 |  ],
224 |  "metadata": {
225 |   "kernelspec": {
226 |    "display_name": "Python 3",
227 |    "language": "python",
228 |    "name": "python3"
229 |   },
230 |   "language_info": {
231 |    "codemirror_mode": {
232 |     "name": "ipython",
233 |     "version": 3
234 |    },
235 |    "file_extension": ".py",
236 |    "mimetype": "text/x-python",
237 |    "name": "python",
238 |    "nbconvert_exporter": "python",
239 |    "pygments_lexer": "ipython3",
240 |    "version": "3.6.5"
241 |   },
242 |   "toc": {
243 |    "base_numbering": 1,
244 |    "nav_menu": {},
245 |    "number_sections": true,
246 |    "sideBar": true,
247 |    "skip_h1_title": false,
248 |    "title_cell": "Table of Contents",
249 |    "title_sidebar": "Contents",
250 |    "toc_cell": false,
251 |    "toc_position": {},
252 |    "toc_section_display": true,
253 |    "toc_window_display": false
254 |   }
255 |  },
256 |  "nbformat": 4,
257 |  "nbformat_minor": 2
258 | }
259 | 


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/01-memory-management.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Overwriting sumArraysOnHost.c\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "%%file sumArraysOnHost.c\n",
 18 |     "\n",
 19 |     "#include <stdlib.h>\n",
 20 |     "#include <string.h>\n",
 21 |     "#include <time.h>\n",
 22 |     "\n",
 23 |     "void sumArraysOnHost(float *A, float *B, float *C, const int N){\n",
 24 |     "    for (int idx=0; idx<N; idx++){\n",
 25 |     "        C[idx] = A[idx] + B[idx];\n",
 26 |     "    }\n",
 27 |     "}\n",
 28 |     "\n",
 29 |     "\n",
 30 |     "void initialData(float *ip, int size){\n",
 31 |     "    // generate different seed for random number \n",
 32 |     "    time_t t;\n",
 33 |     "    srand((unsigned int) time (&t));\n",
 34 |     "    \n",
 35 |     "    for (int i=0; i<size; i++){\n",
 36 |     "        ip[i] = (float)(rand() & 0xFF) / 10.0f;\n",
 37 |     "    }\n",
 38 |     "}\n",
 39 |     "\n",
 40 |     "\n",
 41 |     "int main(int argc, char **argv){\n",
 42 |     "    int nElem = 1024;\n",
 43 |     "    size_t nBytes = nElem * sizeof(float);\n",
 44 |     "    \n",
 45 |     "    float *h_A, *h_B, *h_C;\n",
 46 |     "    h_A = (float *)malloc(nBytes);\n",
 47 |     "    h_B = (float *)malloc(nBytes);\n",
 48 |     "    h_C = (float *)malloc(nBytes);\n",
 49 |     "    \n",
 50 |     "    initialData(h_A, nElem);\n",
 51 |     "    initialData(h_B, nElem);\n",
 52 |     "    \n",
 53 |     "    sumArraysOnHost(h_A, h_B, h_C, nElem);\n",
 54 |     "    \n",
 55 |     "    free(h_A);\n",
 56 |     "    free(h_B);\n",
 57 |     "    free(h_C);\n",
 58 |     "    \n",
 59 |     "    return (0);\n",
 60 |     "}"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": 2,
 66 |    "metadata": {},
 67 |    "outputs": [
 68 |     {
 69 |      "name": "stdout",
 70 |      "output_type": "stream",
 71 |      "text": [
 72 |       "\u001b[01m\u001b[Kcc1plus:\u001b[m\u001b[K \u001b[01;35m\u001b[Kwarning: \u001b[m\u001b[Kcommand line option ‘\u001b[01m\u001b[K-std=c99\u001b[m\u001b[K’ is valid for C/ObjC but not for C++\r\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "!nvcc -Xcompiler -std=c99 sumArraysOnHost.c -o sum"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 3,
 83 |    "metadata": {},
 84 |    "outputs": [],
 85 |    "source": [
 86 |     "!./sum"
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 4,
 92 |    "metadata": {},
 93 |    "outputs": [],
 94 |    "source": [
 95 |     "!nvprof ./sum"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 20,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "Overwriting sumArraysOnDevice.cu\n"
108 |      ]
109 |     }
110 |    ],
111 |    "source": [
112 |     "%%file sumArraysOnDevice.cu\n",
113 |     "\n",
114 |     "#include <stdlib.h>\n",
115 |     "#include <string.h>\n",
116 |     "#include <time.h>\n",
117 |     "#include <stdio.h>\n",
118 |     "\n",
119 |     "__global__ void sumArraysOnDevice(float *A, float *B, float *C){\n",
120 |     "    int idx = threadIdx.x;\n",
121 |     "    C[idx] = A[idx] + B[idx];\n",
122 |     "\n",
123 |     "}\n",
124 |     "\n",
125 |     "\n",
126 |     "void initialData(float *ip, int size){\n",
127 |     "    // generate different seed for random number \n",
128 |     "    time_t t;\n",
129 |     "    srand((unsigned int) time (&t));\n",
130 |     "    \n",
131 |     "    for (int i=0; i<size; i++){\n",
132 |     "        ip[i] = (float)(rand() & 0xFF) / 10.0f;\n",
133 |     "    }\n",
134 |     "}\n",
135 |     "\n",
136 |     "\n",
137 |     "void sumArraysOnHost(float *A, float *B, float *C, const int N){\n",
138 |     "    for (int idx=0; idx<N; idx++){\n",
139 |     "        C[idx] = A[idx] + B[idx];\n",
140 |     "    }\n",
141 |     "}\n",
142 |     "\n",
143 |     "\n",
144 |     "\n",
145 |     "void checkResult(float *h_C, float *result, const int N){\n",
146 |     "    double epsilon = 1.0E-8;\n",
147 |     "    int match = 1;\n",
148 |     "    for (int i = 0; i < N; i++){\n",
149 |     "        if (abs(h_C[i] - result[i]) > epsilon){\n",
150 |     "            match = 0;\n",
151 |     "            printf(\"Arrays do not match!\\n\");\n",
152 |     "            printf(\"host %5.2f gpu %5.2f at current %d\\n\",\n",
153 |     "                   h_C[i], result[i], i);\n",
154 |     "            break;\n",
155 |     "        }\n",
156 |     "    }\n",
157 |     "    if (match) printf(\"Arrays match. \\n\\n\");\n",
158 |     "}\n",
159 |     "\n",
160 |     "\n",
161 |     "int main(int argc, char **argv){\n",
162 |     "    int nElem = 1024;\n",
163 |     "    size_t nBytes = nElem * sizeof(float);\n",
164 |     "    \n",
165 |     "    float *h_A, *h_B, *h_C, *result;\n",
166 |     "    h_A = (float *)malloc(nBytes);\n",
167 |     "    h_B = (float *)malloc(nBytes);\n",
168 |     "    h_C = (float *)malloc(nBytes);\n",
169 |     "    result = (float *)malloc(nBytes);\n",
170 |     "    \n",
171 |     "    initialData(h_A, nElem);\n",
172 |     "    initialData(h_B, nElem);\n",
173 |     "    \n",
174 |     "    float *d_A, *d_B, *d_C;\n",
175 |     "    cudaMalloc((float**)&d_A, nBytes);\n",
176 |     "    cudaMalloc((float**)&d_B, nBytes);\n",
177 |     "    cudaMalloc((float**)&d_C, nBytes);\n",
178 |     "    \n",
179 |     "    // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the\n",
180 |     "    // parameter cudaMemcpyHostToDevice specifying the transfer direction.\n",
181 |     "    \n",
182 |     "    cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);\n",
183 |     "    cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);\n",
184 |     "    \n",
185 |     "    \n",
186 |     "    \n",
187 |     "    sumArraysOnDevice<<<1, nElem>>>(d_A, d_B, d_C);\n",
188 |     "    sumArraysOnHost(h_A, h_B, result, nElem);\n",
189 |     "    \n",
190 |     "    cudaMemcpy(h_C, d_C, nBytes, cudaMemcpyDeviceToHost);\n",
191 |     "    \n",
192 |     "    for (int i=0; i<10; i++){\n",
193 |     "         printf(\"%f + %f = %f \\n\", h_A[i], h_B[i], h_C[i]);\n",
194 |     "\n",
195 |     "    }\n",
196 |     "    \n",
197 |     "    checkResult(h_C, result, nElem);\n",
198 |     "    \n",
199 |     "    free(h_A);\n",
200 |     "    free(h_B);\n",
201 |     "    free(h_C);\n",
202 |     "    free(result);\n",
203 |     "    \n",
204 |     "    // use cudaFree to release the memory used on the GPU\n",
205 |     "    cudaFree(d_A);\n",
206 |     "    cudaFree(d_B);\n",
207 |     "    cudaFree(d_C);\n",
208 |     "    cudaDeviceReset();\n",
209 |     "    \n",
210 |     "    return (0);\n",
211 |     "}"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 21,
217 |    "metadata": {},
218 |    "outputs": [
219 |     {
220 |      "name": "stdout",
221 |      "output_type": "stream",
222 |      "text": [
223 |       "21.600000 + 21.600000 = 43.200001 \n",
224 |       "12.200000 + 12.200000 = 24.400000 \n",
225 |       "3.300000 + 3.300000 = 6.600000 \n",
226 |       "6.400000 + 6.400000 = 12.800000 \n",
227 |       "8.600000 + 8.600000 = 17.200001 \n",
228 |       "11.400000 + 11.400000 = 22.799999 \n",
229 |       "23.299999 + 23.299999 = 46.599998 \n",
230 |       "2.700000 + 2.700000 = 5.400000 \n",
231 |       "2.600000 + 2.600000 = 5.200000 \n",
232 |       "24.100000 + 24.100000 = 48.200001 \n",
233 |       "Arrays match. \n",
234 |       "\n"
235 |      ]
236 |     }
237 |    ],
238 |    "source": [
239 |     "%%bash\n",
240 |     "nvcc sumArraysOnDevice.cu -o sumgpu\n",
241 |     "./sumgpu"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 11,
247 |    "metadata": {},
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "==12294== NVPROF is profiling process 12294, command: ./sumgpu\n",
254 |       "11.300000 + 11.300000 = 22.600000 \n",
255 |       "23.200001 + 23.200001 = 0.000000 \n",
256 |       "23.500000 + 23.500000 = 0.000000 \n",
257 |       "21.500000 + 21.500000 = 0.000000 \n",
258 |       "16.700001 + 16.700001 = 0.000000 \n",
259 |       "23.000000 + 23.000000 = 0.000000 \n",
260 |       "5.900000 + 5.900000 = 0.000000 \n",
261 |       "3.200000 + 3.200000 = 0.000000 \n",
262 |       "13.900000 + 13.900000 = 0.000000 \n",
263 |       "8.200000 + 8.200000 = 0.000000 \n",
264 |       "Arrays do not match!\n",
265 |       "host  0.00 gpu 46.40 at current 1\n",
266 |       "==12294== Profiling application: ./sumgpu\n",
267 |       "==12294== Profiling result:\n",
268 |       "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
269 |       " GPU activities:   38.31%  3.7760us         1  3.7760us  3.7760us  3.7760us  sumArraysOnDevice(float*, float*, float*)\n",
270 |       "                   37.34%  3.6800us         2  1.8400us  1.8240us  1.8560us  [CUDA memcpy HtoD]\n",
271 |       "                   24.35%  2.4000us         1  2.4000us  2.4000us  2.4000us  [CUDA memcpy DtoH]\n",
272 |       "      API calls:   70.29%  116.93ms         3  38.978ms  4.1810us  116.92ms  cudaMalloc\n",
273 |       "                   29.12%  48.439ms         1  48.439ms  48.439ms  48.439ms  cudaDeviceReset\n",
274 |       "                    0.34%  564.20us        94  6.0020us     164ns  248.93us  cuDeviceGetAttribute\n",
275 |       "                    0.11%  187.10us         3  62.365us  9.9380us  159.64us  cudaFree\n",
276 |       "                    0.05%  83.126us         1  83.126us  83.126us  83.126us  cuDeviceTotalMem\n",
277 |       "                    0.03%  57.601us         1  57.601us  57.601us  57.601us  cuDeviceGetName\n",
278 |       "                    0.03%  52.785us         3  17.595us  10.601us  24.990us  cudaMemcpy\n",
279 |       "                    0.01%  24.804us         1  24.804us  24.804us  24.804us  cudaLaunch\n",
280 |       "                    0.00%  3.5350us         3  1.1780us     225ns  2.7900us  cudaSetupArgument\n",
281 |       "                    0.00%  1.5750us         2     787ns     240ns  1.3350us  cuDeviceGetCount\n",
282 |       "                    0.00%     849ns         1     849ns     849ns     849ns  cudaConfigureCall\n",
283 |       "                    0.00%     483ns         2     241ns     198ns     285ns  cuDeviceGet\n"
284 |      ]
285 |     }
286 |    ],
287 |    "source": [
288 |     "!nvprof --unified-memory-profiling off ./sumgpu"
289 |    ]
290 |   },
291 |   {
292 |    "cell_type": "code",
293 |    "execution_count": null,
294 |    "metadata": {},
295 |    "outputs": [],
296 |    "source": []
297 |   }
298 |  ],
299 |  "metadata": {
300 |   "kernelspec": {
301 |    "display_name": "Python 3",
302 |    "language": "python",
303 |    "name": "python3"
304 |   },
305 |   "language_info": {
306 |    "codemirror_mode": {
307 |     "name": "ipython",
308 |     "version": 3
309 |    },
310 |    "file_extension": ".py",
311 |    "mimetype": "text/x-python",
312 |    "name": "python",
313 |    "nbconvert_exporter": "python",
314 |    "pygments_lexer": "ipython3",
315 |    "version": "3.6.5"
316 |   },
317 |   "toc": {
318 |    "base_numbering": 1,
319 |    "nav_menu": {},
320 |    "number_sections": true,
321 |    "sideBar": true,
322 |    "skip_h1_title": false,
323 |    "title_cell": "Table of Contents",
324 |    "title_sidebar": "Contents",
325 |    "toc_cell": false,
326 |    "toc_position": {},
327 |    "toc_section_display": true,
328 |    "toc_window_display": false
329 |   }
330 |  },
331 |  "nbformat": 4,
332 |  "nbformat_minor": 2
333 | }
334 | 


--------------------------------------------------------------------------------
/cuda-c/src/01-hello_world/hello-world-from-gpu.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Hello World From GPU"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "The best way to learn a new programming language is by writing programs using the new language. In this section, we are going to write our first kernel code running on the GPU.\n",
 15 |     "\n",
 16 |     "First, let's check that the CUDA compiler is installed properly with the following command on a Linux system:"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": 1,
 22 |    "metadata": {},
 23 |    "outputs": [
 24 |     {
 25 |      "name": "stdout",
 26 |      "output_type": "stream",
 27 |      "text": [
 28 |       "/usr/local/cuda/bin/nvcc\n",
 29 |       "nvcc: NVIDIA (R) Cuda compiler driver\n",
 30 |       "Copyright (c) 2005-2017 NVIDIA Corporation\n",
 31 |       "Built on Fri_Nov__3_21:07:56_CDT_2017\n",
 32 |       "Cuda compilation tools, release 9.1, V9.1.85\n"
 33 |      ]
 34 |     }
 35 |    ],
 36 |    "source": [
 37 |     "!which nvcc\n",
 38 |     "!nvcc --version"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "Let's check if a GPU accelerator card is attached in our machine:"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "crw-rw-rw- 1 root root 195,   0 Jun 26 08:27 /dev/nvidia0\r\n",
 58 |       "crw-rw-rw- 1 root root 195, 255 Jun 26 08:27 /dev/nvidiactl\r\n",
 59 |       "crw-rw-rw- 1 root root 195, 254 Jun 26 08:28 /dev/nvidia-modeset\r\n",
 60 |       "crw-rw-rw- 1 root root 240,   0 Jun 26 08:28 /dev/nvidia-uvm\r\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "!ls -l /dev/nv*"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "markdown",
 70 |    "metadata": {},
 71 |    "source": [
 72 |     "Now we are ready to write your fi rst CUDA C code. To write a CUDA C program, we need to:\n",
 73 |     "1. Create a source code fi le with the special fi le name extension of .cu. \n",
 74 |     "2. Compile the program using the CUDA nvcc compiler.\n",
 75 |     "3. Run the executable file from the command line, which contains the kernel code executable on the GPU.\n"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 3,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "Overwriting hello_world_gpu.cu\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "%%file hello_world_gpu.cu \n",
 93 |     "#include <stdio.h>\n",
 94 |     "\n",
 95 |     "// The qualifier __global__ tells the compiler that the function will be called \n",
 96 |     "// from the CPU and executed on the GPU.\n",
 97 |     "\n",
 98 |     "__global__ void helloFromGPU(void)\n",
 99 |     "{\n",
100 |     "    printf(\".............Hello World from GPU!.............\\n\");\n",
101 |     "}\n",
102 |     "\n",
103 |     "int main(void){\n",
104 |     "    // hello from cpu\n",
105 |     "    printf(\"<------------Hello World from CPU!-------------->\\n\");\n",
106 |     "    \n",
107 |     "    // Launch the kernel\n",
108 |     "    // The parameters within the triple angle brackets are the execution configuration, \n",
109 |     "    // which specifi es how many threads will execute the kernel. In this example, we will run 10 GPU threads.\n",
110 |     "    helloFromGPU <<<1, 10>>>();\n",
111 |     "    \n",
112 |     "    \n",
113 |     "    // explicitly destroy and clean up all resources associated with the current\n",
114 |     "    // device in the current process\n",
115 |     "    cudaDeviceReset();\n",
116 |     "    return 0;\n",
117 |     "}"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 4,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "<------------Hello World from CPU!-------------->\n",
130 |       ".............Hello World from GPU!.............\n",
131 |       ".............Hello World from GPU!.............\n",
132 |       ".............Hello World from GPU!.............\n",
133 |       ".............Hello World from GPU!.............\n",
134 |       ".............Hello World from GPU!.............\n",
135 |       ".............Hello World from GPU!.............\n",
136 |       ".............Hello World from GPU!.............\n",
137 |       ".............Hello World from GPU!.............\n",
138 |       ".............Hello World from GPU!.............\n",
139 |       ".............Hello World from GPU!.............\n"
140 |      ]
141 |     }
142 |    ],
143 |    "source": [
144 |     "%%bash\n",
145 |     "nvcc hello_world_gpu.cu -o hello_world_gpu\n",
146 |     "./hello_world_gpu"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "## CUDA PROGRAM STRUCTURE \n",
154 |     "\n",
155 |     "A typical CUDA program structure consists of five main steps: \n",
156 |     "1. Allocate GPU memories. \n",
157 |     "2. Copy data from CPU memory to GPU memory. \n",
158 |     "3. Invoke the CUDA kernel to perform program-specifi c computation. \n",
159 |     "4. Copy data back from GPU memory to CPU memory. \n",
160 |     "5. Destroy GPU memories.\n",
161 |     "\n",
162 |     "In the simple program `hello_world_gpu.cu`, you only see the third step: Invoke the kernel. "
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "- Remove the [cudaDeviceReset function](https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1gef69dd5c6d0206c2b8d099abac61f217)"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": 5,
175 |    "metadata": {},
176 |    "outputs": [
177 |     {
178 |      "name": "stdout",
179 |      "output_type": "stream",
180 |      "text": [
181 |       "Overwriting hello_world_gpu.cu\n"
182 |      ]
183 |     }
184 |    ],
185 |    "source": [
186 |     "%%file hello_world_gpu.cu \n",
187 |     "#include <stdio.h>\n",
188 |     "\n",
189 |     "__global__ void helloFromGPU(void)\n",
190 |     "{\n",
191 |     "    printf(\".............Hello World from GPU!.............\\n\");\n",
192 |     "}\n",
193 |     "\n",
194 |     "int main(void){\n",
195 |     "    // hello from cpu\n",
196 |     "    printf(\"<------------Hello World from CPU!-------------->\\n\");\n",
197 |     "    \n",
198 |     "    helloFromGPU <<<1, 10>>>();\n",
199 |     "    // explicitly destroy and clean up all resources associated with the current\n",
200 |     "    // device in the current process\n",
201 |     "    //cudaDeviceReset();\n",
202 |     "    return 0;\n",
203 |     "}"
204 |    ]
205 |   },
206 |   {
207 |    "cell_type": "code",
208 |    "execution_count": 6,
209 |    "metadata": {},
210 |    "outputs": [
211 |     {
212 |      "name": "stdout",
213 |      "output_type": "stream",
214 |      "text": [
215 |       "<------------Hello World from CPU!-------------->\n"
216 |      ]
217 |     }
218 |    ],
219 |    "source": [
220 |     "%%bash\n",
221 |     "nvcc hello_world_gpu.cu -o hello_world_gpu\n",
222 |     "./hello_world_gpu"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "markdown",
227 |    "metadata": {},
228 |    "source": [
229 |     "- Replace the function `cudaDeviceRest` with `cudaDeviceSynchronize`"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 7,
235 |    "metadata": {},
236 |    "outputs": [
237 |     {
238 |      "name": "stdout",
239 |      "output_type": "stream",
240 |      "text": [
241 |       "Overwriting hello_world_gpu.cu\n"
242 |      ]
243 |     }
244 |    ],
245 |    "source": [
246 |     "%%file hello_world_gpu.cu \n",
247 |     "#include <stdio.h>\n",
248 |     "\n",
249 |     "__global__ void helloFromGPU(void)\n",
250 |     "{\n",
251 |     "    printf(\".............Hello World from GPU!.............\\n\");\n",
252 |     "}\n",
253 |     "\n",
254 |     "int main(void){\n",
255 |     "    // hello from cpu\n",
256 |     "    printf(\"<------------Hello World from CPU!-------------->\\n\");\n",
257 |     "    \n",
258 |     "    helloFromGPU <<<1, 10>>>();\n",
259 |     "   \n",
260 |     "    cudaDeviceSynchronize();\n",
261 |     "    return 0;\n",
262 |     "}"
263 |    ]
264 |   },
265 |   {
266 |    "cell_type": "code",
267 |    "execution_count": 8,
268 |    "metadata": {},
269 |    "outputs": [
270 |     {
271 |      "name": "stdout",
272 |      "output_type": "stream",
273 |      "text": [
274 |       "<------------Hello World from CPU!-------------->\n",
275 |       ".............Hello World from GPU!.............\n",
276 |       ".............Hello World from GPU!.............\n",
277 |       ".............Hello World from GPU!.............\n",
278 |       ".............Hello World from GPU!.............\n",
279 |       ".............Hello World from GPU!.............\n",
280 |       ".............Hello World from GPU!.............\n",
281 |       ".............Hello World from GPU!.............\n",
282 |       ".............Hello World from GPU!.............\n",
283 |       ".............Hello World from GPU!.............\n",
284 |       ".............Hello World from GPU!.............\n"
285 |      ]
286 |     }
287 |    ],
288 |    "source": [
289 |     "%%bash\n",
290 |     "nvcc hello_world_gpu.cu -o hello_world_gpu\n",
291 |     "./hello_world_gpu"
292 |    ]
293 |   },
294 |   {
295 |    "cell_type": "markdown",
296 |    "metadata": {},
297 |    "source": [
298 |     "- Each thread that executes the kernel is given a unique thread ID that is accessible within the kernel through the built-in `threadIdx.x` variable."
299 |    ]
300 |   },
301 |   {
302 |    "cell_type": "code",
303 |    "execution_count": 9,
304 |    "metadata": {},
305 |    "outputs": [
306 |     {
307 |      "name": "stdout",
308 |      "output_type": "stream",
309 |      "text": [
310 |       "Overwriting hello_world_gpu.cu\n"
311 |      ]
312 |     }
313 |    ],
314 |    "source": [
315 |     "%%file hello_world_gpu.cu \n",
316 |     "#include <stdio.h>\n",
317 |     "\n",
318 |     "__global__ void helloFromGPU(void)\n",
319 |     "{   \n",
320 |     "    if (threadIdx.x == 5)\n",
321 |     "        printf(\".............Hello World from GPU thread %d!.............\\n\", threadIdx.x);\n",
322 |     "}\n",
323 |     "\n",
324 |     "int main(void){\n",
325 |     "    // hello from cpu\n",
326 |     "    printf(\"<------------Hello World from CPU!-------------->\\n\");\n",
327 |     "    \n",
328 |     "    helloFromGPU <<<1, 10>>>();\n",
329 |     "   \n",
330 |     "    cudaDeviceSynchronize();\n",
331 |     "    return 0;\n",
332 |     "}"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 10,
338 |    "metadata": {},
339 |    "outputs": [
340 |     {
341 |      "name": "stdout",
342 |      "output_type": "stream",
343 |      "text": [
344 |       "<------------Hello World from CPU!-------------->\n",
345 |       ".............Hello World from GPU thread 5!.............\n"
346 |      ]
347 |     }
348 |    ],
349 |    "source": [
350 |     "%%bash\n",
351 |     "nvcc hello_world_gpu.cu -o hello_world_gpu\n",
352 |     "./hello_world_gpu"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "markdown",
357 |    "metadata": {},
358 |    "source": [
359 |     "## IS CUDA C PROGRAMMING DIFFICULT?\n",
360 |     "\n",
361 |     "The main difference between CPU programming and GPU programming is the level of programmer exposure to GPU architectural features. Thinking in parallel and having a basic understanding of GPU architecture enables you to write parallel programs that scale to hundreds of cores as easily as you write a sequential program.\n",
362 |     "\n",
363 |     "\n",
364 |     "If you want to write efficient code as a parallel programmer, you need a basic knowledge of CPU architectures. For example, **locality** is a very important concept in parallel programming. \n",
365 |     "- **Locality** refers to the reuse of data so as to reduce memory access latency. \n",
366 |     "\n",
367 |     "There are two basic types of reference locality:\n",
368 |     "\n",
369 |     "- Temporal locality refers to the reuse of data and/or resources within relatively small time durations.\n",
370 |     "- Spatial locality refers to the use of data elements within relatively close storage locations. \n",
371 |     "\n",
372 |     "Modern CPU architectures use large caches to optimize for applications with good spatial and temporal locality. It is the programmer’s responsibility to design their algorithm to effi ciently use CPU cache. Programmers must handle low-level cache optimizations, but have no introspection into how threads are being scheduled on the underlying architecture because the CPU does not expose that information.\n",
373 |     "CUDA exposes you to the concepts of both memory hierarchy and thread hierarchy, extending your ability to control thread execution and scheduling to a greater degree, using: \n",
374 |     "- ➤ Memory hierarchy structure\n",
375 |     "- ➤ Thread hierarchy structure\n",
376 |     "\n",
377 |     "For example, a special memory, called shared memory, is exposed by the CUDA programming model. Shared memory can be thought of as a software-managed cache, which provides great speed-up by conserving bandwidth to main memory. With shared memory, you can control the locality of your code directly.\n",
378 |     "\n",
379 |     "CUDA abstracts away the hardware details and does not require applications to be mapped to traditional graphics APIs. \n",
380 |     "At its core are three key abstractions: \n",
381 |     "- a hierarchy of thread groups, \n",
382 |     "- a hierarchy of memory groups, \n",
383 |     "- and barrier synchronization, \n",
384 |     "\n",
385 |     "which are exposed to us as a minimal set of language extensions. "
386 |    ]
387 |   }
388 |  ],
389 |  "metadata": {
390 |   "kernelspec": {
391 |    "display_name": "Python 3",
392 |    "language": "python",
393 |    "name": "python3"
394 |   },
395 |   "language_info": {
396 |    "codemirror_mode": {
397 |     "name": "ipython",
398 |     "version": 3
399 |    },
400 |    "file_extension": ".py",
401 |    "mimetype": "text/x-python",
402 |    "name": "python",
403 |    "nbconvert_exporter": "python",
404 |    "pygments_lexer": "ipython3",
405 |    "version": "3.6.5"
406 |   },
407 |   "toc": {
408 |    "base_numbering": 1,
409 |    "nav_menu": {},
410 |    "number_sections": false,
411 |    "sideBar": true,
412 |    "skip_h1_title": false,
413 |    "title_cell": "Table of Contents",
414 |    "title_sidebar": "Contents",
415 |    "toc_cell": false,
416 |    "toc_position": {},
417 |    "toc_section_display": true,
418 |    "toc_window_display": false
419 |   }
420 |  },
421 |  "nbformat": 4,
422 |  "nbformat_minor": 2
423 | }
424 | 


--------------------------------------------------------------------------------
/getting_started_on_colab.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "getting-started-on-colab.ipynb",
  7 |       "version": "0.3.2",
  8 |       "provenance": [],
  9 |       "include_colab_link": true
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "accelerator": "GPU"
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "view-in-github",
 22 |         "colab_type": "text"
 23 |       },
 24 |       "source": [
 25 |         "[View in Colaboratory](https://colab.research.google.com/github/andersy005/cuda-programming/blob/master/getting_started_on_colab.ipynb)"
 26 |       ]
 27 |     },
 28 |     {
 29 |       "metadata": {
 30 |         "id": "Azf91OtajTab",
 31 |         "colab_type": "code",
 32 |         "colab": {
 33 |           "base_uri": "https://localhost:8080/",
 34 |           "height": 34
 35 |         },
 36 |         "outputId": "dfe6f1dc-edac-466c-95ba-b62736d8d3e2"
 37 |       },
 38 |       "cell_type": "code",
 39 |       "source": [
 40 |         "!apt update -qq"
 41 |       ],
 42 |       "execution_count": 1,
 43 |       "outputs": [
 44 |         {
 45 |           "output_type": "stream",
 46 |           "text": [
 47 |             "6 packages can be upgraded. Run 'apt list --upgradable' to see them.\r\n"
 48 |           ],
 49 |           "name": "stdout"
 50 |         }
 51 |       ]
 52 |     },
 53 |     {
 54 |       "metadata": {
 55 |         "id": "PKBV7iXgjdfJ",
 56 |         "colab_type": "code",
 57 |         "colab": {
 58 |           "base_uri": "https://localhost:8080/",
 59 |           "height": 309
 60 |         },
 61 |         "outputId": "5e307c44-8abe-40f3-947f-48d7cf2f41b2"
 62 |       },
 63 |       "cell_type": "code",
 64 |       "source": [
 65 |         "!wget https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb"
 66 |       ],
 67 |       "execution_count": 2,
 68 |       "outputs": [
 69 |         {
 70 |           "output_type": "stream",
 71 |           "text": [
 72 |             "--2018-06-27 18:46:54--  https://developer.nvidia.com/compute/cuda/8.0/Prod2/local_installers/cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb\r\n",
 73 |             "Resolving developer.nvidia.com (developer.nvidia.com)... 192.229.162.216\r\n",
 74 |             "Connecting to developer.nvidia.com (developer.nvidia.com)|192.229.162.216|:443... connected.\n",
 75 |             "HTTP request sent, awaiting response... 302 Found\n",
 76 |             "Location: https://developer.download.nvidia.com/compute/cuda/8.0/secure/Prod2/local_installers/cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64.deb?Oy-tuxd8APgk75C-6ni3GsZYC8MKSx8lCk8BAhMEzptmEKWqiBU80Z9TWX4lJfCr-9n4M6xR8eAQcu5bANJUkw92M88T3sQSG2Q5CzCeAhG3ye37lu2a4s6ej_RdyKJ5nHPAmPPd3wAoF-hVGKyZghC3EKpAvO4xKIEOrqItL1bQbfbUalWFWE6JB5e5i2kZ38Qeu_Hz2HpSo1htmYiBXUHogg [following]\n",
 77 |             "--2018-06-27 18:46:54--  https://developer.download.nvidia.com/compute/cuda/8.0/secure/Prod2/local_installers/cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64.deb?Oy-tuxd8APgk75C-6ni3GsZYC8MKSx8lCk8BAhMEzptmEKWqiBU80Z9TWX4lJfCr-9n4M6xR8eAQcu5bANJUkw92M88T3sQSG2Q5CzCeAhG3ye37lu2a4s6ej_RdyKJ5nHPAmPPd3wAoF-hVGKyZghC3EKpAvO4xKIEOrqItL1bQbfbUalWFWE6JB5e5i2kZ38Qeu_Hz2HpSo1htmYiBXUHogg\n",
 78 |             "Resolving developer.download.nvidia.com (developer.download.nvidia.com)... 192.229.211.70, 2606:2800:21f:3aa:dcf:37b:1ed6:1fb\n",
 79 |             "Connecting to developer.download.nvidia.com (developer.download.nvidia.com)|192.229.211.70|:443... connected.\n",
 80 |             "HTTP request sent, awaiting response... 200 OK\n",
 81 |             "Length: 1913589814 (1.8G) [application/x-deb]\n",
 82 |             "Saving to: ‘cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb.4’\n",
 83 |             "\n"
 84 |           ],
 85 |           "name": "stdout"
 86 |         },
 87 |         {
 88 |           "output_type": "stream",
 89 |           "text": [
 90 |             "deb.4                99%[==================> ]   1.77G   157MB/s    eta 0s     \rcuda-repo-ubuntu160 100%[===================>]   1.78G   155MB/s    in 13s     \r\n",
 91 |             "\r\n",
 92 |             "2018-06-27 18:47:07 (139 MB/s) - ‘cuda-repo-ubuntu1604-8-0-local-ga2_8.0.61-1_amd64-deb.4’ saved [1913589814/1913589814]\r\n",
 93 |             "\r\n"
 94 |           ],
 95 |           "name": "stdout"
 96 |         }
 97 |       ]
 98 |     },
 99 |     {
100 |       "metadata": {
101 |         "id": "vrfKbLekdPDO",
102 |         "colab_type": "code",
103 |         "colab": {}
104 |       },
105 |       "cell_type": "code",
106 |       "source": [
107 |         "!dpkg -i cuda-repo-ubuntu1604–8–0-local-ga2_8.0.61–1_amd64-deb 2> /dev/null"
108 |       ],
109 |       "execution_count": 0,
110 |       "outputs": []
111 |     },
112 |     {
113 |       "metadata": {
114 |         "id": "A9Lo4FoydC7q",
115 |         "colab_type": "code",
116 |         "colab": {
117 |           "base_uri": "https://localhost:8080/",
118 |           "height": 34
119 |         },
120 |         "outputId": "a12cf818-1adf-40a5-8a4c-1f1e2dfd5423"
121 |       },
122 |       "cell_type": "code",
123 |       "source": [
124 |         "!apt-key add /var/cuda-repo-8-0-local-ga2/7fa2af80.pub"
125 |       ],
126 |       "execution_count": 4,
127 |       "outputs": [
128 |         {
129 |           "output_type": "stream",
130 |           "text": [
131 |             "OK\r\n"
132 |           ],
133 |           "name": "stdout"
134 |         }
135 |       ]
136 |     },
137 |     {
138 |       "metadata": {
139 |         "id": "vuIFPr-leAFV",
140 |         "colab_type": "code",
141 |         "colab": {}
142 |       },
143 |       "cell_type": "code",
144 |       "source": [
145 |         "!apt-get update -qq"
146 |       ],
147 |       "execution_count": 0,
148 |       "outputs": []
149 |     },
150 |     {
151 |       "metadata": {
152 |         "id": "5LCW1tnxj-pk",
153 |         "colab_type": "code",
154 |         "colab": {
155 |           "base_uri": "https://localhost:8080/",
156 |           "height": 85
157 |         },
158 |         "outputId": "76f7e7ae-1e54-41b1-c877-6b528a691ade"
159 |       },
160 |       "cell_type": "code",
161 |       "source": [
162 |         "!apt --fix-broken install\n",
163 |         "!apt-get install cuda gcc-5 g++-5 -y -qq;"
164 |       ],
165 |       "execution_count": 6,
166 |       "outputs": [
167 |         {
168 |           "output_type": "stream",
169 |           "text": [
170 |             "Reading package lists... Done\n",
171 |             "Building dependency tree       \n",
172 |             "Reading state information... Done\n",
173 |             "0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.\n"
174 |           ],
175 |           "name": "stdout"
176 |         }
177 |       ]
178 |     },
179 |     {
180 |       "metadata": {
181 |         "id": "zoqglXEBj-02",
182 |         "colab_type": "code",
183 |         "colab": {
184 |           "base_uri": "https://localhost:8080/",
185 |           "height": 170
186 |         },
187 |         "outputId": "3078ead4-f9d8-446c-dc8f-c4774369992a"
188 |       },
189 |       "cell_type": "code",
190 |       "source": [
191 |         "!ln -s /usr/bin/gcc-5 /usr/local/cuda/bin/gcc;\n",
192 |         "!ln -s /usr/bin/g++-5 /usr/local/cuda/bin/g++;\n",
193 |         "!apt install cuda-8.0;"
194 |       ],
195 |       "execution_count": 7,
196 |       "outputs": [
197 |         {
198 |           "output_type": "stream",
199 |           "text": [
200 |             "Reading package lists... Done\n",
201 |             "Building dependency tree       \n",
202 |             "Reading state information... Done\n",
203 |             "Note, selecting 'cuda-8-0' for regex 'cuda-8.0'\n",
204 |             "Note, selecting 'libcuda-8.0-1' for regex 'cuda-8.0'\n",
205 |             "Note, selecting 'libcuda1-384' instead of 'libcuda-8.0-1'\n",
206 |             "libcuda1-384 is already the newest version (384.130-0ubuntu0.17.10.1).\n",
207 |             "cuda-8-0 is already the newest version (8.0.61-1).\n",
208 |             "0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.\n"
209 |           ],
210 |           "name": "stdout"
211 |         }
212 |       ]
213 |     },
214 |     {
215 |       "metadata": {
216 |         "id": "IzaD9e62epuL",
217 |         "colab_type": "code",
218 |         "colab": {}
219 |       },
220 |       "cell_type": "code",
221 |       "source": [
222 |         "import os\n",
223 |         "os.environ['PATH'] += ':/usr/local/cuda/bin'"
224 |       ],
225 |       "execution_count": 0,
226 |       "outputs": []
227 |     },
228 |     {
229 |       "metadata": {
230 |         "id": "ayx2gqk8iICV",
231 |         "colab_type": "code",
232 |         "colab": {
233 |           "base_uri": "https://localhost:8080/",
234 |           "height": 119
235 |         },
236 |         "outputId": "da46bca7-2c02-4b64-9363-31e7efe437df"
237 |       },
238 |       "cell_type": "code",
239 |       "source": [
240 |         "!apt install gcc-5 g++-5 -y"
241 |       ],
242 |       "execution_count": 9,
243 |       "outputs": [
244 |         {
245 |           "output_type": "stream",
246 |           "text": [
247 |             "Reading package lists... Done\n",
248 |             "Building dependency tree       \n",
249 |             "Reading state information... Done\n",
250 |             "gcc-5 is already the newest version (5.5.0-1ubuntu2).\n",
251 |             "g++-5 is already the newest version (5.5.0-1ubuntu2).\n",
252 |             "0 upgraded, 0 newly installed, 0 to remove and 6 not upgraded.\n"
253 |           ],
254 |           "name": "stdout"
255 |         }
256 |       ]
257 |     },
258 |     {
259 |       "metadata": {
260 |         "id": "MFe_6wdBiLcy",
261 |         "colab_type": "code",
262 |         "colab": {}
263 |       },
264 |       "cell_type": "code",
265 |       "source": [
266 |         "import os\n",
267 |         "os.environ['PATH'] += ':/usr/local/cuda/bin'"
268 |       ],
269 |       "execution_count": 0,
270 |       "outputs": []
271 |     },
272 |     {
273 |       "metadata": {
274 |         "id": "1UT9EIGZiN12",
275 |         "colab_type": "code",
276 |         "colab": {
277 |           "base_uri": "https://localhost:8080/",
278 |           "height": 85
279 |         },
280 |         "outputId": "8935d529-d8bf-4063-f95f-18e86ef49e22"
281 |       },
282 |       "cell_type": "code",
283 |       "source": [
284 |         "!nvcc --version"
285 |       ],
286 |       "execution_count": 11,
287 |       "outputs": [
288 |         {
289 |           "output_type": "stream",
290 |           "text": [
291 |             "nvcc: NVIDIA (R) Cuda compiler driver\r\n",
292 |             "Copyright (c) 2005-2016 NVIDIA Corporation\r\n",
293 |             "Built on Tue_Jan_10_13:22:03_CST_2017\r\n",
294 |             "Cuda compilation tools, release 8.0, V8.0.61\r\n"
295 |           ],
296 |           "name": "stdout"
297 |         }
298 |       ]
299 |     },
300 |     {
301 |       "metadata": {
302 |         "id": "zjr4TsIFk21Z",
303 |         "colab_type": "code",
304 |         "colab": {
305 |           "base_uri": "https://localhost:8080/",
306 |           "height": 34
307 |         },
308 |         "outputId": "a4d6d568-c783-408b-ea2d-fdda4b2139b1"
309 |       },
310 |       "cell_type": "code",
311 |       "source": [
312 |         "%%file version.cu\n",
313 |         "#include <thrust/version.h>\n",
314 |         "#include <iostream>\n",
315 |         "\n",
316 |         "int main(void)\n",
317 |         "{\n",
318 |         "  int major = THRUST_MAJOR_VERSION;\n",
319 |         "  int minor = THRUST_MINOR_VERSION;\n",
320 |         "\n",
321 |         "  std::cout << \"Thrust v\" << major << \".\" << minor << std::endl;\n",
322 |         "\n",
323 |         "  return 0;\n",
324 |         "}"
325 |       ],
326 |       "execution_count": 12,
327 |       "outputs": [
328 |         {
329 |           "output_type": "stream",
330 |           "text": [
331 |             "Writing version.cu\n"
332 |           ],
333 |           "name": "stdout"
334 |         }
335 |       ]
336 |     },
337 |     {
338 |       "metadata": {
339 |         "id": "7lVHw-Ezlf_L",
340 |         "colab_type": "code",
341 |         "colab": {
342 |           "base_uri": "https://localhost:8080/",
343 |           "height": 51
344 |         },
345 |         "outputId": "ae831aac-1584-4e94-ff77-ed686be5346e"
346 |       },
347 |       "cell_type": "code",
348 |       "source": [
349 |         "!nvcc version.cu -o version\n",
350 |         "!./version"
351 |       ],
352 |       "execution_count": 13,
353 |       "outputs": [
354 |         {
355 |           "output_type": "stream",
356 |           "text": [
357 |             "nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\n",
358 |             "Thrust v1.8\n"
359 |           ],
360 |           "name": "stdout"
361 |         }
362 |       ]
363 |     },
364 |     {
365 |       "metadata": {
366 |         "id": "DXERV5wRlqX3",
367 |         "colab_type": "code",
368 |         "colab": {
369 |           "base_uri": "https://localhost:8080/",
370 |           "height": 34
371 |         },
372 |         "outputId": "619b324c-c970-467c-9dc1-087a1490670b"
373 |       },
374 |       "cell_type": "code",
375 |       "source": [
376 |         "%%file thrust_example.cu\n",
377 |         "#include <thrust/host_vector.h>\n",
378 |         "#include <thrust/device_vector.h>\n",
379 |         "#include <thrust/generate.h>\n",
380 |         "#include <thrust/sort.h>\n",
381 |         "#include <thrust/copy.h>\n",
382 |         "#include <algorithm>\n",
383 |         "#include <cstdlib>\n",
384 |         "\n",
385 |         "int main(void)\n",
386 |         "{\n",
387 |         "  // generate 32M random numbers serially\n",
388 |         "  thrust::host_vector<int> h_vec(32 << 20);\n",
389 |         "  std::generate(h_vec.begin(), h_vec.end(), rand);\n",
390 |         "\n",
391 |         "  // transfer data to the device\n",
392 |         "  thrust::device_vector<int> d_vec = h_vec;\n",
393 |         "\n",
394 |         "  // sort data on the device \n",
395 |         "  thrust::sort(d_vec.begin(), d_vec.end());\n",
396 |         "\n",
397 |         "  // transfer data back to host\n",
398 |         "  thrust::copy(d_vec.begin(), d_vec.end(), h_vec.begin());\n",
399 |         "\n",
400 |         "  return 0;\n",
401 |         "}"
402 |       ],
403 |       "execution_count": 14,
404 |       "outputs": [
405 |         {
406 |           "output_type": "stream",
407 |           "text": [
408 |             "Writing thrust_example.cu\n"
409 |           ],
410 |           "name": "stdout"
411 |         }
412 |       ]
413 |     },
414 |     {
415 |       "metadata": {
416 |         "id": "PSqPaE0amBwB",
417 |         "colab_type": "code",
418 |         "colab": {
419 |           "base_uri": "https://localhost:8080/",
420 |           "height": 34
421 |         },
422 |         "outputId": "4be9eef0-eee1-4c04-b916-1b4559c0894b"
423 |       },
424 |       "cell_type": "code",
425 |       "source": [
426 |         "!nvcc thrust_example.cu -o thrust_example\n",
427 |         "!./thrust_example\n"
428 |       ],
429 |       "execution_count": 15,
430 |       "outputs": [
431 |         {
432 |           "output_type": "stream",
433 |           "text": [
434 |             "nvcc warning : The 'compute_20', 'sm_20', and 'sm_21' architectures are deprecated, and may be removed in a future release (Use -Wno-deprecated-gpu-targets to suppress warning).\r\n"
435 |           ],
436 |           "name": "stdout"
437 |         }
438 |       ]
439 |     },
440 |     {
441 |       "metadata": {
442 |         "id": "v6gCqNHwmJp1",
443 |         "colab_type": "code",
444 |         "colab": {
445 |           "base_uri": "https://localhost:8080/",
446 |           "height": 581
447 |         },
448 |         "outputId": "65f32a73-9ad6-4b27-cb72-c1aa5a3fec9f"
449 |       },
450 |       "cell_type": "code",
451 |       "source": [
452 |         "!nvprof ./thrust_example"
453 |       ],
454 |       "execution_count": 16,
455 |       "outputs": [
456 |         {
457 |           "output_type": "stream",
458 |           "text": [
459 |             "==1986== NVPROF is profiling process 1986, command: ./thrust_example\n",
460 |             "==1986== Profiling application: ./thrust_example\n",
461 |             "==1986== Profiling result:\n",
462 |             "Time(%)      Time     Calls       Avg       Min       Max  Name\n",
463 |             " 30.98%  26.865ms         1  26.865ms  26.865ms  26.865ms  [CUDA memcpy HtoD]\n",
464 |             " 22.54%  19.542ms         4  4.8854ms  3.8770ms  5.2403ms  void thrust::system::cuda::detail::cub_::DeviceRadixSortDownsweepKernel<thrust::system::cuda::detail::cub_::DeviceRadixSortDispatch<bool=0, int, thrust::system::cuda::detail::cub_::NullType, int>::PtxDownsweepPolicy, bool=0, int, thrust::system::cuda::detail::cub_::NullType, int>(thrust::system::cuda::detail::cub_::NullType*, thrust::system::cuda::detail::cub_::NullType, int*, int, thrust::system::cuda::detail::cub_::DeviceRadixSortDispatch<bool=0, int, thrust::system::cuda::detail::cub_::NullType, int>::PtxDownsweepPolicy*, int*, int, int, bool, bool, thrust::system::cuda::detail::cub_::GridEvenShare<int*>)\n",
465 |             " 21.37%  18.534ms         1  18.534ms  18.534ms  18.534ms  [CUDA memcpy DtoH]\n",
466 |             " 13.21%  11.459ms         3  3.8197ms  3.7991ms  3.8498ms  void thrust::system::cuda::detail::cub_::DeviceRadixSortDownsweepKernel<thrust::system::cuda::detail::cub_::DeviceRadixSortDispatch<bool=0, int, thrust::system::cuda::detail::cub_::NullType, int>::PtxAltDownsweepPolicy, bool=0, int, thrust::system::cuda::detail::cub_::NullType, int>(thrust::system::cuda::detail::cub_::NullType*, thrust::system::cuda::detail::cub_::NullType, int*, int, thrust::system::cuda::detail::cub_::DeviceRadixSortDispatch<bool=0, int, thrust::system::cuda::detail::cub_::NullType, int>::PtxAltDownsweepPolicy*, int*, int, int, bool, bool, thrust::system::cuda::detail::cub_::GridEvenShare<int*>)\n",
467 |             "  5.08%  4.4069ms         4  1.1017ms  1.0984ms  1.1063ms  void thrust::system::cuda::detail::cub_::DeviceRadixSortUpsweepKernel<thrust::system::cuda::detail::cub_::DeviceRadixSortDispatch<bool=0, int, thrust::system::cuda::detail::cub_::NullType, int>::PtxUpsweepPolicy, bool=0, int, int>(thrust::system::cuda::detail::cub_::NullType*, int*, thrust::system::cuda::detail::cub_::NullType*, int, int, bool, thrust::system::cuda::detail::cub_::GridEvenShare<thrust::system::cuda::detail::cub_::NullType*>)\n",
468 |             "  4.12%  3.5706ms         3  1.1902ms  1.1881ms  1.1933ms  void thrust::system::cuda::detail::cub_::DeviceRadixSortUpsweepKernel<thrust::system::cuda::detail::cub_::DeviceRadixSortDispatch<bool=0, int, thrust::system::cuda::detail::cub_::NullType, int>::PtxAltUpsweepPolicy, bool=0, int, int>(thrust::system::cuda::detail::cub_::NullType*, int*, thrust::system::cuda::detail::cub_::NullType*, int, int, bool, thrust::system::cuda::detail::cub_::GridEvenShare<thrust::system::cuda::detail::cub_::NullType*>)\n",
469 |             "  2.35%  2.0400ms         1  2.0400ms  2.0400ms  2.0400ms  [CUDA memcpy DtoD]\n",
470 |             "  0.34%  298.43us         7  42.632us  32.160us  50.464us  void thrust::system::cuda::detail::cub_::RadixSortScanBinsKernel<thrust::system::cuda::detail::cub_::DeviceRadixSortDispatch<bool=0, int, thrust::system::cuda::detail::cub_::NullType, int>::PtxScanPolicy, int>(int*, int)\n",
471 |             "\n",
472 |             "==1986== API calls:\n",
473 |             "Time(%)      Time     Calls       Avg       Min       Max  Name\n",
474 |             " 68.70%  197.26ms         2  98.628ms  481.23us  196.77ms  cudaMalloc\n",
475 |             " 15.94%  45.770ms         3  15.257ms  34.923us  26.979ms  cudaMemcpyAsync\n",
476 |             " 14.52%  41.688ms         2  20.844ms  515.99us  41.172ms  cudaFree\n",
477 |             "  0.28%  794.29us        91  8.7280us  2.6300us  267.81us  cuDeviceGetAttribute\n",
478 |             "  0.18%  512.13us        21  24.387us  18.936us  61.082us  cudaLaunch\n",
479 |             "  0.17%  485.34us        10  48.534us  44.333us  68.555us  cudaFuncGetAttributes\n",
480 |             "  0.11%  316.36us         1  316.36us  316.36us  316.36us  cuDeviceTotalMem\n",
481 |             "  0.05%  135.60us         2  67.798us  24.341us  111.26us  cudaStreamSynchronize\n",
482 |             "  0.01%  37.730us         1  37.730us  37.730us  37.730us  cuDeviceGetName\n",
483 |             "  0.01%  33.288us         6  5.5480us  5.3300us  6.3190us  cudaDeviceGetAttribute\n",
484 |             "  0.01%  31.534us       140     225ns     178ns  1.0860us  cudaSetupArgument\n",
485 |             "  0.01%  17.206us         2  8.6030us  8.4720us  8.7340us  cudaDeviceGetSharedMemConfig\n",
486 |             "  0.00%  12.342us         2  6.1710us  5.9050us  6.4370us  cudaGetDevice\n",
487 |             "  0.00%  11.925us         3  3.9750us  2.7300us  5.2650us  cuDeviceGetCount\n",
488 |             "  0.00%  10.324us         3  3.4410us  2.9730us  4.2770us  cuDeviceGet\n",
489 |             "  0.00%  8.4490us        21     402ns     321ns     926ns  cudaPeekAtLastError\n",
490 |             "  0.00%  6.8050us        21     324ns     203ns  2.0050us  cudaConfigureCall\n"
491 |           ],
492 |           "name": "stdout"
493 |         }
494 |       ]
495 |     },
496 |     {
497 |       "metadata": {
498 |         "id": "nKymLb9hnFIZ",
499 |         "colab_type": "code",
500 |         "colab": {}
501 |       },
502 |       "cell_type": "code",
503 |       "source": [
504 |         ""
505 |       ],
506 |       "execution_count": 0,
507 |       "outputs": []
508 |     }
509 |   ]
510 | }


--------------------------------------------------------------------------------
/cuda-c/src/utils/GL/glut.h:
--------------------------------------------------------------------------------
  1 | #ifndef __glut_h__
  2 | #define __glut_h__
  3 | 
  4 | /* Copyright (c) Mark J. Kilgard, 1994, 1995, 1996, 1998. */
  5 | 
  6 | /* This program is freely distributable without licensing fees  and is
  7 |    provided without guarantee or warrantee expressed or  implied. This
  8 |    program is -not- in the public domain. */
  9 | 
 10 | #if defined(_WIN32)
 11 | 
 12 | /* GLUT 3.7 now tries to avoid including <windows.h>
 13 |    to avoid name space pollution, but Win32's <GL/gl.h> 
 14 |    needs APIENTRY and WINGDIAPI defined properly. */
 15 | # if 0
 16 |    /* This would put tons of macros and crap in our clean name space. */
 17 | #  define  WIN32_LEAN_AND_MEAN
 18 | #  include <windows.h>
 19 | # else
 20 |    /* XXX This is from Win32's <windef.h> */
 21 | #  ifndef APIENTRY
 22 | #   define GLUT_APIENTRY_DEFINED
 23 | #   if (_MSC_VER >= 800) || defined(_STDCALL_SUPPORTED) || defined(__BORLANDC__) || defined(__LCC__)
 24 | #    define APIENTRY    __stdcall
 25 | #   else
 26 | #    define APIENTRY
 27 | #   endif
 28 | #  endif
 29 |    /* XXX This is from Win32's <winnt.h> */
 30 | #  ifndef CALLBACK
 31 | #   if (defined(_M_MRX000) || defined(_M_IX86) || defined(_M_ALPHA) || defined(_M_PPC)) && !defined(MIDL_PASS) || defined(__LCC__)
 32 | #    define CALLBACK __stdcall
 33 | #   else
 34 | #    define CALLBACK
 35 | #   endif
 36 | #  endif
 37 |    /* XXX Hack for lcc compiler.  It doesn't support __declspec(dllimport), just __stdcall. */
 38 | #  if defined( __LCC__ )
 39 | #   undef WINGDIAPI
 40 | #   define WINGDIAPI __stdcall
 41 | #  else
 42 |    /* XXX This is from Win32's <wingdi.h> and <winnt.h> */
 43 | #   ifndef WINGDIAPI
 44 | #    define GLUT_WINGDIAPI_DEFINED
 45 | #    define WINGDIAPI __declspec(dllimport)
 46 | #   endif
 47 | #  endif
 48 |    /* XXX This is from Win32's <ctype.h> */
 49 | #  ifndef _WCHAR_T_DEFINED
 50 | typedef unsigned short wchar_t;
 51 | #   define _WCHAR_T_DEFINED
 52 | #  endif
 53 | # endif
 54 | 
 55 | /* To disable automatic library usage for GLUT, define GLUT_NO_LIB_PRAGMA
 56 |    in your compile preprocessor options. */
 57 | # if !defined(GLUT_BUILDING_LIB) && !defined(GLUT_NO_LIB_PRAGMA)
 58 | #  pragma comment (lib, "winmm.lib")      /* link with Windows MultiMedia lib */
 59 | /* To enable automatic SGI OpenGL for Windows library usage for GLUT,
 60 |    define GLUT_USE_SGI_OPENGL in your compile preprocessor options.  */
 61 | #  ifdef GLUT_USE_SGI_OPENGL
 62 | #   pragma comment (lib, "opengl.lib")    /* link with SGI OpenGL for Windows lib */
 63 | #   pragma comment (lib, "glu.lib")       /* link with SGI OpenGL Utility lib */
 64 | #   pragma comment (lib, "glut.lib")      /* link with Win32 GLUT for SGI OpenGL lib */
 65 | #  else
 66 | #   pragma comment (lib, "opengl32.lib")  /* link with Microsoft OpenGL lib */
 67 | #   pragma comment (lib, "glu32.lib")     /* link with Microsoft OpenGL Utility lib */
 68 | #   pragma comment (lib, "glut32.lib")    /* link with Win32 GLUT lib */
 69 | #  endif
 70 | # endif
 71 | 
 72 | /* To disable supression of annoying warnings about floats being promoted
 73 |    to doubles, define GLUT_NO_WARNING_DISABLE in your compile preprocessor
 74 |    options. */
 75 | # ifndef GLUT_NO_WARNING_DISABLE
 76 | #  pragma warning (disable:4244)  /* Disable bogus VC++ 4.2 conversion warnings. */
 77 | #  pragma warning (disable:4305)  /* VC++ 5.0 version of above warning. */
 78 | # endif
 79 | 
 80 | /* Win32 has an annoying issue where there are multiple C run-time
 81 |    libraries (CRTs).  If the executable is linked with a different CRT
 82 |    from the GLUT DLL, the GLUT DLL will not share the same CRT static
 83 |    data seen by the executable.  In particular, atexit callbacks registered
 84 |    in the executable will not be called if GLUT calls its (different)
 85 |    exit routine).  GLUT is typically built with the
 86 |    "/MD" option (the CRT with multithreading DLL support), but the Visual
 87 |    C++ linker default is "/ML" (the single threaded CRT).
 88 | 
 89 |    One workaround to this issue is requiring users to always link with
 90 |    the same CRT as GLUT is compiled with.  That requires users supply a
 91 |    non-standard option.  GLUT 3.7 has its own built-in workaround where
 92 |    the executable's "exit" function pointer is covertly passed to GLUT.
 93 |    GLUT then calls the executable's exit function pointer to ensure that
 94 |    any "atexit" calls registered by the application are called if GLUT
 95 |    needs to exit.
 96 | 
 97 |    Note that the __glut*WithExit routines should NEVER be called directly.
 98 |    To avoid the atexit workaround, #define GLUT_DISABLE_ATEXIT_HACK. */
 99 | 
100 | /* XXX This is from Win32's <process.h> */
101 | # if !defined(_MSC_VER) && !defined(__cdecl)
102 |    /* Define __cdecl for non-Microsoft compilers. */
103 | #  define __cdecl
104 | #  define GLUT_DEFINED___CDECL
105 | # endif
106 | # ifndef _CRTIMP
107 | #  ifdef _NTSDK
108 |     /* Definition compatible with NT SDK */
109 | #   define _CRTIMP
110 | #  else
111 |     /* Current definition */
112 | #   ifdef _DLL
113 | #    define _CRTIMP __declspec(dllimport)
114 | #   else
115 | #    define _CRTIMP
116 | #   endif
117 | #  endif
118 | #  define GLUT_DEFINED__CRTIMP
119 | # endif
120 | 
121 | /* GLUT API entry point declarations for Win32. */
122 | # ifdef GLUT_BUILDING_LIB
123 | #  define GLUTAPI __declspec(dllexport)
124 | # else
125 | #  ifdef _DLL
126 | #   define GLUTAPI __declspec(dllimport)
127 | #  else
128 | #   define GLUTAPI extern
129 | #  endif
130 | # endif
131 | 
132 | /* GLUT callback calling convention for Win32. */
133 | # define GLUTCALLBACK __cdecl
134 | 
135 | #endif  /* _WIN32 */
136 | 
137 | #include <GL/gl.h>
138 | #include <GL/glu.h>
139 | 
140 | #ifdef __cplusplus
141 | extern "C" {
142 | #endif
143 | 
144 | #if defined(_WIN32)
145 | # ifndef GLUT_BUILDING_LIB
146 | extern _CRTIMP void __cdecl exit(int);
147 | # endif
148 | #else
149 | /* non-Win32 case. */
150 | /* Define APIENTRY and CALLBACK to nothing if we aren't on Win32. */
151 | # define APIENTRY
152 | # define GLUT_APIENTRY_DEFINED
153 | # define CALLBACK
154 | /* Define GLUTAPI and GLUTCALLBACK as below if we aren't on Win32. */
155 | # define GLUTAPI extern
156 | # define GLUTCALLBACK
157 | /* Prototype exit for the non-Win32 case (see above). */
158 | extern void exit(int);
159 | #endif
160 | 
161 | /**
162 |  GLUT API revision history:
163 |  
164 |  GLUT_API_VERSION is updated to reflect incompatible GLUT
165 |  API changes (interface changes, semantic changes, deletions,
166 |  or additions).
167 |  
168 |  GLUT_API_VERSION=1  First public release of GLUT.  11/29/94
169 | 
170 |  GLUT_API_VERSION=2  Added support for OpenGL/GLX multisampling,
171 |  extension.  Supports new input devices like tablet, dial and button
172 |  box, and Spaceball.  Easy to query OpenGL extensions.
173 | 
174 |  GLUT_API_VERSION=3  glutMenuStatus added.
175 | 
176 |  GLUT_API_VERSION=4  glutInitDisplayString, glutWarpPointer,
177 |  glutBitmapLength, glutStrokeLength, glutWindowStatusFunc, dynamic
178 |  video resize subAPI, glutPostWindowRedisplay, glutKeyboardUpFunc,
179 |  glutSpecialUpFunc, glutIgnoreKeyRepeat, glutSetKeyRepeat,
180 |  glutJoystickFunc, glutForceJoystickFunc (NOT FINALIZED!).
181 | **/
182 | #ifndef GLUT_API_VERSION  /* allow this to be overriden */
183 | #define GLUT_API_VERSION		3
184 | #endif
185 | 
186 | /**
187 |  GLUT implementation revision history:
188 |  
189 |  GLUT_XLIB_IMPLEMENTATION is updated to reflect both GLUT
190 |  API revisions and implementation revisions (ie, bug fixes).
191 | 
192 |  GLUT_XLIB_IMPLEMENTATION=1  mjk's first public release of
193 |  GLUT Xlib-based implementation.  11/29/94
194 | 
195 |  GLUT_XLIB_IMPLEMENTATION=2  mjk's second public release of
196 |  GLUT Xlib-based implementation providing GLUT version 2 
197 |  interfaces.
198 | 
199 |  GLUT_XLIB_IMPLEMENTATION=3  mjk's GLUT 2.2 images. 4/17/95
200 | 
201 |  GLUT_XLIB_IMPLEMENTATION=4  mjk's GLUT 2.3 images. 6/?/95
202 | 
203 |  GLUT_XLIB_IMPLEMENTATION=5  mjk's GLUT 3.0 images. 10/?/95
204 | 
205 |  GLUT_XLIB_IMPLEMENTATION=7  mjk's GLUT 3.1+ with glutWarpPoitner.  7/24/96
206 | 
207 |  GLUT_XLIB_IMPLEMENTATION=8  mjk's GLUT 3.1+ with glutWarpPoitner
208 |  and video resize.  1/3/97
209 | 
210 |  GLUT_XLIB_IMPLEMENTATION=9 mjk's GLUT 3.4 release with early GLUT 4 routines.
211 | 
212 |  GLUT_XLIB_IMPLEMENTATION=11 Mesa 2.5's GLUT 3.6 release.
213 | 
214 |  GLUT_XLIB_IMPLEMENTATION=12 mjk's GLUT 3.6 release with early GLUT 4 routines + signal handling.
215 | 
216 |  GLUT_XLIB_IMPLEMENTATION=13 mjk's GLUT 3.7 beta with GameGLUT support.
217 | 
218 |  GLUT_XLIB_IMPLEMENTATION=14 mjk's GLUT 3.7 beta with f90gl friend interface.
219 | 
220 |  GLUT_XLIB_IMPLEMENTATION=15 mjk's GLUT 3.7 beta sync'ed with Mesa <GL/glut.h>
221 | **/
222 | #ifndef GLUT_XLIB_IMPLEMENTATION  /* Allow this to be overriden. */
223 | #define GLUT_XLIB_IMPLEMENTATION	15
224 | #endif
225 | 
226 | /* Display mode bit masks. */
227 | #define GLUT_RGB			0
228 | #define GLUT_RGBA			GLUT_RGB
229 | #define GLUT_INDEX			1
230 | #define GLUT_SINGLE			0
231 | #define GLUT_DOUBLE			2
232 | #define GLUT_ACCUM			4
233 | #define GLUT_ALPHA			8
234 | #define GLUT_DEPTH			16
235 | #define GLUT_STENCIL			32
236 | #if (GLUT_API_VERSION >= 2)
237 | #define GLUT_MULTISAMPLE		128
238 | #define GLUT_STEREO			256
239 | #endif
240 | #if (GLUT_API_VERSION >= 3)
241 | #define GLUT_LUMINANCE			512
242 | #endif
243 | 
244 | /* Mouse buttons. */
245 | #define GLUT_LEFT_BUTTON		0
246 | #define GLUT_MIDDLE_BUTTON		1
247 | #define GLUT_RIGHT_BUTTON		2
248 | 
249 | /* Mouse button  state. */
250 | #define GLUT_DOWN			0
251 | #define GLUT_UP				1
252 | 
253 | #if (GLUT_API_VERSION >= 2)
254 | /* function keys */
255 | #define GLUT_KEY_F1			1
256 | #define GLUT_KEY_F2			2
257 | #define GLUT_KEY_F3			3
258 | #define GLUT_KEY_F4			4
259 | #define GLUT_KEY_F5			5
260 | #define GLUT_KEY_F6			6
261 | #define GLUT_KEY_F7			7
262 | #define GLUT_KEY_F8			8
263 | #define GLUT_KEY_F9			9
264 | #define GLUT_KEY_F10			10
265 | #define GLUT_KEY_F11			11
266 | #define GLUT_KEY_F12			12
267 | /* directional keys */
268 | #define GLUT_KEY_LEFT			100
269 | #define GLUT_KEY_UP			101
270 | #define GLUT_KEY_RIGHT			102
271 | #define GLUT_KEY_DOWN			103
272 | #define GLUT_KEY_PAGE_UP		104
273 | #define GLUT_KEY_PAGE_DOWN		105
274 | #define GLUT_KEY_HOME			106
275 | #define GLUT_KEY_END			107
276 | #define GLUT_KEY_INSERT			108
277 | #endif
278 | 
279 | /* Entry/exit  state. */
280 | #define GLUT_LEFT			0
281 | #define GLUT_ENTERED			1
282 | 
283 | /* Menu usage  state. */
284 | #define GLUT_MENU_NOT_IN_USE		0
285 | #define GLUT_MENU_IN_USE		1
286 | 
287 | /* Visibility  state. */
288 | #define GLUT_NOT_VISIBLE		0
289 | #define GLUT_VISIBLE			1
290 | 
291 | /* Window status  state. */
292 | #define GLUT_HIDDEN			0
293 | #define GLUT_FULLY_RETAINED		1
294 | #define GLUT_PARTIALLY_RETAINED		2
295 | #define GLUT_FULLY_COVERED		3
296 | 
297 | /* Color index component selection values. */
298 | #define GLUT_RED			0
299 | #define GLUT_GREEN			1
300 | #define GLUT_BLUE			2
301 | 
302 | #if defined(_WIN32)
303 | /* Stroke font constants (use these in GLUT program). */
304 | #define GLUT_STROKE_ROMAN		((void*)0)
305 | #define GLUT_STROKE_MONO_ROMAN		((void*)1)
306 | 
307 | /* Bitmap font constants (use these in GLUT program). */
308 | #define GLUT_BITMAP_9_BY_15		((void*)2)
309 | #define GLUT_BITMAP_8_BY_13		((void*)3)
310 | #define GLUT_BITMAP_TIMES_ROMAN_10	((void*)4)
311 | #define GLUT_BITMAP_TIMES_ROMAN_24	((void*)5)
312 | #if (GLUT_API_VERSION >= 3)
313 | #define GLUT_BITMAP_HELVETICA_10	((void*)6)
314 | #define GLUT_BITMAP_HELVETICA_12	((void*)7)
315 | #define GLUT_BITMAP_HELVETICA_18	((void*)8)
316 | #endif
317 | #else
318 | /* Stroke font opaque addresses (use constants instead in source code). */
319 | GLUTAPI void *glutStrokeRoman;
320 | GLUTAPI void *glutStrokeMonoRoman;
321 | 
322 | /* Stroke font constants (use these in GLUT program). */
323 | #define GLUT_STROKE_ROMAN		(&glutStrokeRoman)
324 | #define GLUT_STROKE_MONO_ROMAN		(&glutStrokeMonoRoman)
325 | 
326 | /* Bitmap font opaque addresses (use constants instead in source code). */
327 | GLUTAPI void *glutBitmap9By15;
328 | GLUTAPI void *glutBitmap8By13;
329 | GLUTAPI void *glutBitmapTimesRoman10;
330 | GLUTAPI void *glutBitmapTimesRoman24;
331 | GLUTAPI void *glutBitmapHelvetica10;
332 | GLUTAPI void *glutBitmapHelvetica12;
333 | GLUTAPI void *glutBitmapHelvetica18;
334 | 
335 | /* Bitmap font constants (use these in GLUT program). */
336 | #define GLUT_BITMAP_9_BY_15		(&glutBitmap9By15)
337 | #define GLUT_BITMAP_8_BY_13		(&glutBitmap8By13)
338 | #define GLUT_BITMAP_TIMES_ROMAN_10	(&glutBitmapTimesRoman10)
339 | #define GLUT_BITMAP_TIMES_ROMAN_24	(&glutBitmapTimesRoman24)
340 | #if (GLUT_API_VERSION >= 3)
341 | #define GLUT_BITMAP_HELVETICA_10	(&glutBitmapHelvetica10)
342 | #define GLUT_BITMAP_HELVETICA_12	(&glutBitmapHelvetica12)
343 | #define GLUT_BITMAP_HELVETICA_18	(&glutBitmapHelvetica18)
344 | #endif
345 | #endif
346 | 
347 | /* glutGet parameters. */
348 | #define GLUT_WINDOW_X			((GLenum) 100)
349 | #define GLUT_WINDOW_Y			((GLenum) 101)
350 | #define GLUT_WINDOW_WIDTH		((GLenum) 102)
351 | #define GLUT_WINDOW_HEIGHT		((GLenum) 103)
352 | #define GLUT_WINDOW_BUFFER_SIZE		((GLenum) 104)
353 | #define GLUT_WINDOW_STENCIL_SIZE	((GLenum) 105)
354 | #define GLUT_WINDOW_DEPTH_SIZE		((GLenum) 106)
355 | #define GLUT_WINDOW_RED_SIZE		((GLenum) 107)
356 | #define GLUT_WINDOW_GREEN_SIZE		((GLenum) 108)
357 | #define GLUT_WINDOW_BLUE_SIZE		((GLenum) 109)
358 | #define GLUT_WINDOW_ALPHA_SIZE		((GLenum) 110)
359 | #define GLUT_WINDOW_ACCUM_RED_SIZE	((GLenum) 111)
360 | #define GLUT_WINDOW_ACCUM_GREEN_SIZE	((GLenum) 112)
361 | #define GLUT_WINDOW_ACCUM_BLUE_SIZE	((GLenum) 113)
362 | #define GLUT_WINDOW_ACCUM_ALPHA_SIZE	((GLenum) 114)
363 | #define GLUT_WINDOW_DOUBLEBUFFER	((GLenum) 115)
364 | #define GLUT_WINDOW_RGBA		((GLenum) 116)
365 | #define GLUT_WINDOW_PARENT		((GLenum) 117)
366 | #define GLUT_WINDOW_NUM_CHILDREN	((GLenum) 118)
367 | #define GLUT_WINDOW_COLORMAP_SIZE	((GLenum) 119)
368 | #if (GLUT_API_VERSION >= 2)
369 | #define GLUT_WINDOW_NUM_SAMPLES		((GLenum) 120)
370 | #define GLUT_WINDOW_STEREO		((GLenum) 121)
371 | #endif
372 | #if (GLUT_API_VERSION >= 3)
373 | #define GLUT_WINDOW_CURSOR		((GLenum) 122)
374 | #endif
375 | #define GLUT_SCREEN_WIDTH		((GLenum) 200)
376 | #define GLUT_SCREEN_HEIGHT		((GLenum) 201)
377 | #define GLUT_SCREEN_WIDTH_MM		((GLenum) 202)
378 | #define GLUT_SCREEN_HEIGHT_MM		((GLenum) 203)
379 | #define GLUT_MENU_NUM_ITEMS		((GLenum) 300)
380 | #define GLUT_DISPLAY_MODE_POSSIBLE	((GLenum) 400)
381 | #define GLUT_INIT_WINDOW_X		((GLenum) 500)
382 | #define GLUT_INIT_WINDOW_Y		((GLenum) 501)
383 | #define GLUT_INIT_WINDOW_WIDTH		((GLenum) 502)
384 | #define GLUT_INIT_WINDOW_HEIGHT		((GLenum) 503)
385 | #define GLUT_INIT_DISPLAY_MODE		((GLenum) 504)
386 | #if (GLUT_API_VERSION >= 2)
387 | #define GLUT_ELAPSED_TIME		((GLenum) 700)
388 | #endif
389 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
390 | #define GLUT_WINDOW_FORMAT_ID		((GLenum) 123)
391 | #endif
392 | 
393 | #if (GLUT_API_VERSION >= 2)
394 | /* glutDeviceGet parameters. */
395 | #define GLUT_HAS_KEYBOARD		((GLenum) 600)
396 | #define GLUT_HAS_MOUSE			((GLenum) 601)
397 | #define GLUT_HAS_SPACEBALL		((GLenum) 602)
398 | #define GLUT_HAS_DIAL_AND_BUTTON_BOX	((GLenum) 603)
399 | #define GLUT_HAS_TABLET			((GLenum) 604)
400 | #define GLUT_NUM_MOUSE_BUTTONS		((GLenum) 605)
401 | #define GLUT_NUM_SPACEBALL_BUTTONS	((GLenum) 606)
402 | #define GLUT_NUM_BUTTON_BOX_BUTTONS	((GLenum) 607)
403 | #define GLUT_NUM_DIALS			((GLenum) 608)
404 | #define GLUT_NUM_TABLET_BUTTONS		((GLenum) 609)
405 | #endif
406 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
407 | #define GLUT_DEVICE_IGNORE_KEY_REPEAT   ((GLenum) 610)
408 | #define GLUT_DEVICE_KEY_REPEAT          ((GLenum) 611)
409 | #define GLUT_HAS_JOYSTICK		((GLenum) 612)
410 | #define GLUT_OWNS_JOYSTICK		((GLenum) 613)
411 | #define GLUT_JOYSTICK_BUTTONS		((GLenum) 614)
412 | #define GLUT_JOYSTICK_AXES		((GLenum) 615)
413 | #define GLUT_JOYSTICK_POLL_RATE		((GLenum) 616)
414 | #endif
415 | 
416 | #if (GLUT_API_VERSION >= 3)
417 | /* glutLayerGet parameters. */
418 | #define GLUT_OVERLAY_POSSIBLE           ((GLenum) 800)
419 | #define GLUT_LAYER_IN_USE		((GLenum) 801)
420 | #define GLUT_HAS_OVERLAY		((GLenum) 802)
421 | #define GLUT_TRANSPARENT_INDEX		((GLenum) 803)
422 | #define GLUT_NORMAL_DAMAGED		((GLenum) 804)
423 | #define GLUT_OVERLAY_DAMAGED		((GLenum) 805)
424 | 
425 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
426 | /* glutVideoResizeGet parameters. */
427 | #define GLUT_VIDEO_RESIZE_POSSIBLE	((GLenum) 900)
428 | #define GLUT_VIDEO_RESIZE_IN_USE	((GLenum) 901)
429 | #define GLUT_VIDEO_RESIZE_X_DELTA	((GLenum) 902)
430 | #define GLUT_VIDEO_RESIZE_Y_DELTA	((GLenum) 903)
431 | #define GLUT_VIDEO_RESIZE_WIDTH_DELTA	((GLenum) 904)
432 | #define GLUT_VIDEO_RESIZE_HEIGHT_DELTA	((GLenum) 905)
433 | #define GLUT_VIDEO_RESIZE_X		((GLenum) 906)
434 | #define GLUT_VIDEO_RESIZE_Y		((GLenum) 907)
435 | #define GLUT_VIDEO_RESIZE_WIDTH		((GLenum) 908)
436 | #define GLUT_VIDEO_RESIZE_HEIGHT	((GLenum) 909)
437 | #endif
438 | 
439 | /* glutUseLayer parameters. */
440 | #define GLUT_NORMAL			((GLenum) 0)
441 | #define GLUT_OVERLAY			((GLenum) 1)
442 | 
443 | /* glutGetModifiers return mask. */
444 | #define GLUT_ACTIVE_SHIFT               1
445 | #define GLUT_ACTIVE_CTRL                2
446 | #define GLUT_ACTIVE_ALT                 4
447 | 
448 | /* glutSetCursor parameters. */
449 | /* Basic arrows. */
450 | #define GLUT_CURSOR_RIGHT_ARROW		0
451 | #define GLUT_CURSOR_LEFT_ARROW		1
452 | /* Symbolic cursor shapes. */
453 | #define GLUT_CURSOR_INFO		2
454 | #define GLUT_CURSOR_DESTROY		3
455 | #define GLUT_CURSOR_HELP		4
456 | #define GLUT_CURSOR_CYCLE		5
457 | #define GLUT_CURSOR_SPRAY		6
458 | #define GLUT_CURSOR_WAIT		7
459 | #define GLUT_CURSOR_TEXT		8
460 | #define GLUT_CURSOR_CROSSHAIR		9
461 | /* Directional cursors. */
462 | #define GLUT_CURSOR_UP_DOWN		10
463 | #define GLUT_CURSOR_LEFT_RIGHT		11
464 | /* Sizing cursors. */
465 | #define GLUT_CURSOR_TOP_SIDE		12
466 | #define GLUT_CURSOR_BOTTOM_SIDE		13
467 | #define GLUT_CURSOR_LEFT_SIDE		14
468 | #define GLUT_CURSOR_RIGHT_SIDE		15
469 | #define GLUT_CURSOR_TOP_LEFT_CORNER	16
470 | #define GLUT_CURSOR_TOP_RIGHT_CORNER	17
471 | #define GLUT_CURSOR_BOTTOM_RIGHT_CORNER	18
472 | #define GLUT_CURSOR_BOTTOM_LEFT_CORNER	19
473 | /* Inherit from parent window. */
474 | #define GLUT_CURSOR_INHERIT		100
475 | /* Blank cursor. */
476 | #define GLUT_CURSOR_NONE		101
477 | /* Fullscreen crosshair (if available). */
478 | #define GLUT_CURSOR_FULL_CROSSHAIR	102
479 | #endif
480 | 
481 | /* GLUT initialization sub-API. */
482 | GLUTAPI void APIENTRY glutInit(int *argcp, char **argv);
483 | #if defined(_WIN32) && !defined(GLUT_DISABLE_ATEXIT_HACK)
484 | GLUTAPI void APIENTRY __glutInitWithExit(int *argcp, char **argv, void (__cdecl *exitfunc)(int));
485 | #ifndef GLUT_BUILDING_LIB
486 | static void APIENTRY glutInit_ATEXIT_HACK(int *argcp, char **argv) { __glutInitWithExit(argcp, argv, exit); }
487 | #define glutInit glutInit_ATEXIT_HACK
488 | #endif
489 | #endif
490 | GLUTAPI void APIENTRY glutInitDisplayMode(unsigned int mode);
491 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
492 | GLUTAPI void APIENTRY glutInitDisplayString(const char *string);
493 | #endif
494 | GLUTAPI void APIENTRY glutInitWindowPosition(int x, int y);
495 | GLUTAPI void APIENTRY glutInitWindowSize(int width, int height);
496 | GLUTAPI void APIENTRY glutMainLoop(void);
497 | 
498 | /* GLUT window sub-API. */
499 | GLUTAPI int APIENTRY glutCreateWindow(const char *title);
500 | #if defined(_WIN32) && !defined(GLUT_DISABLE_ATEXIT_HACK)
501 | GLUTAPI int APIENTRY __glutCreateWindowWithExit(const char *title, void (__cdecl *exitfunc)(int));
502 | #ifndef GLUT_BUILDING_LIB
503 | static int APIENTRY glutCreateWindow_ATEXIT_HACK(const char *title) { return __glutCreateWindowWithExit(title, exit); }
504 | #define glutCreateWindow glutCreateWindow_ATEXIT_HACK
505 | #endif
506 | #endif
507 | GLUTAPI int APIENTRY glutCreateSubWindow(int win, int x, int y, int width, int height);
508 | GLUTAPI void APIENTRY glutDestroyWindow(int win);
509 | GLUTAPI void APIENTRY glutPostRedisplay(void);
510 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 11)
511 | GLUTAPI void APIENTRY glutPostWindowRedisplay(int win);
512 | #endif
513 | GLUTAPI void APIENTRY glutSwapBuffers(void);
514 | GLUTAPI int APIENTRY glutGetWindow(void);
515 | GLUTAPI void APIENTRY glutSetWindow(int win);
516 | GLUTAPI void APIENTRY glutSetWindowTitle(const char *title);
517 | GLUTAPI void APIENTRY glutSetIconTitle(const char *title);
518 | GLUTAPI void APIENTRY glutPositionWindow(int x, int y);
519 | GLUTAPI void APIENTRY glutReshapeWindow(int width, int height);
520 | GLUTAPI void APIENTRY glutPopWindow(void);
521 | GLUTAPI void APIENTRY glutPushWindow(void);
522 | GLUTAPI void APIENTRY glutIconifyWindow(void);
523 | GLUTAPI void APIENTRY glutShowWindow(void);
524 | GLUTAPI void APIENTRY glutHideWindow(void);
525 | #if (GLUT_API_VERSION >= 3)
526 | GLUTAPI void APIENTRY glutFullScreen(void);
527 | GLUTAPI void APIENTRY glutSetCursor(int cursor);
528 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
529 | GLUTAPI void APIENTRY glutWarpPointer(int x, int y);
530 | #endif
531 | 
532 | /* GLUT overlay sub-API. */
533 | GLUTAPI void APIENTRY glutEstablishOverlay(void);
534 | GLUTAPI void APIENTRY glutRemoveOverlay(void);
535 | GLUTAPI void APIENTRY glutUseLayer(GLenum layer);
536 | GLUTAPI void APIENTRY glutPostOverlayRedisplay(void);
537 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 11)
538 | GLUTAPI void APIENTRY glutPostWindowOverlayRedisplay(int win);
539 | #endif
540 | GLUTAPI void APIENTRY glutShowOverlay(void);
541 | GLUTAPI void APIENTRY glutHideOverlay(void);
542 | #endif
543 | 
544 | /* GLUT menu sub-API. */
545 | GLUTAPI int APIENTRY glutCreateMenu(void (GLUTCALLBACK *func)(int));
546 | #if defined(_WIN32) && !defined(GLUT_DISABLE_ATEXIT_HACK)
547 | GLUTAPI int APIENTRY __glutCreateMenuWithExit(void (GLUTCALLBACK *func)(int), void (__cdecl *exitfunc)(int));
548 | #ifndef GLUT_BUILDING_LIB
549 | static int APIENTRY glutCreateMenu_ATEXIT_HACK(void (GLUTCALLBACK *func)(int)) { return __glutCreateMenuWithExit(func, exit); }
550 | #define glutCreateMenu glutCreateMenu_ATEXIT_HACK
551 | #endif
552 | #endif
553 | GLUTAPI void APIENTRY glutDestroyMenu(int menu);
554 | GLUTAPI int APIENTRY glutGetMenu(void);
555 | GLUTAPI void APIENTRY glutSetMenu(int menu);
556 | GLUTAPI void APIENTRY glutAddMenuEntry(const char *label, int value);
557 | GLUTAPI void APIENTRY glutAddSubMenu(const char *label, int submenu);
558 | GLUTAPI void APIENTRY glutChangeToMenuEntry(int item, const char *label, int value);
559 | GLUTAPI void APIENTRY glutChangeToSubMenu(int item, const char *label, int submenu);
560 | GLUTAPI void APIENTRY glutRemoveMenuItem(int item);
561 | GLUTAPI void APIENTRY glutAttachMenu(int button);
562 | GLUTAPI void APIENTRY glutDetachMenu(int button);
563 | 
564 | /* GLUT window callback sub-API. */
565 | GLUTAPI void APIENTRY glutDisplayFunc(void (GLUTCALLBACK *func)(void));
566 | GLUTAPI void APIENTRY glutReshapeFunc(void (GLUTCALLBACK *func)(int width, int height));
567 | GLUTAPI void APIENTRY glutKeyboardFunc(void (GLUTCALLBACK *func)(unsigned char key, int x, int y));
568 | GLUTAPI void APIENTRY glutMouseFunc(void (GLUTCALLBACK *func)(int button, int state, int x, int y));
569 | GLUTAPI void APIENTRY glutMotionFunc(void (GLUTCALLBACK *func)(int x, int y));
570 | GLUTAPI void APIENTRY glutPassiveMotionFunc(void (GLUTCALLBACK *func)(int x, int y));
571 | GLUTAPI void APIENTRY glutEntryFunc(void (GLUTCALLBACK *func)(int state));
572 | GLUTAPI void APIENTRY glutVisibilityFunc(void (GLUTCALLBACK *func)(int state));
573 | GLUTAPI void APIENTRY glutIdleFunc(void (GLUTCALLBACK *func)(void));
574 | GLUTAPI void APIENTRY glutTimerFunc(unsigned int millis, void (GLUTCALLBACK *func)(int value), int value);
575 | GLUTAPI void APIENTRY glutMenuStateFunc(void (GLUTCALLBACK *func)(int state));
576 | #if (GLUT_API_VERSION >= 2)
577 | GLUTAPI void APIENTRY glutSpecialFunc(void (GLUTCALLBACK *func)(int key, int x, int y));
578 | GLUTAPI void APIENTRY glutSpaceballMotionFunc(void (GLUTCALLBACK *func)(int x, int y, int z));
579 | GLUTAPI void APIENTRY glutSpaceballRotateFunc(void (GLUTCALLBACK *func)(int x, int y, int z));
580 | GLUTAPI void APIENTRY glutSpaceballButtonFunc(void (GLUTCALLBACK *func)(int button, int state));
581 | GLUTAPI void APIENTRY glutButtonBoxFunc(void (GLUTCALLBACK *func)(int button, int state));
582 | GLUTAPI void APIENTRY glutDialsFunc(void (GLUTCALLBACK *func)(int dial, int value));
583 | GLUTAPI void APIENTRY glutTabletMotionFunc(void (GLUTCALLBACK *func)(int x, int y));
584 | GLUTAPI void APIENTRY glutTabletButtonFunc(void (GLUTCALLBACK *func)(int button, int state, int x, int y));
585 | #if (GLUT_API_VERSION >= 3)
586 | GLUTAPI void APIENTRY glutMenuStatusFunc(void (GLUTCALLBACK *func)(int status, int x, int y));
587 | GLUTAPI void APIENTRY glutOverlayDisplayFunc(void (GLUTCALLBACK *func)(void));
588 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
589 | GLUTAPI void APIENTRY glutWindowStatusFunc(void (GLUTCALLBACK *func)(int state));
590 | #endif
591 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
592 | GLUTAPI void APIENTRY glutKeyboardUpFunc(void (GLUTCALLBACK *func)(unsigned char key, int x, int y));
593 | GLUTAPI void APIENTRY glutSpecialUpFunc(void (GLUTCALLBACK *func)(int key, int x, int y));
594 | GLUTAPI void APIENTRY glutJoystickFunc(void (GLUTCALLBACK *func)(unsigned int buttonMask, int x, int y, int z), int pollInterval);
595 | #endif
596 | #endif
597 | #endif
598 | 
599 | /* GLUT color index sub-API. */
600 | GLUTAPI void APIENTRY glutSetColor(int, GLfloat red, GLfloat green, GLfloat blue);
601 | GLUTAPI GLfloat APIENTRY glutGetColor(int ndx, int component);
602 | GLUTAPI void APIENTRY glutCopyColormap(int win);
603 | 
604 | /* GLUT state retrieval sub-API. */
605 | GLUTAPI int APIENTRY glutGet(GLenum type);
606 | GLUTAPI int APIENTRY glutDeviceGet(GLenum type);
607 | #if (GLUT_API_VERSION >= 2)
608 | /* GLUT extension support sub-API */
609 | GLUTAPI int APIENTRY glutExtensionSupported(const char *name);
610 | #endif
611 | #if (GLUT_API_VERSION >= 3)
612 | GLUTAPI int APIENTRY glutGetModifiers(void);
613 | GLUTAPI int APIENTRY glutLayerGet(GLenum type);
614 | #endif
615 | 
616 | /* GLUT font sub-API */
617 | GLUTAPI void APIENTRY glutBitmapCharacter(void *font, int character);
618 | GLUTAPI int APIENTRY glutBitmapWidth(void *font, int character);
619 | GLUTAPI void APIENTRY glutStrokeCharacter(void *font, int character);
620 | GLUTAPI int APIENTRY glutStrokeWidth(void *font, int character);
621 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
622 | GLUTAPI int APIENTRY glutBitmapLength(void *font, const unsigned char *string);
623 | GLUTAPI int APIENTRY glutStrokeLength(void *font, const unsigned char *string);
624 | #endif
625 | 
626 | /* GLUT pre-built models sub-API */
627 | GLUTAPI void APIENTRY glutWireSphere(GLdouble radius, GLint slices, GLint stacks);
628 | GLUTAPI void APIENTRY glutSolidSphere(GLdouble radius, GLint slices, GLint stacks);
629 | GLUTAPI void APIENTRY glutWireCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
630 | GLUTAPI void APIENTRY glutSolidCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
631 | GLUTAPI void APIENTRY glutWireCube(GLdouble size);
632 | GLUTAPI void APIENTRY glutSolidCube(GLdouble size);
633 | GLUTAPI void APIENTRY glutWireTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
634 | GLUTAPI void APIENTRY glutSolidTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
635 | GLUTAPI void APIENTRY glutWireDodecahedron(void);
636 | GLUTAPI void APIENTRY glutSolidDodecahedron(void);
637 | GLUTAPI void APIENTRY glutWireTeapot(GLdouble size);
638 | GLUTAPI void APIENTRY glutSolidTeapot(GLdouble size);
639 | GLUTAPI void APIENTRY glutWireOctahedron(void);
640 | GLUTAPI void APIENTRY glutSolidOctahedron(void);
641 | GLUTAPI void APIENTRY glutWireTetrahedron(void);
642 | GLUTAPI void APIENTRY glutSolidTetrahedron(void);
643 | GLUTAPI void APIENTRY glutWireIcosahedron(void);
644 | GLUTAPI void APIENTRY glutSolidIcosahedron(void);
645 | 
646 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
647 | /* GLUT video resize sub-API. */
648 | GLUTAPI int APIENTRY glutVideoResizeGet(GLenum param);
649 | GLUTAPI void APIENTRY glutSetupVideoResizing(void);
650 | GLUTAPI void APIENTRY glutStopVideoResizing(void);
651 | GLUTAPI void APIENTRY glutVideoResize(int x, int y, int width, int height);
652 | GLUTAPI void APIENTRY glutVideoPan(int x, int y, int width, int height);
653 | 
654 | /* GLUT debugging sub-API. */
655 | GLUTAPI void APIENTRY glutReportErrors(void);
656 | #endif
657 | 
658 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
659 | /* GLUT device control sub-API. */
660 | /* glutSetKeyRepeat modes. */
661 | #define GLUT_KEY_REPEAT_OFF		0
662 | #define GLUT_KEY_REPEAT_ON		1
663 | #define GLUT_KEY_REPEAT_DEFAULT		2
664 | 
665 | /* Joystick button masks. */
666 | #define GLUT_JOYSTICK_BUTTON_A		1
667 | #define GLUT_JOYSTICK_BUTTON_B		2
668 | #define GLUT_JOYSTICK_BUTTON_C		4
669 | #define GLUT_JOYSTICK_BUTTON_D		8
670 | 
671 | GLUTAPI void APIENTRY glutIgnoreKeyRepeat(int ignore);
672 | GLUTAPI void APIENTRY glutSetKeyRepeat(int repeatMode);
673 | GLUTAPI void APIENTRY glutForceJoystickFunc(void);
674 | 
675 | /* GLUT game mode sub-API. */
676 | /* glutGameModeGet. */
677 | #define GLUT_GAME_MODE_ACTIVE           ((GLenum) 0)
678 | #define GLUT_GAME_MODE_POSSIBLE         ((GLenum) 1)
679 | #define GLUT_GAME_MODE_WIDTH            ((GLenum) 2)
680 | #define GLUT_GAME_MODE_HEIGHT           ((GLenum) 3)
681 | #define GLUT_GAME_MODE_PIXEL_DEPTH      ((GLenum) 4)
682 | #define GLUT_GAME_MODE_REFRESH_RATE     ((GLenum) 5)
683 | #define GLUT_GAME_MODE_DISPLAY_CHANGED  ((GLenum) 6)
684 | 
685 | GLUTAPI void APIENTRY glutGameModeString(const char *string);
686 | GLUTAPI int APIENTRY glutEnterGameMode(void);
687 | GLUTAPI void APIENTRY glutLeaveGameMode(void);
688 | GLUTAPI int APIENTRY glutGameModeGet(GLenum mode);
689 | #endif
690 | 
691 | #ifdef __cplusplus
692 | }
693 | 
694 | #endif
695 | 
696 | #ifdef GLUT_APIENTRY_DEFINED
697 | # undef GLUT_APIENTRY_DEFINED
698 | # undef APIENTRY
699 | #endif
700 | 
701 | #ifdef GLUT_WINGDIAPI_DEFINED
702 | # undef GLUT_WINGDIAPI_DEFINED
703 | # undef WINGDIAPI
704 | #endif
705 | 
706 | #ifdef GLUT_DEFINED___CDECL
707 | # undef GLUT_DEFINED___CDECL
708 | # undef __cdecl
709 | #endif
710 | 
711 | #ifdef GLUT_DEFINED__CRTIMP
712 | # undef GLUT_DEFINED__CRTIMP
713 | # undef _CRTIMP
714 | #endif
715 | 
716 | #endif                  /* __glut_h__ */
717 | 


--------------------------------------------------------------------------------
/cuda-c/src/cuda-programming-model/04-timing-kernel.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 24,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stdout",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Overwriting sumArraysOnGPU.cu\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "%%file sumArraysOnGPU.cu\n",
 18 |     "\n",
 19 |     "#include <stdlib.h>\n",
 20 |     "#include <string.h>\n",
 21 |     "#include <time.h>\n",
 22 |     "#include <stdio.h>\n",
 23 |     "#include <cuda_runtime.h>\n",
 24 |     "#include <sys/time.h>\n",
 25 |     "\n",
 26 |     "double cpuSecond(){\n",
 27 |     "    struct timeval tp;\n",
 28 |     "    gettimeofday(&tp, NULL);\n",
 29 |     "    return ((double)tp.tv_sec + (double)tp.tv_usec*1.e-6);\n",
 30 |     "}\n",
 31 |     "\n",
 32 |     "#define CHECK(call)                                                            \\\n",
 33 |     "{                                                                              \\\n",
 34 |     "    const cudaError_t error = call;                                            \\\n",
 35 |     "    if (error != cudaSuccess)                                                  \\\n",
 36 |     "    {                                                                          \\\n",
 37 |     "        fprintf(stderr, \"Error: %s:%d, \", __FILE__, __LINE__);                 \\\n",
 38 |     "        fprintf(stderr, \"code: %d, reason: %s\\n\", error,                       \\\n",
 39 |     "                cudaGetErrorString(error));                                    \\\n",
 40 |     "        exit(1);                                                               \\\n",
 41 |     "    }                                                                          \\\n",
 42 |     "}\n",
 43 |     "\n",
 44 |     "\n",
 45 |     "__global__ void sumArraysOnDevice(float *A, float *B, float *C, const int N){\n",
 46 |     "    int idx = blockIdx.x * blockDim.x + threadIdx.x;\n",
 47 |     "    if (idx < N) C[idx] = A[idx] + B[idx];\n",
 48 |     "\n",
 49 |     "}\n",
 50 |     "\n",
 51 |     "\n",
 52 |     "void initialData(float *ip, int size){\n",
 53 |     "    // generate different seed for random number \n",
 54 |     "    time_t t;\n",
 55 |     "    srand((unsigned int) time (&t));\n",
 56 |     "    \n",
 57 |     "    for (int i=0; i<size; i++){\n",
 58 |     "        ip[i] = (float)(rand() & 0xFF) / 10.0f;\n",
 59 |     "    }\n",
 60 |     "}\n",
 61 |     "\n",
 62 |     "\n",
 63 |     "void sumArraysOnHost(float *A, float *B, float *C, const int N){\n",
 64 |     "    for (int idx=0; idx<N; idx++){\n",
 65 |     "        C[idx] = A[idx] + B[idx];\n",
 66 |     "    }\n",
 67 |     "}\n",
 68 |     "\n",
 69 |     "\n",
 70 |     "\n",
 71 |     "void checkResult(float *hostRef, float *gpuRef, const int N){\n",
 72 |     "    double epsilon = 1.0E-8;\n",
 73 |     "    int match = 1;\n",
 74 |     "    for (int i = 0; i < N; i++){\n",
 75 |     "        if (abs(hostRef[i] - gpuRef[i]) > epsilon){\n",
 76 |     "            match = 0;\n",
 77 |     "            printf(\"Arrays do not match!\\n\");\n",
 78 |     "            printf(\"host %5.2f gpu %5.2f at current %d\\n\",\n",
 79 |     "                   hostRef[i], gpuRef[i], i);\n",
 80 |     "            break;\n",
 81 |     "        }\n",
 82 |     "    }\n",
 83 |     "    if (match) printf(\"Arrays match. \\n\\n\");\n",
 84 |     "}\n",
 85 |     "\n",
 86 |     "\n",
 87 |     "int main(int argc, char **argv){\n",
 88 |     "    \n",
 89 |     "    printf(\"%s Starting...\\n\", argv[0]);\n",
 90 |     "    \n",
 91 |     "    // malloc host memory\n",
 92 |     "    int nElem = 1 <<24;\n",
 93 |     "    size_t nBytes = nElem * sizeof(float);\n",
 94 |     "    \n",
 95 |     "    \n",
 96 |     "    // initialize data at host side\n",
 97 |     "    float *h_A, *h_B, *hostRef, *gpuRef;\n",
 98 |     "    h_A = (float *)malloc(nBytes);\n",
 99 |     "    h_B = (float *)malloc(nBytes);\n",
100 |     "    hostRef = (float *)malloc(nBytes);\n",
101 |     "    gpuRef = (float *)malloc(nBytes);\n",
102 |     "    \n",
103 |     "    // initialize data at host side\n",
104 |     "    initialData(h_A, nElem);\n",
105 |     "    initialData(h_B, nElem);\n",
106 |     "    \n",
107 |     "    memset(hostRef, 0, nBytes);\n",
108 |     "    memset(gpuRef, 0, nBytes);\n",
109 |     "    \n",
110 |     "    // malloc device global memory \n",
111 |     "    float *d_A, *d_B, *d_C;\n",
112 |     "    cudaMalloc((float**)&d_A, nBytes);\n",
113 |     "    cudaMalloc((float**)&d_B, nBytes);\n",
114 |     "    cudaMalloc((float**)&d_C, nBytes);\n",
115 |     "    \n",
116 |     "    // Use cudaMemcpy to transfer the data from the host memory to the GPU global memory with the\n",
117 |     "    // parameter cudaMemcpyHostToDevice specifying the transfer direction.\n",
118 |     "    \n",
119 |     "    CHECK(cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice));\n",
120 |     "    CHECK(cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice));\n",
121 |     "    \n",
122 |     "    // invoke kernel at host side\n",
123 |     "    int iLen = 128;\n",
124 |     "    dim3 block(iLen);\n",
125 |     "    dim3 grid((nElem+block.x-1)/block.x);\n",
126 |     "    \n",
127 |     "    double iStart = cpuSecond();\n",
128 |     "    sumArraysOnDevice<<<grid, block>>>(d_A, d_B, d_C, nElem);\n",
129 |     "    CHECK(cudaDeviceSynchronize());\n",
130 |     "    double iElaps = cpuSecond() - iStart;\n",
131 |     "    printf(\"sumArraysOnGPU <<<%d,%d>>> Time elapsed %f sec\\n\", grid.x, block.x, iElaps);\n",
132 |     "    //printf(\"Execution configuration <<<%d, %d>>>\\n\", grid.x, block.x);\n",
133 |     "    \n",
134 |     "    // copy kernel result back to host side \n",
135 |     "    cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);\n",
136 |     "    \n",
137 |     "    // add vector at host side for result checks\n",
138 |     "    sumArraysOnHost(h_A, h_B, hostRef, nElem);\n",
139 |     "    \n",
140 |     "    for (int i=0; i<10; i++){\n",
141 |     "         printf(\"%f + %f = %f \\n\", h_A[i], h_B[i], hostRef[i]);\n",
142 |     "\n",
143 |     "    }\n",
144 |     "    \n",
145 |     "    // check device results\n",
146 |     "    checkResult(hostRef, gpuRef, nElem);\n",
147 |     "    \n",
148 |     "    free(h_A);\n",
149 |     "    free(h_B);\n",
150 |     "    free(hostRef);\n",
151 |     "    free(gpuRef);\n",
152 |     "    \n",
153 |     "    // use cudaFree to release the memory used on the GPU\n",
154 |     "    cudaFree(d_A);\n",
155 |     "    cudaFree(d_B);\n",
156 |     "    cudaFree(d_C);\n",
157 |     "    cudaDeviceReset();\n",
158 |     "    \n",
159 |     "    return (0);\n",
160 |     "}\n"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 26,
166 |    "metadata": {},
167 |    "outputs": [
168 |     {
169 |      "name": "stdout",
170 |      "output_type": "stream",
171 |      "text": [
172 |       "./addvector Starting...\n",
173 |       "sumArraysOnGPU <<<131072,128>>> Time elapsed 0.016467 sec\n",
174 |       "2.800000 + 2.800000 = 5.600000 \n",
175 |       "10.000000 + 10.000000 = 20.000000 \n",
176 |       "2.600000 + 2.600000 = 5.200000 \n",
177 |       "22.299999 + 22.299999 = 44.599998 \n",
178 |       "11.000000 + 11.000000 = 22.000000 \n",
179 |       "9.900000 + 9.900000 = 19.799999 \n",
180 |       "14.600000 + 14.600000 = 29.200001 \n",
181 |       "22.299999 + 22.299999 = 44.599998 \n",
182 |       "21.100000 + 21.100000 = 42.200001 \n",
183 |       "8.600000 + 8.600000 = 17.200001 \n",
184 |       "Arrays match. \n",
185 |       "\n"
186 |      ]
187 |     }
188 |    ],
189 |    "source": [
190 |     "%%bash\n",
191 |     "nvcc sumArraysOnGPU.cu -o addvector\n",
192 |     "./addvector"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "markdown",
197 |    "metadata": {},
198 |    "source": [
199 |     "## Timing with nvprof"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 27,
205 |    "metadata": {},
206 |    "outputs": [
207 |     {
208 |      "name": "stdout",
209 |      "output_type": "stream",
210 |      "text": [
211 |       "./addvector Starting...\n",
212 |       "==19639== NVPROF is profiling process 19639, command: ./addvector\n",
213 |       "sumArraysOnGPU <<<131072,128>>> Time elapsed 0.014515 sec\n",
214 |       "24.600000 + 24.600000 = 49.200001 \n",
215 |       "11.400000 + 11.400000 = 22.799999 \n",
216 |       "9.800000 + 9.800000 = 19.600000 \n",
217 |       "15.000000 + 15.000000 = 30.000000 \n",
218 |       "0.800000 + 0.800000 = 1.600000 \n",
219 |       "22.700001 + 22.700001 = 45.400002 \n",
220 |       "8.800000 + 8.800000 = 17.600000 \n",
221 |       "17.700001 + 17.700001 = 35.400002 \n",
222 |       "5.100000 + 5.100000 = 10.200000 \n",
223 |       "3.800000 + 3.800000 = 7.600000 \n",
224 |       "Arrays match. \n",
225 |       "\n",
226 |       "==19639== Profiling application: ./addvector\n",
227 |       "==19639== Profiling result:\n",
228 |       "            Type  Time(%)      Time     Calls       Avg       Min       Max  Name\n",
229 |       " GPU activities:   61.59%  86.326ms         2  43.163ms  43.142ms  43.184ms  [CUDA memcpy HtoD]\n",
230 |       "                   28.89%  40.487ms         1  40.487ms  40.487ms  40.487ms  [CUDA memcpy DtoH]\n",
231 |       "                    9.52%  13.347ms         1  13.347ms  13.347ms  13.347ms  sumArraysOnDevice(float*, float*, float*, int)\n",
232 |       "      API calls:   40.42%  166.13ms         3  55.378ms  263.99us  165.59ms  cudaMalloc\n",
233 |       "                   30.71%  126.23ms         3  42.076ms  40.685ms  43.332ms  cudaMemcpy\n",
234 |       "                   16.15%  66.370ms         1  66.370ms  66.370ms  66.370ms  cudaDeviceReset\n",
235 |       "                    8.37%  34.394ms         3  11.465ms  364.77us  26.767ms  cudaFree\n",
236 |       "                    3.52%  14.469ms         1  14.469ms  14.469ms  14.469ms  cudaDeviceSynchronize\n",
237 |       "                    0.65%  2.6564ms        94  28.259us     256ns  1.1985ms  cuDeviceGetAttribute\n",
238 |       "                    0.10%  402.69us         1  402.69us  402.69us  402.69us  cuDeviceGetName\n",
239 |       "                    0.07%  279.80us         1  279.80us  279.80us  279.80us  cuDeviceTotalMem\n",
240 |       "                    0.01%  35.872us         1  35.872us  35.872us  35.872us  cudaLaunch\n",
241 |       "                    0.00%  3.0730us         4     768ns     181ns  2.1630us  cudaSetupArgument\n",
242 |       "                    0.00%  2.3970us         2  1.1980us     572ns  1.8250us  cuDeviceGetCount\n",
243 |       "                    0.00%  1.6430us         1  1.6430us  1.6430us  1.6430us  cudaConfigureCall\n",
244 |       "                    0.00%  1.0380us         2     519ns     269ns     769ns  cuDeviceGet\n"
245 |      ]
246 |     }
247 |    ],
248 |    "source": [
249 |     "!nvprof --unified-memory-profiling off ./addvector"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 28,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "name": "stdout",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "Usage: nvprof [options] [application] [application-arguments]\r\n",
262 |       "Options:\r\n",
263 |       "       --aggregate-mode <on|off>\r\n",
264 |       "                        Turn on/off aggregate mode for events and metrics specified\r\n",
265 |       "                        by subsequent \"--events\" and \"--metrics\" options. Those\r\n",
266 |       "                        event/metric values will be collected for each domain instance,\r\n",
267 |       "                        instead of the whole device. Allowed values:\r\n",
268 |       "                        \ton - turn on aggregate mode (default)\r\n",
269 |       "                        \toff - turn off aggregate mode\r\n",
270 |       "\r\n",
271 |       "       --analysis-metrics\r\n",
272 |       "                        Collect profiling data that can be imported to Visual Profiler's\r\n",
273 |       "                        \"analysis\" mode. Note: Use \"--export-profile\" to specify\r\n",
274 |       "                        an export file.\r\n",
275 |       "\r\n",
276 |       "       --annotate-mpi <off|openmpi|mpich>\r\n",
277 |       "                        Automatically annotate MPI calls with NVTX markers. Specify\r\n",
278 |       "                        the MPI implementation installed on your machine. Currently,\r\n",
279 |       "                        Open MPI and MPICH implementations are supported. By default,\r\n",
280 |       "                        this option is off.\r\n",
281 |       "\r\n",
282 |       "       --concurrent-kernels <on|off>\r\n",
283 |       "                        Turn on/off concurrent kernel execution. If concurrent kernel\r\n",
284 |       "                        execution is off, all kernels running on one device will\r\n",
285 |       "                        be serialized. Allowed values:\r\n",
286 |       "                        \ton - turn on concurrent kernel execution (default)\r\n",
287 |       "                        \toff - turn off concurrent kernel execution\r\n",
288 |       "\r\n",
289 |       "       --continuous-sampling-interval <interval>\r\n",
290 |       "                        Set the continuous mode sampling interval in milliseconds.\r\n",
291 |       "                        Minimum is 1 ms. Default is 2 ms.\r\n",
292 |       "\r\n",
293 |       "       --cpu-thread-tracing <on|off>\r\n",
294 |       "                        Collect information about CPU thread API activity.\r\n",
295 |       "                        Allowed values:\r\n",
296 |       "                        \ton  - turn on CPU thread API tracing\r\n",
297 |       "                        \toff - turn off CPU thread API tracing (default)\r\n",
298 |       "\r\n",
299 |       "       --dependency-analysis\r\n",
300 |       "                        Generate event dependency graph for host and device activities\r\n",
301 |       "                        and run dependency analysis.\r\n",
302 |       "\r\n",
303 |       "       --device-buffer-size <size in MBs>\r\n",
304 |       "                        Set the device memory size (in MBs) reserved for storing\r\n",
305 |       "                        profiling data for non-CDP operations, especially for concurrent\r\n",
306 |       "                        kernel tracing, for each buffer on a context. The default\r\n",
307 |       "                        value is 8MB. The size should be a positive integer.\r\n",
308 |       "\r\n",
309 |       "       --device-cdp-buffer-size <size in MBs>\r\n",
310 |       "                        Set the device memory size (in MBs) reserved for storing\r\n",
311 |       "                        profiling data for CDP operations for each buffer on a context.\r\n",
312 |       "                        The default value is 8MB. The size should be a positive\r\n",
313 |       "                        integer.\r\n",
314 |       "\r\n",
315 |       "       --devices <device ids>\r\n",
316 |       "                        Change the scope of subsequent \"--events\", \"--metrics\", \"--query-events\"\r\n",
317 |       "                        and \"--query-metrics\" options.\r\n",
318 |       "                        Allowed values:\r\n",
319 |       "                        \tall - change scope to all valid devices\r\n",
320 |       "                        \tcomma-separated device IDs - change scope to specified\r\n",
321 |       "                        devices\r\n",
322 |       "\r\n",
323 |       "       --event-collection-mode <mode>\r\n",
324 |       "                        Choose event collection mode for all events/metrics Allowed\r\n",
325 |       "                        values:\r\n",
326 |       "                        \tkernel - events/metrics are collected only for durations\r\n",
327 |       "                        of kernel executions (default)\r\n",
328 |       "                        \tcontinuous - events/metrics are collected for duration\r\n",
329 |       "                        of application. This is not applicable for non-tesla devices.\r\n",
330 |       "                        This mode is compatible only with NVLink events/metrics.\r\n",
331 |       "                        This modeis incompatible with \"--profile-all-processes\"\r\n",
332 |       "                        or \"--profile-child-processes\" or \"--replay-mode kernel\"\r\n",
333 |       "                        or \"--replay-mode application\".\r\n",
334 |       "\r\n",
335 |       "  -e,  --events <event names>\r\n",
336 |       "                        Specify the events to be profiled on certain device(s). Multiple\r\n",
337 |       "                        event names separated by comma can be specified. Which device(s)\r\n",
338 |       "                        are profiled is controlled by the \"--devices\" option. Otherwise\r\n",
339 |       "                        events will be collected on all devices.\r\n",
340 |       "                        For a list of available events, use \"--query-events\".\r\n",
341 |       "                        Use \"--events all\" to profile all events available for each\r\n",
342 |       "                        device.\r\n",
343 |       "                        Use \"--devices\" and \"--kernels\" to select a specific kernel\r\n",
344 |       "                        invocation.\r\n",
345 |       "\r\n",
346 |       "       --kernel-latency-timestamps <on|off>\r\n",
347 |       "                        Turn on/off collection of kernel latency timestamps, namely\r\n",
348 |       "                        queued and submitted. The queued timestamp is captured when\r\n",
349 |       "                        a kernel launch command was queued into the CPU command\r\n",
350 |       "                        buffer. The submitted timestamp denotes when the CPU command\r\n",
351 |       "                        buffer containing this kernel launch was submitted to the\r\n",
352 |       "                        GPU. Turning this option on may incur an overhead during\r\n",
353 |       "                        profiling. Allowed values:\r\n",
354 |       "                        \ton - turn on collection of kernel latency timestamps\r\n",
355 |       "                        \toff - turn off collection of kernel latency timestamps\r\n",
356 |       "                        (default)\r\n",
357 |       "\r\n",
358 |       "       --kernels <kernel path syntax>\r\n",
359 |       "                        Change the scope of subsequent \"--events\", \"--metrics\" options.\r\n",
360 |       "                        The syntax is as follows:\r\n",
361 |       "                        \t<kernel name>\r\n",
362 |       "                        \tLimit scope to given kernel name.\r\n",
363 |       "                        or\r\n",
364 |       "                        \t<context id/name>:<stream id/name>:<kernel name>:<invocation>\r\n",
365 |       "                        The context/stream IDs, names, kernel name and invocation\r\n",
366 |       "                        can be regular expressions. Empty string matches any number\r\n",
367 |       "                        or characters. If <context id/name> or <stream id/name>\r\n",
368 |       "                        is a positive number, it's strictly matched against the\r\n",
369 |       "                        CUDA context/stream ID. Otherwise it's treated as a regular\r\n",
370 |       "                        expression and matched against the context/stream name specified\r\n",
371 |       "                        by the NVTX library. If the invocation count is a positive\r\n",
372 |       "                        number, it's strictly matched against the invocation of\r\n",
373 |       "                        the kernel. Otherwise it's treated as a regular expression.\r\n",
374 |       "                        Example: --kernels \"1:foo:bar:2\" will profile any kernel\r\n",
375 |       "                        whose name contains \"bar\" and is the 2nd instance on context\r\n",
376 |       "                        1 and on stream named \"foo\".\r\n",
377 |       "\r\n",
378 |       "  -m,  --metrics <metric names>\r\n",
379 |       "                        Specify the metrics to be profiled on certain device(s).\r\n",
380 |       "                        Multiple metric names separated by comma can be specified.\r\n",
381 |       "                        Which device(s) are profiled is controlled by the \"--devices\"\r\n",
382 |       "                        option. Otherwise metrics will be collected on all devices.\r\n",
383 |       "                        For a list of available metrics, use \"--query-metrics\".\r\n",
384 |       "                        Use \"--metrics all\" to profile all metrics available for\r\n",
385 |       "                        each device.\r\n",
386 |       "                        Use \"--devices\" and \"--kernels\" to select a specific kernel\r\n",
387 |       "                        invocation. \r\n",
388 |       "                        Note: \"--metrics all\" does not include some metrics which\r\n",
389 |       "                        are needed for Visual Profiler's source level analysis.\r\n",
390 |       "                        For that, use \"--analysis-metrics\".\r\n",
391 |       "\r\n",
392 |       "       --pc-sampling-period <period>\r\n",
393 |       "                        Specify PC Sampling period in cycles,  at which the sampling\r\n",
394 |       "                        records will be dumped. Allowed values for the period are\r\n",
395 |       "                        integers between 5 to 31 both inclusive.\r\n",
396 |       "                        This will set the sampling period to (2^period) cycles\r\n",
397 |       "                        Default value is a number between 5 and 12 based on the setup.Note:\r\n",
398 |       "                        Only available for GM20X+.\r\n",
399 |       "                        \r\n",
400 |       "\r\n",
401 |       "       --profile-all-processes\r\n",
402 |       "                        Profile all processes launched by the same user who launched\r\n",
403 |       "                        this nvprof instance. Note: Only one instance of nvprof\r\n",
404 |       "                        can run with this option at the same time. Under this mode,\r\n",
405 |       "                        there's no need to specify an application to run.\r\n",
406 |       "\r\n",
407 |       "       --profile-api-trace <none|runtime|driver|all>\r\n",
408 |       "                        Turn on/off CUDA runtime/driver API tracing. Allowed values:\r\n",
409 |       "                        \tnone - turn off API tracing\r\n",
410 |       "                        \truntime - only turn on CUDA runtime API tracing\r\n",
411 |       "                        \tdriver - only turn on CUDA driver API tracing\r\n",
412 |       "                        \tall - turn on all API tracing (default)\r\n",
413 |       "\r\n",
414 |       "       --profile-child-processes\r\n",
415 |       "                        Profile the application and all child processes launched\r\n",
416 |       "                        by it.\r\n",
417 |       "\r\n",
418 |       "       --profile-from-start <on|off>\r\n",
419 |       "                        Enable/disable profiling from the start of the application.\r\n",
420 |       "                        If it's disabled, the application can use {cu,cuda}Profiler{Start,Stop}\r\n",
421 |       "                        to turn on/off profiling. Allowed values:\r\n",
422 |       "                        \ton - enable profiling from start (default)\r\n",
423 |       "                        \toff - disable profiling from start\r\n",
424 |       "\r\n",
425 |       "       --profiling-semaphore-pool-size <count>\r\n",
426 |       "                        Set the profiling semaphore pool size reserved for storing\r\n",
427 |       "                        profiling data for serialized kernels and memory operations\r\n",
428 |       "                        for each context. The default value is 65536. The size should\r\n",
429 |       "                        be a positive integer.\r\n",
430 |       "\r\n",
431 |       "       --query-events\r\n",
432 |       "                        List all the events available on the device(s). Device(s)\r\n",
433 |       "                        queried can be controlled by the \"--devices\" option.\r\n",
434 |       "\r\n",
435 |       "       --query-metrics\r\n",
436 |       "                        List all the metrics available on the device(s). Device(s)\r\n",
437 |       "                        queried can be controlled by the \"--devices\" option.\r\n",
438 |       "\r\n",
439 |       "       --replay-mode <mode>\r\n",
440 |       "                        Choose replay mode used when not all events/metrics can be\r\n",
441 |       "                        collected in a single run. Allowed values:\r\n",
442 |       "                        \tdisabled - replay is disabled, events/metrics couldn't\r\n",
443 |       "                        be profiled will be dropped\r\n",
444 |       "                        \tkernel - each kernel invocation is replayed (default)\r\n",
445 |       "                        \tapplication - the entire application is replayed.\r\n",
446 |       "                        This modeis incompatible with \"--profile-all-processes\"\r\n",
447 |       "                        or \"profile-child-processes\".\r\n",
448 |       "\r\n",
449 |       "  -a,  --source-level-analysis <source level analysis names>\r\n",
450 |       "                        Specify the source level metrics to be profiled on a certain\r\n",
451 |       "                        kernel invocation. Use \"--devices\" and \"--kernels\" to select\r\n",
452 |       "                        a specific kernel invocation. Allowed values: one or more\r\n",
453 |       "                        of the following, separated by commas\r\n",
454 |       "                        \tglobal_access: global access\r\n",
455 |       "                        \tshared_access: shared access\r\n",
456 |       "                        \tbranch: divergent branch\r\n",
457 |       "                        \tinstruction_execution: instruction execution\r\n",
458 |       "                        \tpc_sampling: pc sampling, available only for GM20X+\r\n",
459 |       "                        Note: Use \"--export-profile\" to specify an export file.\r\n",
460 |       "\r\n",
461 |       "       --system-profiling <on|off>\r\n",
462 |       "                        Turn on/off power, clock, and thermal profiling. Allowed\r\n",
463 |       "                        values:\r\n",
464 |       "                        \ton - turn on system profiling\r\n",
465 |       "                        \toff - turn off system profiling (default)\r\n",
466 |       "\r\n",
467 |       "  -t,  --timeout <seconds>\r\n",
468 |       "                        Set an execution timeout (in seconds) for the CUDA application.\r\n",
469 |       "                        Note: Timeout starts counting from the moment the CUDA driver\r\n",
470 |       "                        is initialized. If the application doesn't call any CUDA\r\n",
471 |       "                        APIs, timeout won't be triggered.\r\n",
472 |       "\r\n",
473 |       "       --track-memory-allocations <on|off>\r\n",
474 |       "                        Turn on/off tracking of memory operations, which involves\r\n",
475 |       "                        recording timestamps, memory size, memory type and program\r\n",
476 |       "                        counters of the memory allocations and frees. Turning this\r\n",
477 |       "                        option on may incur an overhead during profiling. Allowed\r\n",
478 |       "                        values:\r\n",
479 |       "                        \ton - turn on tracking of memory allocations and\r\n",
480 |       "                        free\r\n",
481 |       "                        \toff - turn off tracking of memory allocations and\r\n",
482 |       "                        free (default)\r\n",
483 |       "\r\n",
484 |       "       --unified-memory-profiling <per-process-device|off>\r\n",
485 |       "                        Configure unified memory profiling. Allowed values:\r\n",
486 |       "                        \tper-process-device - collect counts for each process\r\n",
487 |       "                        and each device (default)\r\n",
488 |       "                        \toff - turn off unified memory profiling\r\n",
489 |       "\r\n",
490 |       "       --cpu-profiling <on|off>\r\n",
491 |       "                        Turn on CPU profiling. Note: CPU profiling is not supported\r\n",
492 |       "                        in multi-process mode.\r\n",
493 |       "\r\n",
494 |       "       --cpu-profiling-explain-ccff <filename>\r\n",
495 |       "                        Path to a PGI pgexplain.xml file that should be used to interpret\r\n",
496 |       "                        Common Compiler Feedback Format (CCFF) messages.\r\n",
497 |       "\r\n",
498 |       "       --cpu-profiling-frequency <frequency>\r\n",
499 |       "                        Set the CPU profiling frequency in samples per second. Default\r\n",
500 |       "                        is 100Hz. Maximum is 500Hz.\r\n",
501 |       "\r\n",
502 |       "       --cpu-profiling-max-depth <depth>\r\n",
503 |       "                        Set the maximum depth of each call stack. Zero means no limit.\r\n",
504 |       "                        Default is zero.\r\n",
505 |       "\r\n",
506 |       "       --cpu-profiling-mode <flat|top-down|bottom-up>\r\n",
507 |       "                        Set the output mode of CPU profiling. Allowed values:\r\n",
508 |       "                        \tflat - Show flat profile\r\n",
509 |       "                        \ttop-down - Show parent functions at the top\r\n",
510 |       "                        \tbottom-up - Show parent functions at the bottom\r\n",
511 |       "                        (default)\r\n",
512 |       "\r\n",
513 |       "       --cpu-profiling-percentage-threshold <threshold>\r\n",
514 |       "                        Filter out the entries that are below the set percentage\r\n",
515 |       "                        threshold. The limit should be an integer between 0 and\r\n",
516 |       "                        100, inclusive. Zero means no limit. Default is zero.\r\n",
517 |       "\r\n",
518 |       "       --cpu-profiling-scope <function|instruction>\r\n",
519 |       "                        Choose the profiling scope. Allowed values:\r\n",
520 |       "                        \tfunction - Each level in the stack trace represents\r\n",
521 |       "                        a distinct function (default)\r\n",
522 |       "                        \tinstruction - Each level in the stack trace represents\r\n",
523 |       "                        a distinct instruction address\r\n",
524 |       "\r\n",
525 |       "       --cpu-profiling-show-ccff <on|off>\r\n",
526 |       "                        Choose whether to print Common Compiler Feedback Format (CCFF)\r\n",
527 |       "                        messages embedded in the binary. Note: this option implies\r\n",
528 |       "                        \"--cpu-profiling-scope instruction\".Default is off.\r\n",
529 |       "\r\n",
530 |       "       --cpu-profiling-show-library <on|off>\r\n",
531 |       "                        Choose whether to print the library name for each sample.\r\n",
532 |       "\r\n",
533 |       "       --cpu-profiling-thread-mode <separated|aggregated>\r\n",
534 |       "                        Set the thread mode of CPU profiling. Allowed values:\r\n",
535 |       "                        \tseparated - Show separate profile for each thread\r\n",
536 |       "                        \taggregated - Aggregate data from all threads (default)\r\n",
537 |       "\r\n",
538 |       "       --cpu-profiling-unwind-stack <on|off>\r\n",
539 |       "                        Choose whether to unwind the CPU call-stack at each sample\r\n",
540 |       "                        point. Default is on. \r\n",
541 |       "\r\n",
542 |       "       --openacc-profiling <on|off>\r\n",
543 |       "                        Enable/disable recording information from the OpenACC profiling\r\n",
544 |       "                        interface. Note: if the OpenACC profiling interface is available\r\n",
545 |       "                        depends on the OpenACC runtime. Default is on.\r\n",
546 |       "\r\n",
547 |       "       --context-name <name>\r\n",
548 |       "                        Name of the CUDA context.\r\n",
549 |       "                        \t\"%i\" in the context name string is replaced with\r\n",
550 |       "                        the ID of the context.\r\n",
551 |       "                        \t\"%p\" in the context name string is replaced with\r\n",
552 |       "                        the process ID of the application being profiled.\r\n",
553 |       "                        \t\"%q{<ENV>}\" in the context name string is replaced\r\n",
554 |       "                        with the value of the environment variable \"<ENV>\". If the\r\n",
555 |       "                        environment variable is not set it's an error.\r\n",
556 |       "                        \t\"%h\" in the context name string is replaced with\r\n",
557 |       "                        the hostname of the system.\r\n",
558 |       "                        \t\"%%\" in the context name string is replaced with\r\n",
559 |       "                        \"%\". Any other character following \"%\" is illegal.\r\n",
560 |       "\r\n",
561 |       "       --csv\r\n",
562 |       "                        Use comma-separated values in the output.\r\n",
563 |       "\r\n",
564 |       "       --demangling <on|off>\r\n",
565 |       "                        Turn on/off C++ name demangling of function names. Allowed\r\n",
566 |       "                        values:\r\n",
567 |       "                        \ton - turn on demangling (default)\r\n",
568 |       "                        \toff - turn off demangling\r\n",
569 |       "\r\n",
570 |       "  -u,  --normalized-time-unit <s|ms|us|ns|col|auto>\r\n",
571 |       "                        Specify the unit of time that will be used in the output.\r\n",
572 |       "                        Allowed values:\r\n",
573 |       "                        \ts - second, ms - millisecond, us - microsecond,\r\n",
574 |       "                        ns - nanosecond\r\n",
575 |       "                        \tcol - a fixed unit for each column\r\n",
576 |       "                        \tauto (default) - the scale is chosen for each value\r\n",
577 |       "                        based on its length.\r\n",
578 |       "\r\n",
579 |       "       --openacc-summary-mode <mode>\r\n",
580 |       "                        Set how durations are computed in the OpenACC summary. Allowed\r\n",
581 |       "                        values:\r\n",
582 |       "                        \texclusive: show exclusive times (default)\r\n",
583 |       "                        \tinclusive: show inclusive times\r\n",
584 |       "\r\n",
585 |       "       --print-api-summary\r\n",
586 |       "                        Print a summary of CUDA runtime/driver API calls.\r\n",
587 |       "\r\n",
588 |       "       --print-api-trace\r\n",
589 |       "                        Print CUDA runtime/driver API trace.\r\n",
590 |       "\r\n",
591 |       "       --print-dependency-analysis-trace\r\n",
592 |       "                        Print dependency analysis trace.\r\n",
593 |       "\r\n",
594 |       "       --print-gpu-summary\r\n",
595 |       "                        Print a summary of the activities on the GPU (including CUDA\r\n",
596 |       "                        kernels and memcpy's/memset's).\r\n",
597 |       "\r\n",
598 |       "       --print-gpu-trace\r\n",
599 |       "                        Print individual kernel invocations (including CUDA memcpy's/memset's)\r\n",
600 |       "                        and sort them in chronological order. In event/metric profiling\r\n",
601 |       "                        mode, show events/metrics for each kernel invocation.\r\n",
602 |       "\r\n",
603 |       "       --print-openacc-constructs\r\n",
604 |       "                        Include parent construct names in OpenACC profile.\r\n",
605 |       "\r\n",
606 |       "       --print-openacc-summary\r\n",
607 |       "                        Print a summary of the OpenACC profile.\r\n",
608 |       "\r\n",
609 |       "       --print-openacc-trace\r\n",
610 |       "                        Print a trace of the OpenACC profile.\r\n",
611 |       "\r\n",
612 |       "  -s,  --print-summary\r\n",
613 |       "                        Print a summary of the profiling result on screen. Note:\r\n",
614 |       "                        This is the default unless \"--export-profile\" or other print\r\n",
615 |       "                        options are used.\r\n",
616 |       "\r\n",
617 |       "       --print-summary-per-gpu\r\n",
618 |       "                        Print a summary of the profiling result for each GPU.\r\n",
619 |       "\r\n",
620 |       "       --process-name <name>\r\n",
621 |       "                        Name of the process.\r\n",
622 |       "                        \t\"%p\" in the process name string is replaced with\r\n",
623 |       "                        the process ID of the application being profiled.\r\n",
624 |       "                        \t\"%q{<ENV>}\" in the process name string is replaced\r\n",
625 |       "                        with the value of the environment variable \"<ENV>\". If the\r\n",
626 |       "                        environment variable is not set it's an error.\r\n",
627 |       "                        \t\"%h\" in the process name string is replaced with\r\n",
628 |       "                        the hostname of the system.\r\n",
629 |       "                        \t\"%%\" in the process  name string is replaced with\r\n",
630 |       "                        \"%\". Any other character following \"%\" is illegal.\r\n",
631 |       "\r\n",
632 |       "       --quiet\r\n",
633 |       "                        Suppress all nvprof output.\r\n",
634 |       "\r\n",
635 |       "       --stream-name <name>\r\n",
636 |       "                        Name of the CUDA stream.\r\n",
637 |       "                        \t\"%i\" in the stream name string is replaced with the\r\n",
638 |       "                        ID of the stream.\r\n",
639 |       "                        \t\"%p\" in the stream name string is replaced with\r\n",
640 |       "                        the process ID of the application being profiled.\r\n",
641 |       "                        \t\"%q{<ENV>}\" in the stream name string is replaced\r\n",
642 |       "                        with the value of the environment variable \"<ENV>\". If the\r\n",
643 |       "                        environment variable is not set it's an error.\r\n",
644 |       "                        \t\"%h\" in the stream name string is replaced with\r\n",
645 |       "                        the hostname of the system.\r\n",
646 |       "                        \t\"%%\" in the stream name string is replaced with\r\n",
647 |       "                        \"%\". Any other character following \"%\" is illegal.\r\n",
648 |       "\r\n",
649 |       "  -o,  --export-profile <filename>\r\n",
650 |       "                        Export the result file which can be imported later or opened\r\n",
651 |       "                        by the NVIDIA Visual Profiler.\r\n",
652 |       "                        \t\"%p\" in the file name string is replaced with the\r\n",
653 |       "                        process ID of the application being profiled.\r\n",
654 |       "                        \t\"%q{<ENV>}\" in the file name string is replaced\r\n",
655 |       "                        with the value of the environment variable \"<ENV>\". If the\r\n",
656 |       "                        environment variable is not set it's an error.\r\n",
657 |       "                        \t\"%h\" in the file name string is replaced with the\r\n",
658 |       "                        hostname of the system.\r\n",
659 |       "                        \t\"%%\" in the file name string is replaced with \"%\".\r\n",
660 |       "                        \tAny other character following \"%\" is illegal.\r\n",
661 |       "                        By default, this option disables the summary output. Note:\r\n",
662 |       "                        If the application being profiled creates child processes,\r\n",
663 |       "                        or if '--profile-all-processes' is used, the \"%p\" format\r\n",
664 |       "                        is needed to get correct export files for each process.\r\n",
665 |       "\r\n",
666 |       "  -f,  --force-overwrite\r\n",
667 |       "                        Force overwriting all output files (any existing files will\r\n",
668 |       "                        be overwritten).\r\n",
669 |       "\r\n",
670 |       "  -i,  --import-profile <filename>\r\n",
671 |       "                        Import a result profile from a previous run.\r\n",
672 |       "\r\n",
673 |       "       --log-file <filename>\r\n",
674 |       "                        Make nvprof send all its output to the specified file, or\r\n",
675 |       "                        one of the standard channels. The file will be overwritten.\r\n",
676 |       "                        If the file doesn't exist, a new one will be created.\r\n",
677 |       "                        \t\"%1\" as the whole file name indicates standard output\r\n",
678 |       "                        channel (stdout).\r\n",
679 |       "                        \t\"%2\" as the whole file name indicates standard error\r\n",
680 |       "                        channel (stderr). Note: This is the default.\r\n",
681 |       "                        \t\"%p\" in the file name string is replaced with the\r\n",
682 |       "                        process ID of the application being profiled.\r\n",
683 |       "                        \t\"%q{<ENV>}\" in the file name string is replaced\r\n",
684 |       "                        with the value of the environment variable \"<ENV>\". If the\r\n",
685 |       "                        environment variable is not set it's an error.\r\n",
686 |       "                        \t\"%h\" in the file name string is replaced with the\r\n",
687 |       "                        hostname of the system.\r\n",
688 |       "                        \t\"%%\" in the file name is replaced with \"%\".\r\n",
689 |       "                        \tAny other character following \"%\" is illegal.\r\n",
690 |       "\r\n",
691 |       "       --print-nvlink-topology\r\n",
692 |       "                        Print nvlink topology \r\n",
693 |       "\r\n",
694 |       "  -h,  --help\r\n",
695 |       "                        Print this help information.\r\n",
696 |       "\r\n",
697 |       "  -V,  --version\r\n",
698 |       "                        Print version information of this tool.\r\n",
699 |       "\r\n"
700 |      ]
701 |     }
702 |    ],
703 |    "source": [
704 |     "!nvprof --help"
705 |    ]
706 |   },
707 |   {
708 |    "cell_type": "code",
709 |    "execution_count": null,
710 |    "metadata": {},
711 |    "outputs": [],
712 |    "source": []
713 |   }
714 |  ],
715 |  "metadata": {
716 |   "kernelspec": {
717 |    "display_name": "Python 3",
718 |    "language": "python",
719 |    "name": "python3"
720 |   },
721 |   "language_info": {
722 |    "codemirror_mode": {
723 |     "name": "ipython",
724 |     "version": 3
725 |    },
726 |    "file_extension": ".py",
727 |    "mimetype": "text/x-python",
728 |    "name": "python",
729 |    "nbconvert_exporter": "python",
730 |    "pygments_lexer": "ipython3",
731 |    "version": "3.6.5"
732 |   },
733 |   "toc": {
734 |    "base_numbering": 1,
735 |    "nav_menu": {},
736 |    "number_sections": true,
737 |    "sideBar": true,
738 |    "skip_h1_title": false,
739 |    "title_cell": "Table of Contents",
740 |    "title_sidebar": "Contents",
741 |    "toc_cell": false,
742 |    "toc_position": {},
743 |    "toc_section_display": true,
744 |    "toc_window_display": false
745 |   }
746 |  },
747 |  "nbformat": 4,
748 |  "nbformat_minor": 2
749 | }
750 | 


--------------------------------------------------------------------------------