├── CodingAssignments
├── basic_matrix_mul
│ ├── README.md
│ └── basic_matrix_mul.cu
├── mat_vec_mul
│ ├── README.md
│ └── mat_vec_mul.cu
├── tiled_matrix_mul_shared_mem
│ └── README.md
└── vector_add
│ ├── README.md
│ └── vector_add.cu
├── Colab_Test
├── Intro_to_CUDA_C_Part_1-Student.zip
└── README.md
├── Get_GPU_Properties
├── Makefile
├── Makefile_gmatch_cu
├── Makefile_gpu_props_cu
├── common
│ ├── book.h
│ ├── common_functions.h
│ ├── constants_32bp.h
│ ├── db_file_names.h
│ ├── gpu_arch_constants.h
│ └── qry_file_names.h
├── get_device_properties
└── get_device_properties.cu
├── README.md
├── TechDocs
├── NVIDIA-Turing-Architecture-Whitepaper.pdf
└── nsight_profiler_explained.pdf
└── common
└── GL
├── glext.h
└── glut.h
/CodingAssignments/basic_matrix_mul/README.md:
--------------------------------------------------------------------------------
1 | ## Basic Matrix Multiplication
2 |
3 | *This program performs matrix-matrix multiplication with each thread calculating only one output element*
4 |
5 | ---
6 |
7 |
8 | In the host code:
9 |
10 | - We allocate memory for the input and output images.
11 | - We initialize the input image with a random values.
12 | - We launch the kernel.
13 | - We copy the output image to the host.
14 | - And finally we free both host and device memory.
15 |
16 | In the kernel function we check if the thread is within the bounds of the matrix.
17 |
18 | If the thread is within the bounds we calculate the output element by multiplying the input elements of the two matrices residing in the global memory and we store the result in the output matrix.
19 |
20 | For each iteration in the for-loop the kernel function has to perform two load operations and two arithmetic operations. This deteriorates the performance of the kernel due to long-latency load operations.
--------------------------------------------------------------------------------
/CodingAssignments/basic_matrix_mul/basic_matrix_mul.cu:
--------------------------------------------------------------------------------
1 | /*/
2 | *
3 | * This program implements matrix-matrix multiplication in its simplest form.
4 | *
5 | * Compile with:
6 | * nvcc basic_matrix_mul.cu
7 | *
8 | * Run with:
9 | * ./a.out
10 | *
11 | /*/
12 |
13 | #include
14 |
15 | #define DEBUG
16 |
17 | // Compute A * B
18 | // A and B can have arbitrary dimensions
19 | // Sgemm stands for single precision general matrix-matrix multiply
20 | __global__ void sgemm(float *A, float *B, float *C, int numARows, int numAColumns,
21 | int numBRows, int numBColumns) {
22 | int row = blockIdx.y * blockDim.y + threadIdx.y;
23 | int col = blockIdx.x * blockDim.x + threadIdx.x;
24 |
25 | if (row >= numARows || col >= numBColumns) return;
26 |
27 | float sum = 0;
28 | for (int k = 0; k < numAColumns; k++)
29 | sum += A[row * numAColumns + k] * B[k * numBColumns + col];
30 |
31 | C[row * numBColumns + col] = sum;
32 | }
33 |
34 |
35 | int main(int argc, char **argv) {
36 |
37 | float *hostA, *hostB, *hostC;
38 | float *deviceA, *deviceB, *deviceC;
39 | int numARows, numAColumns;
40 | int numBRows, numBColumns;
41 | int numCRows, numCColumns;
42 |
43 | if (argc != 5){
44 | printf("Usage: ./a.out \n");
45 | return 1;
46 | }
47 |
48 | numARows = atoi(argv[1]);
49 | numAColumns = atoi(argv[2]);
50 | numBRows = atoi(argv[3]);
51 | numBColumns = atoi(argv[4]);
52 |
53 | numCRows = numARows;
54 | numCColumns = numBColumns;
55 |
56 | if(numAColumns != numBRows) {
57 | printf("Number of columns in A must be the same as the number of rows in B\n");
58 | return 1;
59 | }
60 |
61 | // Allocate memory on host
62 | hostA = (float *) malloc(numARows * numAColumns * sizeof(float));
63 | hostB = (float *) malloc(numBRows * numBColumns * sizeof(float));
64 | hostC = (float *) malloc(numCRows * numCColumns * sizeof(float));
65 |
66 | // Allocate memory on device
67 | cudaMalloc((void **) &deviceA, numARows * numAColumns * sizeof(float));
68 | cudaMalloc((void **) &deviceB, numBRows * numBColumns * sizeof(float));
69 | cudaMalloc((void **) &deviceC, numCRows * numCColumns * sizeof(float));
70 |
71 | // Initialize host memory
72 | srand(time(NULL));
73 | for (int i = 0; i < numARows; i++)
74 | for (int j = 0; j < numAColumns; j++)
75 | hostA[i * numAColumns + j] = rand() / (float) RAND_MAX;
76 |
77 | for (int i = 0; i < numBRows; i++)
78 | for (int j = 0; j < numBColumns; j++)
79 | hostB[i * numBColumns + j] = rand() / (float) RAND_MAX;
80 |
81 | #ifdef DEBUG
82 | // Show input matrices
83 | printf("A:\n");
84 | for (int i = 0; i < numARows; i++) {
85 | for (int j = 0; j < numAColumns; j++)
86 | printf("%f ", hostA[i * numAColumns + j]);
87 | printf("\n");
88 | }
89 | printf("\n");
90 |
91 | printf("B:\n");
92 | for (int i = 0; i < numBRows; i++) {
93 | for (int j = 0; j < numBColumns; j++)
94 | printf("%f ", hostB[i * numBColumns + j]);
95 | printf("\n");
96 | }
97 | printf("\n");
98 | #endif
99 |
100 | // Copy host memory to device
101 | cudaMemcpy(deviceA, hostA, numARows * numAColumns * sizeof(float), cudaMemcpyHostToDevice);
102 | cudaMemcpy(deviceB, hostB, numBRows * numBColumns * sizeof(float), cudaMemcpyHostToDevice);
103 |
104 | // Launch kernel
105 | dim3 blockDim(16, 16);
106 | dim3 gridDim(ceil((float)numCColumns / blockDim.x), ceil((float)numCRows / blockDim.y));
107 | sgemm<<>>(deviceA, deviceB, deviceC,
108 | numARows, numAColumns,
109 | numBRows, numBColumns);
110 |
111 | // Copy device memory to host
112 | cudaMemcpy(hostC, deviceC, numCRows * numCColumns * sizeof(float), cudaMemcpyDeviceToHost);
113 |
114 | #ifdef DEBUG
115 | // Print results
116 | printf("C:\n");
117 | for (int i = 0; i < numCRows; i++) {
118 | for (int j = 0; j < numCColumns; j++)
119 | printf("%f ", hostC[i * numCColumns + j]);
120 | printf("\n");
121 | }
122 | #endif
123 |
124 | // Free memory
125 | free(hostA);
126 | free(hostB);
127 | free(hostC);
128 | cudaFree(deviceA);
129 | cudaFree(deviceB);
130 | cudaFree(deviceC);
131 |
132 | return 0;
133 | }
--------------------------------------------------------------------------------
/CodingAssignments/mat_vec_mul/README.md:
--------------------------------------------------------------------------------
1 | ## Matrix-Vector Multiplication
2 |
3 | *Code for Exercise 2 from [here](../../exercises/README.MD)*
4 |
5 | ---
6 |
7 | The host code:
8 |
9 | - Allocates memory for the input and output matrices on host and initializes the memory.
10 | - Allocates memory for the input and output matrices on device and copys the input matrices to the device.
11 | - Launches the kernel.
12 | - Copys the output matrix from the device to the host and prints the results.
13 | - Frees the memory on the device and host.
14 |
15 |
16 | The kernel **mat_vec_mul** a thread for each element of the output matrix. It uses a *for-loop* and each thread iterates over a row of the B matrix and the C vector to compute the result for the A matrix. The load operations for the B matrix are not coalesced and the kernel will be underutilized due to its high latency load operations.
--------------------------------------------------------------------------------
/CodingAssignments/mat_vec_mul/mat_vec_mul.cu:
--------------------------------------------------------------------------------
1 |
2 | #include
3 | #include
4 |
5 | //#define DEBUG
6 |
7 |
8 | __global__
9 | void mat_vec_mul(float *A, float *B, float *C, int n)
10 | {
11 | int i = blockIdx.x * blockDim.x + threadIdx.x;
12 | if (i >= n) return;
13 |
14 | A[i] = 0;
15 | for (int k = 0; k < n; k++)
16 | {
17 | A[i] += B[i * n + k] * C[k];
18 | }
19 | }
20 |
21 |
22 | int main(int argc, char **argv)
23 | {
24 | int n;
25 |
26 | float *h_A, *h_B, *h_C;
27 | float *d_A, *d_B, *d_C;
28 |
29 | if (argc != 2){
30 | printf("Usage: ./a.out \n");
31 | return 1;
32 | }
33 |
34 | n = atoi(argv[1]);
35 |
36 | // Allocate memory on host
37 | h_A = (float *)malloc(n * sizeof(float));
38 | h_B = (float *)malloc(n * n * sizeof(float));
39 | h_C = (float *)malloc(n * sizeof(float));
40 |
41 | // Initialize host memory
42 | for (int i = 0; i < n; i++)
43 | {
44 | h_C[i] = 1;
45 | for (int j = 0; j < n; j++)
46 | {
47 | h_B[i * n + j] = 1;
48 | }
49 | }
50 |
51 | // Allocate memory on device
52 | cudaMalloc((void **)&d_A, n * sizeof(float));
53 | cudaMalloc((void **)&d_B, n * n * sizeof(float));
54 | cudaMalloc((void **)&d_C, n * sizeof(float));
55 |
56 | // Copy host memory to device memory
57 | cudaMemcpy(d_C, h_C, n * sizeof(float), cudaMemcpyHostToDevice);
58 | cudaMemcpy(d_B, h_B, n * n * sizeof(float), cudaMemcpyHostToDevice);
59 |
60 | // Launch the kernel
61 | dim3 dimBlock(128);
62 | dim3 dimGrid(ceil(n / 128.0f));
63 | mat_vec_mul<<>>(d_A, d_B, d_C, n);
64 |
65 | // Copy device memory to host memory
66 | cudaMemcpy(h_A, d_A, n * sizeof(float), cudaMemcpyDeviceToHost);
67 |
68 | #ifdef DEBUG
69 | // Print the result
70 | for (int i = 0; i < n; i++)
71 | {
72 | printf("%f ", h_A[i]);
73 | }
74 | printf("\n");
75 | #endif
76 |
77 | // Free device memory
78 | cudaFree(d_A);
79 | cudaFree(d_B);
80 | cudaFree(d_C);
81 |
82 | // Free host memory
83 | free(h_A);
84 | free(h_B);
85 | free(h_C);
86 |
87 | return 0;
88 | }
--------------------------------------------------------------------------------
/CodingAssignments/tiled_matrix_mul_shared_mem/README.md:
--------------------------------------------------------------------------------
1 | ## Tiled Matrix Multiplication Using Shared Memory
2 |
3 | *This program performs matrix-matrix multiplication using shared memory to reduce the number of load operations.*
4 |
5 | ---
6 |
7 |
8 | In the host code:
9 | - We allocate memory for the input and output matrices.
10 | - We initialize the input matrices with a random values.
11 | - We copy the input matrices to the device.
12 | - We launch the kernel.
13 | - We copy the output matrix to the host.
14 | - And finally we free both host and device memory.
15 |
16 | In the kernel function first we declare the shared memory for the matrix multiplication and compute the row and column that each thread will load from.
17 |
18 | The number of iterations (phases) in the for-loop is equal to number of columns of the A matrix and the number of rows of the B matrix divided by the TILE_WIDTH.
19 |
20 | The number of rows of the A matrix and the number of columns of the B matrix are handled by the number of blocks in the grid.
21 |
22 | Inside the for loop we check if the load operation is performed within both the A and B matrices.
23 | - If it does we load the corresponding element.
24 | - If it does not we assign the value to 0 so that the result will not be corrupted with random values.
25 |
26 | After that we synchronize the threads so that the dot product is computed only after all the threads have finished loading the values and no random values are left in the shared memory.
27 |
28 | Then we accumulate the dot product and we sychronize the threads again so that no thread will start loading new values (in the next iteration) before the dot product is computed.
29 |
30 | Finally if the current thread is within the output matrix bound we store the computed element to the output matrix.
--------------------------------------------------------------------------------
/CodingAssignments/vector_add/README.md:
--------------------------------------------------------------------------------
1 | ## Vector Add
2 |
3 | *This program performs vector addition*
4 |
5 | ---
6 |
7 |
8 | In the host code:
9 |
10 | - We allocate memory for the input and output vectors.
11 | - We initialize the input vectors with a random values.
12 | - We launch the kernel.
13 | - We copy the output vector to the host.
14 | - And finally we free both host and device memory.
15 |
16 | The kernel first checks if the thread's i variable is inside the vector bounds and then performs the add operation between the input vectors and stores the output to the output vector.
--------------------------------------------------------------------------------
/CodingAssignments/vector_add/vector_add.cu:
--------------------------------------------------------------------------------
1 | /*/
2 | *
3 | * A program that implements vector addition
4 | *
5 | * Compile with:
6 | * nvcc vector_add.cu
7 | *
8 | * Run with:
9 | * ./a.out
10 | *
11 | /*/
12 |
13 | #include
14 |
15 | #define DEBUG
16 |
17 | __global__
18 | void vec_add(int *in1, int *in2, int *out, int n)
19 | {
20 | int i = blockIdx.x * blockDim.x + threadIdx.x;
21 |
22 | if (i >= n) return;
23 |
24 | out[i] = in1[i] + in2[i];
25 | }
26 |
27 |
28 | int main(int argc, char* argv[])
29 | {
30 |
31 | int inputLength;
32 |
33 | int *hostInput1, *hostInput2;
34 | int *hostOutput;
35 |
36 | int *deviceInput1, *deviceInput2;
37 | int *deviceOutput;
38 |
39 | int blockSize;
40 |
41 | if (argc != 3)
42 | {
43 | printf("Usage: %s \n", argv[0]);
44 | return 1;
45 | }
46 |
47 | blockSize = strtol(argv[1], NULL, 10);
48 | inputLength = strtol(argv[2], NULL, 10);
49 |
50 | // Allocate host memory for the input and output data
51 | hostInput1 = (int*)malloc(inputLength * sizeof(int));
52 | hostInput2 = (int*)malloc(inputLength * sizeof(int));
53 | hostOutput = (int*)malloc(inputLength * sizeof(int));
54 |
55 | // Allocate device memory for the input and output data
56 | cudaMalloc((void**) &deviceInput1, inputLength * sizeof(int));
57 | cudaMalloc((void**) &deviceInput2, inputLength * sizeof(int));
58 | cudaMalloc((void**) &deviceOutput, inputLength * sizeof(int));
59 |
60 | // Initialize the host vectors
61 | srand(time(NULL));
62 | for (int i = 0; i < inputLength; i++)
63 | {
64 | hostInput1[i] = rand() % 100;
65 | hostInput2[i] = rand() % 100;
66 | }
67 |
68 | // Copy the host input data to the device
69 | cudaMemcpy(deviceInput1, hostInput1, inputLength * sizeof(int), cudaMemcpyHostToDevice);
70 | cudaMemcpy(deviceInput2, hostInput2, inputLength * sizeof(int), cudaMemcpyHostToDevice);
71 |
72 | // Launch the kernel
73 | vec_add<<>>(deviceInput1, deviceInput2, deviceOutput, inputLength);
74 |
75 | // Copy the device output data to the host
76 | cudaMemcpy(hostOutput, deviceOutput, inputLength * sizeof(int), cudaMemcpyDeviceToHost);
77 |
78 | #ifdef DEBUG
79 | // Print the results
80 | for (int i = 0; i < inputLength; i++)
81 | printf("%d + %d = %d\n", hostInput1[i], hostInput2[i], hostOutput[i]);
82 | #endif
83 |
84 | // Free the device memory
85 | cudaFree(deviceInput1);
86 | cudaFree(deviceInput2);
87 | cudaFree(deviceOutput);
88 |
89 | // Free the host memory
90 | free(hostInput1);
91 | free(hostInput2);
92 | free(hostOutput);
93 | }
--------------------------------------------------------------------------------
/Colab_Test/Intro_to_CUDA_C_Part_1-Student.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZenoRobotics/CUDA_C_and_GPU/87dadf7d66311aa0db03ff163fafa7c545cc5298/Colab_Test/Intro_to_CUDA_C_Part_1-Student.zip
--------------------------------------------------------------------------------
/Colab_Test/README.md:
--------------------------------------------------------------------------------
1 | # Test Colab Jupyter Notebook Setup and Usage for: Part I of the Intro to CUDA C/C++ and GPU Arch. Course
2 |
3 | ## Steps
4 |
5 | 1) Download Zip file.
6 | 2) Unzip and extract "Intro_toCUDA_C_Part_1" folder to your Google Drive
7 | 3) Double click on the Intro_to_CUDA_C_PART_1.pynb (Jupyter Notebook) file in the folder.
8 | 4) Follow the steps in the Notebook.
9 |
10 | Good luck!
11 |
--------------------------------------------------------------------------------
/Get_GPU_Properties/Makefile:
--------------------------------------------------------------------------------
1 | # -lcurand -lcublas -lcusparse -lcufft -lnpp -lcudart
2 |
3 | get_device_properties : get_device_properties.cu
4 | nvcc -I. -I/usr/local/cuda-5.0/include/ get_device_properties.cu -o get_device_properties
5 |
6 |
7 | clean:
8 | rm -f *.o *~ core .depend get_device_properties
9 |
10 | depend .depend dep:
11 | $(CC) $(CFLAGS) -M *.c > $@
12 |
13 |
14 | ifeq (.depend,$(wildcard .depend))
15 | include .depend
16 | endif
17 |
--------------------------------------------------------------------------------
/Get_GPU_Properties/Makefile_gmatch_cu:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZenoRobotics/CUDA_C_and_GPU/87dadf7d66311aa0db03ff163fafa7c545cc5298/Get_GPU_Properties/Makefile_gmatch_cu
--------------------------------------------------------------------------------
/Get_GPU_Properties/Makefile_gpu_props_cu:
--------------------------------------------------------------------------------
1 | # -lcurand -lcublas -lcusparse -lcufft -lnpp -lcudart
2 |
3 | get_device_properties : get_device_properties.cu
4 | nvcc -I. -I/usr/local/cuda/include/ get_device_properties.cu -o get_device_properties
5 |
6 |
7 | clean:
8 | rm -f *.o *~ core .depend get_device_properties
9 |
10 | depend .depend dep:
11 | $(CC) $(CFLAGS) -M *.c > $@
12 |
13 |
14 | ifeq (.depend,$(wildcard .depend))
15 | include .depend
16 | endif
17 |
--------------------------------------------------------------------------------
/Get_GPU_Properties/common/book.h:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 1993-2010 NVIDIA Corporation. All rights reserved.
3 | *
4 | * NVIDIA Corporation and its licensors retain all intellectual property and
5 | * proprietary rights in and to this software and related documentation.
6 | * Any use, reproduction, disclosure, or distribution of this software
7 | * and related documentation without an express license agreement from
8 | * NVIDIA Corporation is strictly prohibited.
9 | *
10 | * Please refer to the applicable NVIDIA end user license agreement (EULA)
11 | * associated with this source code for terms and conditions that govern
12 | * your use of this NVIDIA software.
13 | *
14 | */
15 |
16 |
17 | #ifndef __BOOK_H__
18 | #define __BOOK_H__
19 | #include
20 | #include
21 | //#include
22 | #include
23 |
24 |
25 | static void HandleError( cudaError_t err,
26 | const char *file,
27 | int line ) {
28 | if (err != cudaSuccess) {
29 | printf( "%s in %s at line %d\n", cudaGetErrorString( err ),
30 | file, line );
31 | exit( EXIT_FAILURE );
32 | }
33 | }
34 | #define HANDLE_ERROR( err ) (HandleError( err, __FILE__, __LINE__ ))
35 |
36 |
37 | #define HANDLE_NULL( a ) {if (a == NULL) { \
38 | printf( "Host memory failed in %s at line %d\n", \
39 | __FILE__, __LINE__ ); \
40 | exit( EXIT_FAILURE );}}
41 | /*
42 | template< typename T >
43 | void swap( T& a, T& b ) {
44 | T t = a;
45 | a = b;
46 | b = t;
47 | }
48 | */
49 |
50 | void* big_random_block( int size ) {
51 | int i=0;
52 | unsigned char *data = (unsigned char*)malloc( size );
53 | HANDLE_NULL( data );
54 | for (i=0; i 360) hue -= 360;
75 | else if (hue < 0) hue += 360;
76 |
77 | if (hue < 60)
78 | return (unsigned char)(255 * (n1 + (n2-n1)*hue/60));
79 | if (hue < 180)
80 | return (unsigned char)(255 * n2);
81 | if (hue < 240)
82 | return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60));
83 | return (unsigned char)(255 * n1);
84 | }
85 |
86 | /*
87 | __global__ void float_to_color( unsigned char *optr,
88 | const float *outSrc ) {
89 | // map from threadIdx/BlockIdx to pixel position
90 | int x = threadIdx.x + blockIdx.x * blockDim.x;
91 | int y = threadIdx.y + blockIdx.y * blockDim.y;
92 | int offset = x + y * blockDim.x * gridDim.x;
93 |
94 | float l = outSrc[offset];
95 | float s = 1;
96 | int h = (180 + (int)(360.0f * outSrc[offset])) % 360;
97 | float m1, m2;
98 |
99 | if (l <= 0.5f)
100 | m2 = l * (1 + s);
101 | else
102 | m2 = l + s - l * s;
103 | m1 = 2 * l - m2;
104 |
105 | optr[offset*4 + 0] = value( m1, m2, h+120 );
106 | optr[offset*4 + 1] = value( m1, m2, h );
107 | optr[offset*4 + 2] = value( m1, m2, h -120 );
108 | optr[offset*4 + 3] = 255;
109 | }
110 |
111 | __global__ void float_to_color( uchar4 *optr,
112 | const float *outSrc ) {
113 |
114 | // map from threadIdx/BlockIdx to pixel position
115 | int x = threadIdx.x + blockIdx.x * blockDim.x;
116 | int y = threadIdx.y + blockIdx.y * blockDim.y;
117 | int offset = x + y * blockDim.x * gridDim.x;
118 |
119 | float l = outSrc[offset];
120 | float s = 1;
121 | int h = (180 + (int)(360.0f * outSrc[offset])) % 360;
122 | float m1, m2;
123 |
124 | if (l <= 0.5f)
125 | m2 = l * (1 + s);
126 | else
127 | m2 = l + s - l * s;
128 | m1 = 2 * l - m2;
129 |
130 | optr[offset].x = value( m1, m2, h+120 );
131 | optr[offset].y = value( m1, m2, h );
132 | optr[offset].z = value( m1, m2, h -120 );
133 | optr[offset].w = 255;
134 | }
135 |
136 | */
137 |
138 | #if _WIN32
139 | //Windows threads.
140 | #include
141 |
142 | typedef HANDLE CUTThread;
143 | typedef unsigned (WINAPI *CUT_THREADROUTINE)(void *);
144 |
145 | #define CUT_THREADPROC unsigned WINAPI
146 | #define CUT_THREADEND return 0
147 |
148 | #else
149 | //POSIX threads.
150 | #include
151 |
152 | typedef pthread_t CUTThread;
153 | typedef void *(*CUT_THREADROUTINE)(void *);
154 |
155 | #define CUT_THREADPROC void
156 | #define CUT_THREADEND
157 | #endif
158 |
159 | //Create thread.
160 | CUTThread start_thread( CUT_THREADROUTINE, void *data );
161 |
162 | //Wait for thread to finish.
163 | void end_thread( CUTThread thread );
164 |
165 | //Destroy thread.
166 | void destroy_thread( CUTThread thread );
167 |
168 | //Wait for multiple threads.
169 | void wait_for_threads( const CUTThread *threads, int num );
170 |
171 | #if _WIN32
172 | //Create thread
173 | CUTThread start_thread(CUT_THREADROUTINE func, void *data){
174 | return CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)func, data, 0, NULL);
175 | }
176 |
177 | //Wait for thread to finish
178 | void end_thread(CUTThread thread){
179 | WaitForSingleObject(thread, INFINITE);
180 | CloseHandle(thread);
181 | }
182 |
183 | //Destroy thread
184 | void destroy_thread( CUTThread thread ){
185 | TerminateThread(thread, 0);
186 | CloseHandle(thread);
187 | }
188 |
189 | //Wait for multiple threads
190 | void wait_for_threads(const CUTThread * threads, int num){
191 | WaitForMultipleObjects(num, threads, true, INFINITE);
192 |
193 | for(int i = 0; i < num; i++)
194 | CloseHandle(threads[i]);
195 | }
196 |
197 | #else
198 | //Create thread
199 | CUTThread start_thread(CUT_THREADROUTINE func, void * data){
200 | pthread_t thread;
201 | pthread_create(&thread, NULL, func, data);
202 | return thread;
203 | }
204 |
205 | //Wait for thread to finish
206 | void end_thread(CUTThread thread){
207 | pthread_join(thread, NULL);
208 | }
209 |
210 | //Destroy thread
211 | void destroy_thread( CUTThread thread ){
212 | pthread_cancel(thread);
213 | }
214 |
215 | //Wait for multiple threads
216 | void wait_for_threads(const CUTThread * threads, int num){
217 | int i = 0;
218 |
219 | for(i = 0; i < num; i++)
220 | end_thread( threads[i] );
221 | }
222 |
223 | #endif
224 |
225 |
226 |
227 |
228 | #endif // __BOOK_H__
229 |
--------------------------------------------------------------------------------
/Get_GPU_Properties/common/common_functions.h:
--------------------------------------------------------------------------------
1 | // File: common_functions.h
2 |
3 |
4 | //Shared Functions
5 |
6 | long skip_header_data(FILE *);
7 | int power(int , int );
8 | unsigned int hash_algorithm(unsigned int , int, unsigned int );
9 | unsigned int hash_algorithm_32bp(long long , int, unsigned int);
10 | long long sequence_reverse_complement(long long , int );
11 | bool check_for_valid_nucleotide(char);
12 | unsigned int nucleotide_to_uint(char);
13 | long long bp_string_to_uint(char *string, int x_mer_size);
14 |
15 |
16 | long skip_header_data(FILE *fin) {
17 | char hdr_string[120];
18 | char nucleotide;
19 | long current_file_addr = 0;
20 | bool DONE = FALSE;
21 |
22 | current_file_addr = ftell(fin); // Get current file position
23 |
24 | while (!DONE) {
25 | if (fscanf(fin, "%c", &nucleotide) <= 0)
26 | DONE = TRUE;
27 | else if (check_for_valid_nucleotide(nucleotide))
28 | DONE = TRUE;
29 | else {
30 | fgets(hdr_string , 120 , fin);
31 | current_file_addr = ftell(fin);
32 | }
33 | }
34 |
35 | return current_file_addr;
36 | }
37 |
38 | int power(int x, int y) {
39 | int result = 1;
40 | int i;
41 |
42 | for (i=0; i < y; i++) {
43 | result = result * x;
44 | }
45 | return result;
46 | }
47 |
48 | int log_2(int num) {
49 |
50 | int shift_cnt = 0;
51 | int shift_val = 0;
52 | bool one_found = FALSE;
53 |
54 | shift_val = num;
55 |
56 | while(!one_found) {
57 | shift_val = shift_val >> 1;
58 | shift_cnt += 1;
59 | if (shift_val == 1)
60 | one_found = TRUE;
61 |
62 | }
63 |
64 | return shift_cnt;
65 |
66 | }
67 |
68 | unsigned int hash_algorithm(unsigned int base_x, int x_mer_size, unsigned int bit_mask) {
69 |
70 | unsigned int hashed_array_addr = 0;
71 | unsigned int base_x_xor_upper = 0;
72 | unsigned int base_x_xor_lower = 0;
73 |
74 | /* //2x
75 | base_x_xor_upper = ((base_x >> 0) ^ (base_x >> 9) ^ (base_x >> 19)) & bit_mask;
76 | base_x_xor_lower = ((base_x >> 16) ^ (base_x >> 6) ^ (base_x >> 23)) & bit_mask;
77 | */
78 | /* //4x
79 | base_x_xor_upper = ((base_x >> 0) ^ (base_x >> 9) ^ (base_x >> 19) ^ (base_x >> 18)) & bit_mask;
80 | base_x_xor_lower = ((base_x >> 16) ^ (base_x >> 6) ^ (base_x >> 23) ) & bit_mask;
81 | */
82 | base_x_xor_upper = ((base_x >> 0) ^ (base_x >> 8) ^ (base_x >> 19) ^ (base_x >> 18)) & bit_mask;
83 | base_x_xor_lower = ((base_x >> 16) ^ (base_x >> 9) ^ (base_x >> 23) ^ (base_x >> 18)) & bit_mask;
84 |
85 | hashed_array_addr = (base_x_xor_lower + base_x_xor_upper) & bit_mask;
86 |
87 |
88 | return hashed_array_addr; //hashed_array_addr
89 |
90 | }
91 |
92 |
93 | unsigned int hash_algorithm_32bp(long long base_x, int x_mer_size, unsigned int bit_mask) {
94 |
95 | unsigned int hashed_array_addr = 0;
96 | unsigned int base_x_xor_upper = 0;
97 | unsigned int base_x_xor_lower = 0;
98 |
99 |
100 | base_x_xor_upper = ( ((base_x >> 44) & 0xf0f) ^ (base_x >> 0) ^ (base_x >> 9) ) & HASH_MASK;
101 | base_x_xor_lower = ( ((base_x >> 51) & 0x5a5) ^ (base_x >> 16) ^ (base_x >> 30) ) & HASH_MASK;
102 |
103 | /*
104 | //original
105 | base_x_xor_upper = ((base_x >> 46) ^ (base_x >> 0) ^ (base_x >> 9) ^ (base_x >> 39)) & bit_mask;
106 | base_x_xor_lower = ((base_x >> 16) ^ (base_x >> 30) ^ (base_x >> 45) ^ (base_x >> 23)) & bit_mask;
107 | */
108 | hashed_array_addr = (base_x_xor_lower + base_x_xor_upper) & HASH_MASK;
109 |
110 | return hashed_array_addr;
111 | }
112 |
113 |
114 | long long sequence_reverse_complement(long long orig_sequence,int x_mer_size) {
115 |
116 | int i;
117 | long long rev_comp_seq = 0;
118 | unsigned int comp_base[x_mer_size]; //5' Position of original sequence - complemented = index 0
119 | //3' Position of original sequence - complemented = index 15
120 |
121 | for (i=0; i < x_mer_size; i++) {
122 | comp_base[i] = ((orig_sequence >> (((x_mer_size - i) - 1) * 2)) & 3) ^ 3;
123 | }
124 |
125 | for (i= (x_mer_size - 1); i >= 0 ; i--) {
126 | rev_comp_seq = (comp_base[i] << (i * 2)) | rev_comp_seq;
127 | }
128 |
129 | return rev_comp_seq;
130 | }
131 |
132 |
133 | bool check_for_valid_nucleotide(char nucleotide) {
134 | unsigned int uint_nuke = 99;
135 |
136 | switch (nucleotide) {
137 | case 'A' :
138 | uint_nuke = 0;
139 | break;
140 |
141 | case 'a' :
142 | uint_nuke = 0;
143 | break;
144 |
145 | case 'C' :
146 | uint_nuke = 1;
147 | break;
148 |
149 | case 'c' :
150 | uint_nuke = 1;
151 | break;
152 |
153 | case 'G' :
154 | uint_nuke = 2;
155 | break;
156 |
157 | case 'g' :
158 | uint_nuke = 2;
159 | break;
160 |
161 | case 'T' :
162 | uint_nuke = 3;
163 | break;
164 |
165 | case 't' :
166 | uint_nuke = 3;
167 | break;
168 |
169 | default :
170 | uint_nuke = 99;
171 | }
172 |
173 | if (uint_nuke < 99)
174 | return TRUE;
175 | else
176 | return FALSE;
177 | }
178 |
179 | unsigned int nucleotide_to_uint(char nucleotide) {
180 | unsigned int uint_nuke;
181 | switch (nucleotide) {
182 | case 'A' :
183 | uint_nuke = 0;
184 | break;
185 |
186 | case 'a' :
187 | uint_nuke = 0;
188 | break;
189 |
190 | case 'C' :
191 | uint_nuke = 1;
192 | break;
193 |
194 | case 'c' :
195 | uint_nuke = 1;
196 | break;
197 |
198 | case 'G' :
199 | uint_nuke = 2;
200 | break;
201 |
202 | case 'g' :
203 | uint_nuke = 2;
204 | break;
205 |
206 | case 'T' :
207 | uint_nuke = 3;
208 | break;
209 |
210 | case 't' :
211 | uint_nuke = 3;
212 | break;
213 |
214 | default :
215 | uint_nuke = 0;
216 | }
217 | return uint_nuke;
218 | }
219 |
220 |
221 | long long bp_string_to_uint(char *string, int x_mer_size) {
222 | long long uint_nuke;
223 | long long sequence = 0;
224 | int i;
225 |
226 |
227 | for (i=0; i < x_mer_size; i++) {
228 | switch (string[i]) {
229 | case 'A' :
230 | uint_nuke = 0;
231 | break;
232 |
233 | case 'a' :
234 | uint_nuke = 0;
235 | break;
236 |
237 | case 'C' :
238 | uint_nuke = 1;
239 | break;
240 |
241 | case 'c' :
242 | uint_nuke = 1;
243 | break;
244 |
245 | case 'G' :
246 | uint_nuke = 2;
247 | break;
248 |
249 | case 'g' :
250 | uint_nuke = 2;
251 | break;
252 |
253 | case 'T' :
254 | uint_nuke = 3;
255 | break;
256 |
257 | case 't' :
258 | uint_nuke = 3;
259 | break;
260 |
261 | default :
262 | uint_nuke = 0;
263 | }
264 | sequence = sequence | (uint_nuke << ((x_mer_size-1-i)*2));
265 | }
266 | return sequence;
267 | }
268 |
269 |
270 |
271 |
--------------------------------------------------------------------------------
/Get_GPU_Properties/common/constants_32bp.h:
--------------------------------------------------------------------------------
1 | // File: constants_32bp.h
2 |
3 | // Constants
4 | #define NUM_OF_SEQUENCES_IN_CACHE 16 * 1024 //8 * 1024 // Number of database sequences stored in cache
5 | #define NUM_OF_ENTRIES_IN_PTR_ARRAYS 8 * 1024 //4 * 1024
6 | #define HASH_MASK 0x1fff //Correlates to TABLE_LENGTH
7 | #define NUM_OF_LUTS_USED 12 // Number of parallel LUTs used to find first occurrence
8 | #define NUM_OF_BPS_PER_QRY_SEQ 32 // Particular to the current algorithm
9 | #define X_MER_SIZE NUM_OF_BPS_PER_QRY_SEQ
10 | #define NUM_OF_BITS_FOR_QRY_SEQ_ID 16
11 | #define NUM_OF_BITS_FOR_DB_SEQ_ID 16
12 | #define NUM_OF_BITS_FOR_TOTAL_RSLT_WORD 16 // 13 bits for 1st occurrence offset + 3'b000
13 | #define MAX_HITS 300 // Max number of hits/ptrs per bin recorded
14 | #define NUM_OF_BITS_PER_NUCLEOTIDE 2
15 | #define NUM_OF_BYTES_PER_WORD 4
16 |
17 | #define QRY_SEGMENT_ID_INDICATOR 0x80000000 // Upper 3 bits of the 32 bit data result indicates that
18 | // the Qry segment ID # can be found in the lower 29 bits
19 | #define DB_SEGMENT_ID_INDICATOR 0xa0000000 // Upper 3 bits of the 32 bit data result indicates that
20 | // the DB segment ID # can be found in the lower 29 bits
21 | #define ADDITION_SEARCH_REQD_INDICATOR 0xc0000000 // Upper 3 bits of the 32 bit data result indicates that
22 | // additional search required because # of unique matches is
23 | // greater than # of unique lookup brams in FPGA. Seq Id is
24 | // located in the lower 16 bits.
25 |
26 | //Visual/Analyzed Report Processing Constanst
27 |
28 | #define SHOT_GUN_OVERSAMPLE_FACTOR 5 // Temporary variable. This value will be set and passed in main scripts.
29 | #define NUM_OF_WORDS_PER_MATCH 4 // Number of 32 bit words per match
30 | #define OVERSAMPLE_CUSION_FACTOR 5 // Extra entries factor above oversample factor
31 | #define NUM_OF_RSLT_RECORD_ENTRIES_PER_DB_OFFSET SHOT_GUN_OVERSAMPLE_FACTOR*NUM_OF_WORDS_PER_MATCH*OVERSAMPLE_CUSION_FACTOR
32 |
33 | // Other
34 | //create boolean logic
35 | #ifndef BOOLEAN
36 | typedef int bool;
37 | #endif
38 | #define FALSE 0
39 | #define TRUE 1
40 |
41 |
--------------------------------------------------------------------------------
/Get_GPU_Properties/common/db_file_names.h:
--------------------------------------------------------------------------------
1 | //File: db_file_names.h
2 | //
3 | //Used by db_preprocessor.c and gmatch.cu files
4 |
5 | const char *BIN_UNIQUE_CNT_FILE = "../db_files/unique_cnt_file.txt"; // Num of Unique Values/bin File
6 | const char *CACHE_PTR_FILE_1 = "../db_files/cache_ptr_file_1.txt"; // Ptrs to "Cache" Memory 1
7 | const char *CACHE_PTR_FILE_2 = "../db_files/cache_ptr_file_2.txt"; // Ptrs to "Cache" Memory 2
8 | const char *CACHE_PTR_FILE_3 = "../db_files/cache_ptr_file_3.txt"; // Ptrs to "Cache" Memory 3
9 | const char *CACHE_PTR_FILE_4 = "../db_files/cache_ptr_file_4.txt"; // Ptrs to "Cache" Memory 4
10 | const char *CACHE_PTR_FILE_5 = "../db_files/cache_ptr_file_5.txt"; // Ptrs to "Cache" Memory 5
11 | const char *CACHE_PTR_FILE_6 = "../db_files/cache_ptr_file_6.txt"; // Ptrs to "Cache" Memory 6
12 | const char *CACHE_PTR_FILE_7 = "../db_files/cache_ptr_file_7.txt"; // Ptrs to "Cache" Memory 7
13 | const char *CACHE_PTR_FILE_8 = "../db_files/cache_ptr_file_8.txt"; // Ptrs to "Cache" Memory 8
14 | const char *CACHE_PTR_FILE_9 = "../db_files/cache_ptr_file_9.txt"; // Ptrs to "Cache" Memory 9
15 | const char *CACHE_PTR_FILE_10 = "../db_files/cache_ptr_file_10.txt"; // Ptrs to "Cache" Memory 10
16 | const char *CACHE_PTR_FILE_11 = "../db_files/cache_ptr_file_11.txt"; // Ptrs to "Cache" Memory 11
17 | const char *CACHE_PTR_FILE_12 = "../db_files/cache_ptr_file_12.txt"; // Ptrs to "Cache" Memory 12
18 | const char *CACHE_PTR_FILE_13 = "../db_files/cache_ptr_file_13.txt"; // Ptrs to "Cache" Memory 13
19 | const char *CACHE_PTR_FILE_14 = "../db_files/cache_ptr_file_14.txt"; // Ptrs to "Cache" Memory 14
20 | const char *CACHE_FILE = "../db_files/cache_file.txt"; // Database Segment File
21 | const char *CACHE_FILE_RAW = "../db_files/cache_file_raw.txt"; // Database Segment File
22 | const char *STATS_FILE = "../db_files/stats.txt";
23 | const char *NUM_DB_SEGS = "../db_files/num_db_segs.txt"; // Number of DB Segments Processed
24 |
--------------------------------------------------------------------------------
/Get_GPU_Properties/common/gpu_arch_constants.h:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZenoRobotics/CUDA_C_and_GPU/87dadf7d66311aa0db03ff163fafa7c545cc5298/Get_GPU_Properties/common/gpu_arch_constants.h
--------------------------------------------------------------------------------
/Get_GPU_Properties/common/qry_file_names.h:
--------------------------------------------------------------------------------
1 | //File: qry_file_names.h
2 | //
3 | //Used by bp_to_hex_converter.c and gmatch.cu files
4 |
5 | const char *QUERY_RAW = "../qry_files/query_data_bp_format.txt";
6 | const char *QUERY_HEX = "../qry_files/query_data_hex_format.txt";
7 | const char *QUERY_CNT = "../qry_files/query_sequence_count.txt";
8 |
--------------------------------------------------------------------------------
/Get_GPU_Properties/get_device_properties:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZenoRobotics/CUDA_C_and_GPU/87dadf7d66311aa0db03ff163fafa7c545cc5298/Get_GPU_Properties/get_device_properties
--------------------------------------------------------------------------------
/Get_GPU_Properties/get_device_properties.cu:
--------------------------------------------------------------------------------
1 | //************************************************
2 | // Name: get_device_properties.c
3 | // Copyright: ZenoMachines, LLC
4 | // Author: Peter J. Zeno
5 | // Date: 06/08/11
6 | // Description: Gets/Displays all GPUs on the computer
7 | // along with their properties.
8 | //
9 | //************************************************
10 | // USAGE:
11 | // ./get_device_properties
12 | //
13 |
14 | #include "./common/book.h"
15 |
16 | //output files
17 | //const char *OUTFILE = "./common/gpu_arch_constants.h";
18 | //const char *MAKEFILE = "Makefile_gmatch_cu";
19 |
20 | //file handles
21 | //FILE *fout;
22 | //FILE *fmake;
23 |
24 | int main( void ) {
25 |
26 | cudaDeviceProp prop;
27 | int count;
28 | int i=0;
29 | /*
30 | //open output files
31 | if ((fout = fopen(OUTFILE,"w+")) == NULL)
32 | printf("Cannot open %s for writing",OUTFILE);
33 | if ((fmake = fopen(MAKEFILE,"w+")) == NULL)
34 | printf("Cannot open %s for writing",MAKEFILE);
35 | */
36 | HANDLE_ERROR( cudaGetDeviceCount( &count ) ) ;
37 |
38 | for (i=0; i< count; i++) {
39 | HANDLE_ERROR( cudaGetDeviceProperties( &prop, i ) ) ;
40 |
41 | printf( "\n\n" ) ;
42 | printf( "--- General Information for device %d ---\n\n" , i ) ;
43 | printf( "Name: %s\n", prop.name ) ;
44 | printf( "Compute capability: %d.%d\n" , prop.major, prop.minor ) ;
45 | printf( "Device Clock rate: %d MHz\n" , prop.clockRate/1000 ) ; //comes in kilo-hertz
46 | printf( "Memory Clock rate: %d MHz\n" , prop.memoryClockRate/1000 ) ; //comes in kilo_hertz
47 | printf( "Device copy overlap: " ) ;
48 | if ( prop.deviceOverlap)
49 | printf( "Enabled\n" ) ;
50 | else
51 | printf( "Disabled\n" ) ;
52 | printf( "Kernel execution timeout: " ) ;
53 | if ( prop.kernelExecTimeoutEnabled)
54 | printf( "Enabled\n" ) ;
55 | else
56 | printf( "Disabled\n" ) ;
57 | printf( "\n" ) ;
58 | printf( "--- Memory Information for device %d ---\n\n" , i ) ;
59 | printf( "Total global mem: %4.1f MBs\n" , (double) prop.totalGlobalMem/(1024 * 1024) ) ;
60 | printf( "Total constant Mem: %lu KBs\n" , (long unsigned int) prop.totalConstMem/1024 ) ;
61 | printf( "Max mem pitch: %lu MBs\n", (long unsigned int) prop.memPitch/(1024 * 1024) ) ;
62 | printf( "Texture Alignment: %lu\n" , (long unsigned int) prop.textureAlignment ) ;
63 | printf( " \n" ) ;
64 | printf( "--- MP Information for device %d --- \n\n", i ) ;
65 | printf( "Multiprocessor count: %d\n" , prop.multiProcessorCount ) ;
66 | printf( "Shared mem per block: %lu KBs\n", (long unsigned int) prop.sharedMemPerBlock/1024 ) ;
67 | printf( "Registers per block: %d K\n", prop.regsPerBlock/1024 ) ;
68 | printf( "Threads in warp: %d\n", prop.warpSize ) ;
69 | printf( "Max threads per block: %d\n" , prop.maxThreadsPerBlock ) ;
70 | printf( "Max threads per MP: %d\n" , prop.maxThreadsPerMultiProcessor);
71 | printf( "Max thread dimensions: (%d, %d, %d) \n" ,
72 | prop.maxThreadsDim[0] , prop.maxThreadsDim[1] ,
73 | prop.maxThreadsDim[2] ) ;
74 | printf( "Max grid dimensions: (%d, %d, %d) \n" ,
75 | prop.maxGridSize[0] , prop.maxGridSize[1] ,
76 | prop.maxGridSize[2] ) ;
77 | printf( " \n" ) ;
78 | }
79 | /*
80 | //Create Makefile for gmatch
81 | fprintf(fmake,"# Makefile for gmatch.cu program. \n");
82 | fprintf(fmake,"# Created by get_device_properties program. \n\n\n");
83 | fprintf(fmake,"gmatch : gmatch.cu \n");
84 | fprintf(fmake," nvcc -I. -I/usr/local/cuda/include/ -I/usr/local/cuda/include/crt/ -L/usr/local/cuda/lib64/ -lcuda --ptxas-options=-v -arch=sm_%d%d gmatch.cu -o gmatch \n\n\n", prop.major, prop.minor);
85 | fprintf(fmake,"clean: \n");
86 | fprintf(fmake," rm -f *.o *~ core .depend gmatch \n\n\n");
87 | fprintf(fmake,"depend .depend dep: \n");
88 | fprintf(fmake," $(CC) $(CFLAGS) -M *.c > $@ \n\n\n");
89 |
90 | //Create GPU Specific Constants header file
91 | fprintf(fout,"//GPU Specific Constants Header File. \n");
92 | fprintf(fout,"// \n");
93 | fprintf(fout,"//Created by get_device_properties program. \n\n\n");
94 | fprintf(fout,"const int blocksPerGrid = %d;\n" , prop.multiProcessorCount);
95 | fprintf(fout,"const int max_num_of_threads_per_block = %d;\n" , prop.maxThreadsPerBlock);
96 | fprintf(fout,"\n\n");
97 |
98 |
99 | //close output files
100 | fclose(fout);
101 | fclose(fmake);
102 | */
103 | }
104 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Intro to CUDA C and GPU Architecture Course - 6 hr and extended versions
2 |
3 | Note: Book used for this course is "Programming Massively Parallel Processors - A Hands-on Approach"
4 | The current newest edition is the 4th edition. However, there is a free PDF of for the 4th edition. So, feel free
5 | to use the free PDF found here:
6 |
7 | http://gpu.di.unimi.it/books/PMPP-3rd-Edition.pdf
8 |
9 | ## Links to 3rd edition and materials:
10 |
11 | https://shop.elsevier.com/books/programming-massively-parallel-processors/kirk/978-0-12-811986-0
12 |
13 | ## Book resources root links:
14 | https://booksite.elsevier.com/9780128119860/
15 | https://booksite.elsevier.com/9780128119860/lecture.php (Extra Lecture Slides)
16 |
17 | ## Labs for Course link:
18 | https://github.com/R100001/Programming-Massively-Parallel-Processors/tree/master
19 |
20 | ## CUDA C++ Programming Guide link:
21 | https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html
22 |
23 |
24 |
25 | # Course Outline:
26 | - nVidia GPU Architecture to Support CUDA
27 | - Intro to CUDA C and Host program format
28 | - CUDA threads, blocks, and indexing
29 | - nvcc compiler
30 | - kernel launch
31 | - memory management
32 | - kernel and host code synchronization
33 | - Tensor Cores: Architecture and NN Application
34 | - CUDA Memory Hierarchy
35 | - Shared memory and thread synchronization
36 | DRAM Circuit Operation Considerations: Access Types, Latency Caused by Non-Batch Fetches
37 | - Performance Considerations
38 | - Brief Coverage of PyTorch with CUDA, cuDNN, and cuVSLAM
39 |
40 | The course uses Jupyter Notebook - Colab. If you have your a GPU on your computer and wish to use a different application or command line execution, please feel free to do so.
41 |
42 | * Follow these directions to get acquanted with running CUDA code on the Jupyter Notebook platform: \
43 | https://www.geeksforgeeks.org/how-to-run-cuda-c-c-on-jupyter-notebook-in-google-colaboratory/
44 |
45 |
46 | ## Key Course Takeaways
47 | - Why Nvidia GPU Architectures Changed to General Purpose Processing Architectures (CUDA Arch)
48 | - CPU vs GPU Hardware Architecture: Key differences in unit processor's complexity and why.
49 | - GPU Hardware Basic Components Used For CUDA General Purpose Processing
50 | - GPU Hardware to Software Vocabular Mapping/Translation
51 | - Block and Thread Level Indexing Concept (through Lecture and Programming Homework Problems).
52 | - Memory Hierarchy
53 | - DRAM Circuit Operation Considerations: Access Types, Latency Caused by Non-Batch Fetches
54 | - Memory Coalescing vs Non-Coalesced Access Pattern Impact on Performance
55 | - Performance Considerations
56 | - CUDA, Numba, Cupy, Tensorflow, Pytorch relations
57 |
58 | ## Prerequisites
59 | - Working Knowledge of C
60 | - Exposure to Basic Computer Architecture
61 |
62 | ## GPU Access for Gaining Programming Experience
63 | Methods:
64 | 1) Nvida GPU installed on your own computer (via Windows, Linux, or Mac OS)
65 | 2) Use of Google Colab-Notebook through your web browser to gain free access of GPU via Cloud Service.
66 |
67 | ## Installing CUDA on Windows
68 | https://docs.nvidia.com/cuda/cuda-installation-guide-microsoft-windows/index.html
69 |
70 | ## Verify CUDA Install (RHEL or Ubuntu) and Toolkit
71 | https://xcat-docs.readthedocs.io/en/stable/advanced/gpu/nvidia/verify_cuda_install.html
72 |
73 | ## Other Links of Possible Interest or Reference
74 | https://pytorch.org/docs/stable/notes/cuda.html
75 |
76 |
77 |
78 | # Signup Instructions
79 | Information about signing up for the 5hr courses I offer (The Zeno Institute of Robotics and Artificial Intelligence), is as follows:
80 |
81 | You can purchase some of these courses through the [ZenoRobotcs.com](https://www.zenorobotics.com/courses) website. Other payment option is through Venmo, PayPal, Zelle, or Cash app. There is a $5/course savings for using Zelle. Please contact me for payment details for non-website methods.
82 |
83 | Once you pay, I will send you a link to the booking calendar where you can setup your times.
84 |
85 | When selecting your hour slots, please only choose an hour block for the first hour meeting. This will give me a chance to find out about your HW & SW setup, point out links to get you started, etc. Please limit any single day session to 2 hours max to give you time to absorb the concepts and do some programming/homework problems. Also, you don’t have to book all 5 hr time slots at once. You can select them as time progresses if you wish.
86 |
87 |
88 |
89 | # Additional Learning Resources Links
90 |
91 | ## YouTube
92 | Tom Nurkkala - Video talks for various Computer Science courses at Taylor University:
93 |
94 | - CUDA Hardware \
95 | https://www.youtube.com/watch?v=kUqkOAU84bA
96 |
97 | - Intro to GPU Programming \
98 | https://www.youtube.com/watch?v=G-EimI4q-TQ
99 |
100 | CUDA University Courses
101 |
102 | University of Illinois : Current Course: ECE408/CS483
103 | Taught by Professor Wen-mei W. Hwu and David Kirk, NVIDIA CUDA Scientist. \
104 | https://developer.nvidia.com/educators/existing-courses#2
105 |
106 | Other:
107 |
108 | - Data Access Pattern Matters: How CUDA Programming Works | GTC 2022 (6:55 and on) \
109 | https://www.youtube.com/watch?v=n6M8R8-PlnE
110 |
111 | - Tutorial: CUDA programming in Python with numba and cupy: \
112 | https://www.youtube.com/watch?v=9bBsvpg-Xlk
113 |
114 |
115 | ## Code Links
116 |
117 | - CUDA Samples \
118 | https://github.com/nvidia/cuda-samples
119 |
120 | - Programming-Massively-Parallel-Processors Learning Material (Reading/Images, Exercises, & Labs) \
121 | https://github.com/R100001/Programming-Massively-Parallel-Processors/tree/master
122 |
123 | - CUDA Concepts Cheat Sheet \
124 | https://kdm.icm.edu.pl/Tutorials/GPU-intro/introduction.en/
125 |
126 | ## Colab
127 |
128 | - How to Use a GPU In Google Colab \
129 | https://www.geeksforgeeks.org/how-to-use-gpu-in-google-colab/ \
130 | https://www.geeksforgeeks.org/how-to-run-cuda-c-c-on-jupyter-notebook-in-google-colaboratory/
131 |
132 | - How to Use Colab \
133 | https://www.geeksforgeeks.org/how-to-use-google-colab/
134 |
135 | - How to use GPU acceleration in PyTorch \
136 | https://www.geeksforgeeks.org/how-to-use-gpu-acceleration-in-pytorch/
137 |
138 | - Colab Site \
139 | https://colab.research.google.com
140 |
141 | - Example CUDA GPU Use Github/Notebook \
142 | https://colab.research.google.com/github/ShimaaElabd/CUDA-GPU-Contrast-Enhancement/blob/master/CUDA_GPU.ipynb#scrollTo=mgH5HreZ2WS9
143 |
144 | - Example: GPU calculation in python with Cupy and Numba \
145 | https://colab.research.google.com/drive/15IDLiUMRJbKqZUZPccyigudINCD5uZ71?usp=sharing
146 |
147 |
148 | ## PTX and SASS
149 |
150 | - Parallel Thread Execution (PTX) \
151 | https://docs.nvidia.com/cuda/parallel-thread-execution/index.html
152 |
153 | - PTX and SASS Assembly Debugging \
154 | https://docs.nvidia.com/gameworks/content/developertools/desktop/ptx_sass_assembly_debugging.htm
155 |
156 |
157 | ## PyCUDA
158 |
159 | https://pypi.org/project/pycuda/
160 |
161 |
162 | ## Cupy
163 |
164 | - About \
165 | https://cupy.dev/
166 |
167 | - Interoperability \
168 | https://docs.cupy.dev/en/stable/user_guide/interoperability.html
169 |
170 |
171 |
172 |
173 |
174 |
175 |
--------------------------------------------------------------------------------
/TechDocs/NVIDIA-Turing-Architecture-Whitepaper.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZenoRobotics/CUDA_C_and_GPU/87dadf7d66311aa0db03ff163fafa7c545cc5298/TechDocs/NVIDIA-Turing-Architecture-Whitepaper.pdf
--------------------------------------------------------------------------------
/TechDocs/nsight_profiler_explained.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ZenoRobotics/CUDA_C_and_GPU/87dadf7d66311aa0db03ff163fafa7c545cc5298/TechDocs/nsight_profiler_explained.pdf
--------------------------------------------------------------------------------
/common/GL/glut.h:
--------------------------------------------------------------------------------
1 | #ifndef __glut_h__
2 | #define __glut_h__
3 |
4 | /* Copyright (c) Mark J. Kilgard, 1994, 1995, 1996, 1998. */
5 |
6 | /* This program is freely distributable without licensing fees and is
7 | provided without guarantee or warrantee expressed or implied. This
8 | program is -not- in the public domain. */
9 |
10 | #if defined(_WIN32)
11 |
12 | /* GLUT 3.7 now tries to avoid including
13 | to avoid name space pollution, but Win32's
14 | needs APIENTRY and WINGDIAPI defined properly. */
15 | # if 0
16 | /* This would put tons of macros and crap in our clean name space. */
17 | # define WIN32_LEAN_AND_MEAN
18 | # include
19 | # else
20 | /* XXX This is from Win32's */
21 | # ifndef APIENTRY
22 | # define GLUT_APIENTRY_DEFINED
23 | # if (_MSC_VER >= 800) || defined(_STDCALL_SUPPORTED) || defined(__BORLANDC__) || defined(__LCC__)
24 | # define APIENTRY __stdcall
25 | # else
26 | # define APIENTRY
27 | # endif
28 | # endif
29 | /* XXX This is from Win32's */
30 | # ifndef CALLBACK
31 | # if (defined(_M_MRX000) || defined(_M_IX86) || defined(_M_ALPHA) || defined(_M_PPC)) && !defined(MIDL_PASS) || defined(__LCC__)
32 | # define CALLBACK __stdcall
33 | # else
34 | # define CALLBACK
35 | # endif
36 | # endif
37 | /* XXX Hack for lcc compiler. It doesn't support __declspec(dllimport), just __stdcall. */
38 | # if defined( __LCC__ )
39 | # undef WINGDIAPI
40 | # define WINGDIAPI __stdcall
41 | # else
42 | /* XXX This is from Win32's and */
43 | # ifndef WINGDIAPI
44 | # define GLUT_WINGDIAPI_DEFINED
45 | # define WINGDIAPI __declspec(dllimport)
46 | # endif
47 | # endif
48 | /* XXX This is from Win32's */
49 | # ifndef _WCHAR_T_DEFINED
50 | typedef unsigned short wchar_t;
51 | # define _WCHAR_T_DEFINED
52 | # endif
53 | # endif
54 |
55 | /* To disable automatic library usage for GLUT, define GLUT_NO_LIB_PRAGMA
56 | in your compile preprocessor options. */
57 | # if !defined(GLUT_BUILDING_LIB) && !defined(GLUT_NO_LIB_PRAGMA)
58 | # pragma comment (lib, "winmm.lib") /* link with Windows MultiMedia lib */
59 | /* To enable automatic SGI OpenGL for Windows library usage for GLUT,
60 | define GLUT_USE_SGI_OPENGL in your compile preprocessor options. */
61 | # ifdef GLUT_USE_SGI_OPENGL
62 | # pragma comment (lib, "opengl.lib") /* link with SGI OpenGL for Windows lib */
63 | # pragma comment (lib, "glu.lib") /* link with SGI OpenGL Utility lib */
64 | # pragma comment (lib, "glut.lib") /* link with Win32 GLUT for SGI OpenGL lib */
65 | # else
66 | # pragma comment (lib, "opengl32.lib") /* link with Microsoft OpenGL lib */
67 | # pragma comment (lib, "glu32.lib") /* link with Microsoft OpenGL Utility lib */
68 | # pragma comment (lib, "glut32.lib") /* link with Win32 GLUT lib */
69 | # endif
70 | # endif
71 |
72 | /* To disable supression of annoying warnings about floats being promoted
73 | to doubles, define GLUT_NO_WARNING_DISABLE in your compile preprocessor
74 | options. */
75 | # ifndef GLUT_NO_WARNING_DISABLE
76 | # pragma warning (disable:4244) /* Disable bogus VC++ 4.2 conversion warnings. */
77 | # pragma warning (disable:4305) /* VC++ 5.0 version of above warning. */
78 | # endif
79 |
80 | /* Win32 has an annoying issue where there are multiple C run-time
81 | libraries (CRTs). If the executable is linked with a different CRT
82 | from the GLUT DLL, the GLUT DLL will not share the same CRT static
83 | data seen by the executable. In particular, atexit callbacks registered
84 | in the executable will not be called if GLUT calls its (different)
85 | exit routine). GLUT is typically built with the
86 | "/MD" option (the CRT with multithreading DLL support), but the Visual
87 | C++ linker default is "/ML" (the single threaded CRT).
88 |
89 | One workaround to this issue is requiring users to always link with
90 | the same CRT as GLUT is compiled with. That requires users supply a
91 | non-standard option. GLUT 3.7 has its own built-in workaround where
92 | the executable's "exit" function pointer is covertly passed to GLUT.
93 | GLUT then calls the executable's exit function pointer to ensure that
94 | any "atexit" calls registered by the application are called if GLUT
95 | needs to exit.
96 |
97 | Note that the __glut*WithExit routines should NEVER be called directly.
98 | To avoid the atexit workaround, #define GLUT_DISABLE_ATEXIT_HACK. */
99 |
100 | /* XXX This is from Win32's */
101 | # if !defined(_MSC_VER) && !defined(__cdecl)
102 | /* Define __cdecl for non-Microsoft compilers. */
103 | # define __cdecl
104 | # define GLUT_DEFINED___CDECL
105 | # endif
106 | # ifndef _CRTIMP
107 | # ifdef _NTSDK
108 | /* Definition compatible with NT SDK */
109 | # define _CRTIMP
110 | # else
111 | /* Current definition */
112 | # ifdef _DLL
113 | # define _CRTIMP __declspec(dllimport)
114 | # else
115 | # define _CRTIMP
116 | # endif
117 | # endif
118 | # define GLUT_DEFINED__CRTIMP
119 | # endif
120 |
121 | /* GLUT API entry point declarations for Win32. */
122 | # ifdef GLUT_BUILDING_LIB
123 | # define GLUTAPI __declspec(dllexport)
124 | # else
125 | # ifdef _DLL
126 | # define GLUTAPI __declspec(dllimport)
127 | # else
128 | # define GLUTAPI extern
129 | # endif
130 | # endif
131 |
132 | /* GLUT callback calling convention for Win32. */
133 | # define GLUTCALLBACK __cdecl
134 |
135 | #endif /* _WIN32 */
136 |
137 | #include
138 | #include
139 |
140 | #ifdef __cplusplus
141 | extern "C" {
142 | #endif
143 |
144 | #if defined(_WIN32)
145 | # ifndef GLUT_BUILDING_LIB
146 | extern _CRTIMP void __cdecl exit(int);
147 | # endif
148 | #else
149 | /* non-Win32 case. */
150 | /* Define APIENTRY and CALLBACK to nothing if we aren't on Win32. */
151 | # define APIENTRY
152 | # define GLUT_APIENTRY_DEFINED
153 | # define CALLBACK
154 | /* Define GLUTAPI and GLUTCALLBACK as below if we aren't on Win32. */
155 | # define GLUTAPI extern
156 | # define GLUTCALLBACK
157 | /* Prototype exit for the non-Win32 case (see above). */
158 | extern void exit(int);
159 | #endif
160 |
161 | /**
162 | GLUT API revision history:
163 |
164 | GLUT_API_VERSION is updated to reflect incompatible GLUT
165 | API changes (interface changes, semantic changes, deletions,
166 | or additions).
167 |
168 | GLUT_API_VERSION=1 First public release of GLUT. 11/29/94
169 |
170 | GLUT_API_VERSION=2 Added support for OpenGL/GLX multisampling,
171 | extension. Supports new input devices like tablet, dial and button
172 | box, and Spaceball. Easy to query OpenGL extensions.
173 |
174 | GLUT_API_VERSION=3 glutMenuStatus added.
175 |
176 | GLUT_API_VERSION=4 glutInitDisplayString, glutWarpPointer,
177 | glutBitmapLength, glutStrokeLength, glutWindowStatusFunc, dynamic
178 | video resize subAPI, glutPostWindowRedisplay, glutKeyboardUpFunc,
179 | glutSpecialUpFunc, glutIgnoreKeyRepeat, glutSetKeyRepeat,
180 | glutJoystickFunc, glutForceJoystickFunc (NOT FINALIZED!).
181 | **/
182 | #ifndef GLUT_API_VERSION /* allow this to be overriden */
183 | #define GLUT_API_VERSION 3
184 | #endif
185 |
186 | /**
187 | GLUT implementation revision history:
188 |
189 | GLUT_XLIB_IMPLEMENTATION is updated to reflect both GLUT
190 | API revisions and implementation revisions (ie, bug fixes).
191 |
192 | GLUT_XLIB_IMPLEMENTATION=1 mjk's first public release of
193 | GLUT Xlib-based implementation. 11/29/94
194 |
195 | GLUT_XLIB_IMPLEMENTATION=2 mjk's second public release of
196 | GLUT Xlib-based implementation providing GLUT version 2
197 | interfaces.
198 |
199 | GLUT_XLIB_IMPLEMENTATION=3 mjk's GLUT 2.2 images. 4/17/95
200 |
201 | GLUT_XLIB_IMPLEMENTATION=4 mjk's GLUT 2.3 images. 6/?/95
202 |
203 | GLUT_XLIB_IMPLEMENTATION=5 mjk's GLUT 3.0 images. 10/?/95
204 |
205 | GLUT_XLIB_IMPLEMENTATION=7 mjk's GLUT 3.1+ with glutWarpPoitner. 7/24/96
206 |
207 | GLUT_XLIB_IMPLEMENTATION=8 mjk's GLUT 3.1+ with glutWarpPoitner
208 | and video resize. 1/3/97
209 |
210 | GLUT_XLIB_IMPLEMENTATION=9 mjk's GLUT 3.4 release with early GLUT 4 routines.
211 |
212 | GLUT_XLIB_IMPLEMENTATION=11 Mesa 2.5's GLUT 3.6 release.
213 |
214 | GLUT_XLIB_IMPLEMENTATION=12 mjk's GLUT 3.6 release with early GLUT 4 routines + signal handling.
215 |
216 | GLUT_XLIB_IMPLEMENTATION=13 mjk's GLUT 3.7 beta with GameGLUT support.
217 |
218 | GLUT_XLIB_IMPLEMENTATION=14 mjk's GLUT 3.7 beta with f90gl friend interface.
219 |
220 | GLUT_XLIB_IMPLEMENTATION=15 mjk's GLUT 3.7 beta sync'ed with Mesa
221 | **/
222 | #ifndef GLUT_XLIB_IMPLEMENTATION /* Allow this to be overriden. */
223 | #define GLUT_XLIB_IMPLEMENTATION 15
224 | #endif
225 |
226 | /* Display mode bit masks. */
227 | #define GLUT_RGB 0
228 | #define GLUT_RGBA GLUT_RGB
229 | #define GLUT_INDEX 1
230 | #define GLUT_SINGLE 0
231 | #define GLUT_DOUBLE 2
232 | #define GLUT_ACCUM 4
233 | #define GLUT_ALPHA 8
234 | #define GLUT_DEPTH 16
235 | #define GLUT_STENCIL 32
236 | #if (GLUT_API_VERSION >= 2)
237 | #define GLUT_MULTISAMPLE 128
238 | #define GLUT_STEREO 256
239 | #endif
240 | #if (GLUT_API_VERSION >= 3)
241 | #define GLUT_LUMINANCE 512
242 | #endif
243 |
244 | /* Mouse buttons. */
245 | #define GLUT_LEFT_BUTTON 0
246 | #define GLUT_MIDDLE_BUTTON 1
247 | #define GLUT_RIGHT_BUTTON 2
248 |
249 | /* Mouse button state. */
250 | #define GLUT_DOWN 0
251 | #define GLUT_UP 1
252 |
253 | #if (GLUT_API_VERSION >= 2)
254 | /* function keys */
255 | #define GLUT_KEY_F1 1
256 | #define GLUT_KEY_F2 2
257 | #define GLUT_KEY_F3 3
258 | #define GLUT_KEY_F4 4
259 | #define GLUT_KEY_F5 5
260 | #define GLUT_KEY_F6 6
261 | #define GLUT_KEY_F7 7
262 | #define GLUT_KEY_F8 8
263 | #define GLUT_KEY_F9 9
264 | #define GLUT_KEY_F10 10
265 | #define GLUT_KEY_F11 11
266 | #define GLUT_KEY_F12 12
267 | /* directional keys */
268 | #define GLUT_KEY_LEFT 100
269 | #define GLUT_KEY_UP 101
270 | #define GLUT_KEY_RIGHT 102
271 | #define GLUT_KEY_DOWN 103
272 | #define GLUT_KEY_PAGE_UP 104
273 | #define GLUT_KEY_PAGE_DOWN 105
274 | #define GLUT_KEY_HOME 106
275 | #define GLUT_KEY_END 107
276 | #define GLUT_KEY_INSERT 108
277 | #endif
278 |
279 | /* Entry/exit state. */
280 | #define GLUT_LEFT 0
281 | #define GLUT_ENTERED 1
282 |
283 | /* Menu usage state. */
284 | #define GLUT_MENU_NOT_IN_USE 0
285 | #define GLUT_MENU_IN_USE 1
286 |
287 | /* Visibility state. */
288 | #define GLUT_NOT_VISIBLE 0
289 | #define GLUT_VISIBLE 1
290 |
291 | /* Window status state. */
292 | #define GLUT_HIDDEN 0
293 | #define GLUT_FULLY_RETAINED 1
294 | #define GLUT_PARTIALLY_RETAINED 2
295 | #define GLUT_FULLY_COVERED 3
296 |
297 | /* Color index component selection values. */
298 | #define GLUT_RED 0
299 | #define GLUT_GREEN 1
300 | #define GLUT_BLUE 2
301 |
302 | #if defined(_WIN32)
303 | /* Stroke font constants (use these in GLUT program). */
304 | #define GLUT_STROKE_ROMAN ((void*)0)
305 | #define GLUT_STROKE_MONO_ROMAN ((void*)1)
306 |
307 | /* Bitmap font constants (use these in GLUT program). */
308 | #define GLUT_BITMAP_9_BY_15 ((void*)2)
309 | #define GLUT_BITMAP_8_BY_13 ((void*)3)
310 | #define GLUT_BITMAP_TIMES_ROMAN_10 ((void*)4)
311 | #define GLUT_BITMAP_TIMES_ROMAN_24 ((void*)5)
312 | #if (GLUT_API_VERSION >= 3)
313 | #define GLUT_BITMAP_HELVETICA_10 ((void*)6)
314 | #define GLUT_BITMAP_HELVETICA_12 ((void*)7)
315 | #define GLUT_BITMAP_HELVETICA_18 ((void*)8)
316 | #endif
317 | #else
318 | /* Stroke font opaque addresses (use constants instead in source code). */
319 | GLUTAPI void *glutStrokeRoman;
320 | GLUTAPI void *glutStrokeMonoRoman;
321 |
322 | /* Stroke font constants (use these in GLUT program). */
323 | #define GLUT_STROKE_ROMAN (&glutStrokeRoman)
324 | #define GLUT_STROKE_MONO_ROMAN (&glutStrokeMonoRoman)
325 |
326 | /* Bitmap font opaque addresses (use constants instead in source code). */
327 | GLUTAPI void *glutBitmap9By15;
328 | GLUTAPI void *glutBitmap8By13;
329 | GLUTAPI void *glutBitmapTimesRoman10;
330 | GLUTAPI void *glutBitmapTimesRoman24;
331 | GLUTAPI void *glutBitmapHelvetica10;
332 | GLUTAPI void *glutBitmapHelvetica12;
333 | GLUTAPI void *glutBitmapHelvetica18;
334 |
335 | /* Bitmap font constants (use these in GLUT program). */
336 | #define GLUT_BITMAP_9_BY_15 (&glutBitmap9By15)
337 | #define GLUT_BITMAP_8_BY_13 (&glutBitmap8By13)
338 | #define GLUT_BITMAP_TIMES_ROMAN_10 (&glutBitmapTimesRoman10)
339 | #define GLUT_BITMAP_TIMES_ROMAN_24 (&glutBitmapTimesRoman24)
340 | #if (GLUT_API_VERSION >= 3)
341 | #define GLUT_BITMAP_HELVETICA_10 (&glutBitmapHelvetica10)
342 | #define GLUT_BITMAP_HELVETICA_12 (&glutBitmapHelvetica12)
343 | #define GLUT_BITMAP_HELVETICA_18 (&glutBitmapHelvetica18)
344 | #endif
345 | #endif
346 |
347 | /* glutGet parameters. */
348 | #define GLUT_WINDOW_X ((GLenum) 100)
349 | #define GLUT_WINDOW_Y ((GLenum) 101)
350 | #define GLUT_WINDOW_WIDTH ((GLenum) 102)
351 | #define GLUT_WINDOW_HEIGHT ((GLenum) 103)
352 | #define GLUT_WINDOW_BUFFER_SIZE ((GLenum) 104)
353 | #define GLUT_WINDOW_STENCIL_SIZE ((GLenum) 105)
354 | #define GLUT_WINDOW_DEPTH_SIZE ((GLenum) 106)
355 | #define GLUT_WINDOW_RED_SIZE ((GLenum) 107)
356 | #define GLUT_WINDOW_GREEN_SIZE ((GLenum) 108)
357 | #define GLUT_WINDOW_BLUE_SIZE ((GLenum) 109)
358 | #define GLUT_WINDOW_ALPHA_SIZE ((GLenum) 110)
359 | #define GLUT_WINDOW_ACCUM_RED_SIZE ((GLenum) 111)
360 | #define GLUT_WINDOW_ACCUM_GREEN_SIZE ((GLenum) 112)
361 | #define GLUT_WINDOW_ACCUM_BLUE_SIZE ((GLenum) 113)
362 | #define GLUT_WINDOW_ACCUM_ALPHA_SIZE ((GLenum) 114)
363 | #define GLUT_WINDOW_DOUBLEBUFFER ((GLenum) 115)
364 | #define GLUT_WINDOW_RGBA ((GLenum) 116)
365 | #define GLUT_WINDOW_PARENT ((GLenum) 117)
366 | #define GLUT_WINDOW_NUM_CHILDREN ((GLenum) 118)
367 | #define GLUT_WINDOW_COLORMAP_SIZE ((GLenum) 119)
368 | #if (GLUT_API_VERSION >= 2)
369 | #define GLUT_WINDOW_NUM_SAMPLES ((GLenum) 120)
370 | #define GLUT_WINDOW_STEREO ((GLenum) 121)
371 | #endif
372 | #if (GLUT_API_VERSION >= 3)
373 | #define GLUT_WINDOW_CURSOR ((GLenum) 122)
374 | #endif
375 | #define GLUT_SCREEN_WIDTH ((GLenum) 200)
376 | #define GLUT_SCREEN_HEIGHT ((GLenum) 201)
377 | #define GLUT_SCREEN_WIDTH_MM ((GLenum) 202)
378 | #define GLUT_SCREEN_HEIGHT_MM ((GLenum) 203)
379 | #define GLUT_MENU_NUM_ITEMS ((GLenum) 300)
380 | #define GLUT_DISPLAY_MODE_POSSIBLE ((GLenum) 400)
381 | #define GLUT_INIT_WINDOW_X ((GLenum) 500)
382 | #define GLUT_INIT_WINDOW_Y ((GLenum) 501)
383 | #define GLUT_INIT_WINDOW_WIDTH ((GLenum) 502)
384 | #define GLUT_INIT_WINDOW_HEIGHT ((GLenum) 503)
385 | #define GLUT_INIT_DISPLAY_MODE ((GLenum) 504)
386 | #if (GLUT_API_VERSION >= 2)
387 | #define GLUT_ELAPSED_TIME ((GLenum) 700)
388 | #endif
389 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
390 | #define GLUT_WINDOW_FORMAT_ID ((GLenum) 123)
391 | #endif
392 |
393 | #if (GLUT_API_VERSION >= 2)
394 | /* glutDeviceGet parameters. */
395 | #define GLUT_HAS_KEYBOARD ((GLenum) 600)
396 | #define GLUT_HAS_MOUSE ((GLenum) 601)
397 | #define GLUT_HAS_SPACEBALL ((GLenum) 602)
398 | #define GLUT_HAS_DIAL_AND_BUTTON_BOX ((GLenum) 603)
399 | #define GLUT_HAS_TABLET ((GLenum) 604)
400 | #define GLUT_NUM_MOUSE_BUTTONS ((GLenum) 605)
401 | #define GLUT_NUM_SPACEBALL_BUTTONS ((GLenum) 606)
402 | #define GLUT_NUM_BUTTON_BOX_BUTTONS ((GLenum) 607)
403 | #define GLUT_NUM_DIALS ((GLenum) 608)
404 | #define GLUT_NUM_TABLET_BUTTONS ((GLenum) 609)
405 | #endif
406 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
407 | #define GLUT_DEVICE_IGNORE_KEY_REPEAT ((GLenum) 610)
408 | #define GLUT_DEVICE_KEY_REPEAT ((GLenum) 611)
409 | #define GLUT_HAS_JOYSTICK ((GLenum) 612)
410 | #define GLUT_OWNS_JOYSTICK ((GLenum) 613)
411 | #define GLUT_JOYSTICK_BUTTONS ((GLenum) 614)
412 | #define GLUT_JOYSTICK_AXES ((GLenum) 615)
413 | #define GLUT_JOYSTICK_POLL_RATE ((GLenum) 616)
414 | #endif
415 |
416 | #if (GLUT_API_VERSION >= 3)
417 | /* glutLayerGet parameters. */
418 | #define GLUT_OVERLAY_POSSIBLE ((GLenum) 800)
419 | #define GLUT_LAYER_IN_USE ((GLenum) 801)
420 | #define GLUT_HAS_OVERLAY ((GLenum) 802)
421 | #define GLUT_TRANSPARENT_INDEX ((GLenum) 803)
422 | #define GLUT_NORMAL_DAMAGED ((GLenum) 804)
423 | #define GLUT_OVERLAY_DAMAGED ((GLenum) 805)
424 |
425 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
426 | /* glutVideoResizeGet parameters. */
427 | #define GLUT_VIDEO_RESIZE_POSSIBLE ((GLenum) 900)
428 | #define GLUT_VIDEO_RESIZE_IN_USE ((GLenum) 901)
429 | #define GLUT_VIDEO_RESIZE_X_DELTA ((GLenum) 902)
430 | #define GLUT_VIDEO_RESIZE_Y_DELTA ((GLenum) 903)
431 | #define GLUT_VIDEO_RESIZE_WIDTH_DELTA ((GLenum) 904)
432 | #define GLUT_VIDEO_RESIZE_HEIGHT_DELTA ((GLenum) 905)
433 | #define GLUT_VIDEO_RESIZE_X ((GLenum) 906)
434 | #define GLUT_VIDEO_RESIZE_Y ((GLenum) 907)
435 | #define GLUT_VIDEO_RESIZE_WIDTH ((GLenum) 908)
436 | #define GLUT_VIDEO_RESIZE_HEIGHT ((GLenum) 909)
437 | #endif
438 |
439 | /* glutUseLayer parameters. */
440 | #define GLUT_NORMAL ((GLenum) 0)
441 | #define GLUT_OVERLAY ((GLenum) 1)
442 |
443 | /* glutGetModifiers return mask. */
444 | #define GLUT_ACTIVE_SHIFT 1
445 | #define GLUT_ACTIVE_CTRL 2
446 | #define GLUT_ACTIVE_ALT 4
447 |
448 | /* glutSetCursor parameters. */
449 | /* Basic arrows. */
450 | #define GLUT_CURSOR_RIGHT_ARROW 0
451 | #define GLUT_CURSOR_LEFT_ARROW 1
452 | /* Symbolic cursor shapes. */
453 | #define GLUT_CURSOR_INFO 2
454 | #define GLUT_CURSOR_DESTROY 3
455 | #define GLUT_CURSOR_HELP 4
456 | #define GLUT_CURSOR_CYCLE 5
457 | #define GLUT_CURSOR_SPRAY 6
458 | #define GLUT_CURSOR_WAIT 7
459 | #define GLUT_CURSOR_TEXT 8
460 | #define GLUT_CURSOR_CROSSHAIR 9
461 | /* Directional cursors. */
462 | #define GLUT_CURSOR_UP_DOWN 10
463 | #define GLUT_CURSOR_LEFT_RIGHT 11
464 | /* Sizing cursors. */
465 | #define GLUT_CURSOR_TOP_SIDE 12
466 | #define GLUT_CURSOR_BOTTOM_SIDE 13
467 | #define GLUT_CURSOR_LEFT_SIDE 14
468 | #define GLUT_CURSOR_RIGHT_SIDE 15
469 | #define GLUT_CURSOR_TOP_LEFT_CORNER 16
470 | #define GLUT_CURSOR_TOP_RIGHT_CORNER 17
471 | #define GLUT_CURSOR_BOTTOM_RIGHT_CORNER 18
472 | #define GLUT_CURSOR_BOTTOM_LEFT_CORNER 19
473 | /* Inherit from parent window. */
474 | #define GLUT_CURSOR_INHERIT 100
475 | /* Blank cursor. */
476 | #define GLUT_CURSOR_NONE 101
477 | /* Fullscreen crosshair (if available). */
478 | #define GLUT_CURSOR_FULL_CROSSHAIR 102
479 | #endif
480 |
481 | /* GLUT initialization sub-API. */
482 | GLUTAPI void APIENTRY glutInit(int *argcp, char **argv);
483 | #if defined(_WIN32) && !defined(GLUT_DISABLE_ATEXIT_HACK)
484 | GLUTAPI void APIENTRY __glutInitWithExit(int *argcp, char **argv, void (__cdecl *exitfunc)(int));
485 | #ifndef GLUT_BUILDING_LIB
486 | static void APIENTRY glutInit_ATEXIT_HACK(int *argcp, char **argv) { __glutInitWithExit(argcp, argv, exit); }
487 | #define glutInit glutInit_ATEXIT_HACK
488 | #endif
489 | #endif
490 | GLUTAPI void APIENTRY glutInitDisplayMode(unsigned int mode);
491 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
492 | GLUTAPI void APIENTRY glutInitDisplayString(const char *string);
493 | #endif
494 | GLUTAPI void APIENTRY glutInitWindowPosition(int x, int y);
495 | GLUTAPI void APIENTRY glutInitWindowSize(int width, int height);
496 | GLUTAPI void APIENTRY glutMainLoop(void);
497 |
498 | /* GLUT window sub-API. */
499 | GLUTAPI int APIENTRY glutCreateWindow(const char *title);
500 | #if defined(_WIN32) && !defined(GLUT_DISABLE_ATEXIT_HACK)
501 | GLUTAPI int APIENTRY __glutCreateWindowWithExit(const char *title, void (__cdecl *exitfunc)(int));
502 | #ifndef GLUT_BUILDING_LIB
503 | static int APIENTRY glutCreateWindow_ATEXIT_HACK(const char *title) { return __glutCreateWindowWithExit(title, exit); }
504 | #define glutCreateWindow glutCreateWindow_ATEXIT_HACK
505 | #endif
506 | #endif
507 | GLUTAPI int APIENTRY glutCreateSubWindow(int win, int x, int y, int width, int height);
508 | GLUTAPI void APIENTRY glutDestroyWindow(int win);
509 | GLUTAPI void APIENTRY glutPostRedisplay(void);
510 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 11)
511 | GLUTAPI void APIENTRY glutPostWindowRedisplay(int win);
512 | #endif
513 | GLUTAPI void APIENTRY glutSwapBuffers(void);
514 | GLUTAPI int APIENTRY glutGetWindow(void);
515 | GLUTAPI void APIENTRY glutSetWindow(int win);
516 | GLUTAPI void APIENTRY glutSetWindowTitle(const char *title);
517 | GLUTAPI void APIENTRY glutSetIconTitle(const char *title);
518 | GLUTAPI void APIENTRY glutPositionWindow(int x, int y);
519 | GLUTAPI void APIENTRY glutReshapeWindow(int width, int height);
520 | GLUTAPI void APIENTRY glutPopWindow(void);
521 | GLUTAPI void APIENTRY glutPushWindow(void);
522 | GLUTAPI void APIENTRY glutIconifyWindow(void);
523 | GLUTAPI void APIENTRY glutShowWindow(void);
524 | GLUTAPI void APIENTRY glutHideWindow(void);
525 | #if (GLUT_API_VERSION >= 3)
526 | GLUTAPI void APIENTRY glutFullScreen(void);
527 | GLUTAPI void APIENTRY glutSetCursor(int cursor);
528 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
529 | GLUTAPI void APIENTRY glutWarpPointer(int x, int y);
530 | #endif
531 |
532 | /* GLUT overlay sub-API. */
533 | GLUTAPI void APIENTRY glutEstablishOverlay(void);
534 | GLUTAPI void APIENTRY glutRemoveOverlay(void);
535 | GLUTAPI void APIENTRY glutUseLayer(GLenum layer);
536 | GLUTAPI void APIENTRY glutPostOverlayRedisplay(void);
537 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 11)
538 | GLUTAPI void APIENTRY glutPostWindowOverlayRedisplay(int win);
539 | #endif
540 | GLUTAPI void APIENTRY glutShowOverlay(void);
541 | GLUTAPI void APIENTRY glutHideOverlay(void);
542 | #endif
543 |
544 | /* GLUT menu sub-API. */
545 | GLUTAPI int APIENTRY glutCreateMenu(void (GLUTCALLBACK *func)(int));
546 | #if defined(_WIN32) && !defined(GLUT_DISABLE_ATEXIT_HACK)
547 | GLUTAPI int APIENTRY __glutCreateMenuWithExit(void (GLUTCALLBACK *func)(int), void (__cdecl *exitfunc)(int));
548 | #ifndef GLUT_BUILDING_LIB
549 | static int APIENTRY glutCreateMenu_ATEXIT_HACK(void (GLUTCALLBACK *func)(int)) { return __glutCreateMenuWithExit(func, exit); }
550 | #define glutCreateMenu glutCreateMenu_ATEXIT_HACK
551 | #endif
552 | #endif
553 | GLUTAPI void APIENTRY glutDestroyMenu(int menu);
554 | GLUTAPI int APIENTRY glutGetMenu(void);
555 | GLUTAPI void APIENTRY glutSetMenu(int menu);
556 | GLUTAPI void APIENTRY glutAddMenuEntry(const char *label, int value);
557 | GLUTAPI void APIENTRY glutAddSubMenu(const char *label, int submenu);
558 | GLUTAPI void APIENTRY glutChangeToMenuEntry(int item, const char *label, int value);
559 | GLUTAPI void APIENTRY glutChangeToSubMenu(int item, const char *label, int submenu);
560 | GLUTAPI void APIENTRY glutRemoveMenuItem(int item);
561 | GLUTAPI void APIENTRY glutAttachMenu(int button);
562 | GLUTAPI void APIENTRY glutDetachMenu(int button);
563 |
564 | /* GLUT window callback sub-API. */
565 | GLUTAPI void APIENTRY glutDisplayFunc(void (GLUTCALLBACK *func)(void));
566 | GLUTAPI void APIENTRY glutReshapeFunc(void (GLUTCALLBACK *func)(int width, int height));
567 | GLUTAPI void APIENTRY glutKeyboardFunc(void (GLUTCALLBACK *func)(unsigned char key, int x, int y));
568 | GLUTAPI void APIENTRY glutMouseFunc(void (GLUTCALLBACK *func)(int button, int state, int x, int y));
569 | GLUTAPI void APIENTRY glutMotionFunc(void (GLUTCALLBACK *func)(int x, int y));
570 | GLUTAPI void APIENTRY glutPassiveMotionFunc(void (GLUTCALLBACK *func)(int x, int y));
571 | GLUTAPI void APIENTRY glutEntryFunc(void (GLUTCALLBACK *func)(int state));
572 | GLUTAPI void APIENTRY glutVisibilityFunc(void (GLUTCALLBACK *func)(int state));
573 | GLUTAPI void APIENTRY glutIdleFunc(void (GLUTCALLBACK *func)(void));
574 | GLUTAPI void APIENTRY glutTimerFunc(unsigned int millis, void (GLUTCALLBACK *func)(int value), int value);
575 | GLUTAPI void APIENTRY glutMenuStateFunc(void (GLUTCALLBACK *func)(int state));
576 | #if (GLUT_API_VERSION >= 2)
577 | GLUTAPI void APIENTRY glutSpecialFunc(void (GLUTCALLBACK *func)(int key, int x, int y));
578 | GLUTAPI void APIENTRY glutSpaceballMotionFunc(void (GLUTCALLBACK *func)(int x, int y, int z));
579 | GLUTAPI void APIENTRY glutSpaceballRotateFunc(void (GLUTCALLBACK *func)(int x, int y, int z));
580 | GLUTAPI void APIENTRY glutSpaceballButtonFunc(void (GLUTCALLBACK *func)(int button, int state));
581 | GLUTAPI void APIENTRY glutButtonBoxFunc(void (GLUTCALLBACK *func)(int button, int state));
582 | GLUTAPI void APIENTRY glutDialsFunc(void (GLUTCALLBACK *func)(int dial, int value));
583 | GLUTAPI void APIENTRY glutTabletMotionFunc(void (GLUTCALLBACK *func)(int x, int y));
584 | GLUTAPI void APIENTRY glutTabletButtonFunc(void (GLUTCALLBACK *func)(int button, int state, int x, int y));
585 | #if (GLUT_API_VERSION >= 3)
586 | GLUTAPI void APIENTRY glutMenuStatusFunc(void (GLUTCALLBACK *func)(int status, int x, int y));
587 | GLUTAPI void APIENTRY glutOverlayDisplayFunc(void (GLUTCALLBACK *func)(void));
588 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
589 | GLUTAPI void APIENTRY glutWindowStatusFunc(void (GLUTCALLBACK *func)(int state));
590 | #endif
591 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
592 | GLUTAPI void APIENTRY glutKeyboardUpFunc(void (GLUTCALLBACK *func)(unsigned char key, int x, int y));
593 | GLUTAPI void APIENTRY glutSpecialUpFunc(void (GLUTCALLBACK *func)(int key, int x, int y));
594 | GLUTAPI void APIENTRY glutJoystickFunc(void (GLUTCALLBACK *func)(unsigned int buttonMask, int x, int y, int z), int pollInterval);
595 | #endif
596 | #endif
597 | #endif
598 |
599 | /* GLUT color index sub-API. */
600 | GLUTAPI void APIENTRY glutSetColor(int, GLfloat red, GLfloat green, GLfloat blue);
601 | GLUTAPI GLfloat APIENTRY glutGetColor(int ndx, int component);
602 | GLUTAPI void APIENTRY glutCopyColormap(int win);
603 |
604 | /* GLUT state retrieval sub-API. */
605 | GLUTAPI int APIENTRY glutGet(GLenum type);
606 | GLUTAPI int APIENTRY glutDeviceGet(GLenum type);
607 | #if (GLUT_API_VERSION >= 2)
608 | /* GLUT extension support sub-API */
609 | GLUTAPI int APIENTRY glutExtensionSupported(const char *name);
610 | #endif
611 | #if (GLUT_API_VERSION >= 3)
612 | GLUTAPI int APIENTRY glutGetModifiers(void);
613 | GLUTAPI int APIENTRY glutLayerGet(GLenum type);
614 | #endif
615 |
616 | /* GLUT font sub-API */
617 | GLUTAPI void APIENTRY glutBitmapCharacter(void *font, int character);
618 | GLUTAPI int APIENTRY glutBitmapWidth(void *font, int character);
619 | GLUTAPI void APIENTRY glutStrokeCharacter(void *font, int character);
620 | GLUTAPI int APIENTRY glutStrokeWidth(void *font, int character);
621 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
622 | GLUTAPI int APIENTRY glutBitmapLength(void *font, const unsigned char *string);
623 | GLUTAPI int APIENTRY glutStrokeLength(void *font, const unsigned char *string);
624 | #endif
625 |
626 | /* GLUT pre-built models sub-API */
627 | GLUTAPI void APIENTRY glutWireSphere(GLdouble radius, GLint slices, GLint stacks);
628 | GLUTAPI void APIENTRY glutSolidSphere(GLdouble radius, GLint slices, GLint stacks);
629 | GLUTAPI void APIENTRY glutWireCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
630 | GLUTAPI void APIENTRY glutSolidCone(GLdouble base, GLdouble height, GLint slices, GLint stacks);
631 | GLUTAPI void APIENTRY glutWireCube(GLdouble size);
632 | GLUTAPI void APIENTRY glutSolidCube(GLdouble size);
633 | GLUTAPI void APIENTRY glutWireTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
634 | GLUTAPI void APIENTRY glutSolidTorus(GLdouble innerRadius, GLdouble outerRadius, GLint sides, GLint rings);
635 | GLUTAPI void APIENTRY glutWireDodecahedron(void);
636 | GLUTAPI void APIENTRY glutSolidDodecahedron(void);
637 | GLUTAPI void APIENTRY glutWireTeapot(GLdouble size);
638 | GLUTAPI void APIENTRY glutSolidTeapot(GLdouble size);
639 | GLUTAPI void APIENTRY glutWireOctahedron(void);
640 | GLUTAPI void APIENTRY glutSolidOctahedron(void);
641 | GLUTAPI void APIENTRY glutWireTetrahedron(void);
642 | GLUTAPI void APIENTRY glutSolidTetrahedron(void);
643 | GLUTAPI void APIENTRY glutWireIcosahedron(void);
644 | GLUTAPI void APIENTRY glutSolidIcosahedron(void);
645 |
646 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 9)
647 | /* GLUT video resize sub-API. */
648 | GLUTAPI int APIENTRY glutVideoResizeGet(GLenum param);
649 | GLUTAPI void APIENTRY glutSetupVideoResizing(void);
650 | GLUTAPI void APIENTRY glutStopVideoResizing(void);
651 | GLUTAPI void APIENTRY glutVideoResize(int x, int y, int width, int height);
652 | GLUTAPI void APIENTRY glutVideoPan(int x, int y, int width, int height);
653 |
654 | /* GLUT debugging sub-API. */
655 | GLUTAPI void APIENTRY glutReportErrors(void);
656 | #endif
657 |
658 | #if (GLUT_API_VERSION >= 4 || GLUT_XLIB_IMPLEMENTATION >= 13)
659 | /* GLUT device control sub-API. */
660 | /* glutSetKeyRepeat modes. */
661 | #define GLUT_KEY_REPEAT_OFF 0
662 | #define GLUT_KEY_REPEAT_ON 1
663 | #define GLUT_KEY_REPEAT_DEFAULT 2
664 |
665 | /* Joystick button masks. */
666 | #define GLUT_JOYSTICK_BUTTON_A 1
667 | #define GLUT_JOYSTICK_BUTTON_B 2
668 | #define GLUT_JOYSTICK_BUTTON_C 4
669 | #define GLUT_JOYSTICK_BUTTON_D 8
670 |
671 | GLUTAPI void APIENTRY glutIgnoreKeyRepeat(int ignore);
672 | GLUTAPI void APIENTRY glutSetKeyRepeat(int repeatMode);
673 | GLUTAPI void APIENTRY glutForceJoystickFunc(void);
674 |
675 | /* GLUT game mode sub-API. */
676 | /* glutGameModeGet. */
677 | #define GLUT_GAME_MODE_ACTIVE ((GLenum) 0)
678 | #define GLUT_GAME_MODE_POSSIBLE ((GLenum) 1)
679 | #define GLUT_GAME_MODE_WIDTH ((GLenum) 2)
680 | #define GLUT_GAME_MODE_HEIGHT ((GLenum) 3)
681 | #define GLUT_GAME_MODE_PIXEL_DEPTH ((GLenum) 4)
682 | #define GLUT_GAME_MODE_REFRESH_RATE ((GLenum) 5)
683 | #define GLUT_GAME_MODE_DISPLAY_CHANGED ((GLenum) 6)
684 |
685 | GLUTAPI void APIENTRY glutGameModeString(const char *string);
686 | GLUTAPI int APIENTRY glutEnterGameMode(void);
687 | GLUTAPI void APIENTRY glutLeaveGameMode(void);
688 | GLUTAPI int APIENTRY glutGameModeGet(GLenum mode);
689 | #endif
690 |
691 | #ifdef __cplusplus
692 | }
693 |
694 | #endif
695 |
696 | #ifdef GLUT_APIENTRY_DEFINED
697 | # undef GLUT_APIENTRY_DEFINED
698 | # undef APIENTRY
699 | #endif
700 |
701 | #ifdef GLUT_WINGDIAPI_DEFINED
702 | # undef GLUT_WINGDIAPI_DEFINED
703 | # undef WINGDIAPI
704 | #endif
705 |
706 | #ifdef GLUT_DEFINED___CDECL
707 | # undef GLUT_DEFINED___CDECL
708 | # undef __cdecl
709 | #endif
710 |
711 | #ifdef GLUT_DEFINED__CRTIMP
712 | # undef GLUT_DEFINED__CRTIMP
713 | # undef _CRTIMP
714 | #endif
715 |
716 | #endif /* __glut_h__ */
717 |
--------------------------------------------------------------------------------